From 17d9ddc72fb8bba0d4f67868c9c612e472a594a9 Mon Sep 17 00:00:00 2001
From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Date: Wed, 10 Feb 2010 15:23:44 -0800
Subject: rbtree: Add support for augmented rbtrees

Add support for augmented rbtrees in core rbtree code.

This will be used in subsequent patches, in x86 PAT code, which needs
interval trees to efficiently keep track of PAT ranges.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <20100210232343.GA11465@linux-os.sc.intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/rbtree.txt | 58 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rbtree.h   |  5 ++++-
 lib/rbtree.c             | 48 +++++++++++++++++++++++++++++++++++----
 3 files changed, 106 insertions(+), 5 deletions(-)

diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
index aae8355d316..221f38be98f 100644
--- a/Documentation/rbtree.txt
+++ b/Documentation/rbtree.txt
@@ -190,3 +190,61 @@ Example:
   for (node = rb_first(&mytree); node; node = rb_next(node))
 	printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring);
 
+Support for Augmented rbtrees
+-----------------------------
+
+Augmented rbtree is an rbtree with "some" additional data stored in each node.
+This data can be used to augment some new functionality to rbtree.
+Augmented rbtree is an optional feature built on top of basic rbtree
+infrastructure. rbtree user who wants this feature will have an augment
+callback function in rb_root initialized.
+
+This callback function will be called from rbtree core routines whenever
+a node has a change in one or both of its children. It is the responsibility
+of the callback function to recalculate the additional data that is in the
+rb node using new children information. Note that if this new additional
+data affects the parent node's additional data, then callback function has
+to handle it and do the recursive updates.
+
+
+Interval tree is an example of augmented rb tree. Reference -
+"Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein.
+More details about interval trees:
+
+Classical rbtree has a single key and it cannot be directly used to store
+interval ranges like [lo:hi] and do a quick lookup for any overlap with a new
+lo:hi or to find whether there is an exact match for a new lo:hi.
+
+However, rbtree can be augmented to store such interval ranges in a structured
+way making it possible to do efficient lookup and exact match.
+
+This "extra information" stored in each node is the maximum hi
+(max_hi) value among all the nodes that are its descendents. This
+information can be maintained at each node just be looking at the node
+and its immediate children. And this will be used in O(log n) lookup
+for lowest match (lowest start address among all possible matches)
+with something like:
+
+find_lowest_match(lo, hi, node)
+{
+	lowest_match = NULL;
+	while (node) {
+		if (max_hi(node->left) > lo) {
+			// Lowest overlap if any must be on left side
+			node = node->left;
+		} else if (overlap(lo, hi, node)) {
+			lowest_match = node;
+			break;
+		} else if (lo > node->lo) {
+			// Lowest overlap if any must be on right side
+			node = node->right;
+		} else {
+			break;
+		}
+	}
+	return lowest_match;
+}
+
+Finding exact match will be to first find lowest match and then to follow
+successor nodes looking for exact match, until the start of a node is beyond
+the hi value we are looking for.
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 9c295411d01..8e33a256ea0 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -110,6 +110,7 @@ struct rb_node
 struct rb_root
 {
 	struct rb_node *rb_node;
+	void (*augment_cb)(struct rb_node *node);
 };
 
 
@@ -129,7 +130,9 @@ static inline void rb_set_color(struct rb_node *rb, int color)
 	rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
 }
 
-#define RB_ROOT	(struct rb_root) { NULL, }
+#define RB_ROOT	(struct rb_root) { NULL, NULL, }
+#define RB_AUGMENT_ROOT(x)	(struct rb_root) { NULL, x}
+
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 
 #define RB_EMPTY_ROOT(root)	((root)->rb_node == NULL)
diff --git a/lib/rbtree.c b/lib/rbtree.c
index e2aa3be2985..15e10b1afdd 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -44,6 +44,11 @@ static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
 	else
 		root->rb_node = right;
 	rb_set_parent(node, right);
+
+	if (root->augment_cb) {
+		root->augment_cb(node);
+		root->augment_cb(right);
+	}
 }
 
 static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
@@ -67,12 +72,20 @@ static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
 	else
 		root->rb_node = left;
 	rb_set_parent(node, left);
+
+	if (root->augment_cb) {
+		root->augment_cb(node);
+		root->augment_cb(left);
+	}
 }
 
 void rb_insert_color(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *parent, *gparent;
 
+	if (root->augment_cb)
+		root->augment_cb(node);
+
 	while ((parent = rb_parent(node)) && rb_is_red(parent))
 	{
 		gparent = rb_parent(parent);
@@ -227,12 +240,15 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 	else
 	{
 		struct rb_node *old = node, *left;
+		int old_parent_cb = 0;
+		int successor_parent_cb = 0;
 
 		node = node->rb_right;
 		while ((left = node->rb_left) != NULL)
 			node = left;
 
 		if (rb_parent(old)) {
+			old_parent_cb = 1;
 			if (rb_parent(old)->rb_left == old)
 				rb_parent(old)->rb_left = node;
 			else
@@ -247,8 +263,10 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 		if (parent == old) {
 			parent = node;
 		} else {
+			successor_parent_cb = 1;
 			if (child)
 				rb_set_parent(child, parent);
+
 			parent->rb_left = child;
 
 			node->rb_right = old->rb_right;
@@ -259,6 +277,24 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 		node->rb_left = old->rb_left;
 		rb_set_parent(old->rb_left, node);
 
+		if (root->augment_cb) {
+			/*
+			 * Here, three different nodes can have new children.
+			 * The parent of the successor node that was selected
+			 * to replace the node to be erased.
+			 * The node that is getting erased and is now replaced
+			 * by its successor.
+			 * The parent of the node getting erased-replaced.
+			 */
+			if (successor_parent_cb)
+				root->augment_cb(parent);
+
+			root->augment_cb(node);
+
+			if (old_parent_cb)
+				root->augment_cb(rb_parent(old));
+		}
+
 		goto color;
 	}
 
@@ -267,15 +303,19 @@ void rb_erase(struct rb_node *node, struct rb_root *root)
 
 	if (child)
 		rb_set_parent(child, parent);
-	if (parent)
-	{
+
+	if (parent) {
 		if (parent->rb_left == node)
 			parent->rb_left = child;
 		else
 			parent->rb_right = child;
-	}
-	else
+
+		if (root->augment_cb)
+			root->augment_cb(parent);
+
+	} else {
 		root->rb_node = child;
+	}
 
  color:
 	if (color == RB_BLACK)
-- 
cgit v1.2.3


From be5a0c126ad1dea2128dc5aef12c87083518d1ab Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Wed, 10 Feb 2010 11:57:06 -0800
Subject: x86, pat: Preparatory changes in pat.c for bigger rbtree change

Minor changes in pat.c to cleanup code and make it smoother to introduce
bigger rbtree only change in the following patch. The changes are cleaup
only and should not have any functional impact.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <20100210195909.792781000@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/pat.c          | 170 +++++++++++++++++++++++----------------------
 arch/x86/mm/pat_internal.h |  28 ++++++++
 2 files changed, 116 insertions(+), 82 deletions(-)
 create mode 100644 arch/x86/mm/pat_internal.h

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index ae9648eb1c7..628e507b793 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -30,6 +30,8 @@
 #include <asm/pat.h>
 #include <asm/io.h>
 
+#include "pat_internal.h"
+
 #ifdef CONFIG_X86_PAT
 int __read_mostly pat_enabled = 1;
 
@@ -53,19 +55,15 @@ static inline void pat_disable(const char *reason)
 #endif
 
 
-static int debug_enable;
+int pat_debug_enable;
 
 static int __init pat_debug_setup(char *str)
 {
-	debug_enable = 1;
+	pat_debug_enable = 1;
 	return 0;
 }
 __setup("debugpat", pat_debug_setup);
 
-#define dprintk(fmt, arg...) \
-	do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
-
-
 static u64 __read_mostly boot_pat_state;
 
 enum {
@@ -132,17 +130,6 @@ void pat_init(void)
 
 #undef PAT
 
-static char *cattr_name(unsigned long flags)
-{
-	switch (flags & _PAGE_CACHE_MASK) {
-	case _PAGE_CACHE_UC:		return "uncached";
-	case _PAGE_CACHE_UC_MINUS:	return "uncached-minus";
-	case _PAGE_CACHE_WB:		return "write-back";
-	case _PAGE_CACHE_WC:		return "write-combining";
-	default:			return "broken";
-	}
-}
-
 /*
  * The global memtype list keeps track of memory type for specific
  * physical memory areas. Conflicting memory types in different
@@ -159,14 +146,6 @@ static char *cattr_name(unsigned long flags)
  * memtype_lock protects both the linear list and rbtree.
  */
 
-struct memtype {
-	u64			start;
-	u64			end;
-	unsigned long		type;
-	struct list_head	nd;
-	struct rb_node		rb;
-};
-
 static struct rb_root memtype_rbroot = RB_ROOT;
 static LIST_HEAD(memtype_list);
 static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype list */
@@ -349,6 +328,64 @@ static int free_ram_pages_type(u64 start, u64 end)
 	return 0;
 }
 
+static int memtype_check_insert(struct memtype *new, unsigned long *new_type)
+{
+	struct memtype *entry;
+	u64 start, end;
+	unsigned long actual_type;
+	struct list_head *where;
+	int err = 0;
+
+	start = new->start;
+	end = new->end;
+	actual_type = new->type;
+
+	/* Search for existing mapping that overlaps the current range */
+	where = NULL;
+	list_for_each_entry(entry, &memtype_list, nd) {
+		if (end <= entry->start) {
+			where = entry->nd.prev;
+			break;
+		} else if (start <= entry->start) { /* end > entry->start */
+			err = chk_conflict(new, entry, new_type);
+			if (!err) {
+				dprintk("Overlap at 0x%Lx-0x%Lx\n",
+					entry->start, entry->end);
+				where = entry->nd.prev;
+			}
+			break;
+		} else if (start < entry->end) { /* start > entry->start */
+			err = chk_conflict(new, entry, new_type);
+			if (!err) {
+				dprintk("Overlap at 0x%Lx-0x%Lx\n",
+					entry->start, entry->end);
+
+				/*
+				 * Move to right position in the linked
+				 * list to add this new entry
+				 */
+				list_for_each_entry_continue(entry,
+							&memtype_list, nd) {
+					if (start <= entry->start) {
+						where = entry->nd.prev;
+						break;
+					}
+				}
+			}
+			break;
+		}
+	}
+	if (!err) {
+		if (where)
+			list_add(&new->nd, where);
+		else
+			list_add_tail(&new->nd, &memtype_list);
+
+		memtype_rb_insert(&memtype_rbroot, new);
+	}
+	return err;
+}
+
 /*
  * req_type typically has one of the:
  * - _PAGE_CACHE_WB
@@ -364,9 +401,8 @@ static int free_ram_pages_type(u64 start, u64 end)
 int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 		    unsigned long *new_type)
 {
-	struct memtype *new, *entry;
+	struct memtype *new;
 	unsigned long actual_type;
-	struct list_head *where;
 	int is_range_ram;
 	int err = 0;
 
@@ -423,42 +459,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	spin_lock(&memtype_lock);
 
-	/* Search for existing mapping that overlaps the current range */
-	where = NULL;
-	list_for_each_entry(entry, &memtype_list, nd) {
-		if (end <= entry->start) {
-			where = entry->nd.prev;
-			break;
-		} else if (start <= entry->start) { /* end > entry->start */
-			err = chk_conflict(new, entry, new_type);
-			if (!err) {
-				dprintk("Overlap at 0x%Lx-0x%Lx\n",
-					entry->start, entry->end);
-				where = entry->nd.prev;
-			}
-			break;
-		} else if (start < entry->end) { /* start > entry->start */
-			err = chk_conflict(new, entry, new_type);
-			if (!err) {
-				dprintk("Overlap at 0x%Lx-0x%Lx\n",
-					entry->start, entry->end);
-
-				/*
-				 * Move to right position in the linked
-				 * list to add this new entry
-				 */
-				list_for_each_entry_continue(entry,
-							&memtype_list, nd) {
-					if (start <= entry->start) {
-						where = entry->nd.prev;
-						break;
-					}
-				}
-			}
-			break;
-		}
-	}
-
+	err = memtype_check_insert(new, new_type);
 	if (err) {
 		printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
 		       "track %s, req %s\n",
@@ -469,13 +470,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 		return err;
 	}
 
-	if (where)
-		list_add(&new->nd, where);
-	else
-		list_add_tail(&new->nd, &memtype_list);
-
-	memtype_rb_insert(&memtype_rbroot, new);
-
 	spin_unlock(&memtype_lock);
 
 	dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
@@ -937,28 +931,40 @@ EXPORT_SYMBOL_GPL(pgprot_writecombine);
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
 
 /* get Nth element of the linked list */
-static struct memtype *memtype_get_idx(loff_t pos)
+static int copy_memtype_nth_element(struct memtype *out, loff_t pos)
 {
-	struct memtype *list_node, *print_entry;
+	struct memtype *list_node;
 	int i = 1;
 
-	print_entry  = kmalloc(sizeof(struct memtype), GFP_KERNEL);
-	if (!print_entry)
-		return NULL;
-
-	spin_lock(&memtype_lock);
 	list_for_each_entry(list_node, &memtype_list, nd) {
 		if (pos == i) {
-			*print_entry = *list_node;
-			spin_unlock(&memtype_lock);
-			return print_entry;
+			*out = *list_node;
+			return 0;
 		}
 		++i;
 	}
+	return 1;
+}
+
+static struct memtype *memtype_get_idx(loff_t pos)
+{
+	struct memtype *print_entry;
+	int ret;
+
+	print_entry  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+	if (!print_entry)
+		return NULL;
+
+	spin_lock(&memtype_lock);
+	ret = copy_memtype_nth_element(print_entry, pos);
 	spin_unlock(&memtype_lock);
-	kfree(print_entry);
 
-	return NULL;
+	if (!ret) {
+		return print_entry;
+	} else {
+		kfree(print_entry);
+		return NULL;
+	}
 }
 
 static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
new file mode 100644
index 00000000000..6c98780eb73
--- /dev/null
+++ b/arch/x86/mm/pat_internal.h
@@ -0,0 +1,28 @@
+#ifndef __PAT_INTERNAL_H_
+#define __PAT_INTERNAL_H_
+
+extern int pat_debug_enable;
+
+#define dprintk(fmt, arg...) \
+	do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
+
+struct memtype {
+	u64			start;
+	u64			end;
+	unsigned long		type;
+	struct list_head	nd;
+	struct rb_node		rb;
+};
+
+static inline char *cattr_name(unsigned long flags)
+{
+	switch (flags & _PAGE_CACHE_MASK) {
+	case _PAGE_CACHE_UC:		return "uncached";
+	case _PAGE_CACHE_UC_MINUS:	return "uncached-minus";
+	case _PAGE_CACHE_WB:		return "write-back";
+	case _PAGE_CACHE_WC:		return "write-combining";
+	default:			return "broken";
+	}
+}
+
+#endif /* __PAT_INTERNAL_H_ */
-- 
cgit v1.2.3


From 9e41a49aab88a5a6c8f4875bf10a5543bc321f2d Mon Sep 17 00:00:00 2001
From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Date: Wed, 10 Feb 2010 15:26:07 -0800
Subject: x86, pat: Migrate to rbtree only backend for pat memtype management

Move pat backend to fully rbtree based implementation from the existing
rbtree and linked list hybrid.

New rbtree based solution uses interval trees (augmented rbtrees) in
order to store the PAT ranges. The new code seprates out the pat backend
to pat_rbtree.c file, making is cleaner. The change also makes the PAT
lookup, reserve and free operations more optimal, as we don't have to
traverse linear linked list of few tens of entries in normal case.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <20100210232607.GB11465@linux-os.sc.intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/Makefile       |   1 +
 arch/x86/mm/pat.c          | 209 +---------------------------------
 arch/x86/mm/pat_internal.h |  20 +++-
 arch/x86/mm/pat_rbtree.c   | 271 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 296 insertions(+), 205 deletions(-)
 create mode 100644 arch/x86/mm/pat_rbtree.c

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 06630d26e56..a4c768397ba 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -6,6 +6,7 @@ nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_physaddr.o		:= $(nostackp)
 CFLAGS_setup_nx.o		:= $(nostackp)
 
+obj-$(CONFIG_X86_PAT)		+= pat_rbtree.o
 obj-$(CONFIG_SMP)		+= tlb.o
 
 obj-$(CONFIG_X86_32)		+= pgtable_32.o iomap_32.o
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 628e507b793..951011166ef 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -130,65 +130,7 @@ void pat_init(void)
 
 #undef PAT
 
-/*
- * The global memtype list keeps track of memory type for specific
- * physical memory areas. Conflicting memory types in different
- * mappings can cause CPU cache corruption. To avoid this we keep track.
- *
- * The list is sorted based on starting address and can contain multiple
- * entries for each address (this allows reference counting for overlapping
- * areas). All the aliases have the same cache attributes of course.
- * Zero attributes are represented as holes.
- *
- * The data structure is a list that is also organized as an rbtree
- * sorted on the start address of memtype range.
- *
- * memtype_lock protects both the linear list and rbtree.
- */
-
-static struct rb_root memtype_rbroot = RB_ROOT;
-static LIST_HEAD(memtype_list);
-static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype list */
-
-static struct memtype *memtype_rb_search(struct rb_root *root, u64 start)
-{
-	struct rb_node *node = root->rb_node;
-	struct memtype *last_lower = NULL;
-
-	while (node) {
-		struct memtype *data = container_of(node, struct memtype, rb);
-
-		if (data->start < start) {
-			last_lower = data;
-			node = node->rb_right;
-		} else if (data->start > start) {
-			node = node->rb_left;
-		} else
-			return data;
-	}
-
-	/* Will return NULL if there is no entry with its start <= start */
-	return last_lower;
-}
-
-static void memtype_rb_insert(struct rb_root *root, struct memtype *data)
-{
-	struct rb_node **new = &(root->rb_node);
-	struct rb_node *parent = NULL;
-
-	while (*new) {
-		struct memtype *this = container_of(*new, struct memtype, rb);
-
-		parent = *new;
-		if (data->start <= this->start)
-			new = &((*new)->rb_left);
-		else if (data->start > this->start)
-			new = &((*new)->rb_right);
-	}
-
-	rb_link_node(&data->rb, parent, new);
-	rb_insert_color(&data->rb, root);
-}
+static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype accesses */
 
 /*
  * Does intersection of PAT memory type and MTRR memory type and returns
@@ -216,33 +158,6 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
 	return req_type;
 }
 
-static int
-chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
-{
-	if (new->type != entry->type) {
-		if (type) {
-			new->type = entry->type;
-			*type = entry->type;
-		} else
-			goto conflict;
-	}
-
-	 /* check overlaps with more than one entry in the list */
-	list_for_each_entry_continue(entry, &memtype_list, nd) {
-		if (new->end <= entry->start)
-			break;
-		else if (new->type != entry->type)
-			goto conflict;
-	}
-	return 0;
-
- conflict:
-	printk(KERN_INFO "%s:%d conflicting memory types "
-	       "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start,
-	       new->end, cattr_name(new->type), cattr_name(entry->type));
-	return -EBUSY;
-}
-
 static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
 {
 	int ram_page = 0, not_rampage = 0;
@@ -328,64 +243,6 @@ static int free_ram_pages_type(u64 start, u64 end)
 	return 0;
 }
 
-static int memtype_check_insert(struct memtype *new, unsigned long *new_type)
-{
-	struct memtype *entry;
-	u64 start, end;
-	unsigned long actual_type;
-	struct list_head *where;
-	int err = 0;
-
-	start = new->start;
-	end = new->end;
-	actual_type = new->type;
-
-	/* Search for existing mapping that overlaps the current range */
-	where = NULL;
-	list_for_each_entry(entry, &memtype_list, nd) {
-		if (end <= entry->start) {
-			where = entry->nd.prev;
-			break;
-		} else if (start <= entry->start) { /* end > entry->start */
-			err = chk_conflict(new, entry, new_type);
-			if (!err) {
-				dprintk("Overlap at 0x%Lx-0x%Lx\n",
-					entry->start, entry->end);
-				where = entry->nd.prev;
-			}
-			break;
-		} else if (start < entry->end) { /* start > entry->start */
-			err = chk_conflict(new, entry, new_type);
-			if (!err) {
-				dprintk("Overlap at 0x%Lx-0x%Lx\n",
-					entry->start, entry->end);
-
-				/*
-				 * Move to right position in the linked
-				 * list to add this new entry
-				 */
-				list_for_each_entry_continue(entry,
-							&memtype_list, nd) {
-					if (start <= entry->start) {
-						where = entry->nd.prev;
-						break;
-					}
-				}
-			}
-			break;
-		}
-	}
-	if (!err) {
-		if (where)
-			list_add(&new->nd, where);
-		else
-			list_add_tail(&new->nd, &memtype_list);
-
-		memtype_rb_insert(&memtype_rbroot, new);
-	}
-	return err;
-}
-
 /*
  * req_type typically has one of the:
  * - _PAGE_CACHE_WB
@@ -459,7 +316,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 	spin_lock(&memtype_lock);
 
-	err = memtype_check_insert(new, new_type);
+	err = rbt_memtype_check_insert(new, new_type);
 	if (err) {
 		printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
 		       "track %s, req %s\n",
@@ -481,7 +338,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 
 int free_memtype(u64 start, u64 end)
 {
-	struct memtype *entry, *saved_entry;
 	int err = -EINVAL;
 	int is_range_ram;
 
@@ -505,46 +361,7 @@ int free_memtype(u64 start, u64 end)
 	}
 
 	spin_lock(&memtype_lock);
-
-	entry = memtype_rb_search(&memtype_rbroot, start);
-	if (unlikely(entry == NULL))
-		goto unlock_ret;
-
-	/*
-	 * Saved entry points to an entry with start same or less than what
-	 * we searched for. Now go through the list in both directions to look
-	 * for the entry that matches with both start and end, with list stored
-	 * in sorted start address
-	 */
-	saved_entry = entry;
-	list_for_each_entry_from(entry, &memtype_list, nd) {
-		if (entry->start == start && entry->end == end) {
-			rb_erase(&entry->rb, &memtype_rbroot);
-			list_del(&entry->nd);
-			kfree(entry);
-			err = 0;
-			break;
-		} else if (entry->start > start) {
-			break;
-		}
-	}
-
-	if (!err)
-		goto unlock_ret;
-
-	entry = saved_entry;
-	list_for_each_entry_reverse(entry, &memtype_list, nd) {
-		if (entry->start == start && entry->end == end) {
-			rb_erase(&entry->rb, &memtype_rbroot);
-			list_del(&entry->nd);
-			kfree(entry);
-			err = 0;
-			break;
-		} else if (entry->start < start) {
-			break;
-		}
-	}
-unlock_ret:
+	err = rbt_memtype_erase(start, end);
 	spin_unlock(&memtype_lock);
 
 	if (err) {
@@ -593,7 +410,7 @@ static unsigned long lookup_memtype(u64 paddr)
 
 	spin_lock(&memtype_lock);
 
-	entry = memtype_rb_search(&memtype_rbroot, paddr);
+	entry = rbt_memtype_lookup(paddr);
 	if (entry != NULL)
 		rettype = entry->type;
 	else
@@ -930,22 +747,6 @@ EXPORT_SYMBOL_GPL(pgprot_writecombine);
 
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
 
-/* get Nth element of the linked list */
-static int copy_memtype_nth_element(struct memtype *out, loff_t pos)
-{
-	struct memtype *list_node;
-	int i = 1;
-
-	list_for_each_entry(list_node, &memtype_list, nd) {
-		if (pos == i) {
-			*out = *list_node;
-			return 0;
-		}
-		++i;
-	}
-	return 1;
-}
-
 static struct memtype *memtype_get_idx(loff_t pos)
 {
 	struct memtype *print_entry;
@@ -956,7 +757,7 @@ static struct memtype *memtype_get_idx(loff_t pos)
 		return NULL;
 
 	spin_lock(&memtype_lock);
-	ret = copy_memtype_nth_element(print_entry, pos);
+	ret = rbt_memtype_copy_nth_element(print_entry, pos);
 	spin_unlock(&memtype_lock);
 
 	if (!ret) {
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
index 6c98780eb73..4f39eefa3e6 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -9,8 +9,8 @@ extern int pat_debug_enable;
 struct memtype {
 	u64			start;
 	u64			end;
+	u64			subtree_max_end;
 	unsigned long		type;
-	struct list_head	nd;
 	struct rb_node		rb;
 };
 
@@ -25,4 +25,22 @@ static inline char *cattr_name(unsigned long flags)
 	}
 }
 
+#ifdef CONFIG_X86_PAT
+extern int rbt_memtype_check_insert(struct memtype *new,
+					unsigned long *new_type);
+extern int rbt_memtype_erase(u64 start, u64 end);
+extern struct memtype *rbt_memtype_lookup(u64 addr);
+extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
+#else
+static inline int rbt_memtype_check_insert(struct memtype *new,
+					unsigned long *new_type)
+{ return 0; }
+static inline int rbt_memtype_erase(u64 start, u64 end)
+{ return 0; }
+static inline struct memtype *rbt_memtype_lookup(u64 addr)
+{ return NULL; }
+static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
+{ return 0; }
+#endif
+
 #endif /* __PAT_INTERNAL_H_ */
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
new file mode 100644
index 00000000000..9063f40b638
--- /dev/null
+++ b/arch/x86/mm/pat_rbtree.c
@@ -0,0 +1,271 @@
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *          Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Interval tree (augmented rbtree) used to store the PAT memory type
+ * reservations.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+
+#include <asm/pgtable.h>
+#include <asm/pat.h>
+
+#include "pat_internal.h"
+
+/*
+ * The memtype tree keeps track of memory type for specific
+ * physical memory areas. Without proper tracking, conflicting memory
+ * types in different mappings can cause CPU cache corruption.
+ *
+ * The tree is an interval tree (augmented rbtree) with tree ordered
+ * on starting address. Tree can contain multiple entries for
+ * different regions which overlap. All the aliases have the same
+ * cache attributes of course.
+ *
+ * memtype_lock protects the rbtree.
+ */
+
+static void memtype_rb_augment_cb(struct rb_node *node);
+static struct rb_root memtype_rbroot = RB_AUGMENT_ROOT(&memtype_rb_augment_cb);
+
+static int is_node_overlap(struct memtype *node, u64 start, u64 end)
+{
+	if (node->start >= end || node->end <= start)
+		return 0;
+
+	return 1;
+}
+
+static u64 get_subtree_max_end(struct rb_node *node)
+{
+	u64 ret = 0;
+	if (node) {
+		struct memtype *data = container_of(node, struct memtype, rb);
+		ret = data->subtree_max_end;
+	}
+	return ret;
+}
+
+/* Update 'subtree_max_end' for a node, based on node and its children */
+static void update_node_max_end(struct rb_node *node)
+{
+	struct memtype *data;
+	u64 max_end, child_max_end;
+
+	if (!node)
+		return;
+
+	data = container_of(node, struct memtype, rb);
+	max_end = data->end;
+
+	child_max_end = get_subtree_max_end(node->rb_right);
+	if (child_max_end > max_end)
+		max_end = child_max_end;
+
+	child_max_end = get_subtree_max_end(node->rb_left);
+	if (child_max_end > max_end)
+		max_end = child_max_end;
+
+	data->subtree_max_end = max_end;
+}
+
+/* Update 'subtree_max_end' for a node and all its ancestors */
+static void update_path_max_end(struct rb_node *node)
+{
+	u64 old_max_end, new_max_end;
+
+	while (node) {
+		struct memtype *data = container_of(node, struct memtype, rb);
+
+		old_max_end = data->subtree_max_end;
+		update_node_max_end(node);
+		new_max_end = data->subtree_max_end;
+
+		if (new_max_end == old_max_end)
+			break;
+
+		node = rb_parent(node);
+	}
+}
+
+/* Find the first (lowest start addr) overlapping range from rb tree */
+static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
+				u64 start, u64 end)
+{
+	struct rb_node *node = root->rb_node;
+	struct memtype *last_lower = NULL;
+
+	while (node) {
+		struct memtype *data = container_of(node, struct memtype, rb);
+
+		if (get_subtree_max_end(node->rb_left) > start) {
+			/* Lowest overlap if any must be on left side */
+			node = node->rb_left;
+		} else if (is_node_overlap(data, start, end)) {
+			last_lower = data;
+			break;
+		} else if (start >= data->start) {
+			/* Lowest overlap if any must be on right side */
+			node = node->rb_right;
+		} else {
+			break;
+		}
+	}
+	return last_lower; /* Returns NULL if there is no overlap */
+}
+
+static struct memtype *memtype_rb_exact_match(struct rb_root *root,
+				u64 start, u64 end)
+{
+	struct memtype *match;
+
+	match = memtype_rb_lowest_match(root, start, end);
+	while (match != NULL && match->start < end) {
+		struct rb_node *node;
+
+		if (match->start == start && match->end == end)
+			return match;
+
+		node = rb_next(&match->rb);
+		if (node)
+			match = container_of(node, struct memtype, rb);
+		else
+			match = NULL;
+	}
+
+	return NULL; /* Returns NULL if there is no exact match */
+}
+
+static int memtype_rb_check_conflict(struct rb_root *root,
+				u64 start, u64 end,
+				unsigned long reqtype, unsigned long *newtype)
+{
+	struct rb_node *node;
+	struct memtype *match;
+	int found_type = reqtype;
+
+	match = memtype_rb_lowest_match(&memtype_rbroot, start, end);
+	if (match == NULL)
+		goto success;
+
+	if (match->type != found_type && newtype == NULL)
+		goto failure;
+
+	dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
+	found_type = match->type;
+
+	node = rb_next(&match->rb);
+	while (node) {
+		match = container_of(node, struct memtype, rb);
+
+		if (match->start >= end) /* Checked all possible matches */
+			goto success;
+
+		if (is_node_overlap(match, start, end) &&
+		    match->type != found_type) {
+			goto failure;
+		}
+
+		node = rb_next(&match->rb);
+	}
+success:
+	if (newtype)
+		*newtype = found_type;
+
+	return 0;
+
+failure:
+	printk(KERN_INFO "%s:%d conflicting memory types "
+		"%Lx-%Lx %s<->%s\n", current->comm, current->pid, start,
+		end, cattr_name(found_type), cattr_name(match->type));
+	return -EBUSY;
+}
+
+static void memtype_rb_augment_cb(struct rb_node *node)
+{
+	if (node)
+		update_path_max_end(node);
+}
+
+static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
+{
+	struct rb_node **node = &(root->rb_node);
+	struct rb_node *parent = NULL;
+
+	while (*node) {
+		struct memtype *data = container_of(*node, struct memtype, rb);
+
+		parent = *node;
+		if (newdata->start <= data->start)
+			node = &((*node)->rb_left);
+		else if (newdata->start > data->start)
+			node = &((*node)->rb_right);
+	}
+
+	rb_link_node(&newdata->rb, parent, node);
+	rb_insert_color(&newdata->rb, root);
+}
+
+int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
+{
+	int err = 0;
+
+	err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end,
+						new->type, ret_type);
+
+	if (!err) {
+		new->type = *ret_type;
+		memtype_rb_insert(&memtype_rbroot, new);
+	}
+	return err;
+}
+
+int rbt_memtype_erase(u64 start, u64 end)
+{
+	struct memtype *data;
+
+	data = memtype_rb_exact_match(&memtype_rbroot, start, end);
+	if (!data)
+		return -EINVAL;
+
+	rb_erase(&data->rb, &memtype_rbroot);
+	return 0;
+}
+
+struct memtype *rbt_memtype_lookup(u64 addr)
+{
+	struct memtype *data;
+	data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE);
+	return data;
+}
+
+#if defined(CONFIG_DEBUG_FS)
+int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
+{
+	struct rb_node *node;
+	int i = 1;
+
+	node = rb_first(&memtype_rbroot);
+	while (node && pos != i) {
+		node = rb_next(node);
+		i++;
+	}
+
+	if (node) { /* pos == i */
+		struct memtype *this = container_of(node, struct memtype, rb);
+		*out = *this;
+		return 0;
+	} else {
+		return 1;
+	}
+}
+#endif
-- 
cgit v1.2.3


From b3ac891b67bd4b1fc728d1c784cad1212dea433d Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Wed, 24 Feb 2010 10:54:22 +0100
Subject: x86: Add support for lock prefix in alternatives

The current lock prefix UP/SMP alternative code doesn't allow
LOCK_PREFIX to be used in alternatives code.

This patch solves the problem by adding a new LOCK_PREFIX_ALTERNATIVE_PATCH
macro that only records the lock prefix location but does not emit
the prefix.

The user of this macro can then start any alternative sequence with
"lock" and have it UP/SMP patched.

To make this work, the UP/SMP alternative code is changed to do the
lock/DS prefix switching only if the byte actually contains a lock or
DS prefix.

Thus, if an alternative without the "lock" is selected, it will now do
nothing instead of clobbering the code.

Changes in v2:
- Naming change
- Change label to not conflict with alternatives

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267005265-27958-2-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/alternative.h | 8 +++++---
 arch/x86/kernel/alternative.c      | 6 ++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 3b5b828767b..55fee12cea6 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -28,12 +28,14 @@
  */
 
 #ifdef CONFIG_SMP
-#define LOCK_PREFIX \
+#define LOCK_PREFIX_HERE \
 		".section .smp_locks,\"a\"\n"	\
 		_ASM_ALIGN "\n"			\
-		_ASM_PTR "661f\n" /* address */	\
+		_ASM_PTR "671f\n" /* address */	\
 		".previous\n"			\
-		"661:\n\tlock; "
+		"671:"
+
+#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "
 
 #else /* ! CONFIG_SMP */
 #define LOCK_PREFIX ""
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2589ea4c60c..80b222ea4cf 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -244,7 +244,8 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 		if (*ptr > text_end)
 			continue;
 		/* turn DS segment override prefix into lock prefix */
-		text_poke(*ptr, ((unsigned char []){0xf0}), 1);
+		if (**ptr == 0x3e)
+			text_poke(*ptr, ((unsigned char []){0xf0}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
@@ -263,7 +264,8 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
 		if (*ptr > text_end)
 			continue;
 		/* turn lock prefix into DS segment override prefix */
-		text_poke(*ptr, ((unsigned char []){0x3E}), 1);
+		if (**ptr == 0xf0)
+			text_poke(*ptr, ((unsigned char []){0x3E}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
-- 
cgit v1.2.3


From 9c76b38476b18c45f97098a10b0176b321eba3ea Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Wed, 24 Feb 2010 10:54:23 +0100
Subject: x86-32: Allow UP/SMP lock replacement in cmpxchg64

Use the functionality just introduced in the previous patch: mark the
lock prefixes in cmpxchg64 alternatives for UP removal.

Changes in v2:
- Naming change

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267005265-27958-3-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/cmpxchg_32.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index ffb9bb6b6c3..8859e12dd3c 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -271,7 +271,8 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
 	__typeof__(*(ptr)) __ret;				\
 	__typeof__(*(ptr)) __old = (o);				\
 	__typeof__(*(ptr)) __new = (n);				\
-	alternative_io("call cmpxchg8b_emu",			\
+	alternative_io(LOCK_PREFIX_HERE				\
+			"call cmpxchg8b_emu",			\
 			"lock; cmpxchg8b (%%esi)" ,		\
 		       X86_FEATURE_CX8,				\
 		       "=A" (__ret),				\
-- 
cgit v1.2.3


From 86a8938078a8bb518c5376de493e348c7490d506 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Wed, 24 Feb 2010 10:54:24 +0100
Subject: lib: Add self-test for atomic64_t

This patch adds self-test on boot code for atomic64_t.

This has been used to test the later changes in this patchset.

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267005265-27958-4-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 lib/Kconfig.debug   |   7 +++
 lib/Makefile        |   2 +
 lib/atomic64_test.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 167 insertions(+)
 create mode 100644 lib/atomic64_test.c

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 25c3ed594c5..3676c517a07 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1054,6 +1054,13 @@ config DMA_API_DEBUG
 	  This option causes a performance degredation.  Use only if you want
 	  to debug device drivers. If unsure, say N.
 
+config ATOMIC64_SELFTEST
+	bool "Perform an atomic64_t self-test at boot"
+	help
+	  Enable this option to test the atomic64_t functions at boot.
+
+	  If unsure, say N.
+
 source "samples/Kconfig"
 
 source "lib/Kconfig.kgdb"
diff --git a/lib/Makefile b/lib/Makefile
index 347ad8db29d..4af4786fe28 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -99,6 +99,8 @@ obj-$(CONFIG_GENERIC_CSUM) += checksum.o
 
 obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o
 
+obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
+
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
new file mode 100644
index 00000000000..4ff649e46ba
--- /dev/null
+++ b/lib/atomic64_test.c
@@ -0,0 +1,158 @@
+/*
+ * Testsuite for atomic64_t functions
+ *
+ * Copyright © 2010  Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/init.h>
+#include <asm/atomic.h>
+
+#define INIT(c) do { atomic64_set(&v, c); r = c; } while (0)
+static __init int test_atomic64(void)
+{
+	long long v0 = 0xaaa31337c001d00dLL;
+	long long v1 = 0xdeadbeefdeafcafeLL;
+	long long v2 = 0xfaceabadf00df001LL;
+	long long onestwos = 0x1111111122222222LL;
+	long long one = 1LL;
+
+	atomic64_t v = ATOMIC64_INIT(v0);
+	long long r = v0;
+	BUG_ON(v.counter != r);
+
+	atomic64_set(&v, v1);
+	r = v1;
+	BUG_ON(v.counter != r);
+	BUG_ON(atomic64_read(&v) != r);
+
+	INIT(v0);
+	atomic64_add(onestwos, &v);
+	r += onestwos;
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	atomic64_add(-one, &v);
+	r += -one;
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	r += onestwos;
+	BUG_ON(atomic64_add_return(onestwos, &v) != r);
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	r += -one;
+	BUG_ON(atomic64_add_return(-one, &v) != r);
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	atomic64_sub(onestwos, &v);
+	r -= onestwos;
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	atomic64_sub(-one, &v);
+	r -= -one;
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	r -= onestwos;
+	BUG_ON(atomic64_sub_return(onestwos, &v) != r);
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	r -= -one;
+	BUG_ON(atomic64_sub_return(-one, &v) != r);
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	atomic64_inc(&v);
+	r += one;
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	r += one;
+	BUG_ON(atomic64_inc_return(&v) != r);
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	atomic64_dec(&v);
+	r -= one;
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	r -= one;
+	BUG_ON(atomic64_dec_return(&v) != r);
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	BUG_ON(atomic64_xchg(&v, v1) != v0);
+	r = v1;
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	BUG_ON(atomic64_cmpxchg(&v, v0, v1) != v0);
+	r = v1;
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	BUG_ON(atomic64_cmpxchg(&v, v2, v1) != v0);
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	BUG_ON(!atomic64_add_unless(&v, one, v0));
+	BUG_ON(v.counter != r);
+
+	INIT(v0);
+	BUG_ON(atomic64_add_unless(&v, one, v1));
+	r += one;
+	BUG_ON(v.counter != r);
+
+	INIT(onestwos);
+	BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1));
+	r -= one;
+	BUG_ON(v.counter != r);
+
+	INIT(0);
+	BUG_ON(atomic64_dec_if_positive(&v) != -one);
+	BUG_ON(v.counter != r);
+
+	INIT(-one);
+	BUG_ON(atomic64_dec_if_positive(&v) != (-one - one));
+	BUG_ON(v.counter != r);
+
+	INIT(onestwos);
+	BUG_ON(atomic64_inc_not_zero(&v));
+	r += one;
+	BUG_ON(v.counter != r);
+
+	INIT(0);
+	BUG_ON(!atomic64_inc_not_zero(&v));
+	BUG_ON(v.counter != r);
+
+	INIT(-one);
+	BUG_ON(atomic64_inc_not_zero(&v));
+	r += one;
+	BUG_ON(v.counter != r);
+
+#ifdef CONFIG_X86
+	printk(KERN_INFO "atomic64 test passed for %s+ platform %s CX8 and %s SSE\n",
+#ifdef CONFIG_X86_CMPXCHG64
+			"586",
+#else
+			"386",
+#endif
+			boot_cpu_has(X86_FEATURE_CX8) ? "with" : "without",
+			boot_cpu_has(X86_FEATURE_XMM) ? "with" : "without");
+#else
+	printk(KERN_INFO "atomic64 test passed\n");
+#endif
+
+	return 0;
+}
+
+core_initcall(test_atomic64);
-- 
cgit v1.2.3


From a7e926abc3adfbd2e5e20d2b46177adb4e313915 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Wed, 24 Feb 2010 10:54:25 +0100
Subject: x86-32: Rewrite 32-bit atomic64 functions in assembly

This patch replaces atomic64_32.c with two assembly implementations,
one for 386/486 machines using pushf/cli/popf and one for 586+ machines
using cmpxchg8b.

The cmpxchg8b implementation provides the following advantages over the
current one:

1. Implements atomic64_add_unless, atomic64_dec_if_positive and
   atomic64_inc_not_zero

2. Uses the ZF flag changed by cmpxchg8b instead of doing a comparison

3. Uses custom register calling conventions that reduce or eliminate
   register moves to suit cmpxchg8b

4. Reads the initial value instead of using cmpxchg8b to do that.
   Currently we use lock xaddl and movl, which seems the fastest.

5. Does not use the lock prefix for atomic64_set
   64-bit writes are already atomic, so we don't need that.
   We still need it for atomic64_read to avoid restoring a value
   changed in the meantime.

6. Allocates registers as well or better than gcc

The 386 implementation provides support for 386 and 486 machines.
386/486 SMP is not supported (we dropped it), but such support can be
added easily if desired.

A pure assembly implementation is required due to the custom calling
conventions, and desire to use %ebp in atomic64_add_return (we need
7 registers...), as well as the ability to use pushf/popf in the 386
code without an intermediate pop/push.

The parameter names are changed to match the convention in atomic_64.h

Changes in v3 (due to rebasing to tip/x86/asm):
- Patches atomic64_32.h instead of atomic_32.h
- Uses the CALL alternative mechanism from commit
  1b1d9258181bae199dc940f4bd0298126b9a73d9

Changes in v2:
- Merged 386 and cx8 support in the same patch
- 386 support now done in assembly, C code no longer used at all
- cmpxchg64 is used for atomic64_cmpxchg
- stop using macros, use one-line inline functions instead
- miscellanous changes and improvements

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267005265-27958-5-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/atomic64_32.h | 278 ++++++++++++++++++++++++++++---------
 arch/x86/lib/Makefile              |   3 +-
 arch/x86/lib/atomic64_32.c         | 273 +++++++-----------------------------
 arch/x86/lib/atomic64_386_32.S     | 175 +++++++++++++++++++++++
 arch/x86/lib/atomic64_cx8_32.S     | 225 ++++++++++++++++++++++++++++++
 5 files changed, 664 insertions(+), 290 deletions(-)
 create mode 100644 arch/x86/lib/atomic64_386_32.S
 create mode 100644 arch/x86/lib/atomic64_cx8_32.S

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 03027bf28de..2a934aa19a4 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -14,109 +14,193 @@ typedef struct {
 
 #define ATOMIC64_INIT(val)	{ (val) }
 
-extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
+#ifdef CONFIG_X86_CMPXCHG64
+#define ATOMIC64_ALTERNATIVE_(f, g) "call atomic64_" #g "_cx8"
+#else
+#define ATOMIC64_ALTERNATIVE_(f, g) ALTERNATIVE("call atomic64_" #f "_386", "call atomic64_" #g "_cx8", X86_FEATURE_CX8)
+#endif
+
+#define ATOMIC64_ALTERNATIVE(f) ATOMIC64_ALTERNATIVE_(f, f)
+
+/**
+ * atomic64_cmpxchg - cmpxchg atomic64 variable
+ * @p: pointer to type atomic64_t
+ * @o: expected value
+ * @n: new value
+ *
+ * Atomically sets @v to @n if it was equal to @o and returns
+ * the old value.
+ */
+
+static inline long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n)
+{
+	return cmpxchg64(&v->counter, o, n);
+}
 
 /**
  * atomic64_xchg - xchg atomic64 variable
- * @ptr:      pointer to type atomic64_t
- * @new_val:  value to assign
+ * @v: pointer to type atomic64_t
+ * @n: value to assign
  *
- * Atomically xchgs the value of @ptr to @new_val and returns
+ * Atomically xchgs the value of @v to @n and returns
  * the old value.
  */
-extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
+static inline long long atomic64_xchg(atomic64_t *v, long long n)
+{
+	long long o;
+	unsigned high = (unsigned)(n >> 32);
+	unsigned low = (unsigned)n;
+	asm volatile(ATOMIC64_ALTERNATIVE(xchg)
+		     : "=A" (o), "+b" (low), "+c" (high)
+		     : "S" (v)
+		     : "memory"
+		     );
+	return o;
+}
 
 /**
  * atomic64_set - set atomic64 variable
- * @ptr:      pointer to type atomic64_t
- * @new_val:  value to assign
+ * @v: pointer to type atomic64_t
+ * @n: value to assign
  *
- * Atomically sets the value of @ptr to @new_val.
+ * Atomically sets the value of @v to @n.
  */
-extern void atomic64_set(atomic64_t *ptr, u64 new_val);
+static inline void atomic64_set(atomic64_t *v, long long i)
+{
+	unsigned high = (unsigned)(i >> 32);
+	unsigned low = (unsigned)i;
+	asm volatile(ATOMIC64_ALTERNATIVE(set)
+		     : "+b" (low), "+c" (high)
+		     : "S" (v)
+		     : "eax", "edx", "memory"
+		     );
+}
 
 /**
  * atomic64_read - read atomic64 variable
- * @ptr:      pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
  *
- * Atomically reads the value of @ptr and returns it.
+ * Atomically reads the value of @v and returns it.
  */
-static inline u64 atomic64_read(atomic64_t *ptr)
+static inline long long atomic64_read(atomic64_t *v)
 {
-	u64 res;
-
-	/*
-	 * Note, we inline this atomic64_t primitive because
-	 * it only clobbers EAX/EDX and leaves the others
-	 * untouched. We also (somewhat subtly) rely on the
-	 * fact that cmpxchg8b returns the current 64-bit value
-	 * of the memory location we are touching:
-	 */
-	asm volatile(
-		"mov %%ebx, %%eax\n\t"
-		"mov %%ecx, %%edx\n\t"
-		LOCK_PREFIX "cmpxchg8b %1\n"
-			: "=&A" (res)
-			: "m" (*ptr)
-		);
-
-	return res;
-}
-
-extern u64 atomic64_read(atomic64_t *ptr);
+	long long r;
+	asm volatile(ATOMIC64_ALTERNATIVE(read)
+		     : "=A" (r), "+c" (v)
+		     : : "memory"
+		     );
+	return r;
+ }
 
 /**
  * atomic64_add_return - add and return
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
  *
- * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ * Atomically adds @i to @v and returns @i + *@v
  */
-extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
+static inline long long atomic64_add_return(long long i, atomic64_t *v)
+{
+	asm volatile(ATOMIC64_ALTERNATIVE(add_return)
+		     : "+A" (i), "+c" (v)
+		     : : "memory"
+		     );
+	return i;
+}
 
 /*
  * Other variants with different arithmetic operators:
  */
-extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
-extern u64 atomic64_inc_return(atomic64_t *ptr);
-extern u64 atomic64_dec_return(atomic64_t *ptr);
+static inline long long atomic64_sub_return(long long i, atomic64_t *v)
+{
+	asm volatile(ATOMIC64_ALTERNATIVE(sub_return)
+		     : "+A" (i), "+c" (v)
+		     : : "memory"
+		     );
+	return i;
+}
+
+static inline long long atomic64_inc_return(atomic64_t *v)
+{
+	long long a;
+	asm volatile(ATOMIC64_ALTERNATIVE(inc_return)
+		     : "=A" (a)
+		     : "S" (v)
+		     : "memory", "ecx"
+		     );
+	return a;
+}
+
+static inline long long atomic64_dec_return(atomic64_t *v)
+{
+	long long a;
+	asm volatile(ATOMIC64_ALTERNATIVE(dec_return)
+		     : "=A" (a)
+		     : "S" (v)
+		     : "memory", "ecx"
+		     );
+	return a;
+}
 
 /**
  * atomic64_add - add integer to atomic64 variable
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
  *
- * Atomically adds @delta to @ptr.
+ * Atomically adds @i to @v.
  */
-extern void atomic64_add(u64 delta, atomic64_t *ptr);
+static inline long long atomic64_add(long long i, atomic64_t *v)
+{
+	asm volatile(ATOMIC64_ALTERNATIVE_(add, add_return)
+		     : "+A" (i), "+c" (v)
+		     : : "memory"
+		     );
+	return i;
+}
 
 /**
  * atomic64_sub - subtract the atomic64 variable
- * @delta: integer value to subtract
- * @ptr:   pointer to type atomic64_t
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
  *
- * Atomically subtracts @delta from @ptr.
+ * Atomically subtracts @i from @v.
  */
-extern void atomic64_sub(u64 delta, atomic64_t *ptr);
+static inline long long atomic64_sub(long long i, atomic64_t *v)
+{
+	asm volatile(ATOMIC64_ALTERNATIVE_(sub, sub_return)
+		     : "+A" (i), "+c" (v)
+		     : : "memory"
+		     );
+	return i;
+}
 
 /**
  * atomic64_sub_and_test - subtract value from variable and test result
- * @delta: integer value to subtract
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr and returns
+ * @i: integer value to subtract
+ * @v: pointer to type atomic64_t
+  *
+ * Atomically subtracts @i from @v and returns
  * true if the result is zero, or false for all
  * other cases.
  */
-extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
+static inline int atomic64_sub_and_test(long long i, atomic64_t *v)
+{
+	return atomic64_sub_return(i, v) == 0;
+}
 
 /**
  * atomic64_inc - increment atomic64 variable
- * @ptr: pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
  *
- * Atomically increments @ptr by 1.
+ * Atomically increments @v by 1.
  */
-extern void atomic64_inc(atomic64_t *ptr);
+static inline void atomic64_inc(atomic64_t *v)
+{
+	asm volatile(ATOMIC64_ALTERNATIVE_(inc, inc_return)
+		     : : "S" (v)
+		     : "memory", "eax", "ecx", "edx"
+		     );
+}
 
 /**
  * atomic64_dec - decrement atomic64 variable
@@ -124,37 +208,97 @@ extern void atomic64_inc(atomic64_t *ptr);
  *
  * Atomically decrements @ptr by 1.
  */
-extern void atomic64_dec(atomic64_t *ptr);
+static inline void atomic64_dec(atomic64_t *v)
+{
+	asm volatile(ATOMIC64_ALTERNATIVE_(dec, dec_return)
+		     : : "S" (v)
+		     : "memory", "eax", "ecx", "edx"
+		     );
+}
 
 /**
  * atomic64_dec_and_test - decrement and test
- * @ptr: pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
  *
- * Atomically decrements @ptr by 1 and
+ * Atomically decrements @v by 1 and
  * returns true if the result is 0, or false for all other
  * cases.
  */
-extern int atomic64_dec_and_test(atomic64_t *ptr);
+static inline int atomic64_dec_and_test(atomic64_t *v)
+{
+	return atomic64_dec_return(v) == 0;
+}
 
 /**
  * atomic64_inc_and_test - increment and test
- * @ptr: pointer to type atomic64_t
+ * @v: pointer to type atomic64_t
  *
- * Atomically increments @ptr by 1
+ * Atomically increments @v by 1
  * and returns true if the result is zero, or false for all
  * other cases.
  */
-extern int atomic64_inc_and_test(atomic64_t *ptr);
+static inline int atomic64_inc_and_test(atomic64_t *v)
+{
+	return atomic64_inc_return(v) == 0;
+}
 
 /**
  * atomic64_add_negative - add and test if negative
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
+ * @i: integer value to add
+ * @v: pointer to type atomic64_t
  *
- * Atomically adds @delta to @ptr and returns true
+ * Atomically adds @i to @v and returns true
  * if the result is negative, or false when
  * result is greater than or equal to zero.
  */
-extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
+static inline int atomic64_add_negative(long long i, atomic64_t *v)
+{
+	return atomic64_add_return(i, v) < 0;
+}
+
+/**
+ * atomic64_add_unless - add unless the number is a given value
+ * @v: pointer of type atomic64_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as it was not @u.
+ * Returns non-zero if @v was not @u, and zero otherwise.
+ */
+static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
+{
+	unsigned low = (unsigned)u;
+	unsigned high = (unsigned)(u >> 32);
+	asm volatile(ATOMIC64_ALTERNATIVE(add_unless) "\n\t"
+		     : "+A" (a), "+c" (v), "+S" (low), "+D" (high)
+		     : : "memory");
+	return (int)a;
+}
+
+
+static inline int atomic64_inc_not_zero(atomic64_t *v)
+{
+	int r;
+	asm volatile(ATOMIC64_ALTERNATIVE(inc_not_zero)
+		     : "=a" (r)
+		     : "S" (v)
+		     : "ecx", "edx", "memory"
+		     );
+	return r;
+}
+
+static inline long long atomic64_dec_if_positive(atomic64_t *v)
+{
+	long long r;
+	asm volatile(ATOMIC64_ALTERNATIVE(dec_if_positive)
+		     : "=A" (r)
+		     : "S" (v)
+		     : "ecx", "memory"
+		     );
+	return r;
+}
+
+#undef ATOMIC64_ALTERNATIVE
+#undef ATOMIC64_ALTERNATIVE_
 
 #endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index cffd754f303..05d686bbbe9 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -26,11 +26,12 @@ obj-y += msr.o msr-reg.o msr-reg-export.o
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
+        lib-y += atomic64_cx8_32.o
         lib-y += checksum_32.o
         lib-y += strstr_32.o
         lib-y += semaphore_32.o string_32.o
 ifneq ($(CONFIG_X86_CMPXCHG64),y)
-        lib-y += cmpxchg8b_emu.o
+        lib-y += cmpxchg8b_emu.o atomic64_386_32.o
 endif
         lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
 else
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index 824fa0be55a..540179e8e9f 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -6,225 +6,54 @@
 #include <asm/cmpxchg.h>
 #include <asm/atomic.h>
 
-static noinline u64 cmpxchg8b(u64 *ptr, u64 old, u64 new)
-{
-	u32 low = new;
-	u32 high = new >> 32;
-
-	asm volatile(
-		LOCK_PREFIX "cmpxchg8b %1\n"
-		     : "+A" (old), "+m" (*ptr)
-		     :  "b" (low),  "c" (high)
-		     );
-	return old;
-}
-
-u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val)
-{
-	return cmpxchg8b(&ptr->counter, old_val, new_val);
-}
-EXPORT_SYMBOL(atomic64_cmpxchg);
-
-/**
- * atomic64_xchg - xchg atomic64 variable
- * @ptr:      pointer to type atomic64_t
- * @new_val:  value to assign
- *
- * Atomically xchgs the value of @ptr to @new_val and returns
- * the old value.
- */
-u64 atomic64_xchg(atomic64_t *ptr, u64 new_val)
-{
-	/*
-	 * Try first with a (possibly incorrect) assumption about
-	 * what we have there. We'll do two loops most likely,
-	 * but we'll get an ownership MESI transaction straight away
-	 * instead of a read transaction followed by a
-	 * flush-for-ownership transaction:
-	 */
-	u64 old_val, real_val = 0;
-
-	do {
-		old_val = real_val;
-
-		real_val = atomic64_cmpxchg(ptr, old_val, new_val);
-
-	} while (real_val != old_val);
-
-	return old_val;
-}
-EXPORT_SYMBOL(atomic64_xchg);
-
-/**
- * atomic64_set - set atomic64 variable
- * @ptr:      pointer to type atomic64_t
- * @new_val:  value to assign
- *
- * Atomically sets the value of @ptr to @new_val.
- */
-void atomic64_set(atomic64_t *ptr, u64 new_val)
-{
-	atomic64_xchg(ptr, new_val);
-}
-EXPORT_SYMBOL(atomic64_set);
-
-/**
-EXPORT_SYMBOL(atomic64_read);
- * atomic64_add_return - add and return
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns @delta + *@ptr
- */
-noinline u64 atomic64_add_return(u64 delta, atomic64_t *ptr)
-{
-	/*
-	 * Try first with a (possibly incorrect) assumption about
-	 * what we have there. We'll do two loops most likely,
-	 * but we'll get an ownership MESI transaction straight away
-	 * instead of a read transaction followed by a
-	 * flush-for-ownership transaction:
-	 */
-	u64 old_val, new_val, real_val = 0;
-
-	do {
-		old_val = real_val;
-		new_val = old_val + delta;
-
-		real_val = atomic64_cmpxchg(ptr, old_val, new_val);
-
-	} while (real_val != old_val);
-
-	return new_val;
-}
-EXPORT_SYMBOL(atomic64_add_return);
-
-u64 atomic64_sub_return(u64 delta, atomic64_t *ptr)
-{
-	return atomic64_add_return(-delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_sub_return);
-
-u64 atomic64_inc_return(atomic64_t *ptr)
-{
-	return atomic64_add_return(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc_return);
-
-u64 atomic64_dec_return(atomic64_t *ptr)
-{
-	return atomic64_sub_return(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec_return);
-
-/**
- * atomic64_add - add integer to atomic64 variable
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr.
- */
-void atomic64_add(u64 delta, atomic64_t *ptr)
-{
-	atomic64_add_return(delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_add);
-
-/**
- * atomic64_sub - subtract the atomic64 variable
- * @delta: integer value to subtract
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr.
- */
-void atomic64_sub(u64 delta, atomic64_t *ptr)
-{
-	atomic64_add(-delta, ptr);
-}
-EXPORT_SYMBOL(atomic64_sub);
-
-/**
- * atomic64_sub_and_test - subtract value from variable and test result
- * @delta: integer value to subtract
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically subtracts @delta from @ptr and returns
- * true if the result is zero, or false for all
- * other cases.
- */
-int atomic64_sub_and_test(u64 delta, atomic64_t *ptr)
-{
-	u64 new_val = atomic64_sub_return(delta, ptr);
-
-	return new_val == 0;
-}
-EXPORT_SYMBOL(atomic64_sub_and_test);
-
-/**
- * atomic64_inc - increment atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1.
- */
-void atomic64_inc(atomic64_t *ptr)
-{
-	atomic64_add(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc);
-
-/**
- * atomic64_dec - decrement atomic64 variable
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1.
- */
-void atomic64_dec(atomic64_t *ptr)
-{
-	atomic64_sub(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec);
-
-/**
- * atomic64_dec_and_test - decrement and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically decrements @ptr by 1 and
- * returns true if the result is 0, or false for all other
- * cases.
- */
-int atomic64_dec_and_test(atomic64_t *ptr)
-{
-	return atomic64_sub_and_test(1, ptr);
-}
-EXPORT_SYMBOL(atomic64_dec_and_test);
-
-/**
- * atomic64_inc_and_test - increment and test
- * @ptr: pointer to type atomic64_t
- *
- * Atomically increments @ptr by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-int atomic64_inc_and_test(atomic64_t *ptr)
-{
-	return atomic64_sub_and_test(-1, ptr);
-}
-EXPORT_SYMBOL(atomic64_inc_and_test);
-
-/**
- * atomic64_add_negative - add and test if negative
- * @delta: integer value to add
- * @ptr:   pointer to type atomic64_t
- *
- * Atomically adds @delta to @ptr and returns true
- * if the result is negative, or false when
- * result is greater than or equal to zero.
- */
-int atomic64_add_negative(u64 delta, atomic64_t *ptr)
-{
-	s64 new_val = atomic64_add_return(delta, ptr);
-
-	return new_val < 0;
-}
-EXPORT_SYMBOL(atomic64_add_negative);
+long long atomic64_read_cx8(long long, const atomic64_t *v);
+EXPORT_SYMBOL(atomic64_read_cx8);
+long long atomic64_set_cx8(long long, const atomic64_t *v);
+EXPORT_SYMBOL(atomic64_set_cx8);
+long long atomic64_xchg_cx8(long long, unsigned high);
+EXPORT_SYMBOL(atomic64_xchg_cx8);
+long long atomic64_add_return_cx8(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_add_return_cx8);
+long long atomic64_sub_return_cx8(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_sub_return_cx8);
+long long atomic64_inc_return_cx8(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_return_cx8);
+long long atomic64_dec_return_cx8(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_return_cx8);
+long long atomic64_dec_if_positive_cx8(atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_if_positive_cx8);
+int atomic64_inc_not_zero_cx8(atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_not_zero_cx8);
+int atomic64_add_unless_cx8(atomic64_t *v, long long a, long long u);
+EXPORT_SYMBOL(atomic64_add_unless_cx8);
+
+#ifndef CONFIG_X86_CMPXCHG64
+long long atomic64_read_386(long long, const atomic64_t *v);
+EXPORT_SYMBOL(atomic64_read_386);
+long long atomic64_set_386(long long, const atomic64_t *v);
+EXPORT_SYMBOL(atomic64_set_386);
+long long atomic64_xchg_386(long long, unsigned high);
+EXPORT_SYMBOL(atomic64_xchg_386);
+long long atomic64_add_return_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_add_return_386);
+long long atomic64_sub_return_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_sub_return_386);
+long long atomic64_inc_return_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_return_386);
+long long atomic64_dec_return_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_return_386);
+long long atomic64_add_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_add_386);
+long long atomic64_sub_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_sub_386);
+long long atomic64_inc_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_386);
+long long atomic64_dec_386(long long a, atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_386);
+long long atomic64_dec_if_positive_386(atomic64_t *v);
+EXPORT_SYMBOL(atomic64_dec_if_positive_386);
+int atomic64_inc_not_zero_386(atomic64_t *v);
+EXPORT_SYMBOL(atomic64_inc_not_zero_386);
+int atomic64_add_unless_386(atomic64_t *v, long long a, long long u);
+EXPORT_SYMBOL(atomic64_add_unless_386);
+#endif
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
new file mode 100644
index 00000000000..5db07fe4a0c
--- /dev/null
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -0,0 +1,175 @@
+/*
+ * atomic64_t for 386/486
+ *
+ * Copyright © 2010  Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+/* if you want SMP support, implement these with real spinlocks */
+.macro LOCK reg
+	pushfl
+	CFI_ADJUST_CFA_OFFSET 4
+	cli
+.endm
+
+.macro UNLOCK reg
+	popfl
+	CFI_ADJUST_CFA_OFFSET -4
+.endm
+
+.macro BEGIN func reg
+$v = \reg
+
+ENTRY(atomic64_\func\()_386)
+	CFI_STARTPROC
+	LOCK $v
+
+.macro RETURN
+	UNLOCK $v
+	ret
+.endm
+
+.macro END_
+	CFI_ENDPROC
+ENDPROC(atomic64_\func\()_386)
+.purgem RETURN
+.purgem END_
+.purgem END
+.endm
+
+.macro END
+RETURN
+END_
+.endm
+.endm
+
+BEGIN read %ecx
+	movl  ($v), %eax
+	movl 4($v), %edx
+END
+
+BEGIN set %esi
+	movl %ebx,  ($v)
+	movl %ecx, 4($v)
+END
+
+BEGIN xchg %esi
+	movl  ($v), %eax
+	movl 4($v), %edx
+	movl %ebx,  ($v)
+	movl %ecx, 4($v)
+END
+
+BEGIN add %ecx
+	addl %eax,  ($v)
+	adcl %edx, 4($v)
+END
+
+BEGIN add_return %ecx
+	addl  ($v), %eax
+	adcl 4($v), %edx
+	movl %eax,  ($v)
+	movl %edx, 4($v)
+END
+
+BEGIN sub %ecx
+	subl %eax,  ($v)
+	sbbl %edx, 4($v)
+END
+
+BEGIN sub_return %ecx
+	negl %edx
+	negl %eax
+	sbbl $0, %edx
+	addl  ($v), %eax
+	adcl 4($v), %edx
+	movl %eax,  ($v)
+	movl %edx, 4($v)
+END
+
+BEGIN inc %esi
+	addl $1,  ($v)
+	adcl $0, 4($v)
+END
+
+BEGIN inc_return %esi
+	movl  ($v), %eax
+	movl 4($v), %edx
+	addl $1, %eax
+	adcl $0, %edx
+	movl %eax,  ($v)
+	movl %edx, 4($v)
+END
+
+BEGIN dec %esi
+	subl $1,  ($v)
+	sbbl $0, 4($v)
+END
+
+BEGIN dec_return %esi
+	movl  ($v), %eax
+	movl 4($v), %edx
+	subl $1, %eax
+	sbbl $0, %edx
+	movl %eax,  ($v)
+	movl %edx, 4($v)
+END
+
+BEGIN add_unless %ecx
+	addl %eax, %esi
+	adcl %edx, %edi
+	addl  ($v), %eax
+	adcl 4($v), %edx
+	cmpl %eax, %esi
+	je 3f
+1:
+	movl %eax,  ($v)
+	movl %edx, 4($v)
+	xorl %eax, %eax
+2:
+RETURN
+3:
+	cmpl %edx, %edi
+	jne 1b
+	movl $1, %eax
+	jmp 2b
+END_
+
+BEGIN inc_not_zero %esi
+	movl  ($v), %eax
+	movl 4($v), %edx
+	testl %eax, %eax
+	je 3f
+1:
+	addl $1, %eax
+	adcl $0, %edx
+	movl %eax,  ($v)
+	movl %edx, 4($v)
+	xorl %eax, %eax
+2:
+RETURN
+3:
+	testl %edx, %edx
+	jne 1b
+	movl $1, %eax
+	jmp 2b
+END_
+
+BEGIN dec_if_positive %esi
+	movl  ($v), %eax
+	movl 4($v), %edx
+	subl $1, %eax
+	sbbl $0, %edx
+	js 1f
+	movl %eax,  ($v)
+	movl %edx, 4($v)
+1:
+END
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
new file mode 100644
index 00000000000..e49c4ebca9f
--- /dev/null
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -0,0 +1,225 @@
+/*
+ * atomic64_t for 586+
+ *
+ * Copyright © 2010  Luca Barbieri
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+.macro SAVE reg
+	pushl %\reg
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET \reg, 0
+.endm
+
+.macro RESTORE reg
+	popl %\reg
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE \reg
+.endm
+
+.macro read64 reg
+	movl %ebx, %eax
+	movl %ecx, %edx
+/* we need LOCK_PREFIX since otherwise cmpxchg8b always does the write */
+	LOCK_PREFIX
+	cmpxchg8b (\reg)
+.endm
+
+ENTRY(atomic64_read_cx8)
+	CFI_STARTPROC
+
+	read64 %ecx
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_read_cx8)
+
+ENTRY(atomic64_set_cx8)
+	CFI_STARTPROC
+
+1:
+/* we don't need LOCK_PREFIX since aligned 64-bit writes
+ * are atomic on 586 and newer */
+	cmpxchg8b (%esi)
+	jne 1b
+
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_set_cx8)
+
+ENTRY(atomic64_xchg_cx8)
+	CFI_STARTPROC
+
+	movl %ebx, %eax
+	movl %ecx, %edx
+1:
+	LOCK_PREFIX
+	cmpxchg8b (%esi)
+	jne 1b
+
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_xchg_cx8)
+
+.macro addsub_return func ins insc
+ENTRY(atomic64_\func\()_return_cx8)
+	CFI_STARTPROC
+	SAVE ebp
+	SAVE ebx
+	SAVE esi
+	SAVE edi
+
+	movl %eax, %esi
+	movl %edx, %edi
+	movl %ecx, %ebp
+
+	read64 %ebp
+1:
+	movl %eax, %ebx
+	movl %edx, %ecx
+	\ins\()l %esi, %ebx
+	\insc\()l %edi, %ecx
+	LOCK_PREFIX
+	cmpxchg8b (%ebp)
+	jne 1b
+
+10:
+	movl %ebx, %eax
+	movl %ecx, %edx
+	RESTORE edi
+	RESTORE esi
+	RESTORE ebx
+	RESTORE ebp
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_\func\()_return_cx8)
+.endm
+
+addsub_return add add adc
+addsub_return sub sub sbb
+
+.macro incdec_return func ins insc
+ENTRY(atomic64_\func\()_return_cx8)
+	CFI_STARTPROC
+	SAVE ebx
+
+	read64 %esi
+1:
+	movl %eax, %ebx
+	movl %edx, %ecx
+	\ins\()l $1, %ebx
+	\insc\()l $0, %ecx
+	LOCK_PREFIX
+	cmpxchg8b (%esi)
+	jne 1b
+
+10:
+	movl %ebx, %eax
+	movl %ecx, %edx
+	RESTORE ebx
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_\func\()_return_cx8)
+.endm
+
+incdec_return inc add adc
+incdec_return dec sub sbb
+
+ENTRY(atomic64_dec_if_positive_cx8)
+	CFI_STARTPROC
+	SAVE ebx
+
+	read64 %esi
+1:
+	movl %eax, %ebx
+	movl %edx, %ecx
+	subl $1, %ebx
+	sbb $0, %ecx
+	js 2f
+	LOCK_PREFIX
+	cmpxchg8b (%esi)
+	jne 1b
+
+2:
+	movl %ebx, %eax
+	movl %ecx, %edx
+	RESTORE ebx
+	ret
+	CFI_ENDPROC
+ENDPROC(atomic64_dec_if_positive_cx8)
+
+ENTRY(atomic64_add_unless_cx8)
+	CFI_STARTPROC
+	SAVE ebp
+	SAVE ebx
+/* these just push these two parameters on the stack */
+	SAVE edi
+	SAVE esi
+
+	movl %ecx, %ebp
+	movl %eax, %esi
+	movl %edx, %edi
+
+	read64 %ebp
+1:
+	cmpl %eax, 0(%esp)
+	je 4f
+2:
+	movl %eax, %ebx
+	movl %edx, %ecx
+	addl %esi, %ebx
+	adcl %edi, %ecx
+	LOCK_PREFIX
+	cmpxchg8b (%ebp)
+	jne 1b
+
+	xorl %eax, %eax
+3:
+	addl $8, %esp
+	CFI_ADJUST_CFA_OFFSET -8
+	RESTORE ebx
+	RESTORE ebp
+	ret
+4:
+	cmpl %edx, 4(%esp)
+	jne 2b
+	movl $1, %eax
+	jmp 3b
+	CFI_ENDPROC
+ENDPROC(atomic64_add_unless_cx8)
+
+ENTRY(atomic64_inc_not_zero_cx8)
+	CFI_STARTPROC
+	SAVE ebx
+
+	read64 %esi
+1:
+	testl %eax, %eax
+	je 4f
+2:
+	movl %eax, %ebx
+	movl %edx, %ecx
+	addl $1, %ebx
+	adcl $0, %ecx
+	LOCK_PREFIX
+	cmpxchg8b (%esi)
+	jne 1b
+
+	xorl %eax, %eax
+3:
+	RESTORE ebx
+	ret
+4:
+	testl %edx, %edx
+	jne 2b
+	movl $1, %eax
+	jmp 3b
+	CFI_ENDPROC
+ENDPROC(atomic64_inc_not_zero_cx8)
-- 
cgit v1.2.3


From 8f4f202b335144bf5be5c9e5b1bc9477ecdae958 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Fri, 26 Feb 2010 12:22:40 +0100
Subject: lib: Only test atomic64_dec_if_positive on archs having it

Currently atomic64_dec_if_positive() is only supported by PowerPC,
MIPS and x86-32.

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267183361-20775-1-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 lib/atomic64_test.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index 4ff649e46ba..0effcacbebd 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -112,6 +112,7 @@ static __init int test_atomic64(void)
 	r += one;
 	BUG_ON(v.counter != r);
 
+#if defined(CONFIG_X86_32) || defined(CONFIG_MIPS) || defined(CONFIG_PPC) || defined(_ASM_GENERIC_ATOMIC64_H)
 	INIT(onestwos);
 	BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1));
 	r -= one;
@@ -124,6 +125,9 @@ static __init int test_atomic64(void)
 	INIT(-one);
 	BUG_ON(atomic64_dec_if_positive(&v) != (-one - one));
 	BUG_ON(v.counter != r);
+#else
+#warning Please implement atomic64_dec_if_positive for your architecture, and add it to the IF above
+#endif
 
 	INIT(onestwos);
 	BUG_ON(atomic64_inc_not_zero(&v));
-- 
cgit v1.2.3


From d7f6de1e9c4a12e11ba7186c70f0f40caa76f590 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Fri, 26 Feb 2010 12:22:41 +0100
Subject: x86: Implement atomic[64]_dec_if_positive()

Add support for atomic_dec_if_positive(), and
atomic64_dec_if_positive() for x86-64.

atomic64_dec_if_positive() for x86-32 was already implemented in a previous patch.

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267183361-20775-2-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/atomic.h      | 23 +++++++++++++++++++++++
 arch/x86/include/asm/atomic64_64.h | 23 +++++++++++++++++++++++
 lib/atomic64_test.c                |  2 +-
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 8f8217b9bda..706c69492c1 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -246,6 +246,29 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 
 #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
 
+/*
+ * atomic_dec_if_positive - decrement by 1 if old value positive
+ * @v: pointer of type atomic_t
+ *
+ * The function returns the old value of *v minus 1, even if
+ * the atomic variable, v, was not decremented.
+ */
+static inline int atomic_dec_if_positive(atomic_t *v)
+{
+	int c, old, dec;
+	c = atomic_read(v);
+	for (;;) {
+		dec = c - 1;
+		if (unlikely(dec < 0))
+			break;
+		old = atomic_cmpxchg((v), c, dec);
+		if (likely(old == c))
+			break;
+		c = old;
+	}
+	return dec;
+}
+
 /**
  * atomic_inc_short - increment of a short integer
  * @v: pointer to type int
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 51c5b405692..4d6e2cd6c88 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -221,4 +221,27 @@ static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
 
 #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
 
+/*
+ * atomic64_dec_if_positive - decrement by 1 if old value positive
+ * @v: pointer of type atomic_t
+ *
+ * The function returns the old value of *v minus 1, even if
+ * the atomic variable, v, was not decremented.
+ */
+static inline long atomic64_dec_if_positive(atomic64_t *v)
+{
+	long c, old, dec;
+	c = atomic64_read(v);
+	for (;;) {
+		dec = c - 1;
+		if (unlikely(dec < 0))
+			break;
+		old = atomic64_cmpxchg((v), c, dec);
+		if (likely(old == c))
+			break;
+		c = old;
+	}
+	return dec;
+}
+
 #endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index 0effcacbebd..58efdabb384 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -112,7 +112,7 @@ static __init int test_atomic64(void)
 	r += one;
 	BUG_ON(v.counter != r);
 
-#if defined(CONFIG_X86_32) || defined(CONFIG_MIPS) || defined(CONFIG_PPC) || defined(_ASM_GENERIC_ATOMIC64_H)
+#if defined(CONFIG_X86) || defined(CONFIG_MIPS) || defined(CONFIG_PPC) || defined(_ASM_GENERIC_ATOMIC64_H)
 	INIT(onestwos);
 	BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1));
 	r -= one;
-- 
cgit v1.2.3


From 9efbcd590243045111670c171a951923b877b57d Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Mon, 1 Mar 2010 19:55:45 +0100
Subject: lib: Fix atomic64_add_unless test

atomic64_add_unless must return 1 if it perfomed the add and 0 otherwise.
The test assumed the opposite convention.

Reported-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267469749-11878-2-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 lib/atomic64_test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index 58efdabb384..ee8e6de8b41 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -104,11 +104,11 @@ static __init int test_atomic64(void)
 	BUG_ON(v.counter != r);
 
 	INIT(v0);
-	BUG_ON(!atomic64_add_unless(&v, one, v0));
+	BUG_ON(atomic64_add_unless(&v, one, v0));
 	BUG_ON(v.counter != r);
 
 	INIT(v0);
-	BUG_ON(atomic64_add_unless(&v, one, v1));
+	BUG_ON(!atomic64_add_unless(&v, one, v1));
 	r += one;
 	BUG_ON(v.counter != r);
 
-- 
cgit v1.2.3


From 6e6104fe085026e6ef82cc5cc303d6c8ceb7e411 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Mon, 1 Mar 2010 19:55:46 +0100
Subject: x86-32: Fix atomic64_add_unless return value convention

atomic64_add_unless must return 1 if it perfomed the add and 0 otherwise.
The implementation did the opposite thing.

Reported-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267469749-11878-3-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/lib/atomic64_386_32.S | 4 ++--
 arch/x86/lib/atomic64_cx8_32.S | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 5db07fe4a0c..a2f847c88b8 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -133,13 +133,13 @@ BEGIN add_unless %ecx
 1:
 	movl %eax,  ($v)
 	movl %edx, 4($v)
-	xorl %eax, %eax
+	movl $1, %eax
 2:
 RETURN
 3:
 	cmpl %edx, %edi
 	jne 1b
-	movl $1, %eax
+	xorl %eax, %eax
 	jmp 2b
 END_
 
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index e49c4ebca9f..d0e37b189f8 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -180,7 +180,7 @@ ENTRY(atomic64_add_unless_cx8)
 	cmpxchg8b (%ebp)
 	jne 1b
 
-	xorl %eax, %eax
+	movl $1, %eax
 3:
 	addl $8, %esp
 	CFI_ADJUST_CFA_OFFSET -8
@@ -190,7 +190,7 @@ ENTRY(atomic64_add_unless_cx8)
 4:
 	cmpl %edx, 4(%esp)
 	jne 2b
-	movl $1, %eax
+	xorl %eax, %eax
 	jmp 3b
 	CFI_ENDPROC
 ENDPROC(atomic64_add_unless_cx8)
-- 
cgit v1.2.3


From 97577896f6b9c056fa0a5e9f6a608110cb3dcd33 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Mon, 1 Mar 2010 19:55:47 +0100
Subject: lib: Fix atomic64_add_unless return value convention

atomic64_add_unless must return 1 if it perfomed the add and 0 otherwise.
The generic implementation did the opposite thing.

Reported-by: H. Peter Anvin <hpa@zytor.com>
Confirmed-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267469749-11878-4-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 lib/atomic64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/atomic64.c b/lib/atomic64.c
index 8bee16ec752..a21c12bc727 100644
--- a/lib/atomic64.c
+++ b/lib/atomic64.c
@@ -162,12 +162,12 @@ int atomic64_add_unless(atomic64_t *v, long long a, long long u)
 {
 	unsigned long flags;
 	spinlock_t *lock = lock_addr(v);
-	int ret = 1;
+	int ret = 0;
 
 	spin_lock_irqsave(lock, flags);
 	if (v->counter != u) {
 		v->counter += a;
-		ret = 0;
+		ret = 1;
 	}
 	spin_unlock_irqrestore(lock, flags);
 	return ret;
-- 
cgit v1.2.3


From 25a304f277ad70166eeae25a4958d2049005c33a Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Mon, 1 Mar 2010 19:55:48 +0100
Subject: lib: Fix atomic64_inc_not_zero test

atomic64_inc_not_zero must return 1 if it perfomed the add and 0 otherwise.
The test assumed the opposite convention.

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267469749-11878-5-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 lib/atomic64_test.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index ee8e6de8b41..f7bb706c9c3 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -130,16 +130,16 @@ static __init int test_atomic64(void)
 #endif
 
 	INIT(onestwos);
-	BUG_ON(atomic64_inc_not_zero(&v));
+	BUG_ON(!atomic64_inc_not_zero(&v));
 	r += one;
 	BUG_ON(v.counter != r);
 
 	INIT(0);
-	BUG_ON(!atomic64_inc_not_zero(&v));
+	BUG_ON(atomic64_inc_not_zero(&v));
 	BUG_ON(v.counter != r);
 
 	INIT(-one);
-	BUG_ON(atomic64_inc_not_zero(&v));
+	BUG_ON(!atomic64_inc_not_zero(&v));
 	r += one;
 	BUG_ON(v.counter != r);
 
-- 
cgit v1.2.3


From f3e83131469e29032a700217aa394996107b8fc5 Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Mon, 1 Mar 2010 19:55:49 +0100
Subject: x86-32: Fix atomic64_inc_not_zero return value convention

atomic64_inc_not_zero must return 1 if it perfomed the add and 0 otherwise.
It was doing the opposite thing.

Signed-off-by: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267469749-11878-6-git-send-email-luca@luca-barbieri.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/lib/atomic64_386_32.S | 3 +--
 arch/x86/lib/atomic64_cx8_32.S | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index a2f847c88b8..4a5979aa688 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -153,13 +153,12 @@ BEGIN inc_not_zero %esi
 	adcl $0, %edx
 	movl %eax,  ($v)
 	movl %edx, 4($v)
-	xorl %eax, %eax
+	movl $1, %eax
 2:
 RETURN
 3:
 	testl %edx, %edx
 	jne 1b
-	movl $1, %eax
 	jmp 2b
 END_
 
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index d0e37b189f8..71e080de335 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -212,14 +212,13 @@ ENTRY(atomic64_inc_not_zero_cx8)
 	cmpxchg8b (%esi)
 	jne 1b
 
-	xorl %eax, %eax
+	movl $1, %eax
 3:
 	RESTORE ebx
 	ret
 4:
 	testl %edx, %edx
 	jne 2b
-	movl $1, %eax
 	jmp 3b
 	CFI_ENDPROC
 ENDPROC(atomic64_inc_not_zero_cx8)
-- 
cgit v1.2.3


From a5c9161f27c3e1ae6c0094d262f03a7e98262181 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 1 Mar 2010 11:49:23 -0800
Subject: x86, atomic64: In selftest, distinguish x86-64 from 586+

The x86-64 implementation of the atomics is totally different from the
i586+ implementation, which makes it quite confusing to call it
"586+".  Also fix indentation, and add "i" for "i386" and "i586" as
used elsewhere in the kernel.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267005265-27958-4-git-send-email-luca@luca-barbieri.com>
---
 lib/atomic64_test.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index f7bb706c9c3..65e482caf5e 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -144,14 +144,16 @@ static __init int test_atomic64(void)
 	BUG_ON(v.counter != r);
 
 #ifdef CONFIG_X86
-	printk(KERN_INFO "atomic64 test passed for %s+ platform %s CX8 and %s SSE\n",
-#ifdef CONFIG_X86_CMPXCHG64
-			"586",
+	printk(KERN_INFO "atomic64 test passed for %s platform %s CX8 and %s SSE\n",
+#ifdef CONFIG_X86_64
+	       "x86-64",
+#elif defined(CONFIG_X86_CMPXCHG64)
+	       "i586+",
 #else
-			"386",
+	       "i386+",
 #endif
-			boot_cpu_has(X86_FEATURE_CX8) ? "with" : "without",
-			boot_cpu_has(X86_FEATURE_XMM) ? "with" : "without");
+	       boot_cpu_has(X86_FEATURE_CX8) ? "with" : "without",
+	       boot_cpu_has(X86_FEATURE_XMM) ? "with" : "without");
 #else
 	printk(KERN_INFO "atomic64 test passed\n");
 #endif
-- 
cgit v1.2.3


From 4daa2a8093ecd1148270a1fc64e99f072b8c2901 Mon Sep 17 00:00:00 2001
From: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Date: Wed, 24 Feb 2010 13:43:55 -0800
Subject: x86, pat: In rbt_memtype_check_insert(), update new->type only if
 valid

new->type should only change when there is a valid ret_type. Otherwise
the requested type and return type should be same.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
LKML-Reference: <20100224214355.GA16431@linux-os.sc.intel.com>
Tested-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/pat_rbtree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 9063f40b638..07de4cb8cc3 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -223,7 +223,9 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
 						new->type, ret_type);
 
 	if (!err) {
-		new->type = *ret_type;
+		if (ret_type)
+			new->type = *ret_type;
+
 		memtype_rb_insert(&memtype_rbroot, new);
 	}
 	return err;
-- 
cgit v1.2.3


From 372e22ef0a87d5fc10d387791f9f19721115820c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 1 Mar 2010 22:06:25 -0800
Subject: x86, docbook: Fix errors from x86 headers merger

Fix docbook errors for x86 headers that were recently merged:

docproc: arch/x86/include/asm/atomic_32.h: No such file or directory
docproc: arch/x86/include/asm/io_32.h: No such file or directory

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <20100301220625.bad46c19.randy.dunlap@oracle.com>
Cc: Brian Gerst <brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/DocBook/device-drivers.tmpl | 2 +-
 Documentation/DocBook/deviceiobook.tmpl   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl
index f9a6e2c75f1..1b2dd4fc3db 100644
--- a/Documentation/DocBook/device-drivers.tmpl
+++ b/Documentation/DocBook/device-drivers.tmpl
@@ -45,7 +45,7 @@
      </sect1>
 
      <sect1><title>Atomic and pointer manipulation</title>
-!Iarch/x86/include/asm/atomic_32.h
+!Iarch/x86/include/asm/atomic.h
 !Iarch/x86/include/asm/unaligned.h
      </sect1>
 
diff --git a/Documentation/DocBook/deviceiobook.tmpl b/Documentation/DocBook/deviceiobook.tmpl
index 3ed88126ab8..c1ed6a49e59 100644
--- a/Documentation/DocBook/deviceiobook.tmpl
+++ b/Documentation/DocBook/deviceiobook.tmpl
@@ -316,7 +316,7 @@ CPU B:  spin_unlock_irqrestore(&amp;dev_lock, flags)
 
   <chapter id="pubfunctions">
      <title>Public Functions Provided</title>
-!Iarch/x86/include/asm/io_32.h
+!Iarch/x86/include/asm/io.h
 !Elib/iomap.c
   </chapter>
 
-- 
cgit v1.2.3


From ced918eb748ce30b3aace549fd17540e40ffdca0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 17 Feb 2010 16:47:10 +0000
Subject: i8253: Convert i8253_lock to raw_spinlock

i8253_lock needs to be a real spinlock in preempt-rt, i.e. it can
not be converted to a sleeping lock.

Convert it to raw_spinlock and fix up all users.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Takashi Iwai <tiwai@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <20100217163751.030764372@linutronix.de>
---
 arch/mips/include/asm/i8253.h     |  2 +-
 arch/mips/kernel/i8253.c          | 14 +++++++-------
 arch/x86/include/asm/i8253.h      |  2 +-
 arch/x86/kernel/apm_32.c          |  4 ++--
 arch/x86/kernel/i8253.c           | 14 +++++++-------
 drivers/block/hd.c                |  4 ++--
 drivers/input/gameport/gameport.c |  4 ++--
 drivers/input/joystick/analog.c   |  4 ++--
 drivers/input/misc/pcspkr.c       |  6 +++---
 sound/drivers/pcsp/pcsp.h         |  2 +-
 sound/drivers/pcsp/pcsp_input.c   |  4 ++--
 sound/drivers/pcsp/pcsp_lib.c     | 12 ++++++------
 12 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/arch/mips/include/asm/i8253.h b/arch/mips/include/asm/i8253.h
index 032ca73f181..48bb8237299 100644
--- a/arch/mips/include/asm/i8253.h
+++ b/arch/mips/include/asm/i8253.h
@@ -12,7 +12,7 @@
 #define PIT_CH0			0x40
 #define PIT_CH2			0x42
 
-extern spinlock_t i8253_lock;
+extern raw_spinlock_t i8253_lock;
 
 extern void setup_pit_timer(void);
 
diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c
index ed5c441615e..94794062a17 100644
--- a/arch/mips/kernel/i8253.c
+++ b/arch/mips/kernel/i8253.c
@@ -15,7 +15,7 @@
 #include <asm/io.h>
 #include <asm/time.h>
 
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
 /*
@@ -26,7 +26,7 @@ EXPORT_SYMBOL(i8253_lock);
 static void init_pit_timer(enum clock_event_mode mode,
 			   struct clock_event_device *evt)
 {
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 
 	switch(mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
@@ -55,7 +55,7 @@ static void init_pit_timer(enum clock_event_mode mode,
 		/* Nothing to do here */
 		break;
 	}
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 }
 
 /*
@@ -65,10 +65,10 @@ static void init_pit_timer(enum clock_event_mode mode,
  */
 static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
 {
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 	outb_p(delta & 0xff , PIT_CH0);	/* LSB */
 	outb(delta >> 8 , PIT_CH0);	/* MSB */
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 
 	return 0;
 }
@@ -137,7 +137,7 @@ static cycle_t pit_read(struct clocksource *cs)
 	static int old_count;
 	static u32 old_jifs;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	/*
 	 * Although our caller may have the read side of xtime_lock,
 	 * this is now a seqlock, and we are cheating in this routine
@@ -183,7 +183,7 @@ static cycle_t pit_read(struct clocksource *cs)
 	old_count = count;
 	old_jifs = jifs;
 
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
 	count = (LATCH - 1) - count;
 
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
index 1edbf89680f..fc1f579fb96 100644
--- a/arch/x86/include/asm/i8253.h
+++ b/arch/x86/include/asm/i8253.h
@@ -6,7 +6,7 @@
 #define PIT_CH0			0x40
 #define PIT_CH2			0x42
 
-extern spinlock_t i8253_lock;
+extern raw_spinlock_t i8253_lock;
 
 extern struct clock_event_device *global_clock_event;
 
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 031aa887b0e..c4f9182ca3a 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1224,7 +1224,7 @@ static void reinit_timer(void)
 #ifdef INIT_TIMER_AFTER_SUSPEND
 	unsigned long flags;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	/* set the clock to HZ */
 	outb_pit(0x34, PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
 	udelay(10);
@@ -1232,7 +1232,7 @@ static void reinit_timer(void)
 	udelay(10);
 	outb_pit(LATCH >> 8, PIT_CH0);	/* MSB */
 	udelay(10);
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 #endif
 }
 
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 23c167925a5..2dfd3159744 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -16,7 +16,7 @@
 #include <asm/hpet.h>
 #include <asm/smp.h>
 
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
 /*
@@ -33,7 +33,7 @@ struct clock_event_device *global_clock_event;
 static void init_pit_timer(enum clock_event_mode mode,
 			   struct clock_event_device *evt)
 {
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 
 	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
@@ -62,7 +62,7 @@ static void init_pit_timer(enum clock_event_mode mode,
 		/* Nothing to do here */
 		break;
 	}
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 }
 
 /*
@@ -72,10 +72,10 @@ static void init_pit_timer(enum clock_event_mode mode,
  */
 static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
 {
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 	outb_pit(delta & 0xff , PIT_CH0);	/* LSB */
 	outb_pit(delta >> 8 , PIT_CH0);		/* MSB */
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 
 	return 0;
 }
@@ -130,7 +130,7 @@ static cycle_t pit_read(struct clocksource *cs)
 	int count;
 	u32 jifs;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	/*
 	 * Although our caller may have the read side of xtime_lock,
 	 * this is now a seqlock, and we are cheating in this routine
@@ -176,7 +176,7 @@ static cycle_t pit_read(struct clocksource *cs)
 	old_count = count;
 	old_jifs = jifs;
 
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
 	count = (LATCH - 1) - count;
 
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 5116c65c07c..b9868ad0278 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -165,12 +165,12 @@ unsigned long read_timer(void)
 	unsigned long t, flags;
 	int i;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	t = jiffies * 11932;
 	outb_p(0, 0x43);
 	i = inb_p(0x40);
 	i |= inb(0x40) << 8;
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 	return(t - i);
 }
 #endif
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index 7e18bcf05a6..46239e47a26 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -59,11 +59,11 @@ static unsigned int get_time_pit(void)
 	unsigned long flags;
 	unsigned int count;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 	outb_p(0x00, 0x43);
 	count = inb_p(0x40);
 	count |= inb_p(0x40) << 8;
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
 	return count;
 }
diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index 1c0b529c06a..4afe0a3b488 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -146,11 +146,11 @@ static unsigned int get_time_pit(void)
         unsigned long flags;
         unsigned int count;
 
-        spin_lock_irqsave(&i8253_lock, flags);
+        raw_spin_lock_irqsave(&i8253_lock, flags);
         outb_p(0x00, 0x43);
         count = inb_p(0x40);
         count |= inb_p(0x40) << 8;
-        spin_unlock_irqrestore(&i8253_lock, flags);
+        raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
         return count;
 }
diff --git a/drivers/input/misc/pcspkr.c b/drivers/input/misc/pcspkr.c
index ea4e1fd1265..f080dd31499 100644
--- a/drivers/input/misc/pcspkr.c
+++ b/drivers/input/misc/pcspkr.c
@@ -30,7 +30,7 @@ MODULE_ALIAS("platform:pcspkr");
 #include <asm/i8253.h>
 #else
 #include <asm/8253pit.h>
-static DEFINE_SPINLOCK(i8253_lock);
+static DEFINE_RAW_SPINLOCK(i8253_lock);
 #endif
 
 static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int code, int value)
@@ -50,7 +50,7 @@ static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int c
 	if (value > 20 && value < 32767)
 		count = PIT_TICK_RATE / value;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 
 	if (count) {
 		/* set command for counter 2, 2 byte write */
@@ -65,7 +65,7 @@ static int pcspkr_event(struct input_dev *dev, unsigned int type, unsigned int c
 		outb(inb_p(0x61) & 0xFC, 0x61);
 	}
 
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 
 	return 0;
 }
diff --git a/sound/drivers/pcsp/pcsp.h b/sound/drivers/pcsp/pcsp.h
index 1e123077923..4ff6c8cc507 100644
--- a/sound/drivers/pcsp/pcsp.h
+++ b/sound/drivers/pcsp/pcsp.h
@@ -16,7 +16,7 @@
 #include <asm/i8253.h>
 #else
 #include <asm/8253pit.h>
-static DEFINE_SPINLOCK(i8253_lock);
+static DEFINE_RAW_SPINLOCK(i8253_lock);
 #endif
 
 #define PCSP_SOUND_VERSION 0x400	/* read 4.00 */
diff --git a/sound/drivers/pcsp/pcsp_input.c b/sound/drivers/pcsp/pcsp_input.c
index 0444cdeb4be..b5e2b54c260 100644
--- a/sound/drivers/pcsp/pcsp_input.c
+++ b/sound/drivers/pcsp/pcsp_input.c
@@ -21,7 +21,7 @@ static void pcspkr_do_sound(unsigned int count)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&i8253_lock, flags);
+	raw_spin_lock_irqsave(&i8253_lock, flags);
 
 	if (count) {
 		/* set command for counter 2, 2 byte write */
@@ -36,7 +36,7 @@ static void pcspkr_do_sound(unsigned int count)
 		outb(inb_p(0x61) & 0xFC, 0x61);
 	}
 
-	spin_unlock_irqrestore(&i8253_lock, flags);
+	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 }
 
 void pcspkr_stop_sound(void)
diff --git a/sound/drivers/pcsp/pcsp_lib.c b/sound/drivers/pcsp/pcsp_lib.c
index e1145ac6e90..f6a2e72b8cd 100644
--- a/sound/drivers/pcsp/pcsp_lib.c
+++ b/sound/drivers/pcsp/pcsp_lib.c
@@ -65,7 +65,7 @@ static u64 pcsp_timer_update(struct snd_pcsp *chip)
 	timer_cnt = val * CUR_DIV() / 256;
 
 	if (timer_cnt && chip->enable) {
-		spin_lock_irqsave(&i8253_lock, flags);
+		raw_spin_lock_irqsave(&i8253_lock, flags);
 		if (!nforce_wa) {
 			outb_p(chip->val61, 0x61);
 			outb_p(timer_cnt, 0x42);
@@ -74,7 +74,7 @@ static u64 pcsp_timer_update(struct snd_pcsp *chip)
 			outb(chip->val61 ^ 2, 0x61);
 			chip->thalf = 1;
 		}
-		spin_unlock_irqrestore(&i8253_lock, flags);
+		raw_spin_unlock_irqrestore(&i8253_lock, flags);
 	}
 
 	chip->ns_rem = PCSP_PERIOD_NS();
@@ -158,10 +158,10 @@ static int pcsp_start_playing(struct snd_pcsp *chip)
 		return -EIO;
 	}
 
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 	chip->val61 = inb(0x61) | 0x03;
 	outb_p(0x92, 0x43);	/* binary, mode 1, LSB only, ch 2 */
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 	atomic_set(&chip->timer_active, 1);
 	chip->thalf = 0;
 
@@ -178,11 +178,11 @@ static void pcsp_stop_playing(struct snd_pcsp *chip)
 		return;
 
 	atomic_set(&chip->timer_active, 0);
-	spin_lock(&i8253_lock);
+	raw_spin_lock(&i8253_lock);
 	/* restore the timer */
 	outb_p(0xb6, 0x43);	/* binary, mode 3, LSB/MSB, ch 2 */
 	outb(chip->val61 & 0xFC, 0x61);
-	spin_unlock(&i8253_lock);
+	raw_spin_unlock(&i8253_lock);
 }
 
 /*
-- 
cgit v1.2.3


From bc078e4eab65f11bbaeed380593ab8151b30d703 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 2 Mar 2010 16:01:10 +0100
Subject: oprofile: convert oprofile from timer_hook to hrtimer

Oprofile is currently broken on systems running with NOHZ enabled.
A maximum of 1 tick is accounted via the timer_hook if a cpu sleeps
for a longer period of time. This does bad things to the percentages
in the profiler output. To solve this problem convert oprofile to
use a restarting hrtimer instead of the timer_hook.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 drivers/oprofile/oprof.c     | 12 ++++---
 drivers/oprofile/oprof.h     |  3 +-
 drivers/oprofile/timer_int.c | 78 +++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 79 insertions(+), 14 deletions(-)

diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c
index dc8a0428260..b336cd9ee7a 100644
--- a/drivers/oprofile/oprof.c
+++ b/drivers/oprofile/oprof.c
@@ -253,22 +253,26 @@ static int __init oprofile_init(void)
 	int err;
 
 	err = oprofile_arch_init(&oprofile_ops);
-
 	if (err < 0 || timer) {
 		printk(KERN_INFO "oprofile: using timer interrupt.\n");
-		oprofile_timer_init(&oprofile_ops);
+		err = oprofile_timer_init(&oprofile_ops);
+		if (err)
+			goto out_arch;
 	}
-
 	err = oprofilefs_register();
 	if (err)
-		oprofile_arch_exit();
+		goto out_arch;
+	return 0;
 
+out_arch:
+	oprofile_arch_exit();
 	return err;
 }
 
 
 static void __exit oprofile_exit(void)
 {
+	oprofile_timer_exit();
 	oprofilefs_unregister();
 	oprofile_arch_exit();
 }
diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h
index cb92f5c98c1..47e12cb4ee8 100644
--- a/drivers/oprofile/oprof.h
+++ b/drivers/oprofile/oprof.h
@@ -34,7 +34,8 @@ struct super_block;
 struct dentry;
 
 void oprofile_create_files(struct super_block *sb, struct dentry *root);
-void oprofile_timer_init(struct oprofile_operations *ops);
+int oprofile_timer_init(struct oprofile_operations *ops);
+void oprofile_timer_exit(void);
 
 int oprofile_set_backtrace(unsigned long depth);
 int oprofile_set_timeout(unsigned long time);
diff --git a/drivers/oprofile/timer_int.c b/drivers/oprofile/timer_int.c
index 333f915568c..dc0ae4d14df 100644
--- a/drivers/oprofile/timer_int.c
+++ b/drivers/oprofile/timer_int.c
@@ -13,34 +13,94 @@
 #include <linux/oprofile.h>
 #include <linux/profile.h>
 #include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/hrtimer.h>
+#include <asm/irq_regs.h>
 #include <asm/ptrace.h>
 
 #include "oprof.h"
 
-static int timer_notify(struct pt_regs *regs)
+static DEFINE_PER_CPU(struct hrtimer, oprofile_hrtimer);
+
+static enum hrtimer_restart oprofile_hrtimer_notify(struct hrtimer *hrtimer)
+{
+	oprofile_add_sample(get_irq_regs(), 0);
+	hrtimer_forward_now(hrtimer, ns_to_ktime(TICK_NSEC));
+	return HRTIMER_RESTART;
+}
+
+static void __oprofile_hrtimer_start(void *unused)
+{
+	struct hrtimer *hrtimer = &__get_cpu_var(oprofile_hrtimer);
+
+	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer->function = oprofile_hrtimer_notify;
+
+	hrtimer_start(hrtimer, ns_to_ktime(TICK_NSEC),
+		      HRTIMER_MODE_REL_PINNED);
+}
+
+static int oprofile_hrtimer_start(void)
 {
-	oprofile_add_sample(regs, 0);
+	on_each_cpu(__oprofile_hrtimer_start, NULL, 1);
 	return 0;
 }
 
-static int timer_start(void)
+static void __oprofile_hrtimer_stop(int cpu)
 {
-	return register_timer_hook(timer_notify);
+	struct hrtimer *hrtimer = &per_cpu(oprofile_hrtimer, cpu);
+
+	hrtimer_cancel(hrtimer);
 }
 
+static void oprofile_hrtimer_stop(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		__oprofile_hrtimer_stop(cpu);
+}
 
-static void timer_stop(void)
+static int __cpuinit oprofile_cpu_notify(struct notifier_block *self,
+					 unsigned long action, void *hcpu)
 {
-	unregister_timer_hook(timer_notify);
+	long cpu = (long) hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		smp_call_function_single(cpu, __oprofile_hrtimer_start,
+					 NULL, 1);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		__oprofile_hrtimer_stop(cpu);
+		break;
+	}
+	return NOTIFY_OK;
 }
 
+static struct notifier_block __refdata oprofile_cpu_notifier = {
+	.notifier_call = oprofile_cpu_notify,
+};
 
-void __init oprofile_timer_init(struct oprofile_operations *ops)
+int __init oprofile_timer_init(struct oprofile_operations *ops)
 {
+	int rc;
+
+	rc = register_hotcpu_notifier(&oprofile_cpu_notifier);
+	if (rc)
+		return rc;
 	ops->create_files = NULL;
 	ops->setup = NULL;
 	ops->shutdown = NULL;
-	ops->start = timer_start;
-	ops->stop = timer_stop;
+	ops->start = oprofile_hrtimer_start;
+	ops->stop = oprofile_hrtimer_stop;
 	ops->cpu_type = "timer";
+	return 0;
+}
+
+void __exit oprofile_timer_exit(void)
+{
+	unregister_hotcpu_notifier(&oprofile_cpu_notifier);
 }
-- 
cgit v1.2.3


From 4abc14a733f9002c05623db755aaafdd27fa7a91 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 20 Jan 2010 14:52:23 +0100
Subject: iommu-api: Rename ->{un}map function pointers to ->{un}map_range

The new function pointer names match better with the
top-level functions of the iommu-api which are using them.
Main intention of this change is to make the ->{un}map
pointer names free for two new mapping functions.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 4 ++--
 drivers/base/iommu.c        | 4 ++--
 drivers/pci/intel-iommu.c   | 4 ++--
 include/linux/iommu.h       | 8 ++++----
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index adb0ba02570..59cae7c4df5 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2515,8 +2515,8 @@ static struct iommu_ops amd_iommu_ops = {
 	.domain_destroy = amd_iommu_domain_destroy,
 	.attach_dev = amd_iommu_attach_device,
 	.detach_dev = amd_iommu_detach_device,
-	.map = amd_iommu_map_range,
-	.unmap = amd_iommu_unmap_range,
+	.map_range = amd_iommu_map_range,
+	.unmap_range = amd_iommu_unmap_range,
 	.iova_to_phys = amd_iommu_iova_to_phys,
 	.domain_has_cap = amd_iommu_domain_has_cap,
 };
diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c
index 8ad4ffea692..f4c86c42929 100644
--- a/drivers/base/iommu.c
+++ b/drivers/base/iommu.c
@@ -83,14 +83,14 @@ EXPORT_SYMBOL_GPL(iommu_detach_device);
 int iommu_map_range(struct iommu_domain *domain, unsigned long iova,
 		    phys_addr_t paddr, size_t size, int prot)
 {
-	return iommu_ops->map(domain, iova, paddr, size, prot);
+	return iommu_ops->map_range(domain, iova, paddr, size, prot);
 }
 EXPORT_SYMBOL_GPL(iommu_map_range);
 
 void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
 		      size_t size)
 {
-	iommu_ops->unmap(domain, iova, size);
+	iommu_ops->unmap_range(domain, iova, size);
 }
 EXPORT_SYMBOL_GPL(iommu_unmap_range);
 
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 417312528dd..a714e3db13c 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3714,8 +3714,8 @@ static struct iommu_ops intel_iommu_ops = {
 	.domain_destroy = intel_iommu_domain_destroy,
 	.attach_dev	= intel_iommu_attach_device,
 	.detach_dev	= intel_iommu_detach_device,
-	.map		= intel_iommu_map_range,
-	.unmap		= intel_iommu_unmap_range,
+	.map_range	= intel_iommu_map_range,
+	.unmap_range	= intel_iommu_unmap_range,
 	.iova_to_phys	= intel_iommu_iova_to_phys,
 	.domain_has_cap = intel_iommu_domain_has_cap,
 };
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3af4ffd591b..0f18f37a650 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -36,10 +36,10 @@ struct iommu_ops {
 	void (*domain_destroy)(struct iommu_domain *domain);
 	int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
 	void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
-	int (*map)(struct iommu_domain *domain, unsigned long iova,
-		   phys_addr_t paddr, size_t size, int prot);
-	void (*unmap)(struct iommu_domain *domain, unsigned long iova,
-		      size_t size);
+	int (*map_range)(struct iommu_domain *domain, unsigned long iova,
+			 phys_addr_t paddr, size_t size, int prot);
+	void (*unmap_range)(struct iommu_domain *domain, unsigned long iova,
+			    size_t size);
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
 				    unsigned long iova);
 	int (*domain_has_cap)(struct iommu_domain *domain,
-- 
cgit v1.2.3


From cefc53c7f494240d4813c80154c7617452d1904d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 8 Jan 2010 13:35:09 +0100
Subject: iommu-api: Add iommu_map and iommu_unmap functions

These two functions provide support for mapping and
unmapping physical addresses to io virtual addresses. The
difference to the iommu_(un)map_range() is that the new
functions take a gfp_order parameter instead of a size. This
allows the IOMMU backend implementations to detect easier if
a given range can be mapped by larger page sizes.
These new functions should replace the old ones in the long
term.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/base/iommu.c  | 31 +++++++++++++++++++++++++++++++
 include/linux/iommu.h | 16 ++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c
index f4c86c42929..cf7cbec116e 100644
--- a/drivers/base/iommu.c
+++ b/drivers/base/iommu.c
@@ -107,3 +107,34 @@ int iommu_domain_has_cap(struct iommu_domain *domain,
 	return iommu_ops->domain_has_cap(domain, cap);
 }
 EXPORT_SYMBOL_GPL(iommu_domain_has_cap);
+
+int iommu_map(struct iommu_domain *domain, unsigned long iova,
+	      phys_addr_t paddr, int gfp_order, int prot)
+{
+	unsigned long invalid_mask;
+	size_t size;
+
+	size         = 0x1000UL << gfp_order;
+	invalid_mask = size - 1;
+
+	BUG_ON((iova | paddr) & invalid_mask);
+
+	return iommu_ops->map_range(domain, iova, paddr, size, prot);
+}
+EXPORT_SYMBOL_GPL(iommu_map);
+
+int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order)
+{
+	unsigned long invalid_mask;
+	size_t size;
+
+	size         = 0x1000UL << gfp_order;
+	invalid_mask = size - 1;
+
+	BUG_ON(iova & invalid_mask);
+
+	iommu_ops->unmap_range(domain, iova, size);
+
+	return gfp_order;
+}
+EXPORT_SYMBOL_GPL(iommu_unmap);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0f18f37a650..6d0035bb1a0 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -60,6 +60,10 @@ extern int iommu_map_range(struct iommu_domain *domain, unsigned long iova,
 			   phys_addr_t paddr, size_t size, int prot);
 extern void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
 			      size_t size);
+extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
+		     phys_addr_t paddr, int gfp_order, int prot);
+extern int iommu_unmap(struct iommu_domain *domain, unsigned long iova,
+		       int gfp_order);
 extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
 				      unsigned long iova);
 extern int iommu_domain_has_cap(struct iommu_domain *domain,
@@ -108,6 +112,18 @@ static inline void iommu_unmap_range(struct iommu_domain *domain,
 {
 }
 
+static inline int iommu_map(struct iommu_domain *domain, unsigned long iova,
+			    phys_addr_t paddr, int gfp_order, int prot)
+{
+	return -ENODEV;
+}
+
+static inline int iommu_unmap(struct iommu_domain *domain, unsigned long iova,
+			      int gfp_order)
+{
+	return -ENODEV;
+}
+
 static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
 					     unsigned long iova)
 {
-- 
cgit v1.2.3


From 67651786948c360c3122b8a17cb1e59209d50880 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 21 Jan 2010 16:32:27 +0100
Subject: iommu-api: Add ->{un}map callbacks to iommu_ops

This patch adds new callbacks for mapping and unmapping
pages to the iommu_ops structure. These callbacks are aware
of page sizes which makes them different to the
->{un}map_range callbacks.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/base/iommu.c  | 6 ++++++
 include/linux/iommu.h | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c
index cf7cbec116e..55d37e4609e 100644
--- a/drivers/base/iommu.c
+++ b/drivers/base/iommu.c
@@ -119,6 +119,9 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 
 	BUG_ON((iova | paddr) & invalid_mask);
 
+	if (iommu_ops->map)
+		return iommu_ops->map(domain, iova, paddr, gfp_order, prot);
+
 	return iommu_ops->map_range(domain, iova, paddr, size, prot);
 }
 EXPORT_SYMBOL_GPL(iommu_map);
@@ -133,6 +136,9 @@ int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order)
 
 	BUG_ON(iova & invalid_mask);
 
+	if (iommu_ops->unmap)
+		return iommu_ops->unmap(domain, iova, gfp_order);
+
 	iommu_ops->unmap_range(domain, iova, size);
 
 	return gfp_order;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6d0035bb1a0..5a7a3d888da 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -36,6 +36,10 @@ struct iommu_ops {
 	void (*domain_destroy)(struct iommu_domain *domain);
 	int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
 	void (*detach_dev)(struct iommu_domain *domain, struct device *dev);
+	int (*map)(struct iommu_domain *domain, unsigned long iova,
+		   phys_addr_t paddr, int gfp_order, int prot);
+	int (*unmap)(struct iommu_domain *domain, unsigned long iova,
+		     int gfp_order);
 	int (*map_range)(struct iommu_domain *domain, unsigned long iova,
 			 phys_addr_t paddr, size_t size, int prot);
 	void (*unmap_range)(struct iommu_domain *domain, unsigned long iova,
-- 
cgit v1.2.3


From b146a1c9f7f1feeacf840fa1ba197a99593cea15 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 20 Jan 2010 17:17:37 +0100
Subject: VT-d: Change {un}map_range functions to implement {un}map interface

This patch changes the iommu-api functions for mapping and
unmapping page ranges to use the new page-size based
interface. This allows to remove the range based functions
later.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 drivers/pci/intel-iommu.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index a714e3db13c..371dc564e2e 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -3626,14 +3626,15 @@ static void intel_iommu_detach_device(struct iommu_domain *domain,
 	domain_remove_one_dev_info(dmar_domain, pdev);
 }
 
-static int intel_iommu_map_range(struct iommu_domain *domain,
-				 unsigned long iova, phys_addr_t hpa,
-				 size_t size, int iommu_prot)
+static int intel_iommu_map(struct iommu_domain *domain,
+			   unsigned long iova, phys_addr_t hpa,
+			   int gfp_order, int iommu_prot)
 {
 	struct dmar_domain *dmar_domain = domain->priv;
 	u64 max_addr;
 	int addr_width;
 	int prot = 0;
+	size_t size;
 	int ret;
 
 	if (iommu_prot & IOMMU_READ)
@@ -3643,6 +3644,7 @@ static int intel_iommu_map_range(struct iommu_domain *domain,
 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
 		prot |= DMA_PTE_SNP;
 
+	size     = PAGE_SIZE << gfp_order;
 	max_addr = iova + size;
 	if (dmar_domain->max_addr < max_addr) {
 		int min_agaw;
@@ -3669,19 +3671,19 @@ static int intel_iommu_map_range(struct iommu_domain *domain,
 	return ret;
 }
 
-static void intel_iommu_unmap_range(struct iommu_domain *domain,
-				    unsigned long iova, size_t size)
+static int intel_iommu_unmap(struct iommu_domain *domain,
+			     unsigned long iova, int gfp_order)
 {
 	struct dmar_domain *dmar_domain = domain->priv;
-
-	if (!size)
-		return;
+	size_t size = PAGE_SIZE << gfp_order;
 
 	dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
 			    (iova + size - 1) >> VTD_PAGE_SHIFT);
 
 	if (dmar_domain->max_addr == iova + size)
 		dmar_domain->max_addr = iova;
+
+	return gfp_order;
 }
 
 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
@@ -3714,8 +3716,8 @@ static struct iommu_ops intel_iommu_ops = {
 	.domain_destroy = intel_iommu_domain_destroy,
 	.attach_dev	= intel_iommu_attach_device,
 	.detach_dev	= intel_iommu_detach_device,
-	.map_range	= intel_iommu_map_range,
-	.unmap_range	= intel_iommu_unmap_range,
+	.map		= intel_iommu_map,
+	.unmap		= intel_iommu_unmap,
 	.iova_to_phys	= intel_iommu_iova_to_phys,
 	.domain_has_cap = intel_iommu_domain_has_cap,
 };
-- 
cgit v1.2.3


From fcd95807fb61e67d602610e7ff7129ed769e9fee Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 11 Jan 2010 16:38:18 +0100
Subject: kvm: Change kvm_iommu_map_pages to map large pages

This patch changes the implementation of of
kvm_iommu_map_pages to map the pages with the host page size
into the io virtual address space.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Acked-By: Avi Kivity <avi@redhat.com>
---
 virt/kvm/iommu.c | 113 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 91 insertions(+), 22 deletions(-)

diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 80fd3ad3b2d..11692b9e883 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -32,12 +32,30 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
 static void kvm_iommu_put_pages(struct kvm *kvm,
 				gfn_t base_gfn, unsigned long npages);
 
+static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
+			   gfn_t gfn, unsigned long size)
+{
+	gfn_t end_gfn;
+	pfn_t pfn;
+
+	pfn     = gfn_to_pfn_memslot(kvm, slot, gfn);
+	end_gfn = gfn + (size >> PAGE_SHIFT);
+	gfn    += 1;
+
+	if (is_error_pfn(pfn))
+		return pfn;
+
+	while (gfn < end_gfn)
+		gfn_to_pfn_memslot(kvm, slot, gfn++);
+
+	return pfn;
+}
+
 int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
 {
-	gfn_t gfn = slot->base_gfn;
-	unsigned long npages = slot->npages;
+	gfn_t gfn, end_gfn;
 	pfn_t pfn;
-	int i, r = 0;
+	int r = 0;
 	struct iommu_domain *domain = kvm->arch.iommu_domain;
 	int flags;
 
@@ -45,31 +63,62 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
 	if (!domain)
 		return 0;
 
+	gfn     = slot->base_gfn;
+	end_gfn = gfn + slot->npages;
+
 	flags = IOMMU_READ | IOMMU_WRITE;
 	if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
 		flags |= IOMMU_CACHE;
 
-	for (i = 0; i < npages; i++) {
-		/* check if already mapped */
-		if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn)))
+
+	while (gfn < end_gfn) {
+		unsigned long page_size;
+
+		/* Check if already mapped */
+		if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
+			gfn += 1;
+			continue;
+		}
+
+		/* Get the page size we could use to map */
+		page_size = kvm_host_page_size(kvm, gfn);
+
+		/* Make sure the page_size does not exceed the memslot */
+		while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
+			page_size >>= 1;
+
+		/* Make sure gfn is aligned to the page size we want to map */
+		while ((gfn << PAGE_SHIFT) & (page_size - 1))
+			page_size >>= 1;
+
+		/*
+		 * Pin all pages we are about to map in memory. This is
+		 * important because we unmap and unpin in 4kb steps later.
+		 */
+		pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
+		if (is_error_pfn(pfn)) {
+			gfn += 1;
 			continue;
+		}
 
-		pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
-		r = iommu_map_range(domain,
-				    gfn_to_gpa(gfn),
-				    pfn_to_hpa(pfn),
-				    PAGE_SIZE, flags);
+		/* Map into IO address space */
+		r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
+			      get_order(page_size), flags);
 		if (r) {
 			printk(KERN_ERR "kvm_iommu_map_address:"
 			       "iommu failed to map pfn=%lx\n", pfn);
 			goto unmap_pages;
 		}
-		gfn++;
+
+		gfn += page_size >> PAGE_SHIFT;
+
+
 	}
+
 	return 0;
 
 unmap_pages:
-	kvm_iommu_put_pages(kvm, slot->base_gfn, i);
+	kvm_iommu_put_pages(kvm, slot->base_gfn, gfn);
 	return r;
 }
 
@@ -189,27 +238,47 @@ out_unmap:
 	return r;
 }
 
+static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
+{
+	unsigned long i;
+
+	for (i = 0; i < npages; ++i)
+		kvm_release_pfn_clean(pfn + i);
+}
+
 static void kvm_iommu_put_pages(struct kvm *kvm,
 				gfn_t base_gfn, unsigned long npages)
 {
-	gfn_t gfn = base_gfn;
+	struct iommu_domain *domain;
+	gfn_t end_gfn, gfn;
 	pfn_t pfn;
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-	unsigned long i;
 	u64 phys;
 
+	domain  = kvm->arch.iommu_domain;
+	end_gfn = base_gfn + npages;
+	gfn     = base_gfn;
+
 	/* check if iommu exists and in use */
 	if (!domain)
 		return;
 
-	for (i = 0; i < npages; i++) {
+	while (gfn < end_gfn) {
+		unsigned long unmap_pages;
+		int order;
+
+		/* Get physical address */
 		phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
-		pfn = phys >> PAGE_SHIFT;
-		kvm_release_pfn_clean(pfn);
-		gfn++;
-	}
+		pfn  = phys >> PAGE_SHIFT;
+
+		/* Unmap address from IO address space */
+		order       = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE);
+		unmap_pages = 1ULL << order;
 
-	iommu_unmap_range(domain, gfn_to_gpa(base_gfn), PAGE_SIZE * npages);
+		/* Unpin all pages we just unmapped to not leak any memory */
+		kvm_unpin_pages(kvm, pfn, unmap_pages);
+
+		gfn += unmap_pages;
+	}
 }
 
 static int kvm_iommu_unmap_memslots(struct kvm *kvm)
-- 
cgit v1.2.3


From cbb9d729f3433c9c2660b01dc52e6deb89488886 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 15 Jan 2010 14:41:15 +0100
Subject: x86/amd-iommu: Make iommu_map_page and alloc_pte aware of page sizes

This patch changes the old map_size parameter of alloc_pte
to a page_size parameter which can be used more easily to
alloc a pte for intermediate page sizes.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h | 28 ++++++++++++++++++
 arch/x86/kernel/amd_iommu.c            | 53 +++++++++++++++++++++-------------
 2 files changed, 61 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index ba19ad4c47d..5e8da56755d 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -172,6 +172,34 @@
 				(~((1ULL << (12 + ((lvl) * 9))) - 1)))
 #define PM_ALIGNED(lvl, addr)	((PM_MAP_MASK(lvl) & (addr)) == (addr))
 
+/*
+ * Returns the page table level to use for a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_LEVEL(pagesize) \
+		((__ffs(pagesize) - 12) / 9)
+/*
+ * Returns the number of ptes to use for a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_PTE_COUNT(pagesize) \
+		(1ULL << ((__ffs(pagesize) - 12) % 9))
+
+/*
+ * Aligns a given io-virtual address to a given page size
+ * Pagesize is expected to be a power-of-two
+ */
+#define PAGE_SIZE_ALIGN(address, pagesize) \
+		((address) & ~((pagesize) - 1))
+/*
+ * Creates an IOMMU PTE for an address an a given pagesize
+ * The PTE has no permission bits set
+ * Pagesize is expected to be a power-of-two larger than 4096
+ */
+#define PAGE_SIZE_PTE(address, pagesize)		\
+		(((address) | ((pagesize) - 1)) &	\
+		 (~(pagesize >> 1)) & PM_ADDR_MASK)
+
 #define IOMMU_PTE_P  (1ULL << 0)
 #define IOMMU_PTE_TV (1ULL << 1)
 #define IOMMU_PTE_U  (1ULL << 59)
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 59cae7c4df5..41700314f3e 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -730,18 +730,22 @@ static bool increase_address_space(struct protection_domain *domain,
 
 static u64 *alloc_pte(struct protection_domain *domain,
 		      unsigned long address,
-		      int end_lvl,
+		      unsigned long page_size,
 		      u64 **pte_page,
 		      gfp_t gfp)
 {
+	int level, end_lvl;
 	u64 *pte, *page;
-	int level;
+
+	BUG_ON(!is_power_of_2(page_size));
 
 	while (address > PM_LEVEL_SIZE(domain->mode))
 		increase_address_space(domain, gfp);
 
-	level =  domain->mode - 1;
-	pte   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+	level   = domain->mode - 1;
+	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+	address = PAGE_SIZE_ALIGN(address, page_size);
+	end_lvl = PAGE_SIZE_LEVEL(page_size);
 
 	while (level > end_lvl) {
 		if (!IOMMU_PTE_PRESENT(*pte)) {
@@ -751,6 +755,10 @@ static u64 *alloc_pte(struct protection_domain *domain,
 			*pte = PM_LEVEL_PDE(level, virt_to_phys(page));
 		}
 
+		/* No level skipping support yet */
+		if (PM_PTE_LEVEL(*pte) != level)
+			return NULL;
+
 		level -= 1;
 
 		pte = IOMMU_PTE_PAGE(*pte);
@@ -806,31 +814,36 @@ static int iommu_map_page(struct protection_domain *dom,
 			  unsigned long bus_addr,
 			  unsigned long phys_addr,
 			  int prot,
-			  int map_size)
+			  unsigned long page_size)
 {
 	u64 __pte, *pte;
-
-	bus_addr  = PAGE_ALIGN(bus_addr);
-	phys_addr = PAGE_ALIGN(phys_addr);
-
-	BUG_ON(!PM_ALIGNED(map_size, bus_addr));
-	BUG_ON(!PM_ALIGNED(map_size, phys_addr));
+	int i, count;
 
 	if (!(prot & IOMMU_PROT_MASK))
 		return -EINVAL;
 
-	pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL);
+	bus_addr  = PAGE_ALIGN(bus_addr);
+	phys_addr = PAGE_ALIGN(phys_addr);
+	count     = PAGE_SIZE_PTE_COUNT(page_size);
+	pte       = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
+
+	for (i = 0; i < count; ++i)
+		if (IOMMU_PTE_PRESENT(pte[i]))
+			return -EBUSY;
 
-	if (IOMMU_PTE_PRESENT(*pte))
-		return -EBUSY;
+	if (page_size > PAGE_SIZE) {
+		__pte = PAGE_SIZE_PTE(phys_addr, page_size);
+		__pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
+	} else
+		__pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
 
-	__pte = phys_addr | IOMMU_PTE_P;
 	if (prot & IOMMU_PROT_IR)
 		__pte |= IOMMU_PTE_IR;
 	if (prot & IOMMU_PROT_IW)
 		__pte |= IOMMU_PTE_IW;
 
-	*pte = __pte;
+	for (i = 0; i < count; ++i)
+		pte[i] = __pte;
 
 	update_domain(dom);
 
@@ -877,7 +890,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 	for (addr = e->address_start; addr < e->address_end;
 	     addr += PAGE_SIZE) {
 		ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
-				     PM_MAP_4k);
+				     PAGE_SIZE);
 		if (ret)
 			return ret;
 		/*
@@ -1005,7 +1018,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
 		u64 *pte, *pte_page;
 
 		for (i = 0; i < num_ptes; ++i) {
-			pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k,
+			pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
 					&pte_page, gfp);
 			if (!pte)
 				goto out_free;
@@ -1711,7 +1724,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
 
 	pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
 	if (!pte) {
-		pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page,
+		pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
 				GFP_ATOMIC);
 		aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
 	} else
@@ -2457,7 +2470,7 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
 	paddr &= PAGE_MASK;
 
 	for (i = 0; i < npages; ++i) {
-		ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
+		ret = iommu_map_page(domain, iova, paddr, prot, PAGE_SIZE);
 		if (ret)
 			return ret;
 
-- 
cgit v1.2.3


From 24cd772315c19e4d9409d0d21367ec1ebab3149f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 19 Jan 2010 17:27:39 +0100
Subject: x86/amd-iommu: Make iommu_unmap_page and fetch_pte aware of page
 sizes

This patch extends the functionality of iommu_unmap_page
and fetch_pte to support arbitrary page sizes.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h |  6 +++
 arch/x86/kernel/amd_iommu.c            | 90 +++++++++++++++++++++++++++-------
 2 files changed, 78 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 5e8da56755d..b150c74e0d4 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -200,6 +200,12 @@
 		(((address) | ((pagesize) - 1)) &	\
 		 (~(pagesize >> 1)) & PM_ADDR_MASK)
 
+/*
+ * Takes a PTE value with mode=0x07 and returns the page size it maps
+ */
+#define PTE_PAGE_SIZE(pte) \
+	(1ULL << (1 + ffz(((pte) | 0xfffULL))))
+
 #define IOMMU_PTE_P  (1ULL << 0)
 #define IOMMU_PTE_TV (1ULL << 1)
 #define IOMMU_PTE_U  (1ULL << 59)
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 41700314f3e..503d312f9d6 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -776,28 +776,47 @@ static u64 *alloc_pte(struct protection_domain *domain,
  * This function checks if there is a PTE for a given dma address. If
  * there is one, it returns the pointer to it.
  */
-static u64 *fetch_pte(struct protection_domain *domain,
-		      unsigned long address, int map_size)
+static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
 {
 	int level;
 	u64 *pte;
 
-	level =  domain->mode - 1;
-	pte   = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+	if (address > PM_LEVEL_SIZE(domain->mode))
+		return NULL;
+
+	level   =  domain->mode - 1;
+	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
 
-	while (level > map_size) {
+	while (level > 0) {
+
+		/* Not Present */
 		if (!IOMMU_PTE_PRESENT(*pte))
 			return NULL;
 
+		/* Large PTE */
+		if (PM_PTE_LEVEL(*pte) == 0x07) {
+			unsigned long pte_mask, __pte;
+
+			/*
+			 * If we have a series of large PTEs, make
+			 * sure to return a pointer to the first one.
+			 */
+			pte_mask = PTE_PAGE_SIZE(*pte);
+			pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
+			__pte    = ((unsigned long)pte) & pte_mask;
+
+			return (u64 *)__pte;
+		}
+
+		/* No level skipping support yet */
+		if (PM_PTE_LEVEL(*pte) != level)
+			return NULL;
+
 		level -= 1;
 
+		/* Walk to the next level */
 		pte = IOMMU_PTE_PAGE(*pte);
 		pte = &pte[PM_LEVEL_INDEX(level, address)];
-
-		if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
-			pte = NULL;
-			break;
-		}
 	}
 
 	return pte;
@@ -850,13 +869,48 @@ static int iommu_map_page(struct protection_domain *dom,
 	return 0;
 }
 
-static void iommu_unmap_page(struct protection_domain *dom,
-			     unsigned long bus_addr, int map_size)
+static unsigned long iommu_unmap_page(struct protection_domain *dom,
+				      unsigned long bus_addr,
+				      unsigned long page_size)
 {
-	u64 *pte = fetch_pte(dom, bus_addr, map_size);
+	unsigned long long unmap_size, unmapped;
+	u64 *pte;
+
+	BUG_ON(!is_power_of_2(page_size));
+
+	unmapped = 0;
+
+	while (unmapped < page_size) {
+
+		pte = fetch_pte(dom, bus_addr);
+
+		if (!pte) {
+			/*
+			 * No PTE for this address
+			 * move forward in 4kb steps
+			 */
+			unmap_size = PAGE_SIZE;
+		} else if (PM_PTE_LEVEL(*pte) == 0) {
+			/* 4kb PTE found for this address */
+			unmap_size = PAGE_SIZE;
+			*pte       = 0ULL;
+		} else {
+			int count, i;
+
+			/* Large PTE found which maps this address */
+			unmap_size = PTE_PAGE_SIZE(*pte);
+			count      = PAGE_SIZE_PTE_COUNT(unmap_size);
+			for (i = 0; i < count; i++)
+				pte[i] = 0ULL;
+		}
+
+		bus_addr  = (bus_addr & ~(unmap_size - 1)) + unmap_size;
+		unmapped += unmap_size;
+	}
+
+	BUG_ON(!is_power_of_2(unmapped));
 
-	if (pte)
-		*pte = 0;
+	return unmapped;
 }
 
 /*
@@ -1054,7 +1108,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
 	for (i = dma_dom->aperture[index]->offset;
 	     i < dma_dom->aperture_size;
 	     i += PAGE_SIZE) {
-		u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k);
+		u64 *pte = fetch_pte(&dma_dom->domain, i);
 		if (!pte || !IOMMU_PTE_PRESENT(*pte))
 			continue;
 
@@ -2491,7 +2545,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
 	iova  &= PAGE_MASK;
 
 	for (i = 0; i < npages; ++i) {
-		iommu_unmap_page(domain, iova, PM_MAP_4k);
+		iommu_unmap_page(domain, iova, PAGE_SIZE);
 		iova  += PAGE_SIZE;
 	}
 
@@ -2506,7 +2560,7 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 	phys_addr_t paddr;
 	u64 *pte;
 
-	pte = fetch_pte(domain, iova, PM_MAP_4k);
+	pte = fetch_pte(domain, iova);
 
 	if (!pte || !IOMMU_PTE_PRESENT(*pte))
 		return 0;
-- 
cgit v1.2.3


From f03152bb7d0a74f409ad63ed36916444a7493d72 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 21 Jan 2010 16:15:24 +0100
Subject: x86/amd-iommu: Make amd_iommu_iova_to_phys aware of multiple page
 sizes

This patch extends the amd_iommu_iova_to_phys() function to
handle different page sizes correctly. It doesn't use
fetch_pte() anymore because we don't know (or care about)
the page_size used for mapping the given iova.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 503d312f9d6..52e44af1570 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2556,17 +2556,22 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 					  unsigned long iova)
 {
 	struct protection_domain *domain = dom->priv;
-	unsigned long offset = iova & ~PAGE_MASK;
+	unsigned long offset_mask;
 	phys_addr_t paddr;
-	u64 *pte;
+	u64 *pte, __pte;
 
 	pte = fetch_pte(domain, iova);
 
 	if (!pte || !IOMMU_PTE_PRESENT(*pte))
 		return 0;
 
-	paddr  = *pte & IOMMU_PAGE_MASK;
-	paddr |= offset;
+	if (PM_PTE_LEVEL(*pte) == 0)
+		offset_mask = PAGE_SIZE - 1;
+	else
+		offset_mask = PTE_PAGE_SIZE(*pte) - 1;
+
+	__pte = *pte & PM_ADDR_MASK;
+	paddr = (__pte & ~offset_mask) | (iova & offset_mask);
 
 	return paddr;
 }
-- 
cgit v1.2.3


From 468e2366cdb80cf8a691b8bc212260cfbdbd518e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 21 Jan 2010 16:37:36 +0100
Subject: x86/amd-iommu: Implement ->{un}map callbacks for iommu-api

This patch implements the new callbacks for the IOMMU-API
with functions that can handle different page sizes in the
IOMMU page table.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 52e44af1570..0e068c9ca5f 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2552,6 +2552,33 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
 	iommu_flush_tlb_pde(domain);
 }
 
+static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
+			 phys_addr_t paddr, int gfp_order, int iommu_prot)
+{
+	unsigned long page_size = 0x1000UL << gfp_order;
+	struct protection_domain *domain = dom->priv;
+	int prot = 0;
+
+	if (iommu_prot & IOMMU_READ)
+		prot |= IOMMU_PROT_IR;
+	if (iommu_prot & IOMMU_WRITE)
+		prot |= IOMMU_PROT_IW;
+
+	return iommu_map_page(domain, iova, paddr, prot, page_size);
+}
+
+static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
+			   int gfp_order)
+{
+	struct protection_domain *domain = dom->priv;
+	unsigned long page_size, unmap_size;
+
+	page_size  = 0x1000UL << gfp_order;
+	unmap_size = iommu_unmap_page(domain, iova, page_size);
+
+	return get_order(unmap_size);
+}
+
 static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
 					  unsigned long iova)
 {
@@ -2587,6 +2614,8 @@ static struct iommu_ops amd_iommu_ops = {
 	.domain_destroy = amd_iommu_domain_destroy,
 	.attach_dev = amd_iommu_attach_device,
 	.detach_dev = amd_iommu_detach_device,
+	.map = amd_iommu_map,
+	.unmap = amd_iommu_unmap,
 	.map_range = amd_iommu_map_range,
 	.unmap_range = amd_iommu_unmap_range,
 	.iova_to_phys = amd_iommu_iova_to_phys,
-- 
cgit v1.2.3


From 12c7389abe5786349d3ea6da1961cf78d0c1c7cd Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 21 Jan 2010 11:50:28 +0100
Subject: iommu-api: Remove iommu_{un}map_range functions

These functions are not longer used and can be removed
savely. There functionality is now provided by the
iommu_{un}map functions which are also capable of multiple
page sizes.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 48 ---------------------------------------------
 drivers/base/iommu.c        | 26 ++----------------------
 include/linux/iommu.h       | 20 -------------------
 3 files changed, 2 insertions(+), 92 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0e068c9ca5f..d8da9988edd 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2506,52 +2506,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 	return ret;
 }
 
-static int amd_iommu_map_range(struct iommu_domain *dom,
-			       unsigned long iova, phys_addr_t paddr,
-			       size_t size, int iommu_prot)
-{
-	struct protection_domain *domain = dom->priv;
-	unsigned long i,  npages = iommu_num_pages(paddr, size, PAGE_SIZE);
-	int prot = 0;
-	int ret;
-
-	if (iommu_prot & IOMMU_READ)
-		prot |= IOMMU_PROT_IR;
-	if (iommu_prot & IOMMU_WRITE)
-		prot |= IOMMU_PROT_IW;
-
-	iova  &= PAGE_MASK;
-	paddr &= PAGE_MASK;
-
-	for (i = 0; i < npages; ++i) {
-		ret = iommu_map_page(domain, iova, paddr, prot, PAGE_SIZE);
-		if (ret)
-			return ret;
-
-		iova  += PAGE_SIZE;
-		paddr += PAGE_SIZE;
-	}
-
-	return 0;
-}
-
-static void amd_iommu_unmap_range(struct iommu_domain *dom,
-				  unsigned long iova, size_t size)
-{
-
-	struct protection_domain *domain = dom->priv;
-	unsigned long i,  npages = iommu_num_pages(iova, size, PAGE_SIZE);
-
-	iova  &= PAGE_MASK;
-
-	for (i = 0; i < npages; ++i) {
-		iommu_unmap_page(domain, iova, PAGE_SIZE);
-		iova  += PAGE_SIZE;
-	}
-
-	iommu_flush_tlb_pde(domain);
-}
-
 static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
 			 phys_addr_t paddr, int gfp_order, int iommu_prot)
 {
@@ -2616,8 +2570,6 @@ static struct iommu_ops amd_iommu_ops = {
 	.detach_dev = amd_iommu_detach_device,
 	.map = amd_iommu_map,
 	.unmap = amd_iommu_unmap,
-	.map_range = amd_iommu_map_range,
-	.unmap_range = amd_iommu_unmap_range,
 	.iova_to_phys = amd_iommu_iova_to_phys,
 	.domain_has_cap = amd_iommu_domain_has_cap,
 };
diff --git a/drivers/base/iommu.c b/drivers/base/iommu.c
index 55d37e4609e..6e6b6a11b3c 100644
--- a/drivers/base/iommu.c
+++ b/drivers/base/iommu.c
@@ -80,20 +80,6 @@ void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_detach_device);
 
-int iommu_map_range(struct iommu_domain *domain, unsigned long iova,
-		    phys_addr_t paddr, size_t size, int prot)
-{
-	return iommu_ops->map_range(domain, iova, paddr, size, prot);
-}
-EXPORT_SYMBOL_GPL(iommu_map_range);
-
-void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
-		      size_t size)
-{
-	iommu_ops->unmap_range(domain, iova, size);
-}
-EXPORT_SYMBOL_GPL(iommu_unmap_range);
-
 phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
 			       unsigned long iova)
 {
@@ -119,10 +105,7 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova,
 
 	BUG_ON((iova | paddr) & invalid_mask);
 
-	if (iommu_ops->map)
-		return iommu_ops->map(domain, iova, paddr, gfp_order, prot);
-
-	return iommu_ops->map_range(domain, iova, paddr, size, prot);
+	return iommu_ops->map(domain, iova, paddr, gfp_order, prot);
 }
 EXPORT_SYMBOL_GPL(iommu_map);
 
@@ -136,11 +119,6 @@ int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order)
 
 	BUG_ON(iova & invalid_mask);
 
-	if (iommu_ops->unmap)
-		return iommu_ops->unmap(domain, iova, gfp_order);
-
-	iommu_ops->unmap_range(domain, iova, size);
-
-	return gfp_order;
+	return iommu_ops->unmap(domain, iova, gfp_order);
 }
 EXPORT_SYMBOL_GPL(iommu_unmap);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5a7a3d888da..be22ad83689 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -40,10 +40,6 @@ struct iommu_ops {
 		   phys_addr_t paddr, int gfp_order, int prot);
 	int (*unmap)(struct iommu_domain *domain, unsigned long iova,
 		     int gfp_order);
-	int (*map_range)(struct iommu_domain *domain, unsigned long iova,
-			 phys_addr_t paddr, size_t size, int prot);
-	void (*unmap_range)(struct iommu_domain *domain, unsigned long iova,
-			    size_t size);
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
 				    unsigned long iova);
 	int (*domain_has_cap)(struct iommu_domain *domain,
@@ -60,10 +56,6 @@ extern int iommu_attach_device(struct iommu_domain *domain,
 			       struct device *dev);
 extern void iommu_detach_device(struct iommu_domain *domain,
 				struct device *dev);
-extern int iommu_map_range(struct iommu_domain *domain, unsigned long iova,
-			   phys_addr_t paddr, size_t size, int prot);
-extern void iommu_unmap_range(struct iommu_domain *domain, unsigned long iova,
-			      size_t size);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
 		     phys_addr_t paddr, int gfp_order, int prot);
 extern int iommu_unmap(struct iommu_domain *domain, unsigned long iova,
@@ -104,18 +96,6 @@ static inline void iommu_detach_device(struct iommu_domain *domain,
 {
 }
 
-static inline int iommu_map_range(struct iommu_domain *domain,
-				  unsigned long iova, phys_addr_t paddr,
-				  size_t size, int prot)
-{
-	return -ENODEV;
-}
-
-static inline void iommu_unmap_range(struct iommu_domain *domain,
-				     unsigned long iova, size_t size)
-{
-}
-
 static inline int iommu_map(struct iommu_domain *domain, unsigned long iova,
 			    phys_addr_t paddr, int gfp_order, int prot)
 {
-- 
cgit v1.2.3


From ca037701a025334e724e5c61b3b1082940c8b981 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Mar 2010 19:52:12 +0100
Subject: perf, x86: Add PEBS infrastructure

This patch implements support for Intel Precise Event Based Sampling,
which is an alternative counter mode in which the counter triggers a
hardware assist to collect information on events. The hardware assist
takes a trap like snapshot of a subset of the machine registers.

This data is written to the Intel Debug-Store, which can be programmed
with a data threshold at which to raise a PMI.

With the PEBS hardware assist being trap like, the reported IP is always
one instruction after the actual instruction that triggered the event.

This implements a simple PEBS model that always takes a single PEBS event
at a time. This is done so that the interaction with the rest of the
system is as expected (freq adjust, period randomization, lbr,
callchains, etc.).

It adds an ABI element: perf_event_attr::precise, which indicates that we
wish to use this (constrained, but precise) mode.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.392111285@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c          | 223 +++++-------
 arch/x86/kernel/cpu/perf_event_intel.c    | 150 ++------
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 557 ++++++++++++++++++++++++++++++
 include/linux/perf_event.h                |   3 +-
 4 files changed, 671 insertions(+), 262 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_ds.c

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1d665a0b202..0c03d5c1671 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,45 +31,6 @@
 
 static u64 perf_event_mask __read_mostly;
 
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS	4
-
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE		24
-
-/* The size of a per-cpu BTS buffer in bytes: */
-#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 2048)
-
-/* The BTS overflow threshold in bytes from the end of the buffer: */
-#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 128)
-
-
-/*
- * Bits in the debugctlmsr controlling branch tracing.
- */
-#define X86_DEBUGCTL_TR			(1 << 6)
-#define X86_DEBUGCTL_BTS		(1 << 7)
-#define X86_DEBUGCTL_BTINT		(1 << 8)
-#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
-#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)
-
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-	u64	bts_buffer_base;
-	u64	bts_index;
-	u64	bts_absolute_maximum;
-	u64	bts_interrupt_threshold;
-	u64	pebs_buffer_base;
-	u64	pebs_index;
-	u64	pebs_absolute_maximum;
-	u64	pebs_interrupt_threshold;
-	u64	pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
 struct event_constraint {
 	union {
 		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
@@ -88,17 +49,29 @@ struct amd_nb {
 };
 
 struct cpu_hw_events {
+	/*
+	 * Generic x86 PMC bits
+	 */
 	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
 	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	int			enabled;
-	struct debug_store	*ds;
 
 	int			n_events;
 	int			n_added;
 	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
 	u64			tags[X86_PMC_IDX_MAX];
 	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+
+	/*
+	 * Intel DebugStore bits
+	 */
+	struct debug_store	*ds;
+	u64			pebs_enabled;
+
+	/*
+	 * AMD specific bits
+	 */
 	struct amd_nb		*amd_nb;
 };
 
@@ -112,12 +85,24 @@ struct cpu_hw_events {
 #define EVENT_CONSTRAINT(c, n, m)	\
 	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
 
+/*
+ * Constraint on the Event code.
+ */
 #define INTEL_EVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
 
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ */
 #define FIXED_EVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
 
+/*
+ * Constraint on the Event code + UMask
+ */
+#define PEBS_EVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+
 #define EVENT_CONSTRAINT_END		\
 	EVENT_CONSTRAINT(0, 0, 0)
 
@@ -128,6 +113,9 @@ struct cpu_hw_events {
  * struct x86_pmu - generic x86 pmu
  */
 struct x86_pmu {
+	/*
+	 * Generic x86 PMC bits
+	 */
 	const char	*name;
 	int		version;
 	int		(*handle_irq)(struct pt_regs *);
@@ -146,10 +134,6 @@ struct x86_pmu {
 	u64		event_mask;
 	int		apic;
 	u64		max_period;
-	u64		intel_ctrl;
-	void		(*enable_bts)(u64 config);
-	void		(*disable_bts)(void);
-
 	struct event_constraint *
 			(*get_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
@@ -162,6 +146,19 @@ struct x86_pmu {
 	void		(*cpu_starting)(int cpu);
 	void		(*cpu_dying)(int cpu);
 	void		(*cpu_dead)(int cpu);
+
+	/*
+	 * Intel Arch Perfmon v2+
+	 */
+	u64		intel_ctrl;
+
+	/*
+	 * Intel DebugStore bits
+	 */
+	int		bts, pebs;
+	int		pebs_record_size;
+	void		(*drain_pebs)(struct pt_regs *regs);
+	struct event_constraint *pebs_constraints;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -293,110 +290,14 @@ static void release_pmc_hardware(void)
 #endif
 }
 
-static inline bool bts_available(void)
-{
-	return x86_pmu.enable_bts != NULL;
-}
-
-static void init_debug_store_on_cpu(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds)
-		return;
-
-	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
-		     (u32)((u64)(unsigned long)ds),
-		     (u32)((u64)(unsigned long)ds >> 32));
-}
-
-static void fini_debug_store_on_cpu(int cpu)
-{
-	if (!per_cpu(cpu_hw_events, cpu).ds)
-		return;
-
-	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static void release_bts_hardware(void)
-{
-	int cpu;
-
-	if (!bts_available())
-		return;
-
-	get_online_cpus();
-
-	for_each_online_cpu(cpu)
-		fini_debug_store_on_cpu(cpu);
-
-	for_each_possible_cpu(cpu) {
-		struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-		if (!ds)
-			continue;
-
-		per_cpu(cpu_hw_events, cpu).ds = NULL;
-
-		kfree((void *)(unsigned long)ds->bts_buffer_base);
-		kfree(ds);
-	}
-
-	put_online_cpus();
-}
-
-static int reserve_bts_hardware(void)
-{
-	int cpu, err = 0;
-
-	if (!bts_available())
-		return 0;
-
-	get_online_cpus();
-
-	for_each_possible_cpu(cpu) {
-		struct debug_store *ds;
-		void *buffer;
-
-		err = -ENOMEM;
-		buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
-		if (unlikely(!buffer))
-			break;
-
-		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
-		if (unlikely(!ds)) {
-			kfree(buffer);
-			break;
-		}
-
-		ds->bts_buffer_base = (u64)(unsigned long)buffer;
-		ds->bts_index = ds->bts_buffer_base;
-		ds->bts_absolute_maximum =
-			ds->bts_buffer_base + BTS_BUFFER_SIZE;
-		ds->bts_interrupt_threshold =
-			ds->bts_absolute_maximum - BTS_OVFL_TH;
-
-		per_cpu(cpu_hw_events, cpu).ds = ds;
-		err = 0;
-	}
-
-	if (err)
-		release_bts_hardware();
-	else {
-		for_each_online_cpu(cpu)
-			init_debug_store_on_cpu(cpu);
-	}
-
-	put_online_cpus();
-
-	return err;
-}
+static int reserve_ds_buffers(void);
+static void release_ds_buffers(void);
 
 static void hw_perf_event_destroy(struct perf_event *event)
 {
 	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
 		release_pmc_hardware();
-		release_bts_hardware();
+		release_ds_buffers();
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 }
@@ -459,7 +360,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 			if (!reserve_pmc_hardware())
 				err = -EBUSY;
 			else
-				err = reserve_bts_hardware();
+				err = reserve_ds_buffers();
 		}
 		if (!err)
 			atomic_inc(&active_events);
@@ -537,7 +438,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
 	    (hwc->sample_period == 1)) {
 		/* BTS is not supported by this architecture. */
-		if (!bts_available())
+		if (!x86_pmu.bts)
 			return -EOPNOTSUPP;
 
 		/* BTS is currently only allowed for user-mode. */
@@ -995,6 +896,7 @@ static void x86_pmu_unthrottle(struct perf_event *event)
 void perf_event_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+	u64 pebs;
 	struct cpu_hw_events *cpuc;
 	unsigned long flags;
 	int cpu, idx;
@@ -1012,12 +914,14 @@ void perf_event_print_debug(void)
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
 
 		pr_info("\n");
 		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
+		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
 	}
 	pr_info("CPU#%d: active:       %016llx\n", cpu, *(u64 *)cpuc->active_mask);
 
@@ -1333,6 +1237,7 @@ undo:
 
 #include "perf_event_amd.c"
 #include "perf_event_p6.c"
+#include "perf_event_intel_ds.c"
 #include "perf_event_intel.c"
 
 static int __cpuinit
@@ -1464,6 +1369,32 @@ static const struct pmu pmu = {
 	.unthrottle	= x86_pmu_unthrottle,
 };
 
+/*
+ * validate that we can schedule this event
+ */
+static int validate_event(struct perf_event *event)
+{
+	struct cpu_hw_events *fake_cpuc;
+	struct event_constraint *c;
+	int ret = 0;
+
+	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
+	if (!fake_cpuc)
+		return -ENOMEM;
+
+	c = x86_pmu.get_event_constraints(fake_cpuc, event);
+
+	if (!c || !c->weight)
+		ret = -ENOSPC;
+
+	if (x86_pmu.put_event_constraints)
+		x86_pmu.put_event_constraints(fake_cpuc, event);
+
+	kfree(fake_cpuc);
+
+	return ret;
+}
+
 /*
  * validate a single event group
  *
@@ -1529,6 +1460,8 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
 
 		if (event->group_leader != event)
 			err = validate_group(event);
+		else
+			err = validate_event(event);
 
 		event->pmu = tmp;
 	}
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 84bfde64a33..11446412e4c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -470,42 +470,6 @@ static u64 intel_pmu_raw_event(u64 hw_event)
 	return hw_event & CORE_EVNTSEL_MASK;
 }
 
-static void intel_pmu_enable_bts(u64 config)
-{
-	unsigned long debugctlmsr;
-
-	debugctlmsr = get_debugctlmsr();
-
-	debugctlmsr |= X86_DEBUGCTL_TR;
-	debugctlmsr |= X86_DEBUGCTL_BTS;
-	debugctlmsr |= X86_DEBUGCTL_BTINT;
-
-	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
-
-	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-static void intel_pmu_disable_bts(void)
-{
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	unsigned long debugctlmsr;
-
-	if (!cpuc->ds)
-		return;
-
-	debugctlmsr = get_debugctlmsr();
-
-	debugctlmsr &=
-		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
-		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
-
-	update_debugctlmsr(debugctlmsr);
-}
-
 static void intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -514,6 +478,8 @@ static void intel_pmu_disable_all(void)
 
 	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
 		intel_pmu_disable_bts();
+
+	intel_pmu_pebs_disable_all();
 }
 
 static void intel_pmu_enable_all(void)
@@ -531,6 +497,8 @@ static void intel_pmu_enable_all(void)
 
 		intel_pmu_enable_bts(event->hw.config);
 	}
+
+	intel_pmu_pebs_enable_all();
 }
 
 static inline u64 intel_pmu_get_status(void)
@@ -547,8 +515,7 @@ static inline void intel_pmu_ack_status(u64 ack)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static inline void
-intel_pmu_disable_fixed(struct hw_perf_event *hwc)
+static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
 {
 	int idx = hwc->idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
@@ -560,68 +527,7 @@ intel_pmu_disable_fixed(struct hw_perf_event *hwc)
 	(void)checking_wrmsrl(hwc->config_base, ctrl_val);
 }
 
-static void intel_pmu_drain_bts_buffer(void)
-{
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	struct debug_store *ds = cpuc->ds;
-	struct bts_record {
-		u64	from;
-		u64	to;
-		u64	flags;
-	};
-	struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
-	struct bts_record *at, *top;
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	struct perf_sample_data data;
-	struct pt_regs regs;
-
-	if (!event)
-		return;
-
-	if (!ds)
-		return;
-
-	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
-	top = (struct bts_record *)(unsigned long)ds->bts_index;
-
-	if (top <= at)
-		return;
-
-	ds->bts_index = ds->bts_buffer_base;
-
-	perf_sample_data_init(&data, 0);
-
-	data.period	= event->hw.last_period;
-	regs.ip		= 0;
-
-	/*
-	 * Prepare a generic sample, i.e. fill in the invariant fields.
-	 * We will overwrite the from and to address before we output
-	 * the sample.
-	 */
-	perf_prepare_sample(&header, &data, event, &regs);
-
-	if (perf_output_begin(&handle, event,
-			      header.size * (top - at), 1, 1))
-		return;
-
-	for (; at < top; at++) {
-		data.ip		= at->from;
-		data.addr	= at->to;
-
-		perf_output_sample(&handle, &header, &data, event);
-	}
-
-	perf_output_end(&handle);
-
-	/* There's new data available. */
-	event->hw.interrupts++;
-	event->pending_kill = POLL_IN;
-}
-
-static inline void
-intel_pmu_disable_event(struct perf_event *event)
+static void intel_pmu_disable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
@@ -637,10 +543,12 @@ intel_pmu_disable_event(struct perf_event *event)
 	}
 
 	x86_pmu_disable_event(event);
+
+	if (unlikely(event->attr.precise))
+		intel_pmu_pebs_disable(hwc);
 }
 
-static inline void
-intel_pmu_enable_fixed(struct hw_perf_event *hwc)
+static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
 {
 	int idx = hwc->idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
@@ -689,6 +597,9 @@ static void intel_pmu_enable_event(struct perf_event *event)
 		return;
 	}
 
+	if (unlikely(event->attr.precise))
+		intel_pmu_pebs_enable(hwc);
+
 	__x86_pmu_enable_event(hwc);
 }
 
@@ -762,6 +673,13 @@ again:
 
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
+
+	/*
+	 * PEBS overflow sets bit 62 in the global status register
+	 */
+	if (__test_and_clear_bit(62, (unsigned long *)&status))
+		x86_pmu.drain_pebs(regs);
+
 	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
 		struct perf_event *event = cpuc->events[bit];
 
@@ -791,22 +709,18 @@ done:
 	return 1;
 }
 
-static struct event_constraint bts_constraint =
-	EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
-
 static struct event_constraint *
-intel_special_constraints(struct perf_event *event)
+intel_bts_constraints(struct perf_event *event)
 {
-	unsigned int hw_event;
-
-	hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned int hw_event, bts_event;
 
-	if (unlikely((hw_event ==
-		      x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
-		     (event->hw.sample_period == 1))) {
+	hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
+	bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
 
+	if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
 		return &bts_constraint;
-	}
+
 	return NULL;
 }
 
@@ -815,7 +729,11 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 {
 	struct event_constraint *c;
 
-	c = intel_special_constraints(event);
+	c = intel_bts_constraints(event);
+	if (c)
+		return c;
+
+	c = intel_pebs_constraints(event);
 	if (c)
 		return c;
 
@@ -864,8 +782,6 @@ static __initconst struct x86_pmu intel_pmu = {
 	 * the generic event period:
 	 */
 	.max_period		= (1ULL << 31) - 1,
-	.enable_bts		= intel_pmu_enable_bts,
-	.disable_bts		= intel_pmu_disable_bts,
 	.get_event_constraints	= intel_get_event_constraints,
 
 	.cpu_starting		= init_debug_store_on_cpu,
@@ -915,6 +831,8 @@ static __init int intel_pmu_init(void)
 	if (version > 1)
 		x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
 
+	intel_ds_init();
+
 	/*
 	 * Install the hw-cache-events table:
 	 */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
new file mode 100644
index 00000000000..0d994ef213b
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -0,0 +1,557 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS		4
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE		24
+
+#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE	PAGE_SIZE
+
+/*
+ * pebs_record_32 for p4 and core not supported
+
+struct pebs_record_32 {
+	u32 flags, ip;
+	u32 ax, bc, cx, dx;
+	u32 si, di, bp, sp;
+};
+
+ */
+
+struct pebs_record_core {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+};
+
+struct pebs_record_nhm {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+	u64 status, dla, dse, lat;
+};
+
+/*
+ * Bits in the debugctlmsr controlling branch tracing.
+ */
+#define X86_DEBUGCTL_TR			(1 << 6)
+#define X86_DEBUGCTL_BTS		(1 << 7)
+#define X86_DEBUGCTL_BTINT		(1 << 8)
+#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
+#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+	u64	bts_buffer_base;
+	u64	bts_index;
+	u64	bts_absolute_maximum;
+	u64	bts_interrupt_threshold;
+	u64	pebs_buffer_base;
+	u64	pebs_index;
+	u64	pebs_absolute_maximum;
+	u64	pebs_interrupt_threshold;
+	u64	pebs_event_reset[MAX_PEBS_EVENTS];
+};
+
+static void init_debug_store_on_cpu(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+		     (u32)((u64)(unsigned long)ds),
+		     (u32)((u64)(unsigned long)ds >> 32));
+}
+
+static void fini_debug_store_on_cpu(int cpu)
+{
+	if (!per_cpu(cpu_hw_events, cpu).ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static void release_ds_buffers(void)
+{
+	int cpu;
+
+	if (!x86_pmu.bts && !x86_pmu.pebs)
+		return;
+
+	get_online_cpus();
+
+	for_each_online_cpu(cpu)
+		fini_debug_store_on_cpu(cpu);
+
+	for_each_possible_cpu(cpu) {
+		struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+		if (!ds)
+			continue;
+
+		per_cpu(cpu_hw_events, cpu).ds = NULL;
+
+		kfree((void *)(unsigned long)ds->pebs_buffer_base);
+		kfree((void *)(unsigned long)ds->bts_buffer_base);
+		kfree(ds);
+	}
+
+	put_online_cpus();
+}
+
+static int reserve_ds_buffers(void)
+{
+	int cpu, err = 0;
+
+	if (!x86_pmu.bts && !x86_pmu.pebs)
+		return 0;
+
+	get_online_cpus();
+
+	for_each_possible_cpu(cpu) {
+		struct debug_store *ds;
+		void *buffer;
+		int max, thresh;
+
+		err = -ENOMEM;
+		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+		if (unlikely(!ds)) {
+			kfree(buffer);
+			break;
+		}
+		per_cpu(cpu_hw_events, cpu).ds = ds;
+
+		if (x86_pmu.bts) {
+			buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+			if (unlikely(!buffer))
+				break;
+
+			max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+			thresh = max / 16;
+
+			ds->bts_buffer_base = (u64)(unsigned long)buffer;
+			ds->bts_index = ds->bts_buffer_base;
+			ds->bts_absolute_maximum = ds->bts_buffer_base +
+				max * BTS_RECORD_SIZE;
+			ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+				thresh * BTS_RECORD_SIZE;
+		}
+
+		if (x86_pmu.pebs) {
+			buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
+			if (unlikely(!buffer))
+				break;
+
+			max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+
+			ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+			ds->pebs_index = ds->pebs_buffer_base;
+			ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+				max * x86_pmu.pebs_record_size;
+			/*
+			 * Always use single record PEBS
+			 */
+			ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
+				x86_pmu.pebs_record_size;
+		}
+
+		err = 0;
+	}
+
+	if (err)
+		release_ds_buffers();
+	else {
+		for_each_online_cpu(cpu)
+			init_debug_store_on_cpu(cpu);
+	}
+
+	put_online_cpus();
+
+	return err;
+}
+
+/*
+ * BTS
+ */
+
+static struct event_constraint bts_constraint =
+	EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
+
+static void intel_pmu_enable_bts(u64 config)
+{
+	unsigned long debugctlmsr;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr |= X86_DEBUGCTL_TR;
+	debugctlmsr |= X86_DEBUGCTL_BTS;
+	debugctlmsr |= X86_DEBUGCTL_BTINT;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	unsigned long debugctlmsr;
+
+	if (!cpuc->ds)
+		return;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr &=
+		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
+		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_drain_bts_buffer(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct bts_record {
+		u64	from;
+		u64	to;
+		u64	flags;
+	};
+	struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
+	struct bts_record *at, *top;
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+	struct perf_sample_data data;
+	struct pt_regs regs;
+
+	if (!event)
+		return;
+
+	if (!ds)
+		return;
+
+	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+	top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+	if (top <= at)
+		return;
+
+	ds->bts_index = ds->bts_buffer_base;
+
+	perf_sample_data_init(&data, 0);
+	data.period = event->hw.last_period;
+	regs.ip     = 0;
+
+	/*
+	 * Prepare a generic sample, i.e. fill in the invariant fields.
+	 * We will overwrite the from and to address before we output
+	 * the sample.
+	 */
+	perf_prepare_sample(&header, &data, event, &regs);
+
+	if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+		return;
+
+	for (; at < top; at++) {
+		data.ip		= at->from;
+		data.addr	= at->to;
+
+		perf_output_sample(&handle, &header, &data, event);
+	}
+
+	perf_output_end(&handle);
+
+	/* There's new data available. */
+	event->hw.interrupts++;
+	event->pending_kill = POLL_IN;
+}
+
+/*
+ * PEBS
+ */
+
+static struct event_constraint intel_core_pebs_events[] = {
+	PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
+	PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+	PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+	PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+	PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
+	PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+	PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
+	PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+	PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_pebs_events[] = {
+	PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
+	PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
+	PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
+	PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
+	PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
+	PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+	PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
+	PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+	PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint *
+intel_pebs_constraints(struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (!event->attr.precise)
+		return NULL;
+
+	if (x86_pmu.pebs_constraints) {
+		for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+			if ((event->hw.config & c->cmask) == c->code)
+				return c;
+		}
+	}
+
+	return &emptyconstraint;
+}
+
+static void intel_pmu_pebs_enable(struct hw_perf_event *hwc)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val = cpuc->pebs_enabled;
+
+	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+
+	val |= 1ULL << hwc->idx;
+	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+}
+
+static void intel_pmu_pebs_disable(struct hw_perf_event *hwc)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	u64 val = cpuc->pebs_enabled;
+
+	val &= ~(1ULL << hwc->idx);
+	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+
+	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+}
+
+static void intel_pmu_pebs_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->pebs_enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+}
+
+static void intel_pmu_pebs_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->pebs_enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
+static int intel_pmu_save_and_restart(struct perf_event *event);
+static void intel_pmu_disable_event(struct perf_event *event);
+
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
+	struct pebs_record_core *at, *top;
+	struct perf_sample_data data;
+	struct pt_regs regs;
+	int n;
+
+	if (!event || !ds || !x86_pmu.pebs)
+		return;
+
+	intel_pmu_pebs_disable_all();
+
+	at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+	top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+
+	if (top <= at)
+		goto out;
+
+	ds->pebs_index = ds->pebs_buffer_base;
+
+	if (!intel_pmu_save_and_restart(event))
+		goto out;
+
+	perf_sample_data_init(&data, 0);
+	data.period = event->hw.last_period;
+
+	n = top - at;
+
+	/*
+	 * Should not happen, we program the threshold at 1 and do not
+	 * set a reset value.
+	 */
+	WARN_ON_ONCE(n > 1);
+
+	/*
+	 * We use the interrupt regs as a base because the PEBS record
+	 * does not contain a full regs set, specifically it seems to
+	 * lack segment descriptors, which get used by things like
+	 * user_mode().
+	 *
+	 * In the simple case fix up only the IP and BP,SP regs, for
+	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+	 */
+	regs = *iregs;
+	regs.ip = at->ip;
+	regs.bp = at->bp;
+	regs.sp = at->sp;
+
+	if (perf_event_overflow(event, 1, &data, &regs))
+		intel_pmu_disable_event(event);
+
+out:
+	intel_pmu_pebs_enable_all();
+}
+
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct pebs_record_nhm *at, *top;
+	struct perf_sample_data data;
+	struct perf_event *event = NULL;
+	struct pt_regs regs;
+	int bit, n;
+
+	if (!ds || !x86_pmu.pebs)
+		return;
+
+	intel_pmu_pebs_disable_all();
+
+	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+
+	if (top <= at)
+		goto out;
+
+	ds->pebs_index = ds->pebs_buffer_base;
+
+	n = top - at;
+
+	/*
+	 * Should not happen, we program the threshold at 1 and do not
+	 * set a reset value.
+	 */
+	WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
+
+	for ( ; at < top; at++) {
+		for_each_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
+			if (!cpuc->events[bit]->attr.precise)
+				continue;
+
+			event = cpuc->events[bit];
+		}
+
+		if (!event)
+			continue;
+
+		if (!intel_pmu_save_and_restart(event))
+			continue;
+
+		perf_sample_data_init(&data, 0);
+		data.period = event->hw.last_period;
+
+		/*
+		 * See the comment in intel_pmu_drain_pebs_core()
+		 */
+		regs = *iregs;
+		regs.ip = at->ip;
+		regs.bp = at->bp;
+		regs.sp = at->sp;
+
+		if (perf_event_overflow(event, 1, &data, &regs))
+			intel_pmu_disable_event(event);
+	}
+out:
+	intel_pmu_pebs_enable_all();
+}
+
+/*
+ * BTS, PEBS probe and setup
+ */
+
+static void intel_ds_init(void)
+{
+	/*
+	 * No support for 32bit formats
+	 */
+	if (!boot_cpu_has(X86_FEATURE_DTES64))
+		return;
+
+	x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
+	x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+	if (x86_pmu.pebs) {
+		int format = 0;
+
+		if (x86_pmu.version > 1) {
+			u64 capabilities;
+			/*
+			 * v2+ has a PEBS format field
+			 */
+			rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+			format = (capabilities >> 8) & 0xf;
+		}
+
+		switch (format) {
+		case 0:
+			printk(KERN_CONT "PEBS v0, ");
+			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
+			x86_pmu.pebs_constraints = intel_core_pebs_events;
+			break;
+
+		case 1:
+			printk(KERN_CONT "PEBS v1, ");
+			x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+			x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
+			break;
+
+		default:
+			printk(KERN_CONT "PEBS unknown format: %d, ", format);
+			x86_pmu.pebs = 0;
+			break;
+		}
+	}
+}
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static int reseve_ds_buffers(void)
+{
+	return 0;
+}
+
+static void release_ds_buffers(void)
+{
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 80acbf3d5de..42307b50c78 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -203,8 +203,9 @@ struct perf_event_attr {
 				enable_on_exec :  1, /* next exec enables     */
 				task           :  1, /* trace fork/exit       */
 				watermark      :  1, /* wakeup_watermark      */
+				precise        :  1, /* OoO invariant counter */
 
-				__reserved_1   : 49;
+				__reserved_1   : 48;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
-- 
cgit v1.2.3


From 69fef0d2e2c2c049ef4207a52e78b50d527bd85a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Mar 2010 13:57:24 +0100
Subject: perf: Add attr->precise support to raw event parsing

Minimal userspace interface to the new 'precise' events flag.

Can be used like "perf top -e r00c0p" which will use PEBS to sample
retired instructions.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.468665803@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/parse-events.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 05d0c5c2030..a2014459125 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -656,6 +656,10 @@ parse_raw_event(const char **strp, struct perf_event_attr *attr)
 		return EVT_FAILED;
 	n = hex2u64(str + 1, &config);
 	if (n > 0) {
+		if (str[n+1] == 'p') {
+			attr->precise = 1;
+			n++;
+		}
 		*strp = str + n + 1;
 		attr->type = PERF_TYPE_RAW;
 		attr->config = config;
-- 
cgit v1.2.3


From caff2befffe899e63df5cc760b7ed01cfd902685 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Mar 2010 12:02:30 +0100
Subject: perf, x86: Implement simple LBR support

Implement simple suport Intel Last-Branch-Record, it supports all
hardware that implements FREEZE_LBRS_ON_PMI, but does not (yet) implement
the LBR config register.

The Intel LBR is a FIFO of From,To addresses describing the last few
branches the hardware took.

This patch does not add perf interface to the LBR, but merely provides an
interface for internal use.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.544191154@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c           |  18 +++
 arch/x86/kernel/cpu/perf_event_intel.c     |  13 ++
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 228 +++++++++++++++++++++++++++++
 include/linux/perf_event.h                 |  11 ++
 4 files changed, 270 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_lbr.c

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0c03d5c1671..1badff6b6b2 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -48,6 +48,8 @@ struct amd_nb {
 	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
 };
 
+#define MAX_LBR_ENTRIES		16
+
 struct cpu_hw_events {
 	/*
 	 * Generic x86 PMC bits
@@ -69,6 +71,14 @@ struct cpu_hw_events {
 	struct debug_store	*ds;
 	u64			pebs_enabled;
 
+	/*
+	 * Intel LBR bits
+	 */
+	int				lbr_users;
+	void				*lbr_context;
+	struct perf_branch_stack	lbr_stack;
+	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
+
 	/*
 	 * AMD specific bits
 	 */
@@ -159,6 +169,13 @@ struct x86_pmu {
 	int		pebs_record_size;
 	void		(*drain_pebs)(struct pt_regs *regs);
 	struct event_constraint *pebs_constraints;
+
+	/*
+	 * Intel LBR
+	 */
+	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
+	int		lbr_nr;			   /* hardware stack size */
+	int		lbr_format;		   /* hardware format     */
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -1237,6 +1254,7 @@ undo:
 
 #include "perf_event_amd.c"
 #include "perf_event_p6.c"
+#include "perf_event_intel_lbr.c"
 #include "perf_event_intel_ds.c"
 #include "perf_event_intel.c"
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 11446412e4c..44f6ed42a93 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -480,6 +480,7 @@ static void intel_pmu_disable_all(void)
 		intel_pmu_disable_bts();
 
 	intel_pmu_pebs_disable_all();
+	intel_pmu_lbr_disable_all();
 }
 
 static void intel_pmu_enable_all(void)
@@ -499,6 +500,7 @@ static void intel_pmu_enable_all(void)
 	}
 
 	intel_pmu_pebs_enable_all();
+	intel_pmu_lbr_enable_all();
 }
 
 static inline u64 intel_pmu_get_status(void)
@@ -674,6 +676,8 @@ again:
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
 
+	intel_pmu_lbr_read();
+
 	/*
 	 * PEBS overflow sets bit 62 in the global status register
 	 */
@@ -848,6 +852,8 @@ static __init int intel_pmu_init(void)
 		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
+		intel_pmu_lbr_init_core();
+
 		x86_pmu.event_constraints = intel_core2_event_constraints;
 		pr_cont("Core2 events, ");
 		break;
@@ -857,13 +863,18 @@ static __init int intel_pmu_init(void)
 		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
+		intel_pmu_lbr_init_nhm();
+
 		x86_pmu.event_constraints = intel_nehalem_event_constraints;
 		pr_cont("Nehalem/Corei7 events, ");
 		break;
+
 	case 28: /* Atom */
 		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
+		intel_pmu_lbr_init_atom();
+
 		x86_pmu.event_constraints = intel_gen_event_constraints;
 		pr_cont("Atom events, ");
 		break;
@@ -873,6 +884,8 @@ static __init int intel_pmu_init(void)
 		memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
+		intel_pmu_lbr_init_nhm();
+
 		x86_pmu.event_constraints = intel_westmere_event_constraints;
 		pr_cont("Westmere events, ");
 		break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
new file mode 100644
index 00000000000..ea3e99ed82c
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -0,0 +1,228 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+
+enum {
+	LBR_FORMAT_32		= 0x00,
+	LBR_FORMAT_LIP		= 0x01,
+	LBR_FORMAT_EIP		= 0x02,
+	LBR_FORMAT_EIP_FLAGS	= 0x03,
+};
+
+/*
+ * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
+ * otherwise it becomes near impossible to get a reliable stack.
+ */
+
+#define X86_DEBUGCTL_LBR               		(1 << 0)
+#define X86_DEBUGCTL_FREEZE_LBRS_ON_PMI		(1 << 11)
+
+static void __intel_pmu_lbr_enable(void)
+{
+	u64 debugctl;
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	debugctl |= (X86_DEBUGCTL_LBR | X86_DEBUGCTL_FREEZE_LBRS_ON_PMI);
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void __intel_pmu_lbr_disable(void)
+{
+	u64 debugctl;
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	debugctl &= ~(X86_DEBUGCTL_LBR | X86_DEBUGCTL_FREEZE_LBRS_ON_PMI);
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void intel_pmu_lbr_reset_32(void)
+{
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++)
+		wrmsrl(x86_pmu.lbr_from + i, 0);
+}
+
+static void intel_pmu_lbr_reset_64(void)
+{
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		wrmsrl(x86_pmu.lbr_from + i, 0);
+		wrmsrl(x86_pmu.lbr_to   + i, 0);
+	}
+}
+
+static void intel_pmu_lbr_reset(void)
+{
+	if (x86_pmu.lbr_format == LBR_FORMAT_32)
+		intel_pmu_lbr_reset_32();
+	else
+		intel_pmu_lbr_reset_64();
+}
+
+static void intel_pmu_lbr_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	WARN_ON(cpuc->enabled);
+
+	/*
+	 * Reset the LBR stack if this is the first LBR user or
+	 * we changed task context so as to avoid data leaks.
+	 */
+
+	if (!cpuc->lbr_users ||
+	    (event->ctx->task && cpuc->lbr_context != event->ctx)) {
+		intel_pmu_lbr_reset();
+		cpuc->lbr_context = event->ctx;
+	}
+
+	cpuc->lbr_users++;
+}
+
+static void intel_pmu_lbr_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	cpuc->lbr_users--;
+
+	BUG_ON(cpuc->lbr_users < 0);
+	WARN_ON(cpuc->enabled);
+}
+
+static void intel_pmu_lbr_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->lbr_users)
+		__intel_pmu_lbr_enable();
+}
+
+static void intel_pmu_lbr_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (cpuc->lbr_users)
+		__intel_pmu_lbr_disable();
+}
+
+static inline u64 intel_pmu_lbr_tos(void)
+{
+	u64 tos;
+
+	rdmsrl(x86_pmu.lbr_tos, tos);
+
+	return tos;
+}
+
+static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+{
+	unsigned long mask = x86_pmu.lbr_nr - 1;
+	u64 tos = intel_pmu_lbr_tos();
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++, tos--) {
+		unsigned long lbr_idx = (tos - i) & mask;
+		union {
+			struct {
+				u32 from;
+				u32 to;
+			};
+			u64     lbr;
+		} msr_lastbranch;
+
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
+
+		cpuc->lbr_entries[i].from  = msr_lastbranch.from;
+		cpuc->lbr_entries[i].to    = msr_lastbranch.to;
+		cpuc->lbr_entries[i].flags = 0;
+	}
+	cpuc->lbr_stack.nr = i;
+}
+
+#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
+
+/*
+ * Due to lack of segmentation in Linux the effective address (offset)
+ * is the same as the linear address, allowing us to merge the LIP and EIP
+ * LBR formats.
+ */
+static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+{
+	unsigned long mask = x86_pmu.lbr_nr - 1;
+	u64 tos = intel_pmu_lbr_tos();
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++, tos--) {
+		unsigned long lbr_idx = (tos - i) & mask;
+		u64 from, to, flags = 0;
+
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
+		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
+
+		if (x86_pmu.lbr_format == LBR_FORMAT_EIP_FLAGS) {
+			flags = !!(from & LBR_FROM_FLAG_MISPRED);
+			from = (u64)((((s64)from) << 1) >> 1);
+		}
+
+		cpuc->lbr_entries[i].from  = from;
+		cpuc->lbr_entries[i].to    = to;
+		cpuc->lbr_entries[i].flags = flags;
+	}
+	cpuc->lbr_stack.nr = i;
+}
+
+static void intel_pmu_lbr_read(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	if (!cpuc->lbr_users)
+		return;
+
+	if (x86_pmu.lbr_format == LBR_FORMAT_32)
+		intel_pmu_lbr_read_32(cpuc);
+	else
+		intel_pmu_lbr_read_64(cpuc);
+}
+
+static int intel_pmu_lbr_format(void)
+{
+	u64 capabilities;
+
+	rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+	return capabilities & 0x1f;
+}
+
+static void intel_pmu_lbr_init_core(void)
+{
+	x86_pmu.lbr_format = intel_pmu_lbr_format();
+	x86_pmu.lbr_nr     = 4;
+	x86_pmu.lbr_tos    = 0x01c9;
+	x86_pmu.lbr_from   = 0x40;
+	x86_pmu.lbr_to     = 0x60;
+}
+
+static void intel_pmu_lbr_init_nhm(void)
+{
+	x86_pmu.lbr_format = intel_pmu_lbr_format();
+	x86_pmu.lbr_nr     = 16;
+	x86_pmu.lbr_tos    = 0x01c9;
+	x86_pmu.lbr_from   = 0x680;
+	x86_pmu.lbr_to     = 0x6c0;
+}
+
+static void intel_pmu_lbr_init_atom(void)
+{
+	x86_pmu.lbr_format = intel_pmu_lbr_format();
+	x86_pmu.lbr_nr	   = 8;
+	x86_pmu.lbr_tos    = 0x01c9;
+	x86_pmu.lbr_from   = 0x40;
+	x86_pmu.lbr_to     = 0x60;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 42307b50c78..ab4fd9ede26 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -467,6 +467,17 @@ struct perf_raw_record {
 	void				*data;
 };
 
+struct perf_branch_entry {
+	__u64				from;
+	__u64				to;
+	__u64				flags;
+};
+
+struct perf_branch_stack {
+	__u64				nr;
+	struct perf_branch_entry	entries[0];
+};
+
 struct task_struct;
 
 /**
-- 
cgit v1.2.3


From ef21f683a045a79b6aa86ad81e5fdfc0d5ddd250 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Mar 2010 13:12:23 +0100
Subject: perf, x86: use LBR for PEBS IP+1 fixup

Use the LBR to fix up the PEBS IP+1 issue.

As said, PEBS reports the next instruction, here we use the LBR to find
the last branch and from that construct the actual IP. If the IP matches
the LBR-TO, we use LBR-FROM, otherwise we use the LBR-TO address as the
beginning of the last basic block and decode forward.

Once we find a match to the current IP, we use the previous location.

This patch introduces a new ABI element: PERF_RECORD_MISC_EXACT, which
conveys that the reported IP (PERF_SAMPLE_IP) is the exact instruction
that caused the event (barring CPU errata).

The fixup can fail due to various reasons:

 1) LBR contains invalid data (quite possible)
 2) part of the basic block got paged out
 3) the reported IP isn't part of the basic block (see 1)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.619375431@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h         | 19 +++++++
 arch/x86/kernel/cpu/perf_event.c          | 70 +++++++++++++-------------
 arch/x86/kernel/cpu/perf_event_intel.c    |  4 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 84 ++++++++++++++++++++++++++++++-
 include/linux/perf_event.h                |  6 +++
 5 files changed, 144 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index db6109a885a..a9038c95161 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -136,6 +136,25 @@ extern void perf_events_lapic_init(void);
 
 #define PERF_EVENT_INDEX_OFFSET			0
 
+/*
+ * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups.
+ * This flag is otherwise unused and ABI specified to be 0, so nobody should
+ * care what we do with it.
+ */
+#define PERF_EFLAGS_EXACT	(1UL << 3)
+
+#define perf_misc_flags(regs)				\
+({	int misc = 0;					\
+	if (user_mode(regs))				\
+		misc |= PERF_RECORD_MISC_USER;		\
+	else						\
+		misc |= PERF_RECORD_MISC_KERNEL;	\
+	if (regs->flags & PERF_EFLAGS_EXACT)		\
+		misc |= PERF_RECORD_MISC_EXACT;		\
+	misc; })
+
+#define perf_instruction_pointer(regs)	((regs)->ip)
+
 #else
 static inline void init_hw_perf_events(void)		{ }
 static inline void perf_events_lapic_init(void)	{ }
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1badff6b6b2..5cb4e8dcee4 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -29,6 +29,41 @@
 #include <asm/stacktrace.h>
 #include <asm/nmi.h>
 
+/*
+ * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ */
+static unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+	unsigned long offset, addr = (unsigned long)from;
+	int type = in_nmi() ? KM_NMI : KM_IRQ0;
+	unsigned long size, len = 0;
+	struct page *page;
+	void *map;
+	int ret;
+
+	do {
+		ret = __get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret)
+			break;
+
+		offset = addr & (PAGE_SIZE - 1);
+		size = min(PAGE_SIZE - offset, n - len);
+
+		map = kmap_atomic(page, type);
+		memcpy(to, map+offset, size);
+		kunmap_atomic(map, type);
+		put_page(page);
+
+		len  += size;
+		to   += size;
+		addr += size;
+
+	} while (len < n);
+
+	return len;
+}
+
 static u64 perf_event_mask __read_mostly;
 
 struct event_constraint {
@@ -1550,41 +1585,6 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 	dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
 }
 
-/*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
- */
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
-	unsigned long offset, addr = (unsigned long)from;
-	int type = in_nmi() ? KM_NMI : KM_IRQ0;
-	unsigned long size, len = 0;
-	struct page *page;
-	void *map;
-	int ret;
-
-	do {
-		ret = __get_user_pages_fast(addr, 1, 0, &page);
-		if (!ret)
-			break;
-
-		offset = addr & (PAGE_SIZE - 1);
-		size = min(PAGE_SIZE - offset, n - len);
-
-		map = kmap_atomic(page, type);
-		memcpy(to, map+offset, size);
-		kunmap_atomic(map, type);
-		put_page(page);
-
-		len  += size;
-		to   += size;
-		addr += size;
-
-	} while (len < n);
-
-	return len;
-}
-
 static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
 {
 	unsigned long bytes;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 44f6ed42a93..7eb78be3b22 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -547,7 +547,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
 	x86_pmu_disable_event(event);
 
 	if (unlikely(event->attr.precise))
-		intel_pmu_pebs_disable(hwc);
+		intel_pmu_pebs_disable(event);
 }
 
 static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
@@ -600,7 +600,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 	}
 
 	if (unlikely(event->attr.precise))
-		intel_pmu_pebs_enable(hwc);
+		intel_pmu_pebs_enable(event);
 
 	__x86_pmu_enable_event(hwc);
 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 0d994ef213b..50e6ff3281f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -331,26 +331,32 @@ intel_pebs_constraints(struct perf_event *event)
 	return &emptyconstraint;
 }
 
-static void intel_pmu_pebs_enable(struct hw_perf_event *hwc)
+static void intel_pmu_pebs_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
 	u64 val = cpuc->pebs_enabled;
 
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
 	val |= 1ULL << hwc->idx;
 	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+
+	intel_pmu_lbr_enable(event);
 }
 
-static void intel_pmu_pebs_disable(struct hw_perf_event *hwc)
+static void intel_pmu_pebs_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
 	u64 val = cpuc->pebs_enabled;
 
 	val &= ~(1ULL << hwc->idx);
 	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+
+	intel_pmu_lbr_disable(event);
 }
 
 static void intel_pmu_pebs_enable_all(void)
@@ -369,6 +375,70 @@ static void intel_pmu_pebs_disable_all(void)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
 }
 
+#include <asm/insn.h>
+
+#define MAX_INSN_SIZE	16
+
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+	return ip > PAGE_OFFSET;
+#else
+	return (long)ip < 0;
+#endif
+}
+
+static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	unsigned long from = cpuc->lbr_entries[0].from;
+	unsigned long old_to, to = cpuc->lbr_entries[0].to;
+	unsigned long ip = regs->ip;
+
+	if (!cpuc->lbr_stack.nr || !from || !to)
+		return 0;
+
+	if (ip < to)
+		return 0;
+
+	/*
+	 * We sampled a branch insn, rewind using the LBR stack
+	 */
+	if (ip == to) {
+		regs->ip = from;
+		return 1;
+	}
+
+	do {
+		struct insn insn;
+		u8 buf[MAX_INSN_SIZE];
+		void *kaddr;
+
+		old_to = to;
+		if (!kernel_ip(ip)) {
+			int bytes, size = min_t(int, MAX_INSN_SIZE, ip - to);
+
+			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+			if (bytes != size)
+				return 0;
+
+			kaddr = buf;
+		} else
+			kaddr = (void *)to;
+
+		kernel_insn_init(&insn, kaddr);
+		insn_get_length(&insn);
+		to += insn.length;
+	} while (to < ip);
+
+	if (to == ip) {
+		regs->ip = old_to;
+		return 1;
+	}
+
+	return 0;
+}
+
 static int intel_pmu_save_and_restart(struct perf_event *event);
 static void intel_pmu_disable_event(struct perf_event *event);
 
@@ -424,6 +494,11 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	regs.bp = at->bp;
 	regs.sp = at->sp;
 
+	if (intel_pmu_pebs_fixup_ip(&regs))
+		regs.flags |= PERF_EFLAGS_EXACT;
+	else
+		regs.flags &= ~PERF_EFLAGS_EXACT;
+
 	if (perf_event_overflow(event, 1, &data, &regs))
 		intel_pmu_disable_event(event);
 
@@ -487,6 +562,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		regs.bp = at->bp;
 		regs.sp = at->sp;
 
+		if (intel_pmu_pebs_fixup_ip(&regs))
+			regs.flags |= PERF_EFLAGS_EXACT;
+		else
+			regs.flags &= ~PERF_EFLAGS_EXACT;
+
 		if (perf_event_overflow(event, 1, &data, &regs))
 			intel_pmu_disable_event(event);
 	}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ab4fd9ede26..be85f7c4a94 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -294,6 +294,12 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_USER			(2 << 0)
 #define PERF_RECORD_MISC_HYPERVISOR		(3 << 0)
 
+#define PERF_RECORD_MISC_EXACT			(1 << 14)
+/*
+ * Reserve the last bit to indicate some extended misc field
+ */
+#define PERF_RECORD_MISC_EXT_RESERVED		(1 << 15)
+
 struct perf_event_header {
 	__u32	type;
 	__u16	misc;
-- 
cgit v1.2.3


From 1676b8a077c352085d52578fb4f29350b58b6e74 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Mar 2010 14:19:36 +0100
Subject: perf-top: Show the percentage of successfull PEBS-fixups

Use the PERF_RECORD_MISC_EXACT information to measure the success rate of
the PEBS fix-up.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.694233760@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-top.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 31f2e597800..c051833f755 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -410,6 +410,7 @@ static double sym_weight(const struct sym_entry *sym)
 
 static long			samples;
 static long			userspace_samples;
+static long			exact_samples;
 static const char		CONSOLE_CLEAR[] = "[H[2J";
 
 static void __list_insert_active_sym(struct sym_entry *syme)
@@ -450,6 +451,7 @@ static void print_sym_table(void)
 	int counter, snap = !display_weighted ? sym_counter : 0;
 	float samples_per_sec = samples/delay_secs;
 	float ksamples_per_sec = (samples-userspace_samples)/delay_secs;
+	float esamples_percent = (100.0*exact_samples)/samples;
 	float sum_ksamples = 0.0;
 	struct sym_entry *syme, *n;
 	struct rb_root tmp = RB_ROOT;
@@ -457,7 +459,7 @@ static void print_sym_table(void)
 	int sym_width = 0, dso_width = 0, max_dso_width;
 	const int win_width = winsize.ws_col - 1;
 
-	samples = userspace_samples = 0;
+	samples = userspace_samples = exact_samples = 0;
 
 	/* Sort the active symbols */
 	pthread_mutex_lock(&active_symbols_lock);
@@ -488,9 +490,10 @@ static void print_sym_table(void)
 	puts(CONSOLE_CLEAR);
 
 	printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
-	printf( "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% [",
+	printf( "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%%  exact: %4.1f%% [",
 		samples_per_sec,
-		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
+		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)),
+		esamples_percent);
 
 	if (nr_counters == 1 || !display_weighted) {
 		printf("%Ld", (u64)attrs[0].sample_period);
@@ -954,6 +957,9 @@ static void event__process_sample(const event_t *self,
 		return;
 	}
 
+	if (self->header.misc & PERF_RECORD_MISC_EXACT)
+		exact_samples++;
+
 	if (event__preprocess_sample(self, session, &al, symbol_filter) < 0 ||
 	    al.filtered)
 		return;
-- 
cgit v1.2.3


From 8db909a7e3c888b5d45aef7650d74ccebe3ce725 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Mar 2010 17:07:40 +0100
Subject: perf, x86: Clean up IA32_PERF_CAPABILITIES usage

Saner PERF_CAPABILITIES support, which also exposes pebs_trap. Use that
latter to make PEBS's use of LBR conditional since a fault-like pebs
should already report the correct IP.

( As of this writing there is no known hardware that implements
  !pebs_trap )

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.770650663@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c           | 15 +++++++++++++--
 arch/x86/kernel/cpu/perf_event_intel.c     | 10 ++++++++++
 arch/x86/kernel/cpu/perf_event_intel_ds.c  | 30 +++++++++++++++---------------
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 18 ++++--------------
 4 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5cb4e8dcee4..7b5430b2efe 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -154,6 +154,17 @@ struct cpu_hw_events {
 #define for_each_event_constraint(e, c)	\
 	for ((e) = (c); (e)->cmask; (e)++)
 
+union perf_capabilities {
+	struct {
+		u64	lbr_format    : 6;
+		u64	pebs_trap     : 1;
+		u64	pebs_arch_reg : 1;
+		u64	pebs_format   : 4;
+		u64	smm_freeze    : 1;
+	};
+	u64	capabilities;
+};
+
 /*
  * struct x86_pmu - generic x86 pmu
  */
@@ -195,7 +206,8 @@ struct x86_pmu {
 	/*
 	 * Intel Arch Perfmon v2+
 	 */
-	u64		intel_ctrl;
+	u64			intel_ctrl;
+	union perf_capabilities intel_cap;
 
 	/*
 	 * Intel DebugStore bits
@@ -210,7 +222,6 @@ struct x86_pmu {
 	 */
 	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
 	int		lbr_nr;			   /* hardware stack size */
-	int		lbr_format;		   /* hardware format     */
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 7eb78be3b22..246c0723882 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -835,6 +835,16 @@ static __init int intel_pmu_init(void)
 	if (version > 1)
 		x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
 
+	/*
+	 * v2 and above have a perf capabilities MSR
+	 */
+	if (version > 1) {
+		u64 capabilities;
+
+		rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+		x86_pmu.intel_cap.capabilities = capabilities;
+	}
+
 	intel_ds_init();
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 50e6ff3281f..5e4029441b2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -342,7 +342,8 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
 	val |= 1ULL << hwc->idx;
 	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
 
-	intel_pmu_lbr_enable(event);
+	if (x86_pmu.intel_cap.pebs_trap)
+		intel_pmu_lbr_enable(event);
 }
 
 static void intel_pmu_pebs_disable(struct perf_event *event)
@@ -356,7 +357,8 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
 
-	intel_pmu_lbr_disable(event);
+	if (x86_pmu.intel_cap.pebs_trap)
+		intel_pmu_lbr_disable(event);
 }
 
 static void intel_pmu_pebs_enable_all(void)
@@ -395,6 +397,12 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	unsigned long old_to, to = cpuc->lbr_entries[0].to;
 	unsigned long ip = regs->ip;
 
+	/*
+	 * We don't need to fixup if the PEBS assist is fault like
+	 */
+	if (!x86_pmu.intel_cap.pebs_trap)
+		return 1;
+
 	if (!cpuc->lbr_stack.nr || !from || !to)
 		return 0;
 
@@ -589,34 +597,26 @@ static void intel_ds_init(void)
 	x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
 	x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
 	if (x86_pmu.pebs) {
-		int format = 0;
-
-		if (x86_pmu.version > 1) {
-			u64 capabilities;
-			/*
-			 * v2+ has a PEBS format field
-			 */
-			rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
-			format = (capabilities >> 8) & 0xf;
-		}
+		char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
+		int format = x86_pmu.intel_cap.pebs_format;
 
 		switch (format) {
 		case 0:
-			printk(KERN_CONT "PEBS v0, ");
+			printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
 			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
 			x86_pmu.pebs_constraints = intel_core_pebs_events;
 			break;
 
 		case 1:
-			printk(KERN_CONT "PEBS v1, ");
+			printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
 			x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
 			x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
 			break;
 
 		default:
-			printk(KERN_CONT "PEBS unknown format: %d, ", format);
+			printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
 			x86_pmu.pebs = 0;
 			break;
 		}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index ea3e99ed82c..4f3a124329c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -53,7 +53,7 @@ static void intel_pmu_lbr_reset_64(void)
 
 static void intel_pmu_lbr_reset(void)
 {
-	if (x86_pmu.lbr_format == LBR_FORMAT_32)
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
 		intel_pmu_lbr_reset_32();
 	else
 		intel_pmu_lbr_reset_64();
@@ -155,6 +155,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 {
 	unsigned long mask = x86_pmu.lbr_nr - 1;
+	int lbr_format = x86_pmu.intel_cap.lbr_format;
 	u64 tos = intel_pmu_lbr_tos();
 	int i;
 
@@ -165,7 +166,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
 		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
 
-		if (x86_pmu.lbr_format == LBR_FORMAT_EIP_FLAGS) {
+		if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
 			flags = !!(from & LBR_FROM_FLAG_MISPRED);
 			from = (u64)((((s64)from) << 1) >> 1);
 		}
@@ -184,23 +185,14 @@ static void intel_pmu_lbr_read(void)
 	if (!cpuc->lbr_users)
 		return;
 
-	if (x86_pmu.lbr_format == LBR_FORMAT_32)
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
 		intel_pmu_lbr_read_32(cpuc);
 	else
 		intel_pmu_lbr_read_64(cpuc);
 }
 
-static int intel_pmu_lbr_format(void)
-{
-	u64 capabilities;
-
-	rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
-	return capabilities & 0x1f;
-}
-
 static void intel_pmu_lbr_init_core(void)
 {
-	x86_pmu.lbr_format = intel_pmu_lbr_format();
 	x86_pmu.lbr_nr     = 4;
 	x86_pmu.lbr_tos    = 0x01c9;
 	x86_pmu.lbr_from   = 0x40;
@@ -209,7 +201,6 @@ static void intel_pmu_lbr_init_core(void)
 
 static void intel_pmu_lbr_init_nhm(void)
 {
-	x86_pmu.lbr_format = intel_pmu_lbr_format();
 	x86_pmu.lbr_nr     = 16;
 	x86_pmu.lbr_tos    = 0x01c9;
 	x86_pmu.lbr_from   = 0x680;
@@ -218,7 +209,6 @@ static void intel_pmu_lbr_init_nhm(void)
 
 static void intel_pmu_lbr_init_atom(void)
 {
-	x86_pmu.lbr_format = intel_pmu_lbr_format();
 	x86_pmu.lbr_nr	   = 8;
 	x86_pmu.lbr_tos    = 0x01c9;
 	x86_pmu.lbr_from   = 0x40;
-- 
cgit v1.2.3


From 7e1a40dda619b0483fbe0740494ed2c2a1f05289 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Mar 2010 12:38:03 +0100
Subject: perf, x86: Expose the full PEBS record using PERF_SAMPLE_RAW

Expose the full PEBS record using PERF_SAMPLE_RAW

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.847218224@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 5e4029441b2..ef56f053ab3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -457,6 +457,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
 	struct pebs_record_core *at, *top;
 	struct perf_sample_data data;
+	struct perf_raw_record raw;
 	struct pt_regs regs;
 	int n;
 
@@ -479,6 +480,12 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
 
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		raw.size = x86_pmu.pebs_record_size;
+		raw.data = at;
+		data.raw = &raw;
+	}
+
 	n = top - at;
 
 	/*
@@ -521,6 +528,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	struct pebs_record_nhm *at, *top;
 	struct perf_sample_data data;
 	struct perf_event *event = NULL;
+	struct perf_raw_record raw;
 	struct pt_regs regs;
 	int bit, n;
 
@@ -562,6 +570,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		perf_sample_data_init(&data, 0);
 		data.period = event->hw.last_period;
 
+		if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+			raw.size = x86_pmu.pebs_record_size;
+			raw.data = at;
+			data.raw = &raw;
+		}
+
 		/*
 		 * See the comment in intel_pmu_drain_pebs_core()
 		 */
-- 
cgit v1.2.3


From 30a813ae035d3e220a89609adce878e045c49547 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Mar 2010 13:49:21 +0100
Subject: x86: Move MAX_INSN_SIZE into asm/insn.h

Since there's now two users for this, place it in a common header.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100304140100.923774125@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/insn.h               | 2 ++
 arch/x86/include/asm/kprobes.h            | 2 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 2 --
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 96c2e0ad04c..88c765e1641 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -68,6 +68,8 @@ struct insn {
 	const insn_byte_t *next_byte;
 };
 
+#define MAX_INSN_SIZE	16
+
 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
 #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
 #define X86_MODRM_RM(modrm) ((modrm) & 0x07)
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4ffa345a8cc..54788253915 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -24,6 +24,7 @@
 #include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
+#include <asm/insn.h>
 
 #define  __ARCH_WANT_KPROBES_INSN_SLOT
 
@@ -36,7 +37,6 @@ typedef u8 kprobe_opcode_t;
 #define RELATIVEJUMP_SIZE 5
 #define RELATIVECALL_OPCODE 0xe8
 #define RELATIVE_ADDR_SIZE 4
-#define MAX_INSN_SIZE 16
 #define MAX_STACK_SIZE 64
 #define MIN_STACK_SIZE(ADDR)					       \
 	(((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ef56f053ab3..72453ac5fb7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -379,8 +379,6 @@ static void intel_pmu_pebs_disable_all(void)
 
 #include <asm/insn.h>
 
-#define MAX_INSN_SIZE	16
-
 static inline bool kernel_ip(unsigned long ip)
 {
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3


From 3adaebd69557615c1bf0365ce5e32d93ac7d82af Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Mar 2010 12:09:29 +0100
Subject: perf, x86: Fix silly bug in data store buffer allocation

Fix up the ds allocation error path, where we could free @buffer before
we used it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100305154128.813452402@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 72453ac5fb7..a67fff14475 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -127,10 +127,8 @@ static int reserve_ds_buffers(void)
 
 		err = -ENOMEM;
 		ds = kzalloc(sizeof(*ds), GFP_KERNEL);
-		if (unlikely(!ds)) {
-			kfree(buffer);
+		if (unlikely(!ds))
 			break;
-		}
 		per_cpu(cpu_hw_events, cpu).ds = ds;
 
 		if (x86_pmu.bts) {
-- 
cgit v1.2.3


From 3c44780b220e876b01e39d4028cd6f4205fbf5d6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Mar 2010 21:49:01 +0100
Subject: perf, x86: Disable PEBS on clovertown chips

This CPU has just too many handycaps to be really useful.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100305154128.890278662@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       |  4 ++++
 arch/x86/kernel/cpu/perf_event_intel.c | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 7b5430b2efe..335ee1d38b7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -197,6 +197,7 @@ struct x86_pmu {
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
 	struct event_constraint *event_constraints;
+	void		(*quirks)(void);
 
 	void		(*cpu_prepare)(int cpu);
 	void		(*cpu_starting)(int cpu);
@@ -1373,6 +1374,9 @@ void __init init_hw_perf_events(void)
 
 	pr_cont("%s PMU driver.\n", x86_pmu.name);
 
+	if (x86_pmu.quirks)
+		x86_pmu.quirks();
+
 	if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
 		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
 		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 246c0723882..224c952071f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -792,6 +792,32 @@ static __initconst struct x86_pmu intel_pmu = {
 	.cpu_dying		= fini_debug_store_on_cpu,
 };
 
+static void intel_clovertown_quirks(void)
+{
+	/*
+	 * PEBS is unreliable due to:
+	 *
+	 *   AJ67  - PEBS may experience CPL leaks
+	 *   AJ68  - PEBS PMI may be delayed by one event
+	 *   AJ69  - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
+	 *   AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
+	 *
+	 * AJ67 could be worked around by restricting the OS/USR flags.
+	 * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
+	 *
+	 * AJ106 could possibly be worked around by not allowing LBR
+	 *       usage from PEBS, including the fixup.
+	 * AJ68  could possibly be worked around by always programming
+	 * 	 a pebs_event_reset[0] value and coping with the lost events.
+	 *
+	 * But taken together it might just make sense to not enable PEBS on
+	 * these chips.
+	 */
+	printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
+	x86_pmu.pebs = 0;
+	x86_pmu.pebs_constraints = NULL;
+}
+
 static __init int intel_pmu_init(void)
 {
 	union cpuid10_edx edx;
@@ -856,6 +882,7 @@ static __init int intel_pmu_init(void)
 		break;
 
 	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+		x86_pmu.quirks = intel_clovertown_quirks;
 	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
 	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
 	case 29: /* six-core 45 nm xeon "Dunnington" */
-- 
cgit v1.2.3


From 74846d35b24b6efd61bb88a0a750b6bb257e6e78 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Mar 2010 13:49:35 +0100
Subject: perf, x86: Clear the LBRs on init

Some CPUs have errata where the LBR is not cleared on Power-On. So always
clear the LBRs before use.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100305154128.966563424@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c     | 18 ++++++++++++++++--
 arch/x86/kernel/cpu/perf_event_intel_lbr.c |  3 +++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 224c952071f..c135ed735b2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -767,6 +767,20 @@ static __initconst struct x86_pmu core_pmu = {
 	.event_constraints	= intel_core_event_constraints,
 };
 
+static void intel_pmu_cpu_starting(int cpu)
+{
+	init_debug_store_on_cpu(cpu);
+	/*
+	 * Deal with CPUs that don't clear their LBRs on power-up.
+	 */
+	intel_pmu_lbr_reset();
+}
+
+static void intel_pmu_cpu_dying(int cpu)
+{
+	fini_debug_store_on_cpu(cpu);
+}
+
 static __initconst struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
@@ -788,8 +802,8 @@ static __initconst struct x86_pmu intel_pmu = {
 	.max_period		= (1ULL << 31) - 1,
 	.get_event_constraints	= intel_get_event_constraints,
 
-	.cpu_starting		= init_debug_store_on_cpu,
-	.cpu_dying		= fini_debug_store_on_cpu,
+	.cpu_starting		= intel_pmu_cpu_starting,
+	.cpu_dying		= intel_pmu_cpu_dying,
 };
 
 static void intel_clovertown_quirks(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 4f3a124329c..dcec765f818 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -53,6 +53,9 @@ static void intel_pmu_lbr_reset_64(void)
 
 static void intel_pmu_lbr_reset(void)
 {
+	if (!x86_pmu.lbr_nr)
+		return;
+
 	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
 		intel_pmu_lbr_reset_32();
 	else
-- 
cgit v1.2.3


From a562b1871f7f7d2f3a835c3c1e07fa58d473cfb7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Mar 2010 16:29:14 +0100
Subject: perf, x86: Robustify PEBS fixup

It turns out the LBR is massively unreliable on certain CPUs, so code the
fixup a little more defensive to avoid crashing the kernel.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <20100305154129.042271287@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index a67fff14475..e7ac51770d4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -399,10 +399,23 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	if (!x86_pmu.intel_cap.pebs_trap)
 		return 1;
 
+	/*
+	 * No LBR entry, no basic block, no rewinding
+	 */
 	if (!cpuc->lbr_stack.nr || !from || !to)
 		return 0;
 
-	if (ip < to)
+	/*
+	 * Basic blocks should never cross user/kernel boundaries
+	 */
+	if (kernel_ip(ip) != kernel_ip(to))
+		return 0;
+
+	/*
+	 * unsigned math, either ip is before the start (impossible) or
+	 * the basic block is larger than 1 page (sanity)
+	 */
+	if ((ip - to) > PAGE_SIZE)
 		return 0;
 
 	/*
@@ -420,7 +433,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 
 		old_to = to;
 		if (!kernel_ip(ip)) {
-			int bytes, size = min_t(int, MAX_INSN_SIZE, ip - to);
+			int bytes, size = MAX_INSN_SIZE;
 
 			bytes = copy_from_user_nmi(buf, (void __user *)to, size);
 			if (bytes != size)
@@ -440,6 +453,10 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 		return 1;
 	}
 
+	/*
+	 * Even though we decoded the basic block, the instruction stream
+	 * never matched the given IP, either the TO or the IP got corrupted.
+	 */
 	return 0;
 }
 
-- 
cgit v1.2.3


From cc7f00820b2f3be656569c41158d9323e425bcfe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 8 Mar 2010 17:51:33 +0100
Subject: perf, x86: Avoid double disable on throttle vs
 ioctl(PERF_IOC_DISABLE)

Calling ioctl(PERF_EVENT_IOC_DISABLE) on a thottled counter would result
in a double disable, cure this by using x86_pmu_{start,stop} for
throttle/unthrottle and teach x86_pmu_stop() to check ->active_mask.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index e7ac51770d4..a7401e4167d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -461,7 +461,6 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 }
 
 static int intel_pmu_save_and_restart(struct perf_event *event);
-static void intel_pmu_disable_event(struct perf_event *event);
 
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 {
@@ -528,7 +527,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 		regs.flags &= ~PERF_EFLAGS_EXACT;
 
 	if (perf_event_overflow(event, 1, &data, &regs))
-		intel_pmu_disable_event(event);
+		x86_pmu_stop(event);
 
 out:
 	intel_pmu_pebs_enable_all();
@@ -603,7 +602,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 			regs.flags &= ~PERF_EFLAGS_EXACT;
 
 		if (perf_event_overflow(event, 1, &data, &regs))
-			intel_pmu_disable_event(event);
+			x86_pmu_stop(event);
 	}
 out:
 	intel_pmu_pebs_enable_all();
-- 
cgit v1.2.3


From 8f4aebd2be9892bf8fb79a2d8576d3f3ee7f00f6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 6 Mar 2010 13:26:11 +0100
Subject: perf, x86: Fix pebs drains

I overlooked the perf_disable()/perf_enable() calls in
intel_pmu_handle_irq(), (pointed out by Markus) so we should not
explicitly disable_all/enable_all pebs counters in the drain functions,
these are already disabled and enabling them early is confusing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index a7401e4167d..66c6962f15f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -476,18 +476,16 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	if (!event || !ds || !x86_pmu.pebs)
 		return;
 
-	intel_pmu_pebs_disable_all();
-
 	at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
 
 	if (top <= at)
-		goto out;
+		return;
 
 	ds->pebs_index = ds->pebs_buffer_base;
 
 	if (!intel_pmu_save_and_restart(event))
-		goto out;
+		return;
 
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
@@ -528,9 +526,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 
 	if (perf_event_overflow(event, 1, &data, &regs))
 		x86_pmu_stop(event);
-
-out:
-	intel_pmu_pebs_enable_all();
 }
 
 static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
@@ -547,13 +542,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	if (!ds || !x86_pmu.pebs)
 		return;
 
-	intel_pmu_pebs_disable_all();
-
 	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
 
 	if (top <= at)
-		goto out;
+		return;
 
 	ds->pebs_index = ds->pebs_buffer_base;
 
@@ -604,8 +597,6 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		if (perf_event_overflow(event, 1, &data, &regs))
 			x86_pmu_stop(event);
 	}
-out:
-	intel_pmu_pebs_enable_all();
 }
 
 /*
-- 
cgit v1.2.3


From 4807e3d5dc7bb7057dd6ca3abb09f3da2eb8c323 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 6 Mar 2010 13:47:07 +0100
Subject: perf, x86: Fix PEBS enable/disable vs cpuc->enabled

We should never call ->enable with the pmu enabled, and we _can_ have
->disable called with the pmu enabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 66c6962f15f..9ad0e67b9c8 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -338,7 +338,7 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
 	val |= 1ULL << hwc->idx;
-	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+	WARN_ON_ONCE(cpuc->enabled);
 
 	if (x86_pmu.intel_cap.pebs_trap)
 		intel_pmu_lbr_enable(event);
@@ -351,7 +351,8 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
 	u64 val = cpuc->pebs_enabled;
 
 	val &= ~(1ULL << hwc->idx);
-	wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+	if (cpuc->enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, val);
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
 
-- 
cgit v1.2.3


From 2df202bf7520eaffcbfb07e45dfa3cfb0aeee2c0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 6 Mar 2010 13:48:54 +0100
Subject: perf, x86: Fix LBR enable/disable vs cpuc->enabled

We should never call ->enable with the pmu enabled, and we _can_ have
->disable called with the pmu enabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index dcec765f818..0145f99f7a3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -69,7 +69,7 @@ static void intel_pmu_lbr_enable(struct perf_event *event)
 	if (!x86_pmu.lbr_nr)
 		return;
 
-	WARN_ON(cpuc->enabled);
+	WARN_ON_ONCE(cpuc->enabled);
 
 	/*
 	 * Reset the LBR stack if this is the first LBR user or
@@ -93,9 +93,10 @@ static void intel_pmu_lbr_disable(struct perf_event *event)
 		return;
 
 	cpuc->lbr_users--;
-
 	BUG_ON(cpuc->lbr_users < 0);
-	WARN_ON(cpuc->enabled);
+
+	if (cpuc->enabled && !cpuc->lbr_users)
+		__intel_pmu_lbr_disable();
 }
 
 static void intel_pmu_lbr_enable_all(void)
-- 
cgit v1.2.3


From d329527e47851f84b1e7944ed9601205f35f1b93 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 8 Mar 2010 13:57:14 +0100
Subject: perf, x86: Reorder intel_pmu_enable_all()

The documentation says we have to enable PEBS before we enable the PMU
proper.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index c135ed735b2..d3e2424069a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -487,6 +487,8 @@ static void intel_pmu_enable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
+	intel_pmu_pebs_enable_all();
+	intel_pmu_lbr_enable_all();
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 
 	if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
@@ -498,9 +500,6 @@ static void intel_pmu_enable_all(void)
 
 		intel_pmu_enable_bts(event->hw.config);
 	}
-
-	intel_pmu_pebs_enable_all();
-	intel_pmu_lbr_enable_all();
 }
 
 static inline u64 intel_pmu_get_status(void)
-- 
cgit v1.2.3


From 12ab854d744f04bfc5c6c4db723b7e31fc03eb29 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 6 Mar 2010 18:57:38 +0100
Subject: perf, x86: Deal with multiple state bits for pebs-fmt1

Its unclear if the PEBS state record will have only a single bit set, in
case it does not and accumulates bits, deal with that by only processing
each event once.

Also, robustify some of the code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 9ad0e67b9c8..b4680daecf1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -538,6 +538,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	struct perf_event *event = NULL;
 	struct perf_raw_record raw;
 	struct pt_regs regs;
+	u64 status = 0;
 	int bit, n;
 
 	if (!ds || !x86_pmu.pebs)
@@ -561,13 +562,22 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 
 	for ( ; at < top; at++) {
 		for_each_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
-			if (!cpuc->events[bit]->attr.precise)
+			event = cpuc->events[bit];
+			if (!test_bit(bit, cpuc->active_mask))
 				continue;
 
-			event = cpuc->events[bit];
+			WARN_ON_ONCE(!event);
+
+			if (!event->attr.precise)
+				continue;
+
+			if (__test_and_set_bit(bit, (unsigned long *)&status))
+				continue;
+
+			break;
 		}
 
-		if (!event)
+		if (!event || bit >= MAX_PEBS_EVENTS)
 			continue;
 
 		if (!intel_pmu_save_and_restart(event))
-- 
cgit v1.2.3


From ad0e6cfe2a2a61d7b5530188e571d508146cb43b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 6 Mar 2010 19:49:06 +0100
Subject: perf, x86: Fix silly bug in intel_pmu_pebs_{enable,disable}

We need to use the actual cpuc->pebs_enabled value, not a local copy for
the changes to take effect.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index b4680daecf1..242369488e7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -333,11 +333,10 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	u64 val = cpuc->pebs_enabled;
 
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
-	val |= 1ULL << hwc->idx;
+	cpuc->pebs_enabled |= 1ULL << hwc->idx;
 	WARN_ON_ONCE(cpuc->enabled);
 
 	if (x86_pmu.intel_cap.pebs_trap)
@@ -348,11 +347,10 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	u64 val = cpuc->pebs_enabled;
 
-	val &= ~(1ULL << hwc->idx);
+	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
 	if (cpuc->enabled)
-		wrmsrl(MSR_IA32_PEBS_ENABLE, val);
+		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
 
-- 
cgit v1.2.3


From b83a46e7da4a948cc852ba7805dfb1a392dec861 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 8 Mar 2010 13:51:12 +0100
Subject: perf, x86: Don't reset the LBR as frequently

If we reset the LBR on each first counter, simple counter rotation which
first deschedules all counters and then reschedules the new ones will
lead to LBR reset, even though we're still in the same task context.

Reduce this by not flushing on the first counter but only flushing on
different task contexts.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 0145f99f7a3..f278136bf91 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -72,12 +72,11 @@ static void intel_pmu_lbr_enable(struct perf_event *event)
 	WARN_ON_ONCE(cpuc->enabled);
 
 	/*
-	 * Reset the LBR stack if this is the first LBR user or
-	 * we changed task context so as to avoid data leaks.
+	 * Reset the LBR stack if we changed task context to
+	 * avoid data leaks.
 	 */
 
-	if (!cpuc->lbr_users ||
-	    (event->ctx->task && cpuc->lbr_context != event->ctx)) {
+	if (event->ctx->task && cpuc->lbr_context != event->ctx) {
 		intel_pmu_lbr_reset();
 		cpuc->lbr_context = event->ctx;
 	}
@@ -93,7 +92,7 @@ static void intel_pmu_lbr_disable(struct perf_event *event)
 		return;
 
 	cpuc->lbr_users--;
-	BUG_ON(cpuc->lbr_users < 0);
+	WARN_ON_ONCE(cpuc->lbr_users < 0);
 
 	if (cpuc->enabled && !cpuc->lbr_users)
 		__intel_pmu_lbr_disable();
-- 
cgit v1.2.3


From 7645a24cbd01cbf4865d1273d5ddaa8d8c2ccb3a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 8 Mar 2010 13:51:31 +0100
Subject: perf, x86: Remove checking_{wr,rd}msr() usage

We don't need checking_{wr,rd}msr() calls, since we should know what cpu
we're running on and not use blindly poke at msrs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       | 24 ++++++++++++++++++------
 arch/x86/kernel/cpu/perf_event_intel.c |  5 ++---
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 335ee1d38b7..e24f6374f9f 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -29,6 +29,17 @@
 #include <asm/stacktrace.h>
 #include <asm/nmi.h>
 
+#if 0
+#undef wrmsrl
+#define wrmsrl(msr, val) 					\
+do {								\
+	trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
+			(unsigned long)(val));			\
+	native_write_msr((msr), (u32)((u64)(val)), 		\
+			(u32)((u64)(val) >> 32));		\
+} while (0)
+#endif
+
 /*
  * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
  */
@@ -821,14 +832,15 @@ void hw_perf_enable(void)
 
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
 {
-	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+	wrmsrl(hwc->config_base + hwc->idx,
 			      hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 
 static inline void x86_pmu_disable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	(void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);
+
+	wrmsrl(hwc->config_base + hwc->idx, hwc->config);
 }
 
 static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -843,7 +855,7 @@ x86_perf_event_set_period(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	s64 left = atomic64_read(&hwc->period_left);
 	s64 period = hwc->sample_period;
-	int err, ret = 0, idx = hwc->idx;
+	int ret = 0, idx = hwc->idx;
 
 	if (idx == X86_PMC_IDX_FIXED_BTS)
 		return 0;
@@ -881,8 +893,8 @@ x86_perf_event_set_period(struct perf_event *event)
 	 */
 	atomic64_set(&hwc->prev_count, (u64)-left);
 
-	err = checking_wrmsrl(hwc->event_base + idx,
-			     (u64)(-left) & x86_pmu.event_mask);
+	wrmsrl(hwc->event_base + idx,
+			(u64)(-left) & x86_pmu.event_mask);
 
 	perf_event_update_userpage(event);
 
@@ -987,7 +999,7 @@ void perf_event_print_debug(void)
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
 		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
 	}
-	pr_info("CPU#%d: active:       %016llx\n", cpu, *(u64 *)cpuc->active_mask);
+	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
 
 	for (idx = 0; idx < x86_pmu.num_events; idx++) {
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index d3e2424069a..971dc6e7d54 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -525,7 +525,7 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
 
 	rdmsrl(hwc->config_base, ctrl_val);
 	ctrl_val &= ~mask;
-	(void)checking_wrmsrl(hwc->config_base, ctrl_val);
+	wrmsrl(hwc->config_base, ctrl_val);
 }
 
 static void intel_pmu_disable_event(struct perf_event *event)
@@ -553,7 +553,6 @@ static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
 {
 	int idx = hwc->idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
-	int err;
 
 	/*
 	 * Enable IRQ generation (0x8),
@@ -578,7 +577,7 @@ static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
 	rdmsrl(hwc->config_base, ctrl_val);
 	ctrl_val &= ~mask;
 	ctrl_val |= bits;
-	err = checking_wrmsrl(hwc->config_base, ctrl_val);
+	wrmsrl(hwc->config_base, ctrl_val);
 }
 
 static void intel_pmu_enable_event(struct perf_event *event)
-- 
cgit v1.2.3


From d80c7502ff63aa0d99d8c0c5803d28bbef67a74e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 9 Mar 2010 11:41:02 +0100
Subject: perf, x86: Fixup the PEBS handler for Core2 cpus

Pull the core handler in line with the nhm one, also make sure we always
drain the buffer.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 38 +++++++++++++++++++------------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 242369488e7..1bfd59beb65 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -472,20 +472,39 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	struct pt_regs regs;
 	int n;
 
-	if (!event || !ds || !x86_pmu.pebs)
+	if (!ds || !x86_pmu.pebs)
 		return;
 
 	at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
 
-	if (top <= at)
+	/*
+	 * Whatever else happens, drain the thing
+	 */
+	ds->pebs_index = ds->pebs_buffer_base;
+
+	if (!test_bit(0, cpuc->active_mask))
 		return;
 
-	ds->pebs_index = ds->pebs_buffer_base;
+	WARN_ON_ONCE(!event);
+
+	if (!event->attr.precise)
+		return;
+
+	n = top - at;
+	if (n <= 0)
+		return;
 
 	if (!intel_pmu_save_and_restart(event))
 		return;
 
+	/*
+	 * Should not happen, we program the threshold at 1 and do not
+	 * set a reset value.
+	 */
+	WARN_ON_ONCE(n > 1);
+	at += n - 1;
+
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
 
@@ -495,14 +514,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 		data.raw = &raw;
 	}
 
-	n = top - at;
-
-	/*
-	 * Should not happen, we program the threshold at 1 and do not
-	 * set a reset value.
-	 */
-	WARN_ON_ONCE(n > 1);
-
 	/*
 	 * We use the interrupt regs as a base because the PEBS record
 	 * does not contain a full regs set, specifically it seems to
@@ -545,12 +556,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
 
-	if (top <= at)
-		return;
-
 	ds->pebs_index = ds->pebs_buffer_base;
 
 	n = top - at;
+	if (n <= 0)
+		return;
 
 	/*
 	 * Should not happen, we program the threshold at 1 and do not
-- 
cgit v1.2.3


From 63fb3f9b2312e131be5a0a2dddb63f2fb123db9b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 9 Mar 2010 11:51:02 +0100
Subject: perf, x86: Fix LBR read-out

Don't decrement the TOS twice...

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: paulus@samba.org
Cc: eranian@google.com
Cc: robert.richter@amd.com
Cc: fweisbec@gmail.com
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index f278136bf91..df4c98e26c5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -129,7 +129,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 	u64 tos = intel_pmu_lbr_tos();
 	int i;
 
-	for (i = 0; i < x86_pmu.lbr_nr; i++, tos--) {
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		unsigned long lbr_idx = (tos - i) & mask;
 		union {
 			struct {
@@ -162,7 +162,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 	u64 tos = intel_pmu_lbr_tos();
 	int i;
 
-	for (i = 0; i < x86_pmu.lbr_nr; i++, tos--) {
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		unsigned long lbr_idx = (tos - i) & mask;
 		u64 from, to, flags = 0;
 
-- 
cgit v1.2.3


From ba7e4d13fc7e25af1d167d40e6f028298dfc55ad Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 13:58:12 +0200
Subject: perf, x86: Add INSTRUCTION_DECODER config flag

The PEBS+LBR decoding magic needs the insn_get_length() infrastructure
to be able to decode x86 instruction length.

So split it out of KPROBES dependency and make it enabled when either
KPROBES or PERF_EVENTS is enabled.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig      | 3 +++
 arch/x86/lib/Makefile | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e9844037152..e1240f652a9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -58,6 +58,9 @@ config X86
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
 
+config INSTRUCTION_DECODER
+	def_bool (KPROBES || PERF_EVENTS)
+
 config OUTPUT_FORMAT
 	string
 	default "elf32-i386" if X86_32
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 419386c24b8..cbaf8f2b83d 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -20,7 +20,7 @@ lib-y := delay.o
 lib-y += thunk_$(BITS).o
 lib-y += usercopy_$(BITS).o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
-lib-$(CONFIG_KPROBES) += insn.o inat.o
+lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o
 
-- 
cgit v1.2.3


From caa0142d84ceb0fc83e28f0475d0a7316cb6df77 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 13:58:12 +0200
Subject: perf, x86: Fix the !CONFIG_CPU_SUP_INTEL build

Fix typo. But the modularization here is ugly and should be improved.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 1bfd59beb65..c59678a14a2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -661,7 +661,7 @@ static void intel_ds_init(void)
 
 #else /* CONFIG_CPU_SUP_INTEL */
 
-static int reseve_ds_buffers(void)
+static int reserve_ds_buffers(void)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 938179b4f8cf8a4f11234ebf2dff2eb48400acfe Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@sgi.com>
Date: Fri, 5 Mar 2010 11:42:03 -0600
Subject: x86: Improve Intel microcode loader performance

We've noticed that on large SGI UV system configurations,
running microcode.ctl can take very long periods of time.  This
is due to the large number of vmalloc/vfree calls made by the
Intel generic_load_microcode() logic.

By reusing allocated space, the following patch reduces the time
to run microcode.ctl on a 1024 cpu system from approximately 80
seconds down to 1 or 2 seconds.

Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Acked-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Bill Davidsen <davidsen@tmr.com>
LKML-Reference: <20100305174203.GA19638@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_intel.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 85a343e2893..356170262a9 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -343,10 +343,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 				int (*get_ucode_data)(void *, const void *, size_t))
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	u8 *ucode_ptr = data, *new_mc = NULL, *mc;
+	u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
 	int new_rev = uci->cpu_sig.rev;
 	unsigned int leftover = size;
 	enum ucode_state state = UCODE_OK;
+	unsigned int curr_mc_size = 0;
 
 	while (leftover) {
 		struct microcode_header_intel mc_header;
@@ -361,9 +362,15 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 			break;
 		}
 
-		mc = vmalloc(mc_size);
-		if (!mc)
-			break;
+		/* For performance reasons, reuse mc area when possible */
+		if (!mc || mc_size > curr_mc_size) {
+			if (mc)
+				vfree(mc);
+			mc = vmalloc(mc_size);
+			if (!mc)
+				break;
+			curr_mc_size = mc_size;
+		}
 
 		if (get_ucode_data(mc, ucode_ptr, mc_size) ||
 		    microcode_sanity_check(mc) < 0) {
@@ -376,13 +383,16 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 				vfree(new_mc);
 			new_rev = mc_header.rev;
 			new_mc  = mc;
-		} else
-			vfree(mc);
+			mc = NULL;	/* trigger new vmalloc */
+		}
 
 		ucode_ptr += mc_size;
 		leftover  -= mc_size;
 	}
 
+	if (mc)
+		vfree(mc);
+
 	if (leftover) {
 		if (new_mc)
 			vfree(new_mc);
-- 
cgit v1.2.3


From 6f4edd69e40aba4f45bf9558c1e9a950d79ab4e4 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Wed, 10 Mar 2010 14:44:58 -0600
Subject: x86, UV: Clean up UV headers for MMR definitions

Update UV mmr definitions header file. Eliminate definitions no
longer needed. Move 2 definitions from tlb_uv.c into the header
file where they belong.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20100310204458.GA28835@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_mmrs.h | 528 ++++++--------------------------------
 arch/x86/kernel/tlb_uv.c          |  10 +-
 2 files changed, 85 insertions(+), 453 deletions(-)

diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index 2cae46c7c8a..b2f2d2e05ce 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -1,4 +1,3 @@
-
 /*
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -14,14 +13,26 @@
 
 #define UV_MMR_ENABLE		(1UL << 63)
 
+/* ========================================================================= */
+/*                          UVH_BAU_DATA_BROADCAST                           */
+/* ========================================================================= */
+#define UVH_BAU_DATA_BROADCAST 0x61688UL
+#define UVH_BAU_DATA_BROADCAST_32 0x0440
+
+#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0
+#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL
+
+union uvh_bau_data_broadcast_u {
+    unsigned long	v;
+    struct uvh_bau_data_broadcast_s {
+	unsigned long	enable :  1;  /* RW */
+	unsigned long	rsvd_1_63: 63;  /*    */
+    } s;
+};
+
 /* ========================================================================= */
 /*                           UVH_BAU_DATA_CONFIG                             */
 /* ========================================================================= */
-#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
-#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
-#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
-/* 1011 timebase 7 (168millisec) * 3 ticks -> 500ms */
 #define UVH_BAU_DATA_CONFIG 0x61680UL
 #define UVH_BAU_DATA_CONFIG_32 0x0438
 
@@ -603,6 +614,68 @@ union uvh_lb_bau_intd_software_acknowledge_u {
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70
 
+/* ========================================================================= */
+/*                         UVH_LB_BAU_MISC_CONTROL                           */
+/* ========================================================================= */
+#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
+#define UVH_LB_BAU_MISC_CONTROL_32 0x00a10
+
+#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
+#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
+#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
+#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_SHFT 11
+#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
+#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
+#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
+#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
+#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
+#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT 48
+#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+
+union uvh_lb_bau_misc_control_u {
+    unsigned long	v;
+    struct uvh_lb_bau_misc_control_s {
+	unsigned long	rejection_delay                    :  8;  /* RW */
+	unsigned long	apic_mode                          :  1;  /* RW */
+	unsigned long	force_broadcast                    :  1;  /* RW */
+	unsigned long	force_lock_nop                     :  1;  /* RW */
+	unsigned long	csi_agent_presence_vector          :  3;  /* RW */
+	unsigned long	descriptor_fetch_mode              :  1;  /* RW */
+	unsigned long	enable_intd_soft_ack_mode          :  1;  /* RW */
+	unsigned long	intd_soft_ack_timeout_period       :  4;  /* RW */
+	unsigned long	enable_dual_mapping_mode           :  1;  /* RW */
+	unsigned long	vga_io_port_decode_enable          :  1;  /* RW */
+	unsigned long	vga_io_port_16_bit_decode          :  1;  /* RW */
+	unsigned long	suppress_dest_registration         :  1;  /* RW */
+	unsigned long	programmed_initial_priority        :  3;  /* RW */
+	unsigned long	use_incoming_priority              :  1;  /* RW */
+	unsigned long	enable_programmed_initial_priority :  1;  /* RW */
+	unsigned long	rsvd_29_47                         : 19;  /*    */
+	unsigned long	fun                                : 16;  /* RW */
+    } s;
+};
+
 /* ========================================================================= */
 /*                     UVH_LB_BAU_SB_ACTIVATION_CONTROL                      */
 /* ========================================================================= */
@@ -680,334 +753,6 @@ union uvh_lb_bau_sb_descriptor_base_u {
     } s;
 };
 
-/* ========================================================================= */
-/*                      UVH_LB_MCAST_AOERR0_RPT_ENABLE                       */
-/* ========================================================================= */
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE 0x50b20UL
-
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_SHFT 0
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_MASK 0x0000000000000001UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_SHFT 1
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_MASK 0x0000000000000002UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_SHFT 2
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_MASK 0x0000000000000004UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_SHFT 3
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_MASK 0x0000000000000008UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_SHFT 4
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_MASK 0x0000000000000010UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_SHFT 5
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_MASK 0x0000000000000020UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_SHFT 6
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_MASK 0x0000000000000040UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_SHFT 7
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_MASK 0x0000000000000080UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_SHFT 8
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_MASK 0x0000000000000100UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_SHFT 9
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_MASK 0x0000000000000200UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_SHFT 10
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_MASK 0x0000000000000400UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_SHFT 11
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_MASK 0x0000000000000800UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_SHFT 12
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_MASK 0x0000000000001000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_SHFT 13
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_MASK 0x0000000000002000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_SHFT 14
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_MASK 0x0000000000004000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_SHFT 15
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_MASK 0x0000000000008000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_SHFT 16
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_MASK 0x0000000000010000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_SHFT 17
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_MASK 0x0000000000020000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_SHFT 18
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_MASK 0x0000000000040000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_SHFT 19
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_MASK 0x0000000000080000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_SHFT 20
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_SHFT 22
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_MASK 0x0000000000400000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_SHFT 23
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_MASK 0x0000000000800000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 24
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000001000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 25
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000002000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 26
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000004000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 27
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000008000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 28
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000010000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 29
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000020000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 30
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000040000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 31
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000080000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 32
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000100000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 33
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000200000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 34
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000400000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 35
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000800000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 36
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000001000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 37
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000002000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 38
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000004000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 39
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000008000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 40
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000010000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 41
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000020000000000UL
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 42
-#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000040000000000UL
-
-union uvh_lb_mcast_aoerr0_rpt_enable_u {
-    unsigned long	v;
-    struct uvh_lb_mcast_aoerr0_rpt_enable_s {
-	unsigned long	mcast_obese_msg                         :  1;  /* RW */
-	unsigned long	mcast_data_sb_err                       :  1;  /* RW */
-	unsigned long	mcast_nack_buff_parity                  :  1;  /* RW */
-	unsigned long	mcast_timeout                           :  1;  /* RW */
-	unsigned long	mcast_inactive_reply                    :  1;  /* RW */
-	unsigned long	mcast_upgrade_error                     :  1;  /* RW */
-	unsigned long	mcast_reg_count_underflow               :  1;  /* RW */
-	unsigned long	mcast_rep_obese_msg                     :  1;  /* RW */
-	unsigned long	ucache_req_runt_msg                     :  1;  /* RW */
-	unsigned long	ucache_req_obese_msg                    :  1;  /* RW */
-	unsigned long	ucache_req_data_sb_err                  :  1;  /* RW */
-	unsigned long	ucache_rep_runt_msg                     :  1;  /* RW */
-	unsigned long	ucache_rep_obese_msg                    :  1;  /* RW */
-	unsigned long	ucache_rep_data_sb_err                  :  1;  /* RW */
-	unsigned long	ucache_rep_command_err                  :  1;  /* RW */
-	unsigned long	ucache_pend_timeout                     :  1;  /* RW */
-	unsigned long	macc_req_runt_msg                       :  1;  /* RW */
-	unsigned long	macc_req_obese_msg                      :  1;  /* RW */
-	unsigned long	macc_req_data_sb_err                    :  1;  /* RW */
-	unsigned long	macc_rep_runt_msg                       :  1;  /* RW */
-	unsigned long	macc_rep_obese_msg                      :  1;  /* RW */
-	unsigned long	macc_rep_data_sb_err                    :  1;  /* RW */
-	unsigned long	macc_amo_timeout                        :  1;  /* RW */
-	unsigned long	macc_put_timeout                        :  1;  /* RW */
-	unsigned long	macc_spurious_event                     :  1;  /* RW */
-	unsigned long	ioh_destination_table_parity            :  1;  /* RW */
-	unsigned long	get_had_error_reply                     :  1;  /* RW */
-	unsigned long	get_timeout                             :  1;  /* RW */
-	unsigned long	lock_manager_had_error_reply            :  1;  /* RW */
-	unsigned long	put_had_error_reply                     :  1;  /* RW */
-	unsigned long	put_timeout                             :  1;  /* RW */
-	unsigned long	sb_activation_overrun                   :  1;  /* RW */
-	unsigned long	completed_gb_activation_had_error_reply :  1;  /* RW */
-	unsigned long	completed_gb_activation_timeout         :  1;  /* RW */
-	unsigned long	descriptor_buffer_0_parity              :  1;  /* RW */
-	unsigned long	descriptor_buffer_1_parity              :  1;  /* RW */
-	unsigned long	socket_destination_table_parity         :  1;  /* RW */
-	unsigned long	bau_reply_payload_corruption            :  1;  /* RW */
-	unsigned long	io_port_destination_table_parity        :  1;  /* RW */
-	unsigned long	intd_soft_ack_timeout                   :  1;  /* RW */
-	unsigned long	int_rep_obese_msg                       :  1;  /* RW */
-	unsigned long	int_rep_command_err                     :  1;  /* RW */
-	unsigned long	int_timeout                             :  1;  /* RW */
-	unsigned long	rsvd_43_63                              : 21;  /*    */
-    } s;
-};
-
-/* ========================================================================= */
-/*                          UVH_LOCAL_INT0_CONFIG                            */
-/* ========================================================================= */
-#define UVH_LOCAL_INT0_CONFIG 0x61000UL
-
-#define UVH_LOCAL_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_LOCAL_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_LOCAL_INT0_CONFIG_DM_SHFT 8
-#define UVH_LOCAL_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_LOCAL_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_LOCAL_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_LOCAL_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_LOCAL_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_LOCAL_INT0_CONFIG_P_SHFT 13
-#define UVH_LOCAL_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_LOCAL_INT0_CONFIG_T_SHFT 15
-#define UVH_LOCAL_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_LOCAL_INT0_CONFIG_M_SHFT 16
-#define UVH_LOCAL_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_LOCAL_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_LOCAL_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_local_int0_config_u {
-    unsigned long	v;
-    struct uvh_local_int0_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                          UVH_LOCAL_INT0_ENABLE                            */
-/* ========================================================================= */
-#define UVH_LOCAL_INT0_ENABLE 0x65000UL
-
-#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_SHFT 0
-#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_MASK 0x0000000000000001UL
-#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_SHFT 1
-#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_MASK 0x0000000000000002UL
-#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_SHFT 2
-#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_MASK 0x0000000000000004UL
-#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_SHFT 3
-#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_MASK 0x0000000000000008UL
-#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_SHFT 4
-#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_MASK 0x0000000000000010UL
-#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_SHFT 5
-#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_MASK 0x0000000000000020UL
-#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_SHFT 6
-#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_MASK 0x0000000000000040UL
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_SHFT 7
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_MASK 0x0000000000000080UL
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_SHFT 8
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_MASK 0x0000000000000100UL
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_SHFT 9
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_MASK 0x0000000000000200UL
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_SHFT 10
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_MASK 0x0000000000000400UL
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_SHFT 11
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_MASK 0x0000000000000800UL
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_SHFT 12
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_MASK 0x0000000000001000UL
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_SHFT 13
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_MASK 0x0000000000002000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_SHFT 14
-#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_MASK 0x0000000000004000UL
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_SHFT 15
-#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_MASK 0x0000000000008000UL
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_SHFT 16
-#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_MASK 0x0000000000010000UL
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_SHFT 17
-#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_MASK 0x0000000000020000UL
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_SHFT 18
-#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_MASK 0x0000000000040000UL
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_SHFT 19
-#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_MASK 0x0000000000080000UL
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_SHFT 20
-#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_MASK 0x0000000000100000UL
-#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_SHFT 21
-#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_MASK 0x0000000000200000UL
-#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_SHFT 22
-#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_SHFT 23
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_MASK 0x0000000000800000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_SHFT 24
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_MASK 0x0000000001000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_SHFT 25
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_MASK 0x0000000002000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_SHFT 26
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_MASK 0x0000000004000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_SHFT 27
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_MASK 0x0000000008000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_SHFT 28
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_MASK 0x0000000010000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_SHFT 29
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_MASK 0x0000000020000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_SHFT 30
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_MASK 0x0000000040000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_SHFT 31
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_MASK 0x0000000080000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_SHFT 32
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_MASK 0x0000000100000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_SHFT 33
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_MASK 0x0000000200000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_SHFT 34
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_MASK 0x0000000400000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_SHFT 35
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_MASK 0x0000000800000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_SHFT 36
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_MASK 0x0000001000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_SHFT 37
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_MASK 0x0000002000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_SHFT 38
-#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_MASK 0x0000004000000000UL
-#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_SHFT 39
-#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_MASK 0x0000008000000000UL
-#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_SHFT 40
-#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_MASK 0x0000010000000000UL
-#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_SHFT 41
-#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_MASK 0x0000020000000000UL
-#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_SHFT 42
-#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_MASK 0x0000040000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LTC_INT_SHFT 43
-#define UVH_LOCAL_INT0_ENABLE_LTC_INT_MASK 0x0000080000000000UL
-#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_SHFT 44
-#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
-
-union uvh_local_int0_enable_u {
-    unsigned long	v;
-    struct uvh_local_int0_enable_s {
-	unsigned long	lb_hcerr            :  1;  /* RW */
-	unsigned long	gr0_hcerr           :  1;  /* RW */
-	unsigned long	gr1_hcerr           :  1;  /* RW */
-	unsigned long	lh_hcerr            :  1;  /* RW */
-	unsigned long	rh_hcerr            :  1;  /* RW */
-	unsigned long	xn_hcerr            :  1;  /* RW */
-	unsigned long	si_hcerr            :  1;  /* RW */
-	unsigned long	lb_aoerr0           :  1;  /* RW */
-	unsigned long	gr0_aoerr0          :  1;  /* RW */
-	unsigned long	gr1_aoerr0          :  1;  /* RW */
-	unsigned long	lh_aoerr0           :  1;  /* RW */
-	unsigned long	rh_aoerr0           :  1;  /* RW */
-	unsigned long	xn_aoerr0           :  1;  /* RW */
-	unsigned long	si_aoerr0           :  1;  /* RW */
-	unsigned long	lb_aoerr1           :  1;  /* RW */
-	unsigned long	gr0_aoerr1          :  1;  /* RW */
-	unsigned long	gr1_aoerr1          :  1;  /* RW */
-	unsigned long	lh_aoerr1           :  1;  /* RW */
-	unsigned long	rh_aoerr1           :  1;  /* RW */
-	unsigned long	xn_aoerr1           :  1;  /* RW */
-	unsigned long	si_aoerr1           :  1;  /* RW */
-	unsigned long	rh_vpi_int          :  1;  /* RW */
-	unsigned long	system_shutdown_int :  1;  /* RW */
-	unsigned long	lb_irq_int_0        :  1;  /* RW */
-	unsigned long	lb_irq_int_1        :  1;  /* RW */
-	unsigned long	lb_irq_int_2        :  1;  /* RW */
-	unsigned long	lb_irq_int_3        :  1;  /* RW */
-	unsigned long	lb_irq_int_4        :  1;  /* RW */
-	unsigned long	lb_irq_int_5        :  1;  /* RW */
-	unsigned long	lb_irq_int_6        :  1;  /* RW */
-	unsigned long	lb_irq_int_7        :  1;  /* RW */
-	unsigned long	lb_irq_int_8        :  1;  /* RW */
-	unsigned long	lb_irq_int_9        :  1;  /* RW */
-	unsigned long	lb_irq_int_10       :  1;  /* RW */
-	unsigned long	lb_irq_int_11       :  1;  /* RW */
-	unsigned long	lb_irq_int_12       :  1;  /* RW */
-	unsigned long	lb_irq_int_13       :  1;  /* RW */
-	unsigned long	lb_irq_int_14       :  1;  /* RW */
-	unsigned long	lb_irq_int_15       :  1;  /* RW */
-	unsigned long	l1_nmi_int          :  1;  /* RW */
-	unsigned long	stop_clock          :  1;  /* RW */
-	unsigned long	asic_to_l1          :  1;  /* RW */
-	unsigned long	l1_to_asic          :  1;  /* RW */
-	unsigned long	ltc_int             :  1;  /* RW */
-	unsigned long	la_seq_trigger      :  1;  /* RW */
-	unsigned long	rsvd_45_63          : 19;  /*    */
-    } s;
-};
-
 /* ========================================================================= */
 /*                               UVH_NODE_ID                                 */
 /* ========================================================================= */
@@ -1111,26 +856,6 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
     } s;
 };
 
-/* ========================================================================= */
-/*                    UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR                      */
-/* ========================================================================= */
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL
-
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_rh_gam_cfg_overlay_config_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_cfg_overlay_config_mmr_s {
-	unsigned long	rsvd_0_25: 26;  /*    */
-	unsigned long	base   : 20;  /* RW */
-	unsigned long	rsvd_46_62: 17;  /*    */
-	unsigned long	enable :  1;  /* RW */
-    } s;
-};
-
 /* ========================================================================= */
 /*                    UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR                      */
 /* ========================================================================= */
@@ -1262,101 +987,6 @@ union uvh_rtc1_int_config_u {
     } s;
 };
 
-/* ========================================================================= */
-/*                           UVH_RTC2_INT_CONFIG                             */
-/* ========================================================================= */
-#define UVH_RTC2_INT_CONFIG 0x61600UL
-
-#define UVH_RTC2_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC2_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC2_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC2_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC2_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC2_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC2_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC2_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC2_INT_CONFIG_P_SHFT 13
-#define UVH_RTC2_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC2_INT_CONFIG_T_SHFT 15
-#define UVH_RTC2_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC2_INT_CONFIG_M_SHFT 16
-#define UVH_RTC2_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC2_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC2_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_rtc2_int_config_u {
-    unsigned long	v;
-    struct uvh_rtc2_int_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                           UVH_RTC3_INT_CONFIG                             */
-/* ========================================================================= */
-#define UVH_RTC3_INT_CONFIG 0x61640UL
-
-#define UVH_RTC3_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC3_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC3_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC3_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC3_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC3_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC3_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC3_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC3_INT_CONFIG_P_SHFT 13
-#define UVH_RTC3_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC3_INT_CONFIG_T_SHFT 15
-#define UVH_RTC3_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC3_INT_CONFIG_M_SHFT 16
-#define UVH_RTC3_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC3_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC3_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
-
-union uvh_rtc3_int_config_u {
-    unsigned long	v;
-    struct uvh_rtc3_int_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
-};
-
-/* ========================================================================= */
-/*                            UVH_RTC_INC_RATIO                              */
-/* ========================================================================= */
-#define UVH_RTC_INC_RATIO 0x350000UL
-
-#define UVH_RTC_INC_RATIO_FRACTION_SHFT 0
-#define UVH_RTC_INC_RATIO_FRACTION_MASK 0x00000000000fffffUL
-#define UVH_RTC_INC_RATIO_RATIO_SHFT 20
-#define UVH_RTC_INC_RATIO_RATIO_MASK 0x0000000000700000UL
-
-union uvh_rtc_inc_ratio_u {
-    unsigned long	v;
-    struct uvh_rtc_inc_ratio_s {
-	unsigned long	fraction : 20;  /* RW */
-	unsigned long	ratio    :  3;  /* RW */
-	unsigned long	rsvd_23_63: 41;  /*    */
-    } s;
-};
-
 /* ========================================================================= */
 /*                          UVH_SI_ADDR_MAP_CONFIG                           */
 /* ========================================================================= */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 364d015efeb..ef68ba48564 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -20,6 +20,8 @@
 #include <asm/tsc.h>
 #include <asm/irq_vectors.h>
 
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	0x000000000bUL
+
 static struct bau_control	**uv_bau_table_bases __read_mostly;
 static int			uv_bau_retry_limit __read_mostly;
 
@@ -478,16 +480,16 @@ static void uv_enable_timeouts(void)
 		 * To program the period, the SOFT_ACK_MODE must be off.
 		 */
 		mmr_image &= ~((unsigned long)1 <<
-			       UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
 		uv_write_global_mmr64
 		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 		/*
 		 * Set the 4-bit period.
 		 */
 		mmr_image &= ~((unsigned long)0xf <<
-			UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
 		mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
-			     UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
 		uv_write_global_mmr64
 		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 		/*
@@ -496,7 +498,7 @@ static void uv_enable_timeouts(void)
 		 * indicated in bits 2:0 (7 causes all of them to timeout).
 		 */
 		mmr_image |= ((unsigned long)1 <<
-			      UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
 		uv_write_global_mmr64
 		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 	}
-- 
cgit v1.2.3


From 41acab8851a0408c1d5ad6c21a07456f88b54d40 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.de.marchi@gmail.com>
Date: Wed, 10 Mar 2010 23:37:45 -0300
Subject: sched: Implement group scheduler statistics in one struct

Put all statistic fields of sched_entity in one struct, sched_statistics,
and embed it into sched_entity.

This change allows to memset the sched_statistics to 0 when needed (for
instance when forking), avoiding bugs of non initialized fields.

Signed-off-by: Lucas De Marchi <lucas.de.marchi@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268275065-18542-1-git-send-email-lucas.de.marchi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  54 +++++++++++++--------------
 kernel/sched.c        |  47 +++++------------------
 kernel/sched_debug.c  | 101 +++++++++++++++++++-------------------------------
 kernel/sched_fair.c   |  65 ++++++++++++++++----------------
 kernel/sched_rt.c     |   2 +-
 5 files changed, 106 insertions(+), 163 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4b1753f7e48..8cc863d6647 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1127,36 +1127,8 @@ struct load_weight {
 	unsigned long weight, inv_weight;
 };
 
-/*
- * CFS stats for a schedulable entity (task, task-group etc)
- *
- * Current field usage histogram:
- *
- *     4 se->block_start
- *     4 se->run_node
- *     4 se->sleep_start
- *     6 se->load.weight
- */
-struct sched_entity {
-	struct load_weight	load;		/* for load-balancing */
-	struct rb_node		run_node;
-	struct list_head	group_node;
-	unsigned int		on_rq;
-
-	u64			exec_start;
-	u64			sum_exec_runtime;
-	u64			vruntime;
-	u64			prev_sum_exec_runtime;
-
-	u64			last_wakeup;
-	u64			avg_overlap;
-
-	u64			nr_migrations;
-
-	u64			start_runtime;
-	u64			avg_wakeup;
-
 #ifdef CONFIG_SCHEDSTATS
+struct sched_statistics {
 	u64			wait_start;
 	u64			wait_max;
 	u64			wait_count;
@@ -1188,6 +1160,30 @@ struct sched_entity {
 	u64			nr_wakeups_affine_attempts;
 	u64			nr_wakeups_passive;
 	u64			nr_wakeups_idle;
+};
+#endif
+
+struct sched_entity {
+	struct load_weight	load;		/* for load-balancing */
+	struct rb_node		run_node;
+	struct list_head	group_node;
+	unsigned int		on_rq;
+
+	u64			exec_start;
+	u64			sum_exec_runtime;
+	u64			vruntime;
+	u64			prev_sum_exec_runtime;
+
+	u64			last_wakeup;
+	u64			avg_overlap;
+
+	u64			nr_migrations;
+
+	u64			start_runtime;
+	u64			avg_wakeup;
+
+#ifdef CONFIG_SCHEDSTATS
+	struct sched_statistics statistics;
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched.c b/kernel/sched.c
index 2c1db81f80e..a4aa071f08f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2437,15 +2437,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 
 out_activate:
 #endif /* CONFIG_SMP */
-	schedstat_inc(p, se.nr_wakeups);
+	schedstat_inc(p, se.statistics.nr_wakeups);
 	if (wake_flags & WF_SYNC)
-		schedstat_inc(p, se.nr_wakeups_sync);
+		schedstat_inc(p, se.statistics.nr_wakeups_sync);
 	if (orig_cpu != cpu)
-		schedstat_inc(p, se.nr_wakeups_migrate);
+		schedstat_inc(p, se.statistics.nr_wakeups_migrate);
 	if (cpu == this_cpu)
-		schedstat_inc(p, se.nr_wakeups_local);
+		schedstat_inc(p, se.statistics.nr_wakeups_local);
 	else
-		schedstat_inc(p, se.nr_wakeups_remote);
+		schedstat_inc(p, se.statistics.nr_wakeups_remote);
 	activate_task(rq, p, 1);
 	success = 1;
 
@@ -2532,36 +2532,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity;
 
 #ifdef CONFIG_SCHEDSTATS
-	p->se.wait_start			= 0;
-	p->se.wait_max				= 0;
-	p->se.wait_count			= 0;
-	p->se.wait_sum				= 0;
-
-	p->se.sleep_start			= 0;
-	p->se.sleep_max				= 0;
-	p->se.sum_sleep_runtime			= 0;
-
-	p->se.block_start			= 0;
-	p->se.block_max				= 0;
-	p->se.exec_max				= 0;
-	p->se.slice_max				= 0;
-
-	p->se.nr_migrations_cold		= 0;
-	p->se.nr_failed_migrations_affine	= 0;
-	p->se.nr_failed_migrations_running	= 0;
-	p->se.nr_failed_migrations_hot		= 0;
-	p->se.nr_forced_migrations		= 0;
-
-	p->se.nr_wakeups			= 0;
-	p->se.nr_wakeups_sync			= 0;
-	p->se.nr_wakeups_migrate		= 0;
-	p->se.nr_wakeups_local			= 0;
-	p->se.nr_wakeups_remote			= 0;
-	p->se.nr_wakeups_affine			= 0;
-	p->se.nr_wakeups_affine_attempts	= 0;
-	p->se.nr_wakeups_passive		= 0;
-	p->se.nr_wakeups_idle			= 0;
-
+	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
 	INIT_LIST_HEAD(&p->rt.run_list);
@@ -7910,9 +7881,9 @@ void normalize_rt_tasks(void)
 
 		p->se.exec_start		= 0;
 #ifdef CONFIG_SCHEDSTATS
-		p->se.wait_start		= 0;
-		p->se.sleep_start		= 0;
-		p->se.block_start		= 0;
+		p->se.statistics.wait_start	= 0;
+		p->se.statistics.sleep_start	= 0;
+		p->se.statistics.block_start	= 0;
 #endif
 
 		if (!rt_task(p)) {
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 67f95aada4b..ad9df442276 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
 	PN(se->vruntime);
 	PN(se->sum_exec_runtime);
 #ifdef CONFIG_SCHEDSTATS
-	PN(se->wait_start);
-	PN(se->sleep_start);
-	PN(se->block_start);
-	PN(se->sleep_max);
-	PN(se->block_max);
-	PN(se->exec_max);
-	PN(se->slice_max);
-	PN(se->wait_max);
-	PN(se->wait_sum);
-	P(se->wait_count);
+	PN(se->statistics.wait_start);
+	PN(se->statistics.sleep_start);
+	PN(se->statistics.block_start);
+	PN(se->statistics.sleep_max);
+	PN(se->statistics.block_max);
+	PN(se->statistics.exec_max);
+	PN(se->statistics.slice_max);
+	PN(se->statistics.wait_max);
+	PN(se->statistics.wait_sum);
+	P(se->statistics.wait_count);
 #endif
 	P(se->load.weight);
 #undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
 		SPLIT_NS(p->se.vruntime),
 		SPLIT_NS(p->se.sum_exec_runtime),
-		SPLIT_NS(p->se.sum_sleep_runtime));
+		SPLIT_NS(p->se.statistics.sum_sleep_runtime));
 #else
 	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
 		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -413,34 +413,34 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	nr_switches = p->nvcsw + p->nivcsw;
 
 #ifdef CONFIG_SCHEDSTATS
-	PN(se.wait_start);
-	PN(se.sleep_start);
-	PN(se.block_start);
-	PN(se.sleep_max);
-	PN(se.block_max);
-	PN(se.exec_max);
-	PN(se.slice_max);
-	PN(se.wait_max);
-	PN(se.wait_sum);
-	P(se.wait_count);
-	PN(se.iowait_sum);
-	P(se.iowait_count);
+	PN(se.statistics.wait_start);
+	PN(se.statistics.sleep_start);
+	PN(se.statistics.block_start);
+	PN(se.statistics.sleep_max);
+	PN(se.statistics.block_max);
+	PN(se.statistics.exec_max);
+	PN(se.statistics.slice_max);
+	PN(se.statistics.wait_max);
+	PN(se.statistics.wait_sum);
+	P(se.statistics.wait_count);
+	PN(se.statistics.iowait_sum);
+	P(se.statistics.iowait_count);
 	P(sched_info.bkl_count);
 	P(se.nr_migrations);
-	P(se.nr_migrations_cold);
-	P(se.nr_failed_migrations_affine);
-	P(se.nr_failed_migrations_running);
-	P(se.nr_failed_migrations_hot);
-	P(se.nr_forced_migrations);
-	P(se.nr_wakeups);
-	P(se.nr_wakeups_sync);
-	P(se.nr_wakeups_migrate);
-	P(se.nr_wakeups_local);
-	P(se.nr_wakeups_remote);
-	P(se.nr_wakeups_affine);
-	P(se.nr_wakeups_affine_attempts);
-	P(se.nr_wakeups_passive);
-	P(se.nr_wakeups_idle);
+	P(se.statistics.nr_migrations_cold);
+	P(se.statistics.nr_failed_migrations_affine);
+	P(se.statistics.nr_failed_migrations_running);
+	P(se.statistics.nr_failed_migrations_hot);
+	P(se.statistics.nr_forced_migrations);
+	P(se.statistics.nr_wakeups);
+	P(se.statistics.nr_wakeups_sync);
+	P(se.statistics.nr_wakeups_migrate);
+	P(se.statistics.nr_wakeups_local);
+	P(se.statistics.nr_wakeups_remote);
+	P(se.statistics.nr_wakeups_affine);
+	P(se.statistics.nr_wakeups_affine_attempts);
+	P(se.statistics.nr_wakeups_passive);
+	P(se.statistics.nr_wakeups_idle);
 
 	{
 		u64 avg_atom, avg_per_cpu;
@@ -491,32 +491,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
-	p->se.wait_max				= 0;
-	p->se.wait_sum				= 0;
-	p->se.wait_count			= 0;
-	p->se.iowait_sum			= 0;
-	p->se.iowait_count			= 0;
-	p->se.sleep_max				= 0;
-	p->se.sum_sleep_runtime			= 0;
-	p->se.block_max				= 0;
-	p->se.exec_max				= 0;
-	p->se.slice_max				= 0;
-	p->se.nr_migrations			= 0;
-	p->se.nr_migrations_cold		= 0;
-	p->se.nr_failed_migrations_affine	= 0;
-	p->se.nr_failed_migrations_running	= 0;
-	p->se.nr_failed_migrations_hot		= 0;
-	p->se.nr_forced_migrations		= 0;
-	p->se.nr_wakeups			= 0;
-	p->se.nr_wakeups_sync			= 0;
-	p->se.nr_wakeups_migrate		= 0;
-	p->se.nr_wakeups_local			= 0;
-	p->se.nr_wakeups_remote			= 0;
-	p->se.nr_wakeups_affine			= 0;
-	p->se.nr_wakeups_affine_attempts	= 0;
-	p->se.nr_wakeups_passive		= 0;
-	p->se.nr_wakeups_idle			= 0;
-	p->sched_info.bkl_count			= 0;
+	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 	p->se.sum_exec_runtime			= 0;
 	p->se.prev_sum_exec_runtime		= 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3e1fd96c6cf..8ad164bbdac 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 {
 	unsigned long delta_exec_weighted;
 
-	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
+	schedstat_set(curr->statistics.exec_max,
+		      max((u64)delta_exec, curr->statistics.exec_max));
 
 	curr->sum_exec_runtime += delta_exec;
 	schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
+	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
 }
 
 /*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	schedstat_set(se->wait_max, max(se->wait_max,
-			rq_of(cfs_rq)->clock - se->wait_start));
-	schedstat_set(se->wait_count, se->wait_count + 1);
-	schedstat_set(se->wait_sum, se->wait_sum +
-			rq_of(cfs_rq)->clock - se->wait_start);
+	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
+			rq_of(cfs_rq)->clock - se->statistics.wait_start));
+	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
+	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
+			rq_of(cfs_rq)->clock - se->statistics.wait_start);
 #ifdef CONFIG_SCHEDSTATS
 	if (entity_is_task(se)) {
 		trace_sched_stat_wait(task_of(se),
-			rq_of(cfs_rq)->clock - se->wait_start);
+			rq_of(cfs_rq)->clock - se->statistics.wait_start);
 	}
 #endif
-	schedstat_set(se->wait_start, 0);
+	schedstat_set(se->statistics.wait_start, 0);
 }
 
 static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	if (entity_is_task(se))
 		tsk = task_of(se);
 
-	if (se->sleep_start) {
-		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+	if (se->statistics.sleep_start) {
+		u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
 
 		if ((s64)delta < 0)
 			delta = 0;
 
-		if (unlikely(delta > se->sleep_max))
-			se->sleep_max = delta;
+		if (unlikely(delta > se->statistics.sleep_max))
+			se->statistics.sleep_max = delta;
 
-		se->sleep_start = 0;
-		se->sum_sleep_runtime += delta;
+		se->statistics.sleep_start = 0;
+		se->statistics.sum_sleep_runtime += delta;
 
 		if (tsk) {
 			account_scheduler_latency(tsk, delta >> 10, 1);
 			trace_sched_stat_sleep(tsk, delta);
 		}
 	}
-	if (se->block_start) {
-		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+	if (se->statistics.block_start) {
+		u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
 
 		if ((s64)delta < 0)
 			delta = 0;
 
-		if (unlikely(delta > se->block_max))
-			se->block_max = delta;
+		if (unlikely(delta > se->statistics.block_max))
+			se->statistics.block_max = delta;
 
-		se->block_start = 0;
-		se->sum_sleep_runtime += delta;
+		se->statistics.block_start = 0;
+		se->statistics.sum_sleep_runtime += delta;
 
 		if (tsk) {
 			if (tsk->in_iowait) {
-				se->iowait_sum += delta;
-				se->iowait_count++;
+				se->statistics.iowait_sum += delta;
+				se->statistics.iowait_count++;
 				trace_sched_stat_iowait(tsk, delta);
 			}
 
@@ -826,9 +827,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 			struct task_struct *tsk = task_of(se);
 
 			if (tsk->state & TASK_INTERRUPTIBLE)
-				se->sleep_start = rq_of(cfs_rq)->clock;
+				se->statistics.sleep_start = rq_of(cfs_rq)->clock;
 			if (tsk->state & TASK_UNINTERRUPTIBLE)
-				se->block_start = rq_of(cfs_rq)->clock;
+				se->statistics.block_start = rq_of(cfs_rq)->clock;
 		}
 #endif
 	}
@@ -912,7 +913,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * when there are only lesser-weight tasks around):
 	 */
 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
-		se->slice_max = max(se->slice_max,
+		se->statistics.slice_max = max(se->statistics.slice_max,
 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
 	}
 #endif
@@ -1306,7 +1307,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	if (sync && balanced)
 		return 1;
 
-	schedstat_inc(p, se.nr_wakeups_affine_attempts);
+	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
 	tl_per_task = cpu_avg_load_per_task(this_cpu);
 
 	if (balanced ||
@@ -1318,7 +1319,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 		 * there is no bad imbalance.
 		 */
 		schedstat_inc(sd, ttwu_move_affine);
-		schedstat_inc(p, se.nr_wakeups_affine);
+		schedstat_inc(p, se.statistics.nr_wakeups_affine);
 
 		return 1;
 	}
@@ -1844,13 +1845,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 3) are cache-hot on their current CPU.
 	 */
 	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
-		schedstat_inc(p, se.nr_failed_migrations_affine);
+		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
 		return 0;
 	}
 	*all_pinned = 0;
 
 	if (task_running(rq, p)) {
-		schedstat_inc(p, se.nr_failed_migrations_running);
+		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
 		return 0;
 	}
 
@@ -1866,14 +1867,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 #ifdef CONFIG_SCHEDSTATS
 		if (tsk_cache_hot) {
 			schedstat_inc(sd, lb_hot_gained[idle]);
-			schedstat_inc(p, se.nr_forced_migrations);
+			schedstat_inc(p, se.statistics.nr_forced_migrations);
 		}
 #endif
 		return 1;
 	}
 
 	if (tsk_cache_hot) {
-		schedstat_inc(p, se.nr_failed_migrations_hot);
+		schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
 		return 0;
 	}
 	return 1;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c4fb42a66ca..0335e87f520 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
 	if (unlikely((s64)delta_exec < 0))
 		delta_exec = 0;
 
-	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
+	schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
 
 	curr->se.sum_exec_runtime += delta_exec;
 	account_group_exec_runtime(curr, delta_exec);
-- 
cgit v1.2.3


From 9b33fa6ba0e2f90fdf407501db801c2511121564 Mon Sep 17 00:00:00 2001
From: "eranian@google.com" <eranian@google.com>
Date: Wed, 10 Mar 2010 22:26:05 -0800
Subject: perf_events: Improve task_sched_in()

This patch is an optimization in perf_event_task_sched_in() to avoid
scheduling the events twice in a row.

Without it, the perf_disable()/perf_enable() pair is invoked twice,
thereby pinned events counts while scheduling flexible events and we go
throuh hw_perf_enable() twice.

By encapsulating, the whole sequence into perf_disable()/perf_enable() we
ensure, hw_perf_enable() is going to be invoked only once because of the
refcount protection.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268288765-5326-1-git-send-email-eranian@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 52c69a34d69..3853d49c7d5 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1368,6 +1368,8 @@ void perf_event_task_sched_in(struct task_struct *task)
 	if (cpuctx->task_ctx == ctx)
 		return;
 
+	perf_disable();
+
 	/*
 	 * We want to keep the following priority order:
 	 * cpu pinned (that don't need to move), task pinned,
@@ -1380,6 +1382,8 @@ void perf_event_task_sched_in(struct task_struct *task)
 	ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
 
 	cpuctx->task_ctx = ctx;
+
+	perf_enable();
 }
 
 #define MAX_INTERRUPTS (~0ULL)
-- 
cgit v1.2.3


From 39c0cbe2150cbd848a25ba6cdb271d1ad46818ad Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:17:13 +0100
Subject: sched: Rate-limit nohz

Entering nohz code on every micro-idle is costing ~10% throughput for netperf
TCP_RR when scheduling cross-cpu.  Rate limiting entry fixes this, but raises
ticks a bit.  On my Q6600, an idle box goes from ~85 interrupts/sec to 128.

The higher the context switch rate, the more nohz entry costs.  With this patch
and some cycle recovery patches in my tree, max cross cpu context switch rate is
improved by ~16%, a large portion of which of which is this ratelimiting.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301003.6785.28.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    |  6 ++++++
 kernel/sched.c           | 12 ++++++++++++
 kernel/time/tick-sched.c |  3 +++
 3 files changed, 21 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8cc863d6647..13efe7dac5f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -271,11 +271,17 @@ extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
 extern int get_nohz_load_balancer(void);
+extern int nohz_ratelimit(int cpu);
 #else
 static inline int select_nohz_load_balancer(int cpu)
 {
 	return 0;
 }
+
+static inline int nohz_ratelimit(int cpu)
+{
+	return 0;
+}
 #endif
 
 /*
diff --git a/kernel/sched.c b/kernel/sched.c
index a4aa071f08f..60b1bbe2ad1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -492,6 +492,7 @@ struct rq {
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ
+	u64 nohz_stamp;
 	unsigned char in_nohz_recently;
 #endif
 	/* capture load from *all* tasks on this cpu: */
@@ -1228,6 +1229,17 @@ void wake_up_idle_cpu(int cpu)
 	if (!tsk_is_polling(rq->idle))
 		smp_send_reschedule(cpu);
 }
+
+int nohz_ratelimit(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	u64 diff = rq->clock - rq->nohz_stamp;
+
+	rq->nohz_stamp = rq->clock;
+
+	return diff < (NSEC_PER_SEC / HZ) >> 1;
+}
+
 #endif /* CONFIG_NO_HZ */
 
 static u64 sched_avg_period(void)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f5..f25735a767a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -262,6 +262,9 @@ void tick_nohz_stop_sched_tick(int inidle)
 		goto end;
 	}
 
+	if (nohz_ratelimit(cpu))
+		goto end;
+
 	ts->idle_calls++;
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
-- 
cgit v1.2.3


From b42e0c41a422a212ddea0666d5a3a0e3c35206db Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:15:38 +0100
Subject: sched: Remove avg_wakeup

Testing the load which led to this heuristic (nfs4 kbuild) shows that it has
outlived it's usefullness.  With intervening load balancing changes, I cannot
see any difference with/without, so recover there fastpath cycles.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301062.6785.29.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  3 ---
 kernel/sched.c          | 26 ++++----------------------
 kernel/sched_debug.c    |  1 -
 kernel/sched_fair.c     | 31 -------------------------------
 kernel/sched_features.h |  6 ------
 5 files changed, 4 insertions(+), 63 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 13efe7dac5f..70c560f5ada 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1185,9 +1185,6 @@ struct sched_entity {
 
 	u64			nr_migrations;
 
-	u64			start_runtime;
-	u64			avg_wakeup;
-
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_statistics statistics;
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 60b1bbe2ad1..35a8626ace7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1880,9 +1880,6 @@ static void update_avg(u64 *avg, u64 sample)
 static void
 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
-	if (wakeup)
-		p->se.start_runtime = p->se.sum_exec_runtime;
-
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, wakeup, head);
 	p->se.on_rq = 1;
@@ -1890,17 +1887,11 @@ enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
-	if (sleep) {
-		if (p->se.last_wakeup) {
-			update_avg(&p->se.avg_overlap,
-				p->se.sum_exec_runtime - p->se.last_wakeup);
-			p->se.last_wakeup = 0;
-		} else {
-			update_avg(&p->se.avg_wakeup,
-				sysctl_sched_wakeup_granularity);
-		}
+	if (sleep && p->se.last_wakeup) {
+		update_avg(&p->se.avg_overlap,
+			p->se.sum_exec_runtime - p->se.last_wakeup);
+		p->se.last_wakeup = 0;
 	}
-
 	sched_info_dequeued(p);
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
@@ -2466,13 +2457,6 @@ out_activate:
 	 */
 	if (!in_interrupt()) {
 		struct sched_entity *se = &current->se;
-		u64 sample = se->sum_exec_runtime;
-
-		if (se->last_wakeup)
-			sample -= se->last_wakeup;
-		else
-			sample -= se->start_runtime;
-		update_avg(&se->avg_wakeup, sample);
 
 		se->last_wakeup = se->sum_exec_runtime;
 	}
@@ -2540,8 +2524,6 @@ static void __sched_fork(struct task_struct *p)
 	p->se.nr_migrations		= 0;
 	p->se.last_wakeup		= 0;
 	p->se.avg_overlap		= 0;
-	p->se.start_runtime		= 0;
-	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity;
 
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index ad9df442276..20b95a420fe 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -408,7 +408,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.vruntime);
 	PN(se.sum_exec_runtime);
 	PN(se.avg_overlap);
-	PN(se.avg_wakeup);
 
 	nr_switches = p->nvcsw + p->nivcsw;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8ad164bbdac..6fc62854422 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1592,42 +1592,11 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 }
 #endif /* CONFIG_SMP */
 
-/*
- * Adaptive granularity
- *
- * se->avg_wakeup gives the average time a task runs until it does a wakeup,
- * with the limit of wakeup_gran -- when it never does a wakeup.
- *
- * So the smaller avg_wakeup is the faster we want this task to preempt,
- * but we don't want to treat the preemptee unfairly and therefore allow it
- * to run for at least the amount of time we'd like to run.
- *
- * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
- *
- * NOTE: we use *nr_running to scale with load, this nicely matches the
- *       degrading latency on load.
- */
-static unsigned long
-adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
-{
-	u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
-	u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
-	u64 gran = 0;
-
-	if (this_run < expected_wakeup)
-		gran = expected_wakeup - this_run;
-
-	return min_t(s64, gran, sysctl_sched_wakeup_granularity);
-}
-
 static unsigned long
 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
 {
 	unsigned long gran = sysctl_sched_wakeup_granularity;
 
-	if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
-		gran = adaptive_gran(curr, se);
-
 	/*
 	 * Since its curr running now, convert the gran from real-time
 	 * to virtual-time in his units.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d..96ef5dbc66e 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -30,12 +30,6 @@ SCHED_FEAT(START_DEBIT, 1)
  */
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 
-/*
- * Compute wakeup_gran based on task behaviour, clipped to
- *  [0, sched_wakeup_gran_ns]
- */
-SCHED_FEAT(ADAPTIVE_GRAN, 1)
-
 /*
  * When converting the wakeup granularity to virtual time, do it such
  * that heavier tasks preempting a lighter task have an edge.
-- 
cgit v1.2.3


From e12f31d3e5d36328c7fbd0fce40a95e70b59152c Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:15:51 +0100
Subject: sched: Remove avg_overlap

Both avg_overlap and avg_wakeup had an inherent problem in that their accuracy
was detrimentally affected by cross-cpu wakeups, this because we are missing
the necessary call to update_curr().  This can't be fixed without increasing
overhead in our already too fat fastpath.

Additionally, with recent load balancing changes making us prefer to place tasks
in an idle cache domain (which is good for compute bound loads), communicating
tasks suffer when a sync wakeup, which would enable affine placement, is turned
into a non-sync wakeup by SYNC_LESS.  With one task on the runqueue, wake_affine()
rejects the affine wakeup request, leaving the unfortunate where placed, taking
frequent cache misses.

Remove it, and recover some fastpath cycles.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301121.6785.30.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  3 ---
 kernel/sched.c          | 33 ---------------------------------
 kernel/sched_debug.c    |  1 -
 kernel/sched_fair.c     | 18 ------------------
 kernel/sched_features.h | 16 ----------------
 5 files changed, 71 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 70c560f5ada..8604884cee8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1180,9 +1180,6 @@ struct sched_entity {
 	u64			vruntime;
 	u64			prev_sum_exec_runtime;
 
-	u64			last_wakeup;
-	u64			avg_overlap;
-
 	u64			nr_migrations;
 
 #ifdef CONFIG_SCHEDSTATS
diff --git a/kernel/sched.c b/kernel/sched.c
index 35a8626ace7..68ed6f4f3c1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1887,11 +1887,6 @@ enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
-	if (sleep && p->se.last_wakeup) {
-		update_avg(&p->se.avg_overlap,
-			p->se.sum_exec_runtime - p->se.last_wakeup);
-		p->se.last_wakeup = 0;
-	}
 	sched_info_dequeued(p);
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
@@ -2452,15 +2447,6 @@ out_activate:
 	activate_task(rq, p, 1);
 	success = 1;
 
-	/*
-	 * Only attribute actual wakeups done by this task.
-	 */
-	if (!in_interrupt()) {
-		struct sched_entity *se = &current->se;
-
-		se->last_wakeup = se->sum_exec_runtime;
-	}
-
 out_running:
 	trace_sched_wakeup(rq, p, success);
 	check_preempt_curr(rq, p, wake_flags);
@@ -2522,8 +2508,6 @@ static void __sched_fork(struct task_struct *p)
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
-	p->se.last_wakeup		= 0;
-	p->se.avg_overlap		= 0;
 
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -3594,23 +3578,6 @@ static inline void schedule_debug(struct task_struct *prev)
 
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-	if (prev->state == TASK_RUNNING) {
-		u64 runtime = prev->se.sum_exec_runtime;
-
-		runtime -= prev->se.prev_sum_exec_runtime;
-		runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
-
-		/*
-		 * In order to avoid avg_overlap growing stale when we are
-		 * indeed overlapping and hence not getting put to sleep, grow
-		 * the avg_overlap on preemption.
-		 *
-		 * We use the average preemption runtime because that
-		 * correlates to the amount of cache footprint a task can
-		 * build up.
-		 */
-		update_avg(&prev->se.avg_overlap, runtime);
-	}
 	prev->sched_class->put_prev_task(rq, prev);
 }
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 20b95a420fe..8a46a719f36 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -407,7 +407,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.exec_start);
 	PN(se.vruntime);
 	PN(se.sum_exec_runtime);
-	PN(se.avg_overlap);
 
 	nr_switches = p->nvcsw + p->nivcsw;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6fc62854422..c3b69d4b5d6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1241,7 +1241,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-	struct task_struct *curr = current;
 	unsigned long this_load, load;
 	int idx, this_cpu, prev_cpu;
 	unsigned long tl_per_task;
@@ -1256,18 +1255,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	load	  = source_load(prev_cpu, idx);
 	this_load = target_load(this_cpu, idx);
 
-	if (sync) {
-	       if (sched_feat(SYNC_LESS) &&
-		   (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-		    p->se.avg_overlap > sysctl_sched_migration_cost))
-		       sync = 0;
-	} else {
-		if (sched_feat(SYNC_MORE) &&
-		    (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-		     p->se.avg_overlap < sysctl_sched_migration_cost))
-			sync = 1;
-	}
-
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
 	 * effect of the currently running task from the load
@@ -1711,11 +1698,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (sched_feat(WAKEUP_SYNC) && sync)
 		goto preempt;
 
-	if (sched_feat(WAKEUP_OVERLAP) &&
-			se->avg_overlap < sysctl_sched_migration_cost &&
-			pse->avg_overlap < sysctl_sched_migration_cost)
-		goto preempt;
-
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 96ef5dbc66e..c545e048dfe 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -41,12 +41,6 @@ SCHED_FEAT(ASYM_GRAN, 1)
  */
 SCHED_FEAT(WAKEUP_SYNC, 0)
 
-/*
- * Wakeup preempt based on task behaviour. Tasks that do not overlap
- * don't get preempted.
- */
-SCHED_FEAT(WAKEUP_OVERLAP, 0)
-
 /*
  * Use the SYNC wakeup hint, pipes and the likes use this to indicate
  * the remote end is likely to consume the data we just wrote, and
@@ -63,16 +57,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
  */
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
 
-/*
- * Weaken SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_LESS, 1)
-
-/*
- * Add SYNC hint based on overlap
- */
-SCHED_FEAT(SYNC_MORE, 0)
-
 /*
  * Prefer to schedule the task we woke last (assuming it failed
  * wakeup-preemption), since its likely going to consume data we
-- 
cgit v1.2.3


From a64692a3afd85fe048551ab89142fd5ca99a0dbd Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:16:20 +0100
Subject: sched: Cleanup/optimize clock updates

Now that we no longer depend on the clock being updated prior to enqueueing
on migratory wakeup, we can clean up a bit, placing calls to update_rq_clock()
exactly where they are needed, ie on enqueue, dequeue and schedule events.

In the case of a freshly enqueued task immediately preempting, we can skip the
update during preemption, as the clock was just updated by the enqueue event.
We also save an unneeded call during a migratory wakeup by not updating the
previous runqueue, where update_curr() won't be invoked.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301199.6785.32.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 32 ++++++++++++++++----------------
 kernel/sched_fair.c |  2 --
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 68ed6f4f3c1..16559de4ede 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -495,6 +495,8 @@ struct rq {
 	u64 nohz_stamp;
 	unsigned char in_nohz_recently;
 #endif
+	unsigned int skip_clock_update;
+
 	/* capture load from *all* tasks on this cpu: */
 	struct load_weight load;
 	unsigned long nr_load_updates;
@@ -592,6 +594,13 @@ static inline
 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
 	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+
+	/*
+	 * A queue event has occurred, and we're going to schedule.  In
+	 * this case, we can save a useless back to back clock update.
+	 */
+	if (test_tsk_need_resched(p))
+		rq->skip_clock_update = 1;
 }
 
 static inline int cpu_of(struct rq *rq)
@@ -626,7 +635,8 @@ static inline int cpu_of(struct rq *rq)
 
 inline void update_rq_clock(struct rq *rq)
 {
-	rq->clock = sched_clock_cpu(cpu_of(rq));
+	if (!rq->skip_clock_update)
+		rq->clock = sched_clock_cpu(cpu_of(rq));
 }
 
 /*
@@ -1782,8 +1792,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 			raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
 		}
 	}
-	update_rq_clock(rq1);
-	update_rq_clock(rq2);
 }
 
 /*
@@ -1880,6 +1888,7 @@ static void update_avg(u64 *avg, u64 sample)
 static void
 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 {
+	update_rq_clock(rq);
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, wakeup, head);
 	p->se.on_rq = 1;
@@ -1887,6 +1896,7 @@ enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
+	update_rq_clock(rq);
 	sched_info_dequeued(p);
 	p->sched_class->dequeue_task(rq, p, sleep);
 	p->se.on_rq = 0;
@@ -2366,7 +2376,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
-	update_rq_clock(rq);
 	if (!(p->state & state))
 		goto out;
 
@@ -2407,7 +2416,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 
 	rq = cpu_rq(cpu);
 	raw_spin_lock(&rq->lock);
-	update_rq_clock(rq);
 
 	/*
 	 * We migrated the task without holding either rq->lock, however
@@ -2624,7 +2632,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 
 	BUG_ON(p->state != TASK_WAKING);
 	p->state = TASK_RUNNING;
-	update_rq_clock(rq);
 	activate_task(rq, p, 0);
 	trace_sched_wakeup_new(rq, p, 1);
 	check_preempt_curr(rq, p, WF_FORK);
@@ -3578,6 +3585,9 @@ static inline void schedule_debug(struct task_struct *prev)
 
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
+	if (prev->se.on_rq)
+		update_rq_clock(rq);
+	rq->skip_clock_update = 0;
 	prev->sched_class->put_prev_task(rq, prev);
 }
 
@@ -3640,7 +3650,6 @@ need_resched_nonpreemptible:
 		hrtick_clear(rq);
 
 	raw_spin_lock_irq(&rq->lock);
-	update_rq_clock(rq);
 	clear_tsk_need_resched(prev);
 
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -4197,7 +4206,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
 	rq = task_rq_lock(p, &flags);
-	update_rq_clock(rq);
 
 	oldprio = p->prio;
 	prev_class = p->sched_class;
@@ -4240,7 +4248,6 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
 	rq = task_rq_lock(p, &flags);
-	update_rq_clock(rq);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
@@ -4523,7 +4530,6 @@ recheck:
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
-	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
@@ -5530,7 +5536,6 @@ void sched_idle_next(void)
 
 	__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
 
-	update_rq_clock(rq);
 	activate_task(rq, p, 0);
 
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5585,7 +5590,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 	for ( ; ; ) {
 		if (!rq->nr_running)
 			break;
-		update_rq_clock(rq);
 		next = pick_next_task(rq);
 		if (!next)
 			break;
@@ -5869,7 +5873,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		raw_spin_lock_irq(&rq->lock);
-		update_rq_clock(rq);
 		deactivate_task(rq, rq->idle, 0);
 		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
 		rq->idle->sched_class = &idle_sched_class;
@@ -7815,7 +7818,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 {
 	int on_rq;
 
-	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
 	if (on_rq)
 		deactivate_task(rq, p, 0);
@@ -8177,8 +8179,6 @@ void sched_move_task(struct task_struct *tsk)
 
 	rq = task_rq_lock(tsk, &flags);
 
-	update_rq_clock(rq);
-
 	running = task_current(rq, tsk);
 	on_rq = tsk->se.on_rq;
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c3b69d4b5d6..69e582020ff 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3064,8 +3064,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 
 	/* move a task from busiest_rq to target_rq */
 	double_lock_balance(busiest_rq, target_rq);
-	update_rq_clock(busiest_rq);
-	update_rq_clock(target_rq);
 
 	/* Search for an sd spanning us and the target CPU. */
 	for_each_domain(target_cpu, sd) {
-- 
cgit v1.2.3


From 21406928afe43f1db6acab4931bb8c886f4d04ce Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:17:15 +0100
Subject: sched: Tweak sched_latency and min_granularity

Allow LAST_BUDDY to kick in sooner, improving cache utilization as soon as
a second buddy pair arrives on scene.  The cost is latency starting to climb
sooner, the tbenefit for tbench 8 on my Q6600 box is ~2%.  No detrimental
effects noted in normal idesktop usage.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301285.6785.34.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 69e582020ff..d19df5bccfe 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
  * (to see the precise effective timeslice length of your workload,
  *  run vmstat and monitor the context-switches (cs) field)
  */
-unsigned int sysctl_sched_latency = 5000000ULL;
-unsigned int normalized_sysctl_sched_latency = 5000000ULL;
+unsigned int sysctl_sched_latency = 6000000ULL;
+unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 
 /*
  * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
 
 /*
  * Minimal preemption granularity for CPU-bound tasks:
- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
-unsigned int sysctl_sched_min_granularity = 1000000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
+unsigned int sysctl_sched_min_granularity = 2000000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
 
 /*
  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  */
-static unsigned int sched_nr_latency = 5;
+static unsigned int sched_nr_latency = 3;
 
 /*
  * After fork, child runs first. If set to 0 (default) then
-- 
cgit v1.2.3


From 8b911acdf08477c059d1c36c21113ab1696c612b Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:17:16 +0100
Subject: sched: Fix select_idle_sibling()

Don't bother with selection when the current cpu is idle.  Recent load
balancing changes also make it no longer necessary to check wake_affine()
success before returning the selected sibling, so we now always use it.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301369.6785.36.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d19df5bccfe..0008cc4a119 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1439,7 +1439,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
 	int new_cpu = cpu;
-	int want_affine = 0;
+	int want_affine = 0, cpu_idle = !current->pid;
 	int want_sd = 1;
 	int sync = wake_flags & WF_SYNC;
 
@@ -1497,13 +1497,15 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 			 * If there's an idle sibling in this domain, make that
 			 * the wake_affine target instead of the current cpu.
 			 */
-			if (tmp->flags & SD_SHARE_PKG_RESOURCES)
+			if (!cpu_idle && tmp->flags & SD_SHARE_PKG_RESOURCES)
 				target = select_idle_sibling(p, tmp, target);
 
 			if (target >= 0) {
 				if (tmp->flags & SD_WAKE_AFFINE) {
 					affine_sd = tmp;
 					want_affine = 0;
+					if (target != cpu)
+						cpu_idle = 1;
 				}
 				cpu = target;
 			}
@@ -1519,6 +1521,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 			sd = tmp;
 	}
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 	if (sched_feat(LB_SHARES_UPDATE)) {
 		/*
 		 * Pick the largest domain to update shares over
@@ -1532,9 +1535,12 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 		if (tmp)
 			update_shares(tmp);
 	}
+#endif
 
-	if (affine_sd && wake_affine(affine_sd, p, sync))
-		return cpu;
+	if (affine_sd) {
+		if (cpu_idle || cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+			return cpu;
+	}
 
 	while (sd) {
 		int load_idx = sd->forkexec_idx;
-- 
cgit v1.2.3


From 6bc6cf2b61336ed0c55a615eb4c0c8ed5daf3f08 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:17:17 +0100
Subject: sched: Remove NORMALIZED_SLEEPER

This feature hasn't been enabled in a long time, remove effectively dead code.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301447.6785.38.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 10 ----------
 kernel/sched_features.h |  7 -------
 2 files changed, 17 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0008cc4a119..de98e2e9d6e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -741,16 +741,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	if (!initial && sched_feat(FAIR_SLEEPERS)) {
 		unsigned long thresh = sysctl_sched_latency;
 
-		/*
-		 * Convert the sleeper threshold into virtual time.
-		 * SCHED_IDLE is a special sub-class.  We care about
-		 * fairness only relative to other SCHED_IDLE tasks,
-		 * all of which have the same weight.
-		 */
-		if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
-				 task_of(se)->policy != SCHED_IDLE))
-			thresh = calc_delta_fair(thresh, se);
-
 		/*
 		 * Halve their sleep time's effect, to allow
 		 * for a gentler effect of sleepers:
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index c545e048dfe..404288354ae 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,13 +12,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
  */
 SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
 
-/*
- * By not normalizing the sleep time, heavy tasks get an effective
- * longer period, and lighter task an effective shorter period they
- * are considered running.
- */
-SCHED_FEAT(NORMALIZED_SLEEPER, 0)
-
 /*
  * Place new tasks ahead so that they do not starve already running
  * tasks
-- 
cgit v1.2.3


From 5ca9880c6f4ba4c84b517bc2fed5366adf63d191 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:17:17 +0100
Subject: sched: Remove FAIR_SLEEPERS feature

Our preemption model relies too heavily on sleeper fairness to disable it
without dire consequences.  Remove the feature, and save a branch or two.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301520.6785.40.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 2 +-
 kernel/sched_features.h | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index de98e2e9d6e..97682f925ed 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -738,7 +738,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 		vruntime += sched_vslice(cfs_rq, se);
 
 	/* sleeps up to a single latency don't count. */
-	if (!initial && sched_feat(FAIR_SLEEPERS)) {
+	if (!initial) {
 		unsigned long thresh = sysctl_sched_latency;
 
 		/*
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 404288354ae..850f9809cf8 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,10 +1,3 @@
-/*
- * Disregards a certain amount of sleep time (sched_latency_ns) and
- * considers the task to be running during that period. This gives it
- * a service deficit on wakeup, allowing it to run sooner.
- */
-SCHED_FEAT(FAIR_SLEEPERS, 1)
-
 /*
  * Only give sleepers 50% of their service deficit. This allows
  * them to run sooner, but does not allow tons of sleepers to
-- 
cgit v1.2.3


From f2e74eeac03ffb779d64b66a643c5e598145a28b Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:17:18 +0100
Subject: sched: Remove WAKEUP_SYNC feature

This feature never earned its keep, remove it.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301591.6785.42.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 4 ----
 kernel/sched_features.h | 5 -----
 2 files changed, 9 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 97682f925ed..1d99535b092 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1658,7 +1658,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
-	int sync = wake_flags & WF_SYNC;
 	int scale = cfs_rq->nr_running >= sched_nr_latency;
 
 	if (unlikely(rt_prio(p->prio)))
@@ -1691,9 +1690,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (unlikely(curr->policy == SCHED_IDLE))
 		goto preempt;
 
-	if (sched_feat(WAKEUP_SYNC) && sync)
-		goto preempt;
-
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 850f9809cf8..1cb7c4701bf 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -22,11 +22,6 @@ SCHED_FEAT(WAKEUP_PREEMPT, 1)
  */
 SCHED_FEAT(ASYM_GRAN, 1)
 
-/*
- * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
- */
-SCHED_FEAT(WAKEUP_SYNC, 0)
-
 /*
  * Use the SYNC wakeup hint, pipes and the likes use this to indicate
  * the remote end is likely to consume the data we just wrote, and
-- 
cgit v1.2.3


From c6ee36c423c3ed1fb86bb3eabba9fc256a300d16 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:16:43 +0100
Subject: sched: Remove SYNC_WAKEUPS feature

Sync wakeups are critical functionality with a long history.  Remove it, we don't
need the branch or icache footprint.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301817.6785.47.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 3 ---
 kernel/sched_features.h | 8 --------
 2 files changed, 11 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 16559de4ede..cc6dc8caa38 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2369,9 +2369,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	unsigned long flags;
 	struct rq *rq;
 
-	if (!sched_feat(SYNC_WAKEUPS))
-		wake_flags &= ~WF_SYNC;
-
 	this_cpu = get_cpu();
 
 	smp_wmb();
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 1cb7c4701bf..f54b6f9cc3d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -22,14 +22,6 @@ SCHED_FEAT(WAKEUP_PREEMPT, 1)
  */
 SCHED_FEAT(ASYM_GRAN, 1)
 
-/*
- * Use the SYNC wakeup hint, pipes and the likes use this to indicate
- * the remote end is likely to consume the data we just wrote, and
- * therefore has cache benefit from being placed on the same cpu, see
- * also AFFINE_WAKEUPS.
- */
-SCHED_FEAT(SYNC_WAKEUPS, 1)
-
 /*
  * Based on load and program behaviour, see if it makes sense to place
  * a newly woken task on the same cpu as the task that woke it --
-- 
cgit v1.2.3


From 13814d42e45dfbe845a0bbe5184565d9236896ae Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:17:04 +0100
Subject: sched: Remove ASYM_GRAN feature

This features has been enabled for quite a while, after testing showed that
easing preemption for light tasks was harmful to high priority threads.

Remove the feature flag.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301675.6785.44.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 28 +++++++++++-----------------
 kernel/sched_features.h |  6 ------
 2 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1d99535b092..9357ecdb7f6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1583,24 +1583,18 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
 	/*
 	 * Since its curr running now, convert the gran from real-time
 	 * to virtual-time in his units.
+	 *
+	 * By using 'se' instead of 'curr' we penalize light tasks, so
+	 * they get preempted easier. That is, if 'se' < 'curr' then
+	 * the resulting gran will be larger, therefore penalizing the
+	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
+	 * be smaller, again penalizing the lighter task.
+	 *
+	 * This is especially important for buddies when the leftmost
+	 * task is higher priority than the buddy.
 	 */
-	if (sched_feat(ASYM_GRAN)) {
-		/*
-		 * By using 'se' instead of 'curr' we penalize light tasks, so
-		 * they get preempted easier. That is, if 'se' < 'curr' then
-		 * the resulting gran will be larger, therefore penalizing the
-		 * lighter, if otoh 'se' > 'curr' then the resulting gran will
-		 * be smaller, again penalizing the lighter task.
-		 *
-		 * This is especially important for buddies when the leftmost
-		 * task is higher priority than the buddy.
-		 */
-		if (unlikely(se->load.weight != NICE_0_LOAD))
-			gran = calc_delta_fair(gran, se);
-	} else {
-		if (unlikely(curr->load.weight != NICE_0_LOAD))
-			gran = calc_delta_fair(gran, curr);
-	}
+	if (unlikely(se->load.weight != NICE_0_LOAD))
+		gran = calc_delta_fair(gran, se);
 
 	return gran;
 }
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index f54b6f9cc3d..83c66e8ad3e 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -16,12 +16,6 @@ SCHED_FEAT(START_DEBIT, 1)
  */
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 
-/*
- * When converting the wakeup granularity to virtual time, do it such
- * that heavier tasks preempting a lighter task have an edge.
- */
-SCHED_FEAT(ASYM_GRAN, 1)
-
 /*
  * Based on load and program behaviour, see if it makes sense to place
  * a newly woken task on the same cpu as the task that woke it --
-- 
cgit v1.2.3


From beac4c7e4a1cc6d57801f690e5e82fa2c9c245c8 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 11 Mar 2010 17:17:20 +0100
Subject: sched: Remove AFFINE_WAKEUPS feature

Disabling affine wakeups is too horrible to contemplate.  Remove the feature flag.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268301890.6785.50.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9357ecdb7f6..35a5c649638 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1434,8 +1434,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 	int sync = wake_flags & WF_SYNC;
 
 	if (sd_flag & SD_BALANCE_WAKE) {
-		if (sched_feat(AFFINE_WAKEUPS) &&
-		    cpumask_test_cpu(cpu, &p->cpus_allowed))
+		if (cpumask_test_cpu(cpu, &p->cpus_allowed))
 			want_affine = 1;
 		new_cpu = prev_cpu;
 	}
-- 
cgit v1.2.3


From a072738e04f0eb26370e39ec679e9a0d65e49aea Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 11 Mar 2010 19:54:39 +0300
Subject: perf, x86: Implement initial P4 PMU driver

The netburst PMU is way different from the "architectural
perfomance monitoring" specification that current CPUs use.
P4 uses a tuple of ESCR+CCCR+COUNTER MSR registers to handle
perfomance monitoring events.

A few implementational details:

1) We need a separate x86_pmu::hw_config helper in struct
   x86_pmu since register bit-fields are quite different from P6,
   Core and later cpu series.

2) For the same reason is a x86_pmu::schedule_events helper
   introduced.

3) hw_perf_event::config consists of packed ESCR+CCCR values.
   It's allowed since in reality both registers only use a half
   of their size. Of course before making a real write into a
   particular MSR we need to unpack the value and extend it to
   a proper size.

4) The tuple of packed ESCR+CCCR in hw_perf_event::config
   doesn't describe the memory address of ESCR MSR register
   so that we need to keep a mapping between these tuples
   used and available ESCR (various P4 events may use same
   ESCRs but not simultaneously), for this sake every active
   event has a per-cpu map of hw_perf_event::idx <--> ESCR
   addresses.

5) Since hw_perf_event::idx is an offset to counter/control register
   we need to lift X86_PMC_MAX_GENERIC up, otherwise kernel
   strips it down to 8 registers and event armed may never be turned
   off (ie the bit in active_mask is set but the loop never reaches
   this index to check), thanks to Peter Zijlstra

Restrictions:

 - No cascaded counters support (do we ever need them?)
 - No dependent events support (so PERF_COUNT_HW_INSTRUCTIONS
   doesn't work for now)
 - There are events with same counters which can't work simultaneously
   (need to use intersected ones due to broken counter 1)
 - No PERF_COUNT_HW_CACHE_ events yet

Todo:

 - Implement dependent events
 - Need proper hashing for event opcodes (no linear search, good for
   debugging stage but not in real loads)
 - Some events counted during a clock cycle -- need to set threshold
   for them and count every clock cycle just to get summary statistics
   (ie to behave the same way as other PMUs do)
 - Need to swicth to use event_constraints
 - To support RAW events we need to encode a global list of P4 events
   into p4_templates
 - Cache events need to be added

Event support status matrix:

 Event			status
 -----------------------------
 cycles			works
 cache-references	works
 cache-misses		works
 branch-misses		works
 bus-cycles		partially (does not work on 64bit cpu with HT enabled)
 instruction		doesnt work (needs dependent event [mop tagging])
 branches		doesnt work

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100311165439.GB5129@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h      |   2 +-
 arch/x86/include/asm/perf_event_p4.h   | 707 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event.c       |  46 ++-
 arch/x86/kernel/cpu/perf_event_amd.c   |   2 +
 arch/x86/kernel/cpu/perf_event_intel.c |  15 +-
 arch/x86/kernel/cpu/perf_event_p4.c    | 607 ++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event_p6.c    |   2 +
 7 files changed, 1358 insertions(+), 23 deletions(-)
 create mode 100644 arch/x86/include/asm/perf_event_p4.h
 create mode 100644 arch/x86/kernel/cpu/perf_event_p4.c

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index a9038c95161..124dddd598f 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -5,7 +5,7 @@
  * Performance event hw details:
  */
 
-#define X86_PMC_MAX_GENERIC					8
+#define X86_PMC_MAX_GENERIC				       32
 #define X86_PMC_MAX_FIXED					3
 
 #define X86_PMC_IDX_GENERIC				        0
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
new file mode 100644
index 00000000000..829f4711645
--- /dev/null
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -0,0 +1,707 @@
+/*
+ * Netburst Perfomance Events (P4, old Xeon)
+ */
+
+#ifndef PERF_EVENT_P4_H
+#define PERF_EVENT_P4_H
+
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+
+/*
+ * NetBurst has perfomance MSRs shared between
+ * threads if HT is turned on, ie for both logical
+ * processors (mem: in turn in Atom with HT support
+ * perf-MSRs are not shared and every thread has its
+ * own perf-MSRs set)
+ */
+#define ARCH_P4_TOTAL_ESCR		(46)
+#define ARCH_P4_RESERVED_ESCR		(2) /* IQ_ESCR(0,1) not always present */
+#define ARCH_P4_MAX_ESCR		(ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
+#define ARCH_P4_MAX_CCCR		(18)
+#define ARCH_P4_MAX_COUNTER		(ARCH_P4_MAX_CCCR / 2)
+
+#define P4_EVNTSEL_EVENT_MASK		0x7e000000U
+#define P4_EVNTSEL_EVENT_SHIFT		25
+#define P4_EVNTSEL_EVENTMASK_MASK	0x01fffe00U
+#define P4_EVNTSEL_EVENTMASK_SHIFT	9
+#define P4_EVNTSEL_TAG_MASK		0x000001e0U
+#define P4_EVNTSEL_TAG_SHIFT		5
+#define P4_EVNTSEL_TAG_ENABLE		0x00000010U
+#define P4_EVNTSEL_T0_OS		0x00000008U
+#define P4_EVNTSEL_T0_USR		0x00000004U
+#define P4_EVNTSEL_T1_OS		0x00000002U
+#define P4_EVNTSEL_T1_USR		0x00000001U
+
+/* Non HT mask */
+#define P4_EVNTSEL_MASK				\
+	(P4_EVNTSEL_EVENT_MASK		|	\
+	P4_EVNTSEL_EVENTMASK_MASK	|	\
+	P4_EVNTSEL_TAG_MASK		|	\
+	P4_EVNTSEL_TAG_ENABLE		|	\
+	P4_EVNTSEL_T0_OS		|	\
+	P4_EVNTSEL_T0_USR)
+
+/* HT mask */
+#define P4_EVNTSEL_MASK_HT			\
+	(P4_EVNTSEL_MASK		|	\
+	P4_EVNTSEL_T1_OS		|	\
+	P4_EVNTSEL_T1_USR)
+
+#define P4_CCCR_OVF			0x80000000U
+#define P4_CCCR_CASCADE			0x40000000U
+#define P4_CCCR_OVF_PMI_T0		0x04000000U
+#define P4_CCCR_OVF_PMI_T1		0x08000000U
+#define P4_CCCR_FORCE_OVF		0x02000000U
+#define P4_CCCR_EDGE			0x01000000U
+#define P4_CCCR_THRESHOLD_MASK		0x00f00000U
+#define P4_CCCR_THRESHOLD_SHIFT		20
+#define P4_CCCR_THRESHOLD(v)		((v) << P4_CCCR_THRESHOLD_SHIFT)
+#define P4_CCCR_COMPLEMENT		0x00080000U
+#define P4_CCCR_COMPARE			0x00040000U
+#define P4_CCCR_ESCR_SELECT_MASK	0x0000e000U
+#define P4_CCCR_ESCR_SELECT_SHIFT	13
+#define P4_CCCR_ENABLE			0x00001000U
+#define P4_CCCR_THREAD_SINGLE		0x00010000U
+#define P4_CCCR_THREAD_BOTH		0x00020000U
+#define P4_CCCR_THREAD_ANY		0x00030000U
+
+/* Non HT mask */
+#define P4_CCCR_MASK				\
+	(P4_CCCR_OVF			|	\
+	P4_CCCR_CASCADE			|	\
+	P4_CCCR_OVF_PMI_T0		|	\
+	P4_CCCR_FORCE_OVF		|	\
+	P4_CCCR_EDGE			|	\
+	P4_CCCR_THRESHOLD_MASK		|	\
+	P4_CCCR_COMPLEMENT		|	\
+	P4_CCCR_COMPARE			|	\
+	P4_CCCR_ESCR_SELECT_MASK	|	\
+	P4_CCCR_ENABLE)
+
+/* HT mask */
+#define P4_CCCR_MASK_HT				\
+	(P4_CCCR_MASK			|	\
+	P4_CCCR_THREAD_ANY)
+
+/*
+ * format is 32 bit: ee ss aa aa
+ * where
+ *	ee - 8 bit event
+ *	ss - 8 bit selector
+ *	aa aa - 16 bits reserved for tags/attributes
+ */
+#define P4_EVENT_PACK(event, selector)		(((event) << 24) | ((selector) << 16))
+#define P4_EVENT_UNPACK_EVENT(packed)		(((packed) >> 24) & 0xff)
+#define P4_EVENT_UNPACK_SELECTOR(packed)	(((packed) >> 16) & 0xff)
+#define P4_EVENT_PACK_ATTR(attr)		((attr))
+#define P4_EVENT_UNPACK_ATTR(packed)		((packed) & 0xffff)
+#define P4_MAKE_EVENT_ATTR(class, name, bit)	class##_##name = (1 << bit)
+#define P4_EVENT_ATTR(class, name)		class##_##name
+#define P4_EVENT_ATTR_STR(class, name)		__stringify(class##_##name)
+
+/*
+ * config field is 64bit width and consists of
+ * HT << 63 | ESCR << 32 | CCCR
+ * where HT is HyperThreading bit (since ESCR
+ * has it reserved we may use it for own purpose)
+ *
+ * note that this is NOT the addresses of respective
+ * ESCR and CCCR but rather an only packed value should
+ * be unpacked and written to a proper addresses
+ *
+ * the base idea is to pack as much info as
+ * possible
+ */
+#define p4_config_pack_escr(v)		(((u64)(v)) << 32)
+#define p4_config_pack_cccr(v)		(((u64)(v)) & 0xffffffffULL)
+#define p4_config_unpack_escr(v)	(((u64)(v)) >> 32)
+#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xffffffffULL)
+
+#define p4_config_unpack_emask(v)			\
+	({						\
+		u32 t = p4_config_unpack_escr((v));	\
+		t  &= P4_EVNTSEL_EVENTMASK_MASK;	\
+		t >>= P4_EVNTSEL_EVENTMASK_SHIFT;	\
+		t;					\
+	})
+
+#define P4_CONFIG_HT_SHIFT		63
+#define P4_CONFIG_HT			(1ULL << P4_CONFIG_HT_SHIFT)
+
+static inline u32 p4_config_unpack_opcode(u64 config)
+{
+	u32 e, s;
+
+	/*
+	 * we don't care about HT presence here since
+	 * event opcode doesn't depend on it
+	 */
+	e = (p4_config_unpack_escr(config) & P4_EVNTSEL_EVENT_MASK) >> P4_EVNTSEL_EVENT_SHIFT;
+	s = (p4_config_unpack_cccr(config) & P4_CCCR_ESCR_SELECT_MASK) >> P4_CCCR_ESCR_SELECT_SHIFT;
+
+	return P4_EVENT_PACK(e, s);
+}
+
+static inline bool p4_is_event_cascaded(u64 config)
+{
+	u32 cccr = p4_config_unpack_cccr(config);
+	return !!(cccr & P4_CCCR_CASCADE);
+}
+
+static inline int p4_ht_config_thread(u64 config)
+{
+	return !!(config & P4_CONFIG_HT);
+}
+
+static inline u64 p4_set_ht_bit(u64 config)
+{
+	return config | P4_CONFIG_HT;
+}
+
+static inline u64 p4_clear_ht_bit(u64 config)
+{
+	return config & ~P4_CONFIG_HT;
+}
+
+static inline int p4_ht_active(void)
+{
+#ifdef CONFIG_SMP
+	return smp_num_siblings > 1;
+#endif
+	return 0;
+}
+
+static inline int p4_ht_thread(int cpu)
+{
+#ifdef CONFIG_SMP
+	if (smp_num_siblings == 2)
+		return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map));
+#endif
+	return 0;
+}
+
+static inline int p4_should_swap_ts(u64 config, int cpu)
+{
+	return p4_ht_config_thread(config) ^ p4_ht_thread(cpu);
+}
+
+static inline u32 p4_default_cccr_conf(int cpu)
+{
+	/*
+	 * Note that P4_CCCR_THREAD_ANY is "required" on
+	 * non-HT machines (on HT machines we count TS events
+	 * regardless the state of second logical processor
+	 */
+	u32 cccr = P4_CCCR_THREAD_ANY;
+
+	if (!p4_ht_thread(cpu))
+		cccr |= P4_CCCR_OVF_PMI_T0;
+	else
+		cccr |= P4_CCCR_OVF_PMI_T1;
+
+	return cccr;
+}
+
+static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
+{
+	u32 escr = 0;
+
+	if (!p4_ht_thread(cpu)) {
+		if (!exclude_os)
+			escr |= P4_EVNTSEL_T0_OS;
+		if (!exclude_usr)
+			escr |= P4_EVNTSEL_T0_USR;
+	} else {
+		if (!exclude_os)
+			escr |= P4_EVNTSEL_T1_OS;
+		if (!exclude_usr)
+			escr |= P4_EVNTSEL_T1_USR;
+	}
+
+	return escr;
+}
+
+/*
+ * Comments below the event represent ESCR restriction
+ * for this event and counter index per ESCR
+ *
+ * MSR_P4_IQ_ESCR0 and MSR_P4_IQ_ESCR1 are available only on early
+ * processor builds (family 0FH, models 01H-02H). These MSRs
+ * are not available on later versions, so that we don't use
+ * them completely
+ *
+ * Also note that CCCR1 do not have P4_CCCR_ENABLE bit properly
+ * working so that we should not use this CCCR and respective
+ * counter as result
+ */
+#define P4_TC_DELIVER_MODE		P4_EVENT_PACK(0x01, 0x01)
+	/*
+	 * MSR_P4_TC_ESCR0:	4, 5
+	 * MSR_P4_TC_ESCR1:	6, 7
+	 */
+
+#define P4_BPU_FETCH_REQUEST		P4_EVENT_PACK(0x03, 0x00)
+	/*
+	 * MSR_P4_BPU_ESCR0:	0, 1
+	 * MSR_P4_BPU_ESCR1:	2, 3
+	 */
+
+#define P4_ITLB_REFERENCE		P4_EVENT_PACK(0x18, 0x03)
+	/*
+	 * MSR_P4_ITLB_ESCR0:	0, 1
+	 * MSR_P4_ITLB_ESCR1:	2, 3
+	 */
+
+#define P4_MEMORY_CANCEL		P4_EVENT_PACK(0x02, 0x05)
+	/*
+	 * MSR_P4_DAC_ESCR0:	8, 9
+	 * MSR_P4_DAC_ESCR1:	10, 11
+	 */
+
+#define P4_MEMORY_COMPLETE		P4_EVENT_PACK(0x08, 0x02)
+	/*
+	 * MSR_P4_SAAT_ESCR0:	8, 9
+	 * MSR_P4_SAAT_ESCR1:	10, 11
+	 */
+
+#define P4_LOAD_PORT_REPLAY		P4_EVENT_PACK(0x04, 0x02)
+	/*
+	 * MSR_P4_SAAT_ESCR0:	8, 9
+	 * MSR_P4_SAAT_ESCR1:	10, 11
+	 */
+
+#define P4_STORE_PORT_REPLAY		P4_EVENT_PACK(0x05, 0x02)
+	/*
+	 * MSR_P4_SAAT_ESCR0:	8, 9
+	 * MSR_P4_SAAT_ESCR1:	10, 11
+	 */
+
+#define P4_MOB_LOAD_REPLAY		P4_EVENT_PACK(0x03, 0x02)
+	/*
+	 * MSR_P4_MOB_ESCR0:	0, 1
+	 * MSR_P4_MOB_ESCR1:	2, 3
+	 */
+
+#define P4_PAGE_WALK_TYPE		P4_EVENT_PACK(0x01, 0x04)
+	/*
+	 * MSR_P4_PMH_ESCR0:	0, 1
+	 * MSR_P4_PMH_ESCR1:	2, 3
+	 */
+
+#define P4_BSQ_CACHE_REFERENCE		P4_EVENT_PACK(0x0c, 0x07)
+	/*
+	 * MSR_P4_BSU_ESCR0:	0, 1
+	 * MSR_P4_BSU_ESCR1:	2, 3
+	 */
+
+#define P4_IOQ_ALLOCATION		P4_EVENT_PACK(0x03, 0x06)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_IOQ_ACTIVE_ENTRIES		P4_EVENT_PACK(0x1a, 0x06)
+	/*
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_FSB_DATA_ACTIVITY		P4_EVENT_PACK(0x17, 0x06)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_BSQ_ALLOCATION		P4_EVENT_PACK(0x05, 0x07)
+	/*
+	 * MSR_P4_BSU_ESCR0:	0, 1
+	 */
+
+#define P4_BSQ_ACTIVE_ENTRIES		P4_EVENT_PACK(0x06, 0x07)
+	/*
+	 * MSR_P4_BSU_ESCR1:	2, 3
+	 */
+
+#define P4_SSE_INPUT_ASSIST		P4_EVENT_PACK(0x34, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR:	8, 9
+	 * MSR_P4_FIRM_ESCR:	10, 11
+	 */
+
+#define P4_PACKED_SP_UOP		P4_EVENT_PACK(0x08, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_PACKED_DP_UOP		P4_EVENT_PACK(0x0c, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_SCALAR_SP_UOP		P4_EVENT_PACK(0x0a, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_SCALAR_DP_UOP		P4_EVENT_PACK(0x0e, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_64BIT_MMX_UOP		P4_EVENT_PACK(0x02, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_128BIT_MMX_UOP		P4_EVENT_PACK(0x1a, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_X87_FP_UOP			P4_EVENT_PACK(0x04, 0x01)
+	/*
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
+	 */
+
+#define P4_TC_MISC			P4_EVENT_PACK(0x06, 0x01)
+	/*
+	 * MSR_P4_TC_ESCR0:	4, 5
+	 * MSR_P4_TC_ESCR1:	6, 7
+	 */
+
+#define P4_GLOBAL_POWER_EVENTS		P4_EVENT_PACK(0x13, 0x06)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_TC_MS_XFER			P4_EVENT_PACK(0x05, 0x00)
+	/*
+	 * MSR_P4_MS_ESCR0:	4, 5
+	 * MSR_P4_MS_ESCR1:	6, 7
+	 */
+
+#define P4_UOP_QUEUE_WRITES		P4_EVENT_PACK(0x09, 0x00)
+	/*
+	 * MSR_P4_MS_ESCR0:	4, 5
+	 * MSR_P4_MS_ESCR1:	6, 7
+	 */
+
+#define P4_RETIRED_MISPRED_BRANCH_TYPE	P4_EVENT_PACK(0x05, 0x02)
+	/*
+	 * MSR_P4_TBPU_ESCR0:	4, 5
+	 * MSR_P4_TBPU_ESCR0:	6, 7
+	 */
+
+#define P4_RETIRED_BRANCH_TYPE		P4_EVENT_PACK(0x04, 0x02)
+	/*
+	 * MSR_P4_TBPU_ESCR0:	4, 5
+	 * MSR_P4_TBPU_ESCR0:	6, 7
+	 */
+
+#define P4_RESOURCE_STALL		P4_EVENT_PACK(0x01, 0x01)
+	/*
+	 * MSR_P4_ALF_ESCR0:	12, 13, 16
+	 * MSR_P4_ALF_ESCR1:	14, 15, 17
+	 */
+
+#define P4_WC_BUFFER			P4_EVENT_PACK(0x05, 0x05)
+	/*
+	 * MSR_P4_DAC_ESCR0:	8, 9
+	 * MSR_P4_DAC_ESCR1:	10, 11
+	 */
+
+#define P4_B2B_CYCLES			P4_EVENT_PACK(0x16, 0x03)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_BNR				P4_EVENT_PACK(0x08, 0x03)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_SNOOP			P4_EVENT_PACK(0x06, 0x03)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_RESPONSE			P4_EVENT_PACK(0x04, 0x03)
+	/*
+	 * MSR_P4_FSB_ESCR0:	0, 1
+	 * MSR_P4_FSB_ESCR1:	2, 3
+	 */
+
+#define P4_FRONT_END_EVENT		P4_EVENT_PACK(0x08, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_EXECUTION_EVENT		P4_EVENT_PACK(0x0c, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_REPLAY_EVENT			P4_EVENT_PACK(0x09, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_INSTR_RETIRED		P4_EVENT_PACK(0x02, 0x04)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_UOPS_RETIRED			P4_EVENT_PACK(0x01, 0x04)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_UOP_TYPE			P4_EVENT_PACK(0x02, 0x02)
+	/*
+	 * MSR_P4_RAT_ESCR0:	12, 13, 16
+	 * MSR_P4_RAT_ESCR1:	14, 15, 17
+	 */
+
+#define P4_BRANCH_RETIRED		P4_EVENT_PACK(0x06, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_MISPRED_BRANCH_RETIRED	P4_EVENT_PACK(0x03, 0x04)
+	/*
+	 * MSR_P4_CRU_ESCR0:	12, 13, 16
+	 * MSR_P4_CRU_ESCR1:	14, 15, 17
+	 */
+
+#define P4_X87_ASSIST			P4_EVENT_PACK(0x03, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_MACHINE_CLEAR		P4_EVENT_PACK(0x02, 0x05)
+	/*
+	 * MSR_P4_CRU_ESCR2:	12, 13, 16
+	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 */
+
+#define P4_INSTR_COMPLETED		P4_EVENT_PACK(0x07, 0x04)
+	/*
+	 * MSR_P4_CRU_ESCR0:	12, 13, 16
+	 * MSR_P4_CRU_ESCR1:	14, 15, 17
+	 */
+
+/*
+ * a caller should use P4_EVENT_ATTR helper to
+ * pick the attribute needed, for example
+ *
+ *	P4_EVENT_ATTR(P4_TC_DELIVER_MODE, DD)
+ */
+enum P4_EVENTS_ATTR {
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DD, 0),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DB, 1),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DI, 2),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BD, 3),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BB, 4),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BI, 5),
+	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, ID, 6),
+
+	P4_MAKE_EVENT_ATTR(P4_BPU_FETCH_REQUEST, TCMISS, 0),
+
+	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, HIT, 0),
+	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, MISS, 1),
+	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, HIT_UK, 2),
+
+	P4_MAKE_EVENT_ATTR(P4_MEMORY_CANCEL, ST_RB_FULL, 2),
+	P4_MAKE_EVENT_ATTR(P4_MEMORY_CANCEL, 64K_CONF, 3),
+
+	P4_MAKE_EVENT_ATTR(P4_MEMORY_COMPLETE, LSC, 0),
+	P4_MAKE_EVENT_ATTR(P4_MEMORY_COMPLETE, SSC, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_LOAD_PORT_REPLAY, SPLIT_LD, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_STORE_PORT_REPLAY, SPLIT_ST, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, NO_STA, 1),
+	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, NO_STD, 3),
+	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, PARTIAL_DATA, 4),
+	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, UNALGN_ADDR, 5),
+
+	P4_MAKE_EVENT_ATTR(P4_PAGE_WALK_TYPE, DTMISS, 0),
+	P4_MAKE_EVENT_ATTR(P4_PAGE_WALK_TYPE, ITMISS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10),
+
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, DEFAULT, 0),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, ALL_READ, 5),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, ALL_WRITE, 6),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_UC, 7),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WC, 8),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WT, 9),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WP, 10),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WB, 11),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, OWN, 13),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, OTHER, 14),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, PREFETCH, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, DEFAULT, 0),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, ALL_READ, 5),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_UC, 7),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WC, 8),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WT, 9),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WP, 10),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WB, 11),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, OWN, 13),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, OTHER, 14),
+	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, PREFETCH, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV, 0),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN, 1),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OTHER, 2),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_DRV, 3),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_OWN, 4),
+	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_OTHER, 5),
+
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_TYPE0, 0),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_TYPE1, 1),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LEN0, 2),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LEN1, 3),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_IO_TYPE, 5),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_DEM_TYPE, 9),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_ORD_TYPE, 10),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE0, 11),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE1, 12),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE2, 13),
+
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12),
+	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13),
+
+	P4_MAKE_EVENT_ATTR(P4_SSE_INPUT_ASSIST, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_PACKED_SP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_PACKED_DP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_SCALAR_SP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_SCALAR_DP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_64BIT_MMX_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_128BIT_MMX_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_X87_FP_UOP, ALL, 15),
+
+	P4_MAKE_EVENT_ATTR(P4_TC_MISC, FLUSH, 4),
+
+	P4_MAKE_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING, 0),
+
+	P4_MAKE_EVENT_ATTR(P4_TC_MS_XFER, CISC, 0),
+
+	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0),
+	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1),
+	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_ROM, 2),
+
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4),
+
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL, 1),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL, 2),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, RETURN, 3),
+	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, INDIRECT, 4),
+
+	P4_MAKE_EVENT_ATTR(P4_RESOURCE_STALL, SBFULL, 5),
+
+	P4_MAKE_EVENT_ATTR(P4_WC_BUFFER, WCB_EVICTS, 0),
+	P4_MAKE_EVENT_ATTR(P4_WC_BUFFER, WCB_FULL_EVICTS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_FRONT_END_EVENT, NBOGUS, 0),
+	P4_MAKE_EVENT_ATTR(P4_FRONT_END_EVENT, BOGUS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS0, 0),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS1, 1),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS2, 2),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS3, 3),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS0, 4),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS1, 5),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS2, 6),
+	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS3, 7),
+
+	P4_MAKE_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS, 0),
+	P4_MAKE_EVENT_ATTR(P4_REPLAY_EVENT, BOGUS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG, 0),
+	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSTAG, 1),
+	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG, 2),
+	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSTAG, 3),
+
+	P4_MAKE_EVENT_ATTR(P4_UOPS_RETIRED, NBOGUS, 0),
+	P4_MAKE_EVENT_ATTR(P4_UOPS_RETIRED, BOGUS, 1),
+
+	P4_MAKE_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS, 1),
+	P4_MAKE_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES, 2),
+
+	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMNP, 0),
+	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMNM, 1),
+	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMTP, 2),
+	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMTM, 3),
+
+	P4_MAKE_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS, 0),
+
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, FPSU, 0),
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, FPSO, 1),
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, POAO, 2),
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, POAU, 3),
+	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, PREA, 4),
+
+	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, CLEAR, 0),
+	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, MOCLEAR, 1),
+	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, SMCLEAR, 2),
+
+	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, NBOGUS, 0),
+	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, BOGUS, 1),
+};
+
+#endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e24f6374f9f..e6a3f5f81c9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -190,6 +190,8 @@ struct x86_pmu {
 	void		(*enable_all)(void);
 	void		(*enable)(struct perf_event *);
 	void		(*disable)(struct perf_event *);
+	int		(*hw_config)(struct perf_event_attr *attr, struct hw_perf_event *hwc);
+	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
 	unsigned	eventsel;
 	unsigned	perfctr;
 	u64		(*event_map)(int);
@@ -415,6 +417,25 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 	return 0;
 }
 
+static int x86_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
+{
+	/*
+	 * Generate PMC IRQs:
+	 * (keep 'enabled' bit clear for now)
+	 */
+	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+
+	/*
+	 * Count user and OS events unless requested not to
+	 */
+	if (!attr->exclude_user)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!attr->exclude_kernel)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+
+	return 0;
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -446,23 +467,13 @@ static int __hw_perf_event_init(struct perf_event *event)
 
 	event->destroy = hw_perf_event_destroy;
 
-	/*
-	 * Generate PMC IRQs:
-	 * (keep 'enabled' bit clear for now)
-	 */
-	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
-
 	hwc->idx = -1;
 	hwc->last_cpu = -1;
 	hwc->last_tag = ~0ULL;
 
-	/*
-	 * Count user and OS events unless requested not to.
-	 */
-	if (!attr->exclude_user)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!attr->exclude_kernel)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+	/* Processor specifics */
+	if (x86_pmu.hw_config(attr, hwc))
+		return -EOPNOTSUPP;
 
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
@@ -517,7 +528,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 			return -EOPNOTSUPP;
 
 		/* BTS is currently only allowed for user-mode. */
-		if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		if (!attr->exclude_kernel)
 			return -EOPNOTSUPP;
 	}
 
@@ -931,7 +942,7 @@ static int x86_pmu_enable(struct perf_event *event)
 	if (n < 0)
 		return n;
 
-	ret = x86_schedule_events(cpuc, n, assign);
+	ret = x86_pmu.schedule_events(cpuc, n, assign);
 	if (ret)
 		return ret;
 	/*
@@ -1263,7 +1274,7 @@ int hw_perf_group_sched_in(struct perf_event *leader,
 	if (n0 < 0)
 		return n0;
 
-	ret = x86_schedule_events(cpuc, n0, assign);
+	ret = x86_pmu.schedule_events(cpuc, n0, assign);
 	if (ret)
 		return ret;
 
@@ -1313,6 +1324,7 @@ undo:
 
 #include "perf_event_amd.c"
 #include "perf_event_p6.c"
+#include "perf_event_p4.c"
 #include "perf_event_intel_lbr.c"
 #include "perf_event_intel_ds.c"
 #include "perf_event_intel.c"
@@ -1515,7 +1527,7 @@ static int validate_group(struct perf_event *event)
 
 	fake_cpuc->n_events = n;
 
-	ret = x86_schedule_events(fake_cpuc, n, NULL);
+	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
 
 out_free:
 	kfree(fake_cpuc);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 573458f1caf..358a8e3d05f 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -363,6 +363,8 @@ static __initconst struct x86_pmu amd_pmu = {
 	.enable_all		= x86_pmu_enable_all,
 	.enable			= x86_pmu_enable_event,
 	.disable		= x86_pmu_disable_event,
+	.hw_config		= x86_hw_config,
+	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
 	.event_map		= amd_pmu_event_map,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 971dc6e7d54..044b8436b19 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -749,6 +749,8 @@ static __initconst struct x86_pmu core_pmu = {
 	.enable_all		= x86_pmu_enable_all,
 	.enable			= x86_pmu_enable_event,
 	.disable		= x86_pmu_disable_event,
+	.hw_config		= x86_hw_config,
+	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
@@ -786,6 +788,8 @@ static __initconst struct x86_pmu intel_pmu = {
 	.enable_all		= intel_pmu_enable_all,
 	.enable			= intel_pmu_enable_event,
 	.disable		= intel_pmu_disable_event,
+	.hw_config		= x86_hw_config,
+	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
@@ -839,12 +843,13 @@ static __init int intel_pmu_init(void)
 	int version;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-		/* check for P6 processor family */
-	   if (boot_cpu_data.x86 == 6) {
-		return p6_pmu_init();
-	   } else {
+		switch (boot_cpu_data.x86) {
+		case 0x6:
+			return p6_pmu_init();
+		case 0xf:
+			return p4_pmu_init();
+		}
 		return -ENODEV;
-	   }
 	}
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
new file mode 100644
index 00000000000..381f593e829
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -0,0 +1,607 @@
+/*
+ * Netburst Perfomance Events (P4, old Xeon)
+ *
+ *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
+ *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+
+#include <asm/perf_event_p4.h>
+
+/*
+ * array indices: 0,1 - HT threads, used with HT enabled cpu
+ */
+struct p4_event_template {
+	u32 opcode;			/* ESCR event + CCCR selector */
+	u64 config;			/* packed predefined bits */
+	int dep;			/* upstream dependency event index */
+	unsigned int emask;		/* ESCR EventMask */
+	unsigned int escr_msr[2];	/* ESCR MSR for this event */
+	unsigned int cntr[2];		/* counter index (offset) */
+};
+
+struct p4_pmu_res {
+	/* maps hw_conf::idx into template for ESCR sake */
+	struct p4_event_template *tpl[ARCH_P4_MAX_CCCR];
+};
+
+static DEFINE_PER_CPU(struct p4_pmu_res, p4_pmu_config);
+
+/*
+ * WARN: CCCR1 doesn't have a working enable bit so try to not
+ * use it if possible
+ *
+ * Also as only we start to support raw events we will need to
+ * append _all_ P4_EVENT_PACK'ed events here
+ */
+struct p4_event_template p4_templates[] = {
+	[0] = {
+		.opcode	= P4_UOP_TYPE,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
+			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
+		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+		.cntr		= { 16, 17 },
+	},
+	[1] = {
+		.opcode	= P4_GLOBAL_POWER_EVENTS,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+	[2] = {
+		.opcode	= P4_INSTR_RETIRED,
+		.config	= 0,
+		.dep	= 0, /* needs front-end tagging */
+		.emask	=
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG)	|
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSTAG)	|
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG)	|
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSTAG),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { 12, 14 },
+	},
+	[3] = {
+		.opcode	= P4_BSQ_CACHE_REFERENCE,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITM),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+	[4] = {
+		.opcode	= P4_BSQ_CACHE_REFERENCE,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
+			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+		.cntr		= { 0, 3 },
+	},
+	[5] = {
+		.opcode	= P4_RETIRED_BRANCH_TYPE,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
+			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL)		|
+			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, RETURN)		|
+			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, INDIRECT),
+		.escr_msr	= { MSR_P4_TBPU_ESCR0, MSR_P4_TBPU_ESCR1 },
+		.cntr		= { 4, 6 },
+	},
+	[6] = {
+		.opcode	= P4_MISPRED_BRANCH_RETIRED,
+		.config	= 0,
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { 12, 14 },
+	},
+	[7] = {
+		.opcode	= P4_FSB_DATA_ACTIVITY,
+		.config	= p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
+		.dep	= -1,
+		.emask	=
+			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV)	|
+			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+};
+
+static struct p4_event_template *p4_event_map[PERF_COUNT_HW_MAX] = {
+	/* non-halted CPU clocks */
+	[PERF_COUNT_HW_CPU_CYCLES]		= &p4_templates[1],
+
+	/* retired instructions: dep on tagging the FSB */
+	[PERF_COUNT_HW_INSTRUCTIONS]		= &p4_templates[2],
+
+	/* cache hits */
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= &p4_templates[3],
+
+	/* cache misses */
+	[PERF_COUNT_HW_CACHE_MISSES]		= &p4_templates[4],
+
+	/* branch instructions retired */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= &p4_templates[5],
+
+	/* mispredicted branches retired */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= &p4_templates[6],
+
+	/* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
+	[PERF_COUNT_HW_BUS_CYCLES]		= &p4_templates[7],
+};
+
+static u64 p4_pmu_event_map(int hw_event)
+{
+	struct p4_event_template *tpl;
+	u64 config;
+
+	if (hw_event > ARRAY_SIZE(p4_event_map)) {
+		printk_once(KERN_ERR "PMU: Incorrect event index\n");
+		return 0;
+	}
+	tpl = p4_event_map[hw_event];
+
+	/*
+	 * fill config up according to
+	 * a predefined event template
+	 */
+	config  = tpl->config;
+	config |= p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(tpl->opcode) << P4_EVNTSEL_EVENT_SHIFT);
+	config |= p4_config_pack_escr(tpl->emask << P4_EVNTSEL_EVENTMASK_SHIFT);
+	config |= p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(tpl->opcode) << P4_CCCR_ESCR_SELECT_SHIFT);
+
+	/* on HT machine we need a special bit */
+	if (p4_ht_active() && p4_ht_thread(raw_smp_processor_id()))
+		config = p4_set_ht_bit(config);
+
+	return config;
+}
+
+/*
+ * Note that we still have 5 events (from global events SDM list)
+ * intersected in opcode+emask bits so we will need another
+ * scheme there do distinguish templates.
+ */
+static inline int p4_pmu_emask_match(unsigned int dst, unsigned int src)
+{
+	return dst & src;
+}
+
+static struct p4_event_template *p4_pmu_template_lookup(u64 config)
+{
+	u32 opcode = p4_config_unpack_opcode(config);
+	unsigned int emask = p4_config_unpack_emask(config);
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(p4_templates); i++) {
+		if (opcode == p4_templates[i].opcode &&
+			p4_pmu_emask_match(emask, p4_templates[i].emask))
+			return &p4_templates[i];
+	}
+
+	return NULL;
+}
+
+/*
+ * We don't control raw events so it's up to the caller
+ * to pass sane values (and we don't count the thread number
+ * on HT machine but allow HT-compatible specifics to be
+ * passed on)
+ */
+static u64 p4_pmu_raw_event(u64 hw_event)
+{
+	return hw_event &
+		(p4_config_pack_escr(P4_EVNTSEL_MASK_HT) |
+		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
+}
+
+static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
+{
+	int cpu = raw_smp_processor_id();
+
+	/*
+	 * the reason we use cpu that early is that: if we get scheduled
+	 * first time on the same cpu -- we will not need swap thread
+	 * specific flags in config (and will save some cpu cycles)
+	 */
+
+	/* CCCR by default */
+	hwc->config = p4_config_pack_cccr(p4_default_cccr_conf(cpu));
+
+	/* Count user and OS events unless not requested to */
+	hwc->config |= p4_config_pack_escr(p4_default_escr_conf(cpu, attr->exclude_kernel,
+								attr->exclude_user));
+	return 0;
+}
+
+static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+{
+	unsigned long dummy;
+
+	rdmsrl(hwc->config_base + hwc->idx, dummy);
+	if (dummy & P4_CCCR_OVF) {
+		(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+			((u64)dummy) & ~P4_CCCR_OVF);
+	}
+}
+
+static inline void p4_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	/*
+	 * If event gets disabled while counter is in overflowed
+	 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
+	 * asserted again and again
+	 */
+	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+		(u64)(p4_config_unpack_cccr(hwc->config)) &
+			~P4_CCCR_ENABLE & ~P4_CCCR_OVF);
+}
+
+static void p4_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		struct perf_event *event = cpuc->events[idx];
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		p4_pmu_disable_event(event);
+	}
+}
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int thread = p4_ht_config_thread(hwc->config);
+	u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
+	u64 escr_base;
+	struct p4_event_template *tpl;
+	struct p4_pmu_res *c;
+
+	/*
+	 * some preparation work from per-cpu private fields
+	 * since we need to find out which ESCR to use
+	 */
+	c = &__get_cpu_var(p4_pmu_config);
+	tpl = c->tpl[hwc->idx];
+	if (!tpl) {
+		pr_crit("%s: Wrong index: %d\n", __func__, hwc->idx);
+		return;
+	}
+	escr_base = (u64)tpl->escr_msr[thread];
+
+	/*
+	 * - we dont support cascaded counters yet
+	 * - and counter 1 is broken (erratum)
+	 */
+	WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
+	WARN_ON_ONCE(hwc->idx == 1);
+
+	(void)checking_wrmsrl(escr_base, escr_conf);
+	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
+		(u64)(p4_config_unpack_cccr(hwc->config)) | P4_CCCR_ENABLE);
+}
+
+static void p4_pmu_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+		struct perf_event *event = cpuc->events[idx];
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		p4_pmu_enable_event(event);
+	}
+}
+
+static int p4_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	int idx, handled = 0;
+	u64 val;
+
+	data.addr = 0;
+	data.raw = NULL;
+
+	cpuc = &__get_cpu_var(cpu_hw_events);
+
+	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		event = cpuc->events[idx];
+		hwc = &event->hw;
+
+		WARN_ON_ONCE(hwc->idx != idx);
+
+		/*
+		 * FIXME: Redundant call, actually not needed
+		 * but just to check if we're screwed
+		 */
+		p4_pmu_clear_cccr_ovf(hwc);
+
+		val = x86_perf_event_update(event);
+		if (val & (1ULL << (x86_pmu.event_bits - 1)))
+			continue;
+
+		/*
+		 * event overflow
+		 */
+		handled		= 1;
+		data.period	= event->hw.last_period;
+
+		if (!x86_perf_event_set_period(event))
+			continue;
+		if (perf_event_overflow(event, 1, &data, regs))
+			p4_pmu_disable_event(event);
+	}
+
+	if (handled) {
+		/* p4 quirk: unmask it again */
+		apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+		inc_irq_stat(apic_perf_irqs);
+	}
+
+	return handled;
+}
+
+/*
+ * swap thread specific fields according to a thread
+ * we are going to run on
+ */
+static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
+{
+	u32 escr, cccr;
+
+	/*
+	 * we either lucky and continue on same cpu or no HT support
+	 */
+	if (!p4_should_swap_ts(hwc->config, cpu))
+		return;
+
+	/*
+	 * the event is migrated from an another logical
+	 * cpu, so we need to swap thread specific flags
+	 */
+
+	escr = p4_config_unpack_escr(hwc->config);
+	cccr = p4_config_unpack_cccr(hwc->config);
+
+	if (p4_ht_thread(cpu)) {
+		cccr &= ~P4_CCCR_OVF_PMI_T0;
+		cccr |= P4_CCCR_OVF_PMI_T1;
+		if (escr & P4_EVNTSEL_T0_OS) {
+			escr &= ~P4_EVNTSEL_T0_OS;
+			escr |= P4_EVNTSEL_T1_OS;
+		}
+		if (escr & P4_EVNTSEL_T0_USR) {
+			escr &= ~P4_EVNTSEL_T0_USR;
+			escr |= P4_EVNTSEL_T1_USR;
+		}
+		hwc->config  = p4_config_pack_escr(escr);
+		hwc->config |= p4_config_pack_cccr(cccr);
+		hwc->config |= P4_CONFIG_HT;
+	} else {
+		cccr &= ~P4_CCCR_OVF_PMI_T1;
+		cccr |= P4_CCCR_OVF_PMI_T0;
+		if (escr & P4_EVNTSEL_T1_OS) {
+			escr &= ~P4_EVNTSEL_T1_OS;
+			escr |= P4_EVNTSEL_T0_OS;
+		}
+		if (escr & P4_EVNTSEL_T1_USR) {
+			escr &= ~P4_EVNTSEL_T1_USR;
+			escr |= P4_EVNTSEL_T0_USR;
+		}
+		hwc->config  = p4_config_pack_escr(escr);
+		hwc->config |= p4_config_pack_cccr(cccr);
+		hwc->config &= ~P4_CONFIG_HT;
+	}
+}
+
+/* ESCRs are not sequential in memory so we need a map */
+static unsigned int p4_escr_map[ARCH_P4_TOTAL_ESCR] = {
+	MSR_P4_ALF_ESCR0,	/*  0 */
+	MSR_P4_ALF_ESCR1,	/*  1 */
+	MSR_P4_BPU_ESCR0,	/*  2 */
+	MSR_P4_BPU_ESCR1,	/*  3 */
+	MSR_P4_BSU_ESCR0,	/*  4 */
+	MSR_P4_BSU_ESCR1,	/*  5 */
+	MSR_P4_CRU_ESCR0,	/*  6 */
+	MSR_P4_CRU_ESCR1,	/*  7 */
+	MSR_P4_CRU_ESCR2,	/*  8 */
+	MSR_P4_CRU_ESCR3,	/*  9 */
+	MSR_P4_CRU_ESCR4,	/* 10 */
+	MSR_P4_CRU_ESCR5,	/* 11 */
+	MSR_P4_DAC_ESCR0,	/* 12 */
+	MSR_P4_DAC_ESCR1,	/* 13 */
+	MSR_P4_FIRM_ESCR0,	/* 14 */
+	MSR_P4_FIRM_ESCR1,	/* 15 */
+	MSR_P4_FLAME_ESCR0,	/* 16 */
+	MSR_P4_FLAME_ESCR1,	/* 17 */
+	MSR_P4_FSB_ESCR0,	/* 18 */
+	MSR_P4_FSB_ESCR1,	/* 19 */
+	MSR_P4_IQ_ESCR0,	/* 20 */
+	MSR_P4_IQ_ESCR1,	/* 21 */
+	MSR_P4_IS_ESCR0,	/* 22 */
+	MSR_P4_IS_ESCR1,	/* 23 */
+	MSR_P4_ITLB_ESCR0,	/* 24 */
+	MSR_P4_ITLB_ESCR1,	/* 25 */
+	MSR_P4_IX_ESCR0,	/* 26 */
+	MSR_P4_IX_ESCR1,	/* 27 */
+	MSR_P4_MOB_ESCR0,	/* 28 */
+	MSR_P4_MOB_ESCR1,	/* 29 */
+	MSR_P4_MS_ESCR0,	/* 30 */
+	MSR_P4_MS_ESCR1,	/* 31 */
+	MSR_P4_PMH_ESCR0,	/* 32 */
+	MSR_P4_PMH_ESCR1,	/* 33 */
+	MSR_P4_RAT_ESCR0,	/* 34 */
+	MSR_P4_RAT_ESCR1,	/* 35 */
+	MSR_P4_SAAT_ESCR0,	/* 36 */
+	MSR_P4_SAAT_ESCR1,	/* 37 */
+	MSR_P4_SSU_ESCR0,	/* 38 */
+	MSR_P4_SSU_ESCR1,	/* 39 */
+	MSR_P4_TBPU_ESCR0,	/* 40 */
+	MSR_P4_TBPU_ESCR1,	/* 41 */
+	MSR_P4_TC_ESCR0,	/* 42 */
+	MSR_P4_TC_ESCR1,	/* 43 */
+	MSR_P4_U2L_ESCR0,	/* 44 */
+	MSR_P4_U2L_ESCR1,	/* 45 */
+};
+
+static int p4_get_escr_idx(unsigned int addr)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(p4_escr_map); i++) {
+		if (addr == p4_escr_map[i])
+			return i;
+	}
+
+	return -1;
+}
+
+static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long escr_mask[BITS_TO_LONGS(ARCH_P4_TOTAL_ESCR)];
+
+	struct hw_perf_event *hwc;
+	struct p4_event_template *tpl;
+	struct p4_pmu_res *c;
+	int cpu = raw_smp_processor_id();
+	int escr_idx, thread, i, num;
+
+	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+	bitmap_zero(escr_mask, ARCH_P4_TOTAL_ESCR);
+
+	c = &__get_cpu_var(p4_pmu_config);
+	/*
+	 * Firstly find out which resource events are going
+	 * to use, if ESCR+CCCR tuple is already borrowed
+	 * then get out of here
+	 */
+	for (i = 0, num = n; i < n; i++, num--) {
+		hwc = &cpuc->event_list[i]->hw;
+		tpl = p4_pmu_template_lookup(hwc->config);
+		if (!tpl)
+			goto done;
+		thread = p4_ht_thread(cpu);
+		escr_idx = p4_get_escr_idx(tpl->escr_msr[thread]);
+		if (escr_idx == -1)
+			goto done;
+
+		/* already allocated and remains on the same cpu */
+		if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+			if (assign)
+				assign[i] = hwc->idx;
+			/* upstream dependent event */
+			if (unlikely(tpl->dep != -1))
+				printk_once(KERN_WARNING "PMU: Dep events are "
+					"not implemented yet\n");
+			goto reserve;
+		}
+
+		/* it may be already borrowed */
+		if (test_bit(tpl->cntr[thread], used_mask) ||
+			test_bit(escr_idx, escr_mask))
+			goto done;
+
+		/*
+		 * ESCR+CCCR+COUNTERs are available to use lets swap
+		 * thread specific bits, push assigned bits
+		 * back and save template into per-cpu
+		 * area (which will allow us to find out the ESCR
+		 * to be used at moment of "enable event via real MSR")
+		 */
+		p4_pmu_swap_config_ts(hwc, cpu);
+		if (assign) {
+			assign[i] = tpl->cntr[thread];
+			c->tpl[assign[i]] = tpl;
+		}
+reserve:
+		set_bit(tpl->cntr[thread], used_mask);
+		set_bit(escr_idx, escr_mask);
+	}
+
+done:
+	return num ? -ENOSPC : 0;
+}
+
+static __initconst struct x86_pmu p4_pmu = {
+	.name			= "Netburst P4/Xeon",
+	.handle_irq		= p4_pmu_handle_irq,
+	.disable_all		= p4_pmu_disable_all,
+	.enable_all		= p4_pmu_enable_all,
+	.enable			= p4_pmu_enable_event,
+	.disable		= p4_pmu_disable_event,
+	.eventsel		= MSR_P4_BPU_CCCR0,
+	.perfctr		= MSR_P4_BPU_PERFCTR0,
+	.event_map		= p4_pmu_event_map,
+	.raw_event		= p4_pmu_raw_event,
+	.max_events		= ARRAY_SIZE(p4_event_map),
+	.get_event_constraints	= x86_get_event_constraints,
+	/*
+	 * IF HT disabled we may need to use all
+	 * ARCH_P4_MAX_CCCR counters simulaneously
+	 * though leave it restricted at moment assuming
+	 * HT is on
+	 */
+	.num_events		= ARCH_P4_MAX_CCCR,
+	.apic			= 1,
+	.event_bits		= 40,
+	.event_mask		= (1ULL << 40) - 1,
+	.max_period		= (1ULL << 39) - 1,
+	.hw_config		= p4_hw_config,
+	.schedule_events	= p4_pmu_schedule_events,
+};
+
+static __init int p4_pmu_init(void)
+{
+	unsigned int low, high;
+
+	/* If we get stripped -- indexig fails */
+	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
+
+	rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+	if (!(low & (1 << 7))) {
+		pr_cont("unsupported Netburst CPU model %d ",
+			boot_cpu_data.x86_model);
+		return -ENODEV;
+	}
+
+	pr_cont("Netburst events, ");
+
+	x86_pmu = p4_pmu;
+
+	return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index a330485d14d..6ff4d01d880 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -109,6 +109,8 @@ static __initconst struct x86_pmu p6_pmu = {
 	.enable_all		= p6_pmu_enable_all,
 	.enable			= p6_pmu_enable_event,
 	.disable		= p6_pmu_disable_event,
+	.hw_config		= x86_hw_config,
+	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_P6_EVNTSEL0,
 	.perfctr		= MSR_P6_PERFCTR0,
 	.event_map		= p6_pmu_event_map,
-- 
cgit v1.2.3


From 6230f2c7ef01a69e2ba9370326572c287209d32a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 11 Mar 2010 15:53:12 -0300
Subject: perf record: Mention paranoid sysctl when failing to create counter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[acme@mica linux-2.6-tip]$ perf record -a -f
   Fatal: Permission error - are you root?
 	 Consider tweaking /proc/sys/kernel/perf_event_paranoid.

 [acme@mica linux-2.6-tip]$

Suggested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268333592-30872-2-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 771533ced6a..6e4a39328b3 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -280,7 +280,8 @@ try_again:
 		int err = errno;
 
 		if (err == EPERM || err == EACCES)
-			die("Permission error - are you root?\n");
+			die("Permission error - are you root?\n"
+			    "\t Consider tweaking /proc/sys/kernel/perf_event_paranoid.\n");
 		else if (err ==  ENODEV && profile_cpu != -1)
 			die("No such device - did you specify an out-of-range profile CPU?\n");
 
-- 
cgit v1.2.3


From 0b861225a5890f22445f08ca9cc7a87cff276ff7 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 12 Mar 2010 00:50:16 +0300
Subject: x86, perf: Fix NULL deref on not assigned x86_pmu

In case of not assigned x86_pmu and software events NULL dereference may
being hit via x86_pmu::schedule_events method.

Fix it by checking if x86_pmu is initialized at all.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20100311215016.GG25162@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e6a3f5f81c9..5586a02067d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1269,6 +1269,9 @@ int hw_perf_group_sched_in(struct perf_event *leader,
 	int assign[X86_PMC_IDX_MAX];
 	int n0, n1, ret;
 
+	if (!x86_pmu_initialized())
+		return 0;
+
 	/* n0 = total number of events */
 	n0 = collect_events(cpuc, leader, true);
 	if (n0 < 0)
-- 
cgit v1.2.3


From fe2197b8bb2f318c1e0b0f589f24f781dd27d1f2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 11 Mar 2010 20:12:40 -0300
Subject: perf symbols: Bump plt synthesizing warning debug level
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268349164-5822-1-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/symbol.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 323c0aea0a9..75cd46807c7 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -862,8 +862,8 @@ out_close:
 	if (err == 0)
 		return nr;
 out:
-	pr_warning("%s: problems reading %s PLT info.\n",
-		   __func__, self->long_name);
+	pr_debug("%s: problems reading %s PLT info.\n",
+		 __func__, self->long_name);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 895f0edc3cd04a2a3de2c608ba20821b832a8abb Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 11 Mar 2010 20:12:41 -0300
Subject: perf top: Export get_window_dimensions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Will be used by the newt code too.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268349164-5822-2-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-top.c | 2 +-
 tools/perf/perf.h        | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index ec4822322ab..57e232f13bc 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -133,7 +133,7 @@ static inline struct symbol *sym_entry__symbol(struct sym_entry *self)
        return ((void *)self) + symbol_conf.priv_size;
 }
 
-static void get_term_dimensions(struct winsize *ws)
+void get_term_dimensions(struct winsize *ws)
 {
 	char *s = getenv("LINES");
 
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 6fb379bc1d1..aa786158b66 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -1,6 +1,10 @@
 #ifndef _PERF_PERF_H
 #define _PERF_PERF_H
 
+struct winsize;
+
+void get_term_dimensions(struct winsize *ws);
+
 #if defined(__i386__)
 #include "../../arch/x86/include/asm/unistd.h"
 #define rmb()		asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
-- 
cgit v1.2.3


From b4f5296f0eec2aa7061dfd8bb8c0744f095f9bd1 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 11 Mar 2010 20:12:42 -0300
Subject: perf tools: Use eprintf for pr_{err,warning,info} too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Just like we do for pr_debug, so that we can have a single point
where to redirect to the currently used output system, be it
stdio or newt.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268349164-5822-3-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/debug.h                | 2 --
 tools/perf/util/include/linux/kernel.h | 9 ++++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index c6c24c522de..58720a18159 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -7,8 +7,6 @@
 extern int verbose;
 extern int dump_trace;
 
-int eprintf(int level,
-	    const char *fmt, ...) __attribute__((format(printf, 2, 3)));
 int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
 void trace_event(event_t *event);
 
diff --git a/tools/perf/util/include/linux/kernel.h b/tools/perf/util/include/linux/kernel.h
index f2611655ab5..388ab1bfd11 100644
--- a/tools/perf/util/include/linux/kernel.h
+++ b/tools/perf/util/include/linux/kernel.h
@@ -85,16 +85,19 @@ simple_strtoul(const char *nptr, char **endptr, int base)
 	return strtoul(nptr, endptr, base);
 }
 
+int eprintf(int level,
+	    const char *fmt, ...) __attribute__((format(printf, 2, 3)));
+
 #ifndef pr_fmt
 #define pr_fmt(fmt) fmt
 #endif
 
 #define pr_err(fmt, ...) \
-	do { fprintf(stderr, pr_fmt(fmt), ##__VA_ARGS__); } while (0)
+	eprintf(0, pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_warning(fmt, ...) \
-	do { fprintf(stderr, pr_fmt(fmt), ##__VA_ARGS__); } while (0)
+	eprintf(0, pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_info(fmt, ...) \
-	do { fprintf(stderr, pr_fmt(fmt), ##__VA_ARGS__); } while (0)
+	eprintf(0, pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_debug(fmt, ...) \
 	eprintf(1, pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_debugN(n, fmt, ...) \
-- 
cgit v1.2.3


From dd2ee78dd8e4c6d6f1a333fd60c3dd27d1b07042 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 11 Mar 2010 20:12:43 -0300
Subject: perf tools: Add missing bytes printed in hist_entry__fprintf
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We need those to properly size the browser widht in the newt
TUI.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268349164-5822-4-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/hist.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index bdcfd6190b2..d43be344a88 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -455,11 +455,11 @@ static size_t hist_entry_callchain__fprintf(FILE *fp, struct hist_entry *self,
 	return ret;
 }
 
-static size_t hist_entry__fprintf(struct hist_entry *self,
-				  struct perf_session *pair_session,
-				  bool show_displacement,
-				  long displacement, FILE *fp,
-				  u64 session_total)
+size_t hist_entry__fprintf(struct hist_entry *self,
+			   struct perf_session *pair_session,
+			   bool show_displacement,
+			   long displacement, FILE *fp,
+			   u64 session_total)
 {
 	struct sort_entry *se;
 	u64 count, total;
@@ -485,9 +485,9 @@ static size_t hist_entry__fprintf(struct hist_entry *self,
 
 	if (symbol_conf.show_nr_samples) {
 		if (sep)
-			fprintf(fp, "%c%lld", *sep, count);
+			ret += fprintf(fp, "%c%lld", *sep, count);
 		else
-			fprintf(fp, "%11lld", count);
+			ret += fprintf(fp, "%11lld", count);
 	}
 
 	if (pair_session) {
@@ -518,9 +518,9 @@ static size_t hist_entry__fprintf(struct hist_entry *self,
 				snprintf(bf, sizeof(bf), " ");
 
 			if (sep)
-				fprintf(fp, "%c%s", *sep, bf);
+				ret += fprintf(fp, "%c%s", *sep, bf);
 			else
-				fprintf(fp, "%6.6s", bf);
+				ret += fprintf(fp, "%6.6s", bf);
 		}
 	}
 
@@ -528,7 +528,7 @@ static size_t hist_entry__fprintf(struct hist_entry *self,
 		if (se->elide)
 			continue;
 
-		fprintf(fp, "%s", sep ?: "  ");
+		ret += fprintf(fp, "%s", sep ?: "  ");
 		ret += se->print(fp, self, se->width ? *se->width : 0);
 	}
 
@@ -544,8 +544,8 @@ static size_t hist_entry__fprintf(struct hist_entry *self,
 			left_margin -= thread__comm_len(self->thread);
 		}
 
-		hist_entry_callchain__fprintf(fp, self, session_total,
-					      left_margin);
+		ret += hist_entry_callchain__fprintf(fp, self, session_total,
+						     left_margin);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From f9224c5c944b60cf709db4adf1f5195264b8d194 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 11 Mar 2010 20:12:44 -0300
Subject: perf report: Implement initial UI using newt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Newt has widespread availability and provides a rather simple
API as can be seen by the size of this patch.

The work needed to support it will benefit other frontends too.

In this initial patch it just checks if the output is a tty, if
not it falls back to the previous behaviour, also if
newt-devel/libnewt-dev is not installed the previous behaviour
is maintaned.

Pressing enter on a symbol will annotate it, ESC in the
annotation window will return to the report symbol list.

More work will be done to remove the special casing in
color_fprintf, stop using fmemopen/FILE in the printing of
hist_entries, etc.

Also the annotation doesn't need to be done via spawning "perf
annotate" and then browsing its output, we can do better by
calling directly the builtin-annotate.c functions, that would
then be moved to tools/perf/util/annotate.c and shared with perf
top, etc

But lets go by baby steps, this patch already improves perf
usability by allowing to quickly do annotations on symbols from
the report screen and provides a first experimentation with
libnewt/TUI integration of tools.

Tested on RHEL5 and Fedora12 X86_64 and on Debian PARISC64 to
browse a perf.data file collected on a Fedora12 x86_64 box.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268349164-5822-5-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Makefile         |   8 ++
 tools/perf/builtin-report.c |  47 ++++++-----
 tools/perf/perf.c           |   2 +
 tools/perf/util/cache.h     |  14 ++++
 tools/perf/util/color.c     |   5 +-
 tools/perf/util/debug.c     |   6 +-
 tools/perf/util/debug.h     |   1 +
 tools/perf/util/hist.h      |   5 ++
 tools/perf/util/newt.c      | 194 ++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/session.h   |   9 ++
 10 files changed, 270 insertions(+), 21 deletions(-)
 create mode 100644 tools/perf/util/newt.c

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 8a8f52db7e3..0abd25ee595 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -513,6 +513,14 @@ else
 	LIB_OBJS += util/probe-finder.o
 endif
 
+ifneq ($(shell sh -c "(echo '\#include <newt.h>'; echo 'int main(void) { newtInit(); newtCls(); return newtFinished(); }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -lnewt -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
+	msg := $(warning newt not found, disables TUI support. Please install newt-devel or libnewt-dev);
+	BASIC_CFLAGS += -DNO_NEWT_SUPPORT
+else
+	EXTLIBS += -lnewt
+	LIB_OBJS += util/newt.o
+endif
+
 ifndef NO_LIBPERL
 PERL_EMBED_LDOPTS = `perl -MExtUtils::Embed -e ldopts 2>/dev/null`
 PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f815de25d0f..1f9f8695f05 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -267,6 +267,7 @@ static int __cmd_report(void)
 	int ret = -EINVAL;
 	struct perf_session *session;
 	struct rb_node *next;
+	const char *help = "For a higher level overview, try: perf report --sort comm,dso";
 
 	session = perf_session__new(input_name, O_RDONLY, force);
 	if (session == NULL)
@@ -301,30 +302,38 @@ static int __cmd_report(void)
 		stats = rb_entry(next, struct event_stat_id, rb_node);
 		perf_session__collapse_resort(&stats->hists);
 		perf_session__output_resort(&stats->hists, stats->stats.total);
-		if (rb_first(&session->stats_by_id) ==
-		    rb_last(&session->stats_by_id))
-			fprintf(stdout, "# Samples: %Ld\n#\n",
-				stats->stats.total);
-		else
-			fprintf(stdout, "# Samples: %Ld %s\n#\n",
-				stats->stats.total,
-				__event_name(stats->type, stats->config));
 
-		perf_session__fprintf_hists(&stats->hists, NULL, false, stdout,
+		if (use_browser)
+			perf_session__browse_hists(&stats->hists,
+						   stats->stats.total, help);
+		else {
+			if (rb_first(&session->stats_by_id) ==
+			    rb_last(&session->stats_by_id))
+				fprintf(stdout, "# Samples: %Ld\n#\n",
+					stats->stats.total);
+			else
+				fprintf(stdout, "# Samples: %Ld %s\n#\n",
+					stats->stats.total,
+					__event_name(stats->type, stats->config));
+
+			perf_session__fprintf_hists(&stats->hists, NULL, false, stdout,
 					    stats->stats.total);
-		fprintf(stdout, "\n\n");
+			fprintf(stdout, "\n\n");
+		}
+
 		next = rb_next(&stats->rb_node);
 	}
 
-	if (sort_order == default_sort_order &&
-	    parent_pattern == default_parent_pattern)
-		fprintf(stdout, "#\n# (For a higher level overview, try: perf report --sort comm,dso)\n#\n");
+	if (!use_browser && sort_order == default_sort_order &&
+	    parent_pattern == default_parent_pattern) {
+		fprintf(stdout, "#\n# (%s)\n#\n", help);
 
-	if (show_threads) {
-		bool raw_printing_style = !strcmp(pretty_printing_style, "raw");
-		perf_read_values_display(stdout, &show_threads_values,
-					 raw_printing_style);
-		perf_read_values_destroy(&show_threads_values);
+		if (show_threads) {
+			bool style = !strcmp(pretty_printing_style, "raw");
+			perf_read_values_display(stdout, &show_threads_values,
+						 style);
+			perf_read_values_destroy(&show_threads_values);
+		}
 	}
 out_delete:
 	perf_session__delete(session);
@@ -447,7 +456,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 {
 	argc = parse_options(argc, argv, options, report_usage, 0);
 
-	setup_pager();
+	setup_browser();
 
 	if (symbol__init() < 0)
 		return -1;
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 57cb107c1f1..9ff186b57cb 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -265,6 +265,8 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
 	if (status)
 		return status & 0xff;
 
+	exit_browser();
+
 	/* Somebody closed stdout? */
 	if (fstat(fileno(stdout), &st))
 		return 0;
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index 918eb376abe..47b12a3d11b 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -1,6 +1,7 @@
 #ifndef __PERF_CACHE_H
 #define __PERF_CACHE_H
 
+#include <stdbool.h>
 #include "util.h"
 #include "strbuf.h"
 #include "../perf.h"
@@ -69,6 +70,19 @@ extern const char *pager_program;
 extern int pager_in_use(void);
 extern int pager_use_color;
 
+extern bool use_browser;
+
+#ifdef NO_NEWT_SUPPORT
+static inline void setup_browser(void)
+{
+	setup_pager();
+}
+static inline void exit_browser(void) {}
+#else
+void setup_browser(void);
+void exit_browser(void);
+#endif
+
 extern const char *editor_program;
 extern const char *excludes_file;
 
diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c
index e88bca55a59..9da01914e0a 100644
--- a/tools/perf/util/color.c
+++ b/tools/perf/util/color.c
@@ -203,7 +203,10 @@ int color_fprintf(FILE *fp, const char *color, const char *fmt, ...)
 	int r;
 
 	va_start(args, fmt);
-	r = color_vfprintf(fp, color, fmt, args);
+	if (use_browser)
+		r = vfprintf(fp, fmt, args);
+	else
+		r = color_vfprintf(fp, color, fmt, args);
 	va_end(args);
 	return r;
 }
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index 0905600c385..033d66db863 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -6,6 +6,7 @@
 #include <stdarg.h>
 #include <stdio.h>
 
+#include "cache.h"
 #include "color.h"
 #include "event.h"
 #include "debug.h"
@@ -21,7 +22,10 @@ int eprintf(int level, const char *fmt, ...)
 
 	if (verbose >= level) {
 		va_start(args, fmt);
-		ret = vfprintf(stderr, fmt, args);
+		if (use_browser)
+			ret = browser__show_help(fmt, args);
+		else
+			ret = vfprintf(stderr, fmt, args);
 		va_end(args);
 	}
 
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index 58720a18159..03accb86799 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -9,5 +9,6 @@ extern int dump_trace;
 
 int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
 void trace_event(event_t *event);
+int browser__show_help(const char *format, va_list ap);
 
 #endif	/* __PERF_DEBUG_H */
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 16f360cce5b..fe366ce5db4 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -18,6 +18,11 @@ struct hist_entry *__perf_session__add_hist_entry(struct rb_root *hists,
 						  u64 count, bool *hit);
 extern int64_t hist_entry__cmp(struct hist_entry *, struct hist_entry *);
 extern int64_t hist_entry__collapse(struct hist_entry *, struct hist_entry *);
+size_t hist_entry__fprintf(struct hist_entry *self,
+			   struct perf_session *pair_session,
+			   bool show_displacement,
+			   long displacement, FILE *fp,
+			   u64 session_total);
 void hist_entry__free(struct hist_entry *);
 
 void perf_session__output_resort(struct rb_root *hists, u64 total_samples);
diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
new file mode 100644
index 00000000000..3d3a936acb6
--- /dev/null
+++ b/tools/perf/util/newt.c
@@ -0,0 +1,194 @@
+#define _GNU_SOURCE
+#include <stdio.h>
+#undef _GNU_SOURCE
+
+#include <stdlib.h>
+#include <newt.h>
+
+#include "cache.h"
+#include "hist.h"
+#include "session.h"
+#include "sort.h"
+#include "symbol.h"
+
+static size_t hist_entry__append_browser(struct hist_entry *self,
+					 newtComponent listbox, u64 total)
+{
+	char bf[1024];
+	size_t len;
+	FILE *fp;
+
+	if (symbol_conf.exclude_other && !self->parent)
+		return 0;
+
+	fp = fmemopen(bf, sizeof(bf), "w");
+	if (fp == NULL)
+		return 0;
+
+	len = hist_entry__fprintf(self, NULL, false, 0, fp, total);
+
+	fclose(fp);
+	newtListboxAppendEntry(listbox, bf, self);
+	return len;
+}
+
+static void hist_entry__annotate_browser(struct hist_entry *self)
+{
+	FILE *fp;
+	struct winsize ws;
+	newtComponent form, listbox;
+	struct newtExitStruct es;
+	char *str;
+	size_t line_len, max_line_len = 0;
+	size_t max_usable_width;
+	char *line = NULL;
+
+	if (self->sym == NULL)
+		return;
+
+	if (asprintf(&str, "perf annotate %s | expand", self->sym->name) < 0)
+		return;
+
+	fp = popen(str, "r");
+	if (fp == NULL)
+		goto out_free_str;
+
+	newtPushHelpLine("Press ESC to exit");
+	get_term_dimensions(&ws);
+	listbox = newtListbox(0, 0, ws.ws_row - 5, NEWT_FLAG_SCROLL);
+
+	while (!feof(fp)) {
+		if (getline(&line, &line_len, fp) < 0 || !line_len)
+			break;
+		while (line_len != 0 && isspace(line[line_len - 1]))
+			line[--line_len] = '\0';
+
+		if (line_len > max_line_len)
+			max_line_len = line_len;
+		newtListboxAppendEntry(listbox, line, NULL);
+	}
+	fclose(fp);
+	free(line);
+
+	max_usable_width = ws.ws_col - 22;
+	if (max_line_len > max_usable_width)
+		max_line_len = max_usable_width;
+
+	newtListboxSetWidth(listbox, max_line_len);
+
+	newtCenteredWindow(max_line_len + 2, ws.ws_row - 5, self->sym->name);
+	form = newtForm(NULL, NULL, 0);
+	newtFormAddHotKey(form, NEWT_KEY_ESCAPE);
+	newtFormAddComponents(form, listbox, NULL);
+
+	newtFormRun(form, &es);
+	newtFormDestroy(form);
+	newtPopWindow();
+	newtPopHelpLine();
+out_free_str:
+	free(str);
+}
+
+void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
+				const char *helpline)
+{
+	struct sort_entry *se;
+	struct rb_node *nd;
+	unsigned int width;
+	char *col_width = symbol_conf.col_width_list_str;
+	struct winsize ws;
+	size_t max_len = 0;
+	char str[1024];
+	newtComponent form, listbox;
+	struct newtExitStruct es;
+
+	snprintf(str, sizeof(str), "Samples: %Ld", session_total);
+	newtDrawRootText(0, 0, str);
+	newtPushHelpLine(helpline);
+
+	get_term_dimensions(&ws);
+
+	form = newtForm(NULL, NULL, 0);
+	newtFormAddHotKey(form, NEWT_KEY_ESCAPE);
+
+	listbox = newtListbox(1, 1, ws.ws_row - 2, (NEWT_FLAG_SCROLL |
+						    NEWT_FLAG_BORDER |
+						    NEWT_FLAG_RETURNEXIT));
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		if (se->elide)
+			continue;
+		width = strlen(se->header);
+		if (se->width) {
+			if (symbol_conf.col_width_list_str) {
+				if (col_width) {
+					*se->width = atoi(col_width);
+					col_width = strchr(col_width, ',');
+					if (col_width)
+						++col_width;
+				}
+			}
+			*se->width = max(*se->width, width);
+		}
+	}
+
+	for (nd = rb_first(hists); nd; nd = rb_next(nd)) {
+		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+		size_t len = hist_entry__append_browser(h, listbox, session_total);
+		if (len > max_len)
+			max_len = len;
+	}
+
+	newtListboxSetWidth(listbox, max_len);
+	newtFormAddComponents(form, listbox, NULL);
+
+	while (1) {
+		struct hist_entry *selection;
+
+		newtFormRun(form, &es);
+		if (es.reason == NEWT_EXIT_HOTKEY)
+			break;
+		selection = newtListboxGetCurrent(listbox);
+		hist_entry__annotate_browser(selection);
+	}
+
+	newtFormDestroy(form);
+}
+
+int browser__show_help(const char *format, va_list ap)
+{
+	int ret;
+	static int backlog;
+	static char msg[1024];
+
+        ret = vsnprintf(msg + backlog, sizeof(msg) - backlog, format, ap);
+	backlog += ret;
+
+	if (msg[backlog - 1] == '\n') {
+		newtPopHelpLine();
+		newtPushHelpLine(msg);
+		newtRefresh();
+		backlog = 0;
+	}
+
+	return ret;
+}
+
+bool use_browser;
+
+void setup_browser(void)
+{
+	if (!isatty(1))
+		return;
+
+	use_browser = true;
+	newtInit();
+	newtCls();
+	newtPushHelpLine(" ");
+}
+
+void exit_browser(void)
+{
+	if (use_browser)
+		newtFinished();
+}
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 5c33417eebb..34d73395baa 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -86,4 +86,13 @@ static inline struct map *
 {
 	return map_groups__new_module(&self->kmaps, start, filename);
 }
+
+#ifdef NO_NEWT_SUPPORT
+static inline void perf_session__browse_hists(struct rb_root *hists __used,
+					      u64 session_total __used,
+					      const char *helpline __used) {}
+#else
+void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
+				const char *helpline);
+#endif
 #endif /* __PERF_SESSION_H */
-- 
cgit v1.2.3


From 7081e087b90d4eb4348f7970bd6b266d837321ef Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 12 Mar 2010 10:48:12 -0300
Subject: perf newt: Add 'Q', 'q' and Ctrl+C as ways to exit from forms
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These are keys people expect when pressed to exit the current
widget, so have associate all of them to this semantic.

Suggested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268401692-9361-1-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/newt.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 3d3a936acb6..2a4308a29ba 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -4,6 +4,7 @@
 
 #include <stdlib.h>
 #include <newt.h>
+#include <sys/ttydefaults.h>
 
 #include "cache.h"
 #include "hist.h"
@@ -11,6 +12,22 @@
 #include "sort.h"
 #include "symbol.h"
 
+static void newt_form__set_exit_keys(newtComponent self)
+{
+	newtFormAddHotKey(self, NEWT_KEY_ESCAPE);
+	newtFormAddHotKey(self, 'Q');
+	newtFormAddHotKey(self, 'q');
+	newtFormAddHotKey(self, CTRL('c'));
+}
+
+static newtComponent newt_form__new(void)
+{
+	newtComponent self = newtForm(NULL, NULL, 0);
+	if (self)
+		newt_form__set_exit_keys(self);
+	return self;
+}
+
 static size_t hist_entry__append_browser(struct hist_entry *self,
 					 newtComponent listbox, u64 total)
 {
@@ -77,8 +94,7 @@ static void hist_entry__annotate_browser(struct hist_entry *self)
 	newtListboxSetWidth(listbox, max_line_len);
 
 	newtCenteredWindow(max_line_len + 2, ws.ws_row - 5, self->sym->name);
-	form = newtForm(NULL, NULL, 0);
-	newtFormAddHotKey(form, NEWT_KEY_ESCAPE);
+	form = newt_form__new();
 	newtFormAddComponents(form, listbox, NULL);
 
 	newtFormRun(form, &es);
@@ -108,8 +124,7 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 
 	get_term_dimensions(&ws);
 
-	form = newtForm(NULL, NULL, 0);
-	newtFormAddHotKey(form, NEWT_KEY_ESCAPE);
+	form = newt_form__new();
 
 	listbox = newtListbox(1, 1, ws.ws_row - 2, (NEWT_FLAG_SCROLL |
 						    NEWT_FLAG_BORDER |
-- 
cgit v1.2.3


From cb7afb7092bc502b890f0a897ffd67c2b078d347 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 12 Mar 2010 12:46:47 -0300
Subject: perf newt: Use newtGetScreenSize
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For consistency, use the newt API more fully.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268408808-13595-1-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/newt.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 2a4308a29ba..f6ec6f5c0fd 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -52,7 +52,7 @@ static size_t hist_entry__append_browser(struct hist_entry *self,
 static void hist_entry__annotate_browser(struct hist_entry *self)
 {
 	FILE *fp;
-	struct winsize ws;
+	int cols, rows;
 	newtComponent form, listbox;
 	struct newtExitStruct es;
 	char *str;
@@ -71,8 +71,8 @@ static void hist_entry__annotate_browser(struct hist_entry *self)
 		goto out_free_str;
 
 	newtPushHelpLine("Press ESC to exit");
-	get_term_dimensions(&ws);
-	listbox = newtListbox(0, 0, ws.ws_row - 5, NEWT_FLAG_SCROLL);
+	newtGetScreenSize(&cols, &rows);
+	listbox = newtListbox(0, 0, rows - 5, NEWT_FLAG_SCROLL);
 
 	while (!feof(fp)) {
 		if (getline(&line, &line_len, fp) < 0 || !line_len)
@@ -87,13 +87,13 @@ static void hist_entry__annotate_browser(struct hist_entry *self)
 	fclose(fp);
 	free(line);
 
-	max_usable_width = ws.ws_col - 22;
+	max_usable_width = cols - 22;
 	if (max_line_len > max_usable_width)
 		max_line_len = max_usable_width;
 
 	newtListboxSetWidth(listbox, max_line_len);
 
-	newtCenteredWindow(max_line_len + 2, ws.ws_row - 5, self->sym->name);
+	newtCenteredWindow(max_line_len + 2, rows - 5, self->sym->name);
 	form = newt_form__new();
 	newtFormAddComponents(form, listbox, NULL);
 
@@ -112,7 +112,7 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 	struct rb_node *nd;
 	unsigned int width;
 	char *col_width = symbol_conf.col_width_list_str;
-	struct winsize ws;
+	int rows;
 	size_t max_len = 0;
 	char str[1024];
 	newtComponent form, listbox;
@@ -122,13 +122,13 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 	newtDrawRootText(0, 0, str);
 	newtPushHelpLine(helpline);
 
-	get_term_dimensions(&ws);
+	newtGetScreenSize(NULL, &rows);
 
 	form = newt_form__new();
 
-	listbox = newtListbox(1, 1, ws.ws_row - 2, (NEWT_FLAG_SCROLL |
-						    NEWT_FLAG_BORDER |
-						    NEWT_FLAG_RETURNEXIT));
+	listbox = newtListbox(1, 1, rows - 2, (NEWT_FLAG_SCROLL |
+					       NEWT_FLAG_BORDER |
+					       NEWT_FLAG_RETURNEXIT));
 
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		if (se->elide)
-- 
cgit v1.2.3


From 3997d3776a6e89586e76a0ef355bfbbd8a76966c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 12 Mar 2010 12:46:48 -0300
Subject: perf hist: Don't fprintf the callgraph unconditionally
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[root@doppio ~]# perf report -i newt.data | head -10
  # Samples: 11999679868
  #
  # Overhead  Command                  Shared Object  Symbol
  # ........  .......  .............................  ......
  #
      63.61%     perf  libslang.so.2.1.4              [.] SLsmg_write_chars
       6.30%     perf  perf                           [.] symbols__find
       2.19%     perf  libnewt.so.0.52.10             [.] newtListboxAppendEntry
       2.08%     perf  libslang.so.2.1.4              [.] SLsmg_write_chars@plt
       1.99%     perf  libc-2.10.2.so                 [.] _IO_vfprintf_internal
  [root@doppio ~]#

Not good, the newt form for report works, but slang has to eat
the cost of the additional callgraph lines everytime it prints a
line, and the callgraph doesn't appear on the screen, so move
the callgraph printing to a separate function and don't use it
in newt.c.

Newt tree widgets are being investigated to properly support
callgraphs, but till that gets merged, lets remove this huge
overhead and show at least the symbol overheads for a callgraph
rich perf.data with good performance.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268408808-13595-2-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/hist.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index d43be344a88..1a4e8376d84 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -532,23 +532,23 @@ size_t hist_entry__fprintf(struct hist_entry *self,
 		ret += se->print(fp, self, se->width ? *se->width : 0);
 	}
 
-	ret += fprintf(fp, "\n");
-
-	if (symbol_conf.use_callchain) {
-		int left_margin = 0;
+	return ret + fprintf(fp, "\n");
+}
 
-		if (sort__first_dimension == SORT_COMM) {
-			se = list_first_entry(&hist_entry__sort_list, typeof(*se),
-						list);
-			left_margin = se->width ? *se->width : 0;
-			left_margin -= thread__comm_len(self->thread);
-		}
+static size_t hist_entry__fprintf_callchain(struct hist_entry *self, FILE *fp,
+					    u64 session_total)
+{
+	int left_margin = 0;
 
-		ret += hist_entry_callchain__fprintf(fp, self, session_total,
-						     left_margin);
+	if (sort__first_dimension == SORT_COMM) {
+		struct sort_entry *se = list_first_entry(&hist_entry__sort_list,
+							 typeof(*se), list);
+		left_margin = se->width ? *se->width : 0;
+		left_margin -= thread__comm_len(self->thread);
 	}
 
-	return ret;
+	return hist_entry_callchain__fprintf(fp, self, session_total,
+					     left_margin);
 }
 
 size_t perf_session__fprintf_hists(struct rb_root *hists,
@@ -655,6 +655,10 @@ print_entries:
 		}
 		ret += hist_entry__fprintf(h, pair, show_displacement,
 					   displacement, fp, session_total);
+
+		if (symbol_conf.use_callchain)
+			ret += hist_entry__fprintf_callchain(h, fp, session_total);
+
 		if (h->map == NULL && verbose > 1) {
 			__map_groups__fprintf_maps(&h->thread->mg,
 						   MAP__FUNCTION, fp);
-- 
cgit v1.2.3


From 567e54790e5c07152a93b6de4d0210af8b77da87 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 12 Mar 2010 21:05:10 -0300
Subject: perf tools: Fix non-newt build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The use_browser needs to be in a file that is always built and
also we need a browser__show_help stub in that case.

Reported-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268438710-32697-1-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/perf.c       | 2 ++
 tools/perf/util/debug.h | 8 ++++++++
 tools/perf/util/newt.c  | 2 --
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 9ff186b57cb..0d4b9edfab1 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -16,6 +16,8 @@
 #include "util/string.h"
 #include "util/debugfs.h"
 
+bool use_browser;
+
 const char perf_usage_string[] =
 	"perf [--version] [--help] COMMAND [ARGS]";
 
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index 03accb86799..0172edf3f15 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -9,6 +9,14 @@ extern int dump_trace;
 
 int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
 void trace_event(event_t *event);
+
+#ifdef NO_NEWT_SUPPORT
+static inline int browser__show_help(const char *format __used, va_list ap __used)
+{
+	return 0;
+}
+#else
 int browser__show_help(const char *format, va_list ap);
+#endif
 
 #endif	/* __PERF_DEBUG_H */
diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index f6ec6f5c0fd..2836394459d 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -189,8 +189,6 @@ int browser__show_help(const char *format, va_list ap)
 	return ret;
 }
 
-bool use_browser;
-
 void setup_browser(void)
 {
 	if (!isatty(1))
-- 
cgit v1.2.3


From 8576e1971663ffdb6139041de97cdd2e1d4791cc Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sat, 13 Mar 2010 11:11:16 +0300
Subject: x86, perf: Unmask LVTPC only if we have APIC supported

Ingo reported:

 |
 | There's a build failure on -tip with the P4 driver, on UP 32-bit, if
 | PERF_EVENTS is enabled but UP_APIC is disabled:
 |
 | arch/x86/built-in.o: In function `p4_pmu_handle_irq':
 | perf_event.c:(.text+0xa756): undefined reference to `apic'
 | perf_event.c:(.text+0xa76e): undefined reference to `apic'
 |

So we have to unmask LVTPC only if we're configured to have one.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Lin Ming <ming.m.lin@intel.com>
CC: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20100313081116.GA5179@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 381f593e829..ef861da1c8a 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -365,8 +365,10 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 	}
 
 	if (handled) {
+#ifdef CONFIG_X86_LOCAL_APIC
 		/* p4 quirk: unmask it again */
 		apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+#endif
 		inc_irq_stat(apic_perf_irqs);
 	}
 
-- 
cgit v1.2.3


From e4495262826d1eabca3529fa6ac22394eb348132 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon, 15 Mar 2010 12:58:22 +0800
Subject: perf, x86: Enable not tagged retired instruction counting on P4s

This should turn on instruction counting on P4s, which was missing in
the first version of the new PMU driver.

It's inaccurate for now, we still need dependant event to tag mops
before we can count them precisely. The result is that the number of
instruction may be lifted up.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1268629102.3355.11.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 8 ++++----
 arch/x86/kernel/cpu/perf_event_p4.c  | 8 +++-----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 829f4711645..b47b9e9ac13 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -324,8 +324,8 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 
 #define P4_SSE_INPUT_ASSIST		P4_EVENT_PACK(0x34, 0x01)
 	/*
-	 * MSR_P4_FIRM_ESCR:	8, 9
-	 * MSR_P4_FIRM_ESCR:	10, 11
+	 * MSR_P4_FIRM_ESCR0:	8, 9
+	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
 #define P4_PACKED_SP_UOP		P4_EVENT_PACK(0x08, 0x01)
@@ -462,8 +462,8 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 
 #define P4_INSTR_RETIRED		P4_EVENT_PACK(0x02, 0x04)
 	/*
-	 * MSR_P4_CRU_ESCR2:	12, 13, 16
-	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 * MSR_P4_CRU_ESCR0:	12, 13, 16
+	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
 
 #define P4_UOPS_RETIRED			P4_EVENT_PACK(0x01, 0x04)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ef861da1c8a..a11ce73a93c 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -60,13 +60,11 @@ struct p4_event_template p4_templates[] = {
 	[2] = {
 		.opcode	= P4_INSTR_RETIRED,
 		.config	= 0,
-		.dep	= 0, /* needs front-end tagging */
+		.dep	= -1, /* needs front-end tagging */
 		.emask	=
 			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG)	|
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSTAG)	|
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG)	|
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSTAG),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
 		.cntr		= { 12, 14 },
 	},
 	[3] = {
-- 
cgit v1.2.3


From bedbfdea31daf3880745001d56450c683959ee7e Mon Sep 17 00:00:00 2001
From: Eric B Munson <ebmunson@us.ibm.com>
Date: Mon, 15 Mar 2010 11:46:57 -0300
Subject: perf record: Enable the enable_on_exec flag if record forks the
 target

When forking its target, perf record can capture data from
before the target application is started.  Perf stat uses the
enable_on_exec flag in the event attributes to keep from
displaying events from before the target program starts, this
patch adds the same functionality to perf record when it is will
fork the target process.

Signed-off-by: Eric B Munson <ebmunson@us.ibm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1268664418-28328-1-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index bed175d59e5..962cdbf44ae 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -225,7 +225,7 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n
 	return h_attr;
 }
 
-static void create_counter(int counter, int cpu, pid_t pid)
+static void create_counter(int counter, int cpu, pid_t pid, bool forks)
 {
 	char *filter = filters[counter];
 	struct perf_event_attr *attr = attrs + counter;
@@ -277,6 +277,9 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	attr->inherit		= inherit;
 	attr->disabled		= 1;
 
+	if (forks)
+		attr->enable_on_exec = 1;
+
 try_again:
 	fd[nr_cpu][counter] = sys_perf_event_open(attr, pid, cpu, group_fd, 0);
 
@@ -381,13 +384,13 @@ try_again:
 	ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_ENABLE);
 }
 
-static void open_counters(int cpu, pid_t pid)
+static void open_counters(int cpu, pid_t pid, bool forks)
 {
 	int counter;
 
 	group_fd = -1;
 	for (counter = 0; counter < nr_counters; counter++)
-		create_counter(counter, cpu, pid);
+		create_counter(counter, cpu, pid, forks);
 
 	nr_cpu++;
 }
@@ -547,11 +550,11 @@ static int __cmd_record(int argc, const char **argv)
 
 
 	if ((!system_wide && !inherit) || profile_cpu != -1) {
-		open_counters(profile_cpu, target_pid);
+		open_counters(profile_cpu, target_pid, forks);
 	} else {
 		nr_cpus = read_cpu_map();
 		for (i = 0; i < nr_cpus; i++)
-			open_counters(cpumap[i], target_pid);
+			open_counters(cpumap[i], target_pid, forks);
 	}
 
 	if (file_new) {
-- 
cgit v1.2.3


From b0a9ab62ab96e258a0ddd81d7fe2719c3db36006 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 15 Mar 2010 11:46:58 -0300
Subject: perf top: Properly notify the user that vmlinux is missing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before this patch this message would very briefly appear on the
screen and then the screen would get updates only on the top,
for number of interrupts received, etc, but no annotation would
be performed:

 [root@doppio linux-2.6-tip]# perf top -s n_tty_write > /tmp/bla
 objdump: '[kernel.kallsyms]': No such file

Now this is what the user gets:

 [root@doppio linux-2.6-tip]# perf top -s n_tty_write
 Can't annotate n_tty_write: No vmlinux file was found in the
 path: [0] vmlinux
 [1] /boot/vmlinux
 [2] /boot/vmlinux-2.6.33-rc5
 [3] /lib/modules/2.6.33-rc5/build/vmlinux
 [4] /usr/lib/debug/lib/modules/2.6.33-rc5/vmlinux
 [root@doppio linux-2.6-tip]#

This bug was introduced when we added automatic search for
vmlinux, before that time the user had to specify a vmlinux
file.

Reported-by: David S. Miller <davem@davemloft.net>
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: <stable@kernel.org>
LKML-Reference: <1268664418-28328-2-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-top.c | 33 +++++++++++++++++++++++++--------
 tools/perf/util/symbol.c | 25 ++++++++++++-------------
 tools/perf/util/symbol.h | 15 +++++++++++++++
 3 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 57e232f13bc..c968bd3391e 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -169,7 +169,7 @@ static void sig_winch_handler(int sig __used)
 	update_print_entries(&winsize);
 }
 
-static void parse_source(struct sym_entry *syme)
+static int parse_source(struct sym_entry *syme)
 {
 	struct symbol *sym;
 	struct sym_entry_source *source;
@@ -180,12 +180,21 @@ static void parse_source(struct sym_entry *syme)
 	u64 len;
 
 	if (!syme)
-		return;
+		return -1;
+
+	sym = sym_entry__symbol(syme);
+	map = syme->map;
+
+	/*
+	 * We can't annotate with just /proc/kallsyms
+	 */
+	if (map->dso->origin == DSO__ORIG_KERNEL)
+		return -1;
 
 	if (syme->src == NULL) {
 		syme->src = zalloc(sizeof(*source));
 		if (syme->src == NULL)
-			return;
+			return -1;
 		pthread_mutex_init(&syme->src->lock, NULL);
 	}
 
@@ -195,9 +204,6 @@ static void parse_source(struct sym_entry *syme)
 		pthread_mutex_lock(&source->lock);
 		goto out_assign;
 	}
-
-	sym = sym_entry__symbol(syme);
-	map = syme->map;
 	path = map->dso->long_name;
 
 	len = sym->end - sym->start;
@@ -209,7 +215,7 @@ static void parse_source(struct sym_entry *syme)
 
 	file = popen(command, "r");
 	if (!file)
-		return;
+		return -1;
 
 	pthread_mutex_lock(&source->lock);
 	source->lines_tail = &source->lines;
@@ -245,6 +251,7 @@ static void parse_source(struct sym_entry *syme)
 out_assign:
 	sym_filter_entry = syme;
 	pthread_mutex_unlock(&source->lock);
+	return 0;
 }
 
 static void __zero_source_counters(struct sym_entry *syme)
@@ -991,7 +998,17 @@ static void event__process_sample(const event_t *self,
 	if (sym_filter_entry_sched) {
 		sym_filter_entry = sym_filter_entry_sched;
 		sym_filter_entry_sched = NULL;
-		parse_source(sym_filter_entry);
+		if (parse_source(sym_filter_entry) < 0) {
+			struct symbol *sym = sym_entry__symbol(sym_filter_entry);
+
+			pr_err("Can't annotate %s", sym->name);
+			if (sym_filter_entry->map->dso->origin == DSO__ORIG_KERNEL) {
+				pr_err(": No vmlinux file was found in the path:\n");
+				vmlinux_path__fprintf(stderr);
+			} else
+				pr_err(".\n");
+			exit(1);
+		}
 	}
 
 	syme = symbol__priv(al.sym);
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 75cd46807c7..292f941555a 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -18,18 +18,6 @@
 #define NT_GNU_BUILD_ID 3
 #endif
 
-enum dso_origin {
-	DSO__ORIG_KERNEL = 0,
-	DSO__ORIG_JAVA_JIT,
-	DSO__ORIG_BUILD_ID_CACHE,
-	DSO__ORIG_FEDORA,
-	DSO__ORIG_UBUNTU,
-	DSO__ORIG_BUILDID,
-	DSO__ORIG_DSO,
-	DSO__ORIG_KMODULE,
-	DSO__ORIG_NOT_FOUND,
-};
-
 static void dsos__add(struct list_head *head, struct dso *dso);
 static struct map *map__new2(u64 start, struct dso *dso, enum map_type type);
 static int dso__load_kernel_sym(struct dso *self, struct map *map,
@@ -1017,7 +1005,7 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name,
 				}
 				curr_map->map_ip = identity__map_ip;
 				curr_map->unmap_ip = identity__map_ip;
-				curr_dso->origin = DSO__ORIG_KERNEL;
+				curr_dso->origin = self->origin;
 				map_groups__insert(kmap->kmaps, curr_map);
 				dsos__add(&dsos__kernel, curr_dso);
 				dso__set_loaded(curr_dso, map->type);
@@ -1887,6 +1875,17 @@ out_fail:
 	return -1;
 }
 
+size_t vmlinux_path__fprintf(FILE *fp)
+{
+	int i;
+	size_t printed = 0;
+
+	for (i = 0; i < vmlinux_path__nr_entries; ++i)
+		printed += fprintf(fp, "[%d] %s\n", i, vmlinux_path[i]);
+
+	return printed;
+}
+
 static int setup_list(struct strlist **list, const char *list_str,
 		      const char *list_name)
 {
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 280dadd32a0..d983bbaf99c 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -149,6 +149,19 @@ size_t dsos__fprintf_buildid(FILE *fp, bool with_hits);
 
 size_t dso__fprintf_buildid(struct dso *self, FILE *fp);
 size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp);
+
+enum dso_origin {
+	DSO__ORIG_KERNEL = 0,
+	DSO__ORIG_JAVA_JIT,
+	DSO__ORIG_BUILD_ID_CACHE,
+	DSO__ORIG_FEDORA,
+	DSO__ORIG_UBUNTU,
+	DSO__ORIG_BUILDID,
+	DSO__ORIG_DSO,
+	DSO__ORIG_KMODULE,
+	DSO__ORIG_NOT_FOUND,
+};
+
 char dso__symtab_origin(const struct dso *self);
 void dso__set_long_name(struct dso *self, char *name);
 void dso__set_build_id(struct dso *self, void *build_id);
@@ -168,4 +181,6 @@ int kallsyms__parse(const char *filename, void *arg,
 int symbol__init(void);
 bool symbol_type__is_a(char symbol_type, enum map_type map_type);
 
+size_t vmlinux_path__fprintf(FILE *fp);
+
 #endif /* __PERF_SYMBOL */
-- 
cgit v1.2.3


From d06d92b7c9b99ea52bdaeb13f544675529891b8a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 15 Mar 2010 13:04:33 -0300
Subject: perf annotate: Properly notify the user that vmlinux is missing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before this patch we would not find a vmlinux, then try to pass
objdump "[kernel.kallsyms]" as the filename, it would get
confused and produce no output:

 [root@doppio ~]# perf annotate n_tty_write

 ------------------------------------------------
  Percent |      Source code & Disassembly of [kernel.kallsyms]
 ------------------------------------------------

Now we check that and emit meaningful warning:

 [root@doppio ~]# perf annotate n_tty_write
 Can't annotate n_tty_write: No vmlinux file was found in the
 path: [0] vmlinux
 [1] /boot/vmlinux
 [2] /boot/vmlinux-2.6.34-rc1-tip+
 [3] /lib/modules/2.6.34-rc1-tip+/build/vmlinux
 [4] /usr/lib/debug/lib/modules/2.6.34-rc1-tip+/vmlinux
 [root@doppio ~]#

This bug was introduced when we added automatic search for
vmlinux, before that time the user had to specify a vmlinux
file.

v2: Print the warning just for the first symbol found when no
    symbol name is specified, otherwise it will spam the screen
    repeating the warning for each symbol.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: <stable@kernel.org>
LKML-Reference: <1268669073-6856-1-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-annotate.c | 10 ++++++++++
 tools/perf/util/newt.c        |  2 +-
 tools/perf/util/symbol.h      |  1 +
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 6ad7148451c..45d14660d53 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -452,6 +452,16 @@ static void annotate_sym(struct hist_entry *he)
 	if (!filename)
 		return;
 
+	if (dso->origin == DSO__ORIG_KERNEL) {
+		if (dso->annotate_warned)
+			return;
+		dso->annotate_warned = 1;
+		pr_err("Can't annotate %s: No vmlinux file was found in the "
+		       "path:\n", sym->name);
+		vmlinux_path__fprintf(stderr);
+		return;
+	}
+
 	pr_debug("%s: filename=%s, sym=%s, start=%#Lx, end=%#Lx\n", __func__,
 		 filename, sym->name, map->unmap_ip(map, sym->start),
 		 map->unmap_ip(map, sym->end));
diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 2836394459d..2d19e7a3e6e 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -63,7 +63,7 @@ static void hist_entry__annotate_browser(struct hist_entry *self)
 	if (self->sym == NULL)
 		return;
 
-	if (asprintf(&str, "perf annotate %s | expand", self->sym->name) < 0)
+	if (asprintf(&str, "perf annotate %s 2>&1 | expand", self->sym->name) < 0)
 		return;
 
 	fp = popen(str, "r");
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index d983bbaf99c..5bd91d14270 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -106,6 +106,7 @@ struct dso {
 	u8		 has_build_id:1;
 	u8		 kernel:1;
 	u8		 hit:1;
+	u8		 annotate_warned:1;
 	unsigned char	 origin;
 	u8		 sorted_by_name;
 	u8		 loaded;
-- 
cgit v1.2.3


From 1d199b1ad606ae8b88acebd295b101c4e1cf2a57 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 16 Mar 2010 01:05:02 +0100
Subject: perf: Fix unexported generic perf_arch_fetch_caller_regs

perf_arch_fetch_caller_regs() is exported for the overriden x86
version, but not for the generic weak version.

As a general rule, weak functions should not have their symbol
exported in the same file they are defined.

So let's export it on trace_event_perf.c as it is used by trace
events only.

This fixes:

	ERROR: ".perf_arch_fetch_caller_regs" [fs/xfs/xfs.ko] undefined!
	ERROR: ".perf_arch_fetch_caller_regs" [arch/powerpc/platforms/cell/spufs/spufs.ko] undefined!

-v2: And also only build it if trace events are enabled.
-v3: Fix changelog mistake

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268697902-9518-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 3 ++-
 kernel/perf_event.c              | 2 ++
 kernel/trace/trace_event_perf.c  | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 978d297170a..0d3466cf7f5 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1695,6 +1695,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return entry;
 }
 
+#ifdef CONFIG_EVENT_TRACING
 void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
 {
 	regs->ip = ip;
@@ -1706,4 +1707,4 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 	regs->cs = __KERNEL_CS;
 	local_save_flags(regs->flags);
 }
-EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
+#endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 8bf61273c58..455393e71ca 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2790,10 +2790,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return NULL;
 }
 
+#ifdef CONFIG_EVENT_TRACING
 __weak
 void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
 {
 }
+#endif
 
 /*
  * Output
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0709e4f7511..7d79a10c3cd 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -12,6 +12,8 @@
 DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
 EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
 
+EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
+
 static char *perf_trace_buf;
 static char *perf_trace_buf_nmi;
 
-- 
cgit v1.2.3


From 8ea7f544100844307072cae2f5fc108afdef999a Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Tue, 16 Mar 2010 10:12:36 +0800
Subject: x86, perf: Fix comments in Pentium-4 PMU definitions

Reported-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1268705556.3379.8.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index b47b9e9ac13..b842b3238e4 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -319,6 +319,7 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 
 #define P4_BSQ_ACTIVE_ENTRIES		P4_EVENT_PACK(0x06, 0x07)
 	/*
+	 * NOTE: no ESCR name in docs, it's guessed
 	 * MSR_P4_BSU_ESCR1:	2, 3
 	 */
 
@@ -468,8 +469,8 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 
 #define P4_UOPS_RETIRED			P4_EVENT_PACK(0x01, 0x04)
 	/*
-	 * MSR_P4_CRU_ESCR2:	12, 13, 16
-	 * MSR_P4_CRU_ESCR3:	14, 15, 17
+	 * MSR_P4_CRU_ESCR0:	12, 13, 16
+	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
 
 #define P4_UOP_TYPE			P4_EVENT_PACK(0x02, 0x02)
-- 
cgit v1.2.3


From 6427462bfa50f50dc6c088c07037264fcc73eca1 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Mon, 15 Mar 2010 11:21:48 +0300
Subject: sched: Remove some dead code

This was left over from "7c9414385e sched: Remove USER_SCHED"

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: Dhaval Giani <dhaval.giani@gmail.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
LKML-Reference: <20100315082148.GD18181@bicker>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/user.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/user.c b/kernel/user.c
index 766467b3bcb..ec3b2229893 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -178,8 +178,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 
 	return up;
 
-	put_user_ns(new->user_ns);
-	kmem_cache_free(uid_cachep, new);
 out_unlock:
 	return NULL;
 }
-- 
cgit v1.2.3


From 5cc718b9dad682329a60e73547c6e708faa5bbe4 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 15 Mar 2010 13:00:54 -0400
Subject: kprobes: Hide CONFIG_OPTPROBES and set if arch supports optimized
 kprobes

Hide CONFIG_OPTPROBES and set if the arch supports optimized
kprobes (IOW, HAVE_OPTPROBES=y), since this option doesn't
change the major behavior of kprobes, and workarounds for minor
changes are documented.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Dieter Ries <mail@dieterries.net>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100315170054.31593.3153.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kprobes.txt | 10 ++--------
 arch/Kconfig              |  9 ++-------
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 2f9115c0ae6..61c291cddf1 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -165,8 +165,8 @@ the user entry_handler invocation is also skipped.
 
 1.4 How Does Jump Optimization Work?
 
-If you configured your kernel with CONFIG_OPTPROBES=y (currently
-this option is supported on x86/x86-64, non-preemptive kernel) and
+If your kernel is built with CONFIG_OPTPROBES=y (currently this flag
+is automatically set 'y' on x86/x86-64, non-preemptive kernel) and
 the "debug.kprobes_optimization" kernel parameter is set to 1 (see
 sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump
 instruction instead of a breakpoint instruction at each probepoint.
@@ -271,8 +271,6 @@ tweak the kernel's execution path, you need to suppress optimization,
 using one of the following techniques:
 - Specify an empty function for the kprobe's post_handler or break_handler.
  or
-- Config CONFIG_OPTPROBES=n.
- or
 - Execute 'sysctl -w debug.kprobes_optimization=n'
 
 2. Architectures Supported
@@ -307,10 +305,6 @@ it useful to "Compile the kernel with debug info" (CONFIG_DEBUG_INFO),
 so you can use "objdump -d -l vmlinux" to see the source-to-object
 code mapping.
 
-If you want to reduce probing overhead, set "Kprobes jump optimization
-support" (CONFIG_OPTPROBES) to "y". You can find this option under the
-"Kprobes" line.
-
 4. API Reference
 
 The Kprobes API includes a "register" function and an "unregister"
diff --git a/arch/Kconfig b/arch/Kconfig
index e5eb1337a53..f06010fb483 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -42,15 +42,10 @@ config KPROBES
 	  If in doubt, say "N".
 
 config OPTPROBES
-	bool "Kprobes jump optimization support (EXPERIMENTAL)"
-	default y
-	depends on KPROBES
+	def_bool y
+	depends on KPROBES && HAVE_OPTPROBES
 	depends on !PREEMPT
-	depends on HAVE_OPTPROBES
 	select KALLSYMS_ALL
-	help
-	  This option will allow kprobes to optimize breakpoint to
-	  a jump for reducing its overhead.
 
 config HAVE_EFFICIENT_UNALIGNED_ACCESS
 	bool
-- 
cgit v1.2.3


From 984763cb90d4b5444baa0c3e43feff7926bf1834 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 16 Mar 2010 17:07:33 +0100
Subject: perf, x86: Report error code that returned from x86_pmu.hw_config()

If x86_pmu.hw_config() fails a fixed error code (-EOPNOTSUPP) is
returned even if a different error was reported. This patch fixes
this.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Acked-by: Lin Ming <ming.m.lin@intel.com>
Cc: acme@redhat.com
Cc: eranian@google.com
Cc: gorcunov@openvz.org
Cc: peterz@infradead.org
Cc: fweisbec@gmail.com
LKML-Reference: <20100316160733.GR1585@erda.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0d3466cf7f5..5dacf63f913 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -472,8 +472,9 @@ static int __hw_perf_event_init(struct perf_event *event)
 	hwc->last_tag = ~0ULL;
 
 	/* Processor specifics */
-	if (x86_pmu.hw_config(attr, hwc))
-		return -EOPNOTSUPP;
+	err = x86_pmu.hw_config(attr, hwc);
+	if (err)
+		return err;
 
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
-- 
cgit v1.2.3


From a6b84574eed7e4fd8cb8dac2d0926fe2cf34b941 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 16 Mar 2010 01:05:02 +0100
Subject: perf: Fix unexported generic perf_arch_fetch_caller_regs

perf_arch_fetch_caller_regs() is exported for the overriden x86
version, but not for the generic weak version.

As a general rule, weak functions should not have their symbol
exported in the same file they are defined.

So let's export it on trace_event_perf.c as it is used by trace
events only.

This fixes:

	ERROR: ".perf_arch_fetch_caller_regs" [fs/xfs/xfs.ko] undefined!
	ERROR: ".perf_arch_fetch_caller_regs" [arch/powerpc/platforms/cell/spufs/spufs.ko] undefined!

-v2: And also only build it if trace events are enabled.
-v3: Fix changelog mistake

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1268697902-9518-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 3 ++-
 kernel/perf_event.c              | 2 ++
 kernel/trace/trace_event_perf.c  | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 7645faea8e8..60398a0d947 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1702,6 +1702,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return entry;
 }
 
+#ifdef CONFIG_EVENT_TRACING
 void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
 {
 	regs->ip = ip;
@@ -1713,4 +1714,4 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 	regs->cs = __KERNEL_CS;
 	local_save_flags(regs->flags);
 }
-EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
+#endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index fb3031cf9f1..574ee58a304 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2786,10 +2786,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 	return NULL;
 }
 
+#ifdef CONFIG_EVENT_TRACING
 __weak
 void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
 {
 }
+#endif
 
 /*
  * Output
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0709e4f7511..7d79a10c3cd 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -12,6 +12,8 @@
 DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
 EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
 
+EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
+
 static char *perf_trace_buf;
 static char *perf_trace_buf_nmi;
 
-- 
cgit v1.2.3


From a1d37d5285bcda07f9c0b80a2634ca20ab545297 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 16 Mar 2010 18:05:21 -0400
Subject: perf tools: Introduce xzalloc() for detecting out of memory
 conditions

Introducing xzalloc() which wrapping zalloc() for detecting out
of memory conditions.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100316220521.32050.85155.stgit@localhost6.localdomain6>
[ -v2: small cleanups in surrounding code ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/util.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 0f5b2a6f108..52701087ce0 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -295,6 +295,13 @@ extern void *xmemdupz(const void *data, size_t len);
 extern char *xstrndup(const char *str, size_t len);
 extern void *xrealloc(void *ptr, size_t size) __attribute__((weak));
 
+static inline void *xzalloc(size_t size)
+{
+	void *buf = xmalloc(size);
+
+	return memset(buf, 0, size);
+}
+
 static inline void *zalloc(size_t size)
 {
 	return calloc(1, size);
@@ -309,6 +316,7 @@ static inline int has_extension(const char *filename, const char *ext)
 {
 	size_t len = strlen(filename);
 	size_t extlen = strlen(ext);
+
 	return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
 }
 
@@ -322,6 +330,7 @@ static inline int has_extension(const char *filename, const char *ext)
 #undef isalnum
 #undef tolower
 #undef toupper
+
 extern unsigned char sane_ctype[256];
 #define GIT_SPACE		0x01
 #define GIT_DIGIT		0x02
-- 
cgit v1.2.3


From 31facc5f1ac674fbcc29f212377e589396bb934c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 16 Mar 2010 18:05:30 -0400
Subject: perf probe: Use wrapper functions

Use wrapped functions as much as possible, to check out of
memory conditions in perf probe.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100316220530.32050.53951.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-probe.c     |  4 +---
 tools/perf/util/probe-event.c  | 46 ++++++++++++++++--------------------------
 tools/perf/util/probe-finder.c | 14 ++++++-------
 3 files changed, 24 insertions(+), 40 deletions(-)

diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index 152d6c9b1fa..b6afe7b344d 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -87,9 +87,7 @@ static void parse_probe_event_argv(int argc, const char **argv)
 	len = 0;
 	for (i = 0; i < argc; i++)
 		len += strlen(argv[i]) + 1;
-	buf = zalloc(len + 1);
-	if (!buf)
-		die("Failed to allocate memory for binding arguments.");
+	buf = xzalloc(len + 1);
 	len = 0;
 	for (i = 0; i < argc; i++)
 		len += sprintf(&buf[len], "%s ", argv[i]);
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 7c004b6ef24..88a3b6d75c7 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -33,6 +33,7 @@
 #include <limits.h>
 
 #undef _GNU_SOURCE
+#include "util.h"
 #include "event.h"
 #include "string.h"
 #include "strlist.h"
@@ -90,9 +91,9 @@ void parse_line_range_desc(const char *arg, struct line_range *lr)
 		if (*tmp != '\0')
 			semantic_error("Tailing with invalid character '%d'.",
 				       *tmp);
-		tmp = strndup(arg, (ptr - arg));
+		tmp = xstrndup(arg, (ptr - arg));
 	} else
-		tmp = strdup(arg);
+		tmp = xstrdup(arg);
 
 	if (strchr(tmp, '.'))
 		lr->file = tmp;
@@ -135,7 +136,7 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp)
 		if (!check_event_name(arg))
 			semantic_error("%s is bad for event name -it must "
 				       "follow C symbol-naming rule.", arg);
-		pp->event = strdup(arg);
+		pp->event = xstrdup(arg);
 		arg = tmp;
 	}
 
@@ -147,17 +148,16 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp)
 
 	/* Check arg is function or file and copy it */
 	if (strchr(arg, '.'))	/* File */
-		pp->file = strdup(arg);
+		pp->file = xstrdup(arg);
 	else			/* Function */
-		pp->function = strdup(arg);
-	DIE_IF(pp->file == NULL && pp->function == NULL);
+		pp->function = xstrdup(arg);
 
 	/* Parse other options */
 	while (ptr) {
 		arg = ptr;
 		c = nc;
 		if (c == ';') {	/* Lazy pattern must be the last part */
-			pp->lazy_line = strdup(arg);
+			pp->lazy_line = xstrdup(arg);
 			break;
 		}
 		ptr = strpbrk(arg, ";:+@%");
@@ -181,8 +181,7 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp)
 		case '@':	/* File name */
 			if (pp->file)
 				semantic_error("SRC@SRC is not allowed.");
-			pp->file = strdup(arg);
-			DIE_IF(pp->file == NULL);
+			pp->file = xstrdup(arg);
 			break;
 		case '%':	/* Probe places */
 			if (strcmp(arg, "return") == 0) {
@@ -247,11 +246,9 @@ void parse_perf_probe_event(const char *str, struct probe_point *pp,
 
 	/* Copy arguments and ensure return probe has no C argument */
 	pp->nr_args = argc - 1;
-	pp->args = zalloc(sizeof(char *) * pp->nr_args);
+	pp->args = xzalloc(sizeof(char *) * pp->nr_args);
 	for (i = 0; i < pp->nr_args; i++) {
-		pp->args[i] = strdup(argv[i + 1]);
-		if (!pp->args[i])
-			die("Failed to copy argument.");
+		pp->args[i] = xstrdup(argv[i + 1]);
 		if (is_c_varname(pp->args[i])) {
 			if (pp->retprobe)
 				semantic_error("You can't specify local"
@@ -299,14 +296,12 @@ void parse_trace_kprobe_event(const char *str, struct probe_point *pp)
 	pp->file = NULL;
 
 	pp->nr_args = argc - 2;
-	pp->args = zalloc(sizeof(char *) * pp->nr_args);
+	pp->args = xzalloc(sizeof(char *) * pp->nr_args);
 	for (i = 0; i < pp->nr_args; i++) {
 		p = strchr(argv[i + 2], '=');
 		if (p)	/* We don't need which register is assigned. */
 			*p = '\0';
-		pp->args[i] = strdup(argv[i + 2]);
-		if (!pp->args[i])
-			die("Failed to copy argument.");
+		pp->args[i] = xstrdup(argv[i + 2]);
 	}
 
 	argv_free(argv);
@@ -319,10 +314,8 @@ int synthesize_perf_probe_point(struct probe_point *pp)
 	char offs[64] = "", line[64] = "";
 	int ret;
 
-	pp->probes[0] = buf = zalloc(MAX_CMDLEN);
+	pp->probes[0] = buf = xzalloc(MAX_CMDLEN);
 	pp->found = 1;
-	if (!buf)
-		die("Failed to allocate memory by zalloc.");
 	if (pp->offset) {
 		ret = e_snprintf(offs, 64, "+%d", pp->offset);
 		if (ret <= 0)
@@ -380,9 +373,7 @@ int synthesize_trace_kprobe_event(struct probe_point *pp)
 	char *buf;
 	int i, len, ret;
 
-	pp->probes[0] = buf = zalloc(MAX_CMDLEN);
-	if (!buf)
-		die("Failed to allocate memory by zalloc.");
+	pp->probes[0] = buf = xzalloc(MAX_CMDLEN);
 	ret = e_snprintf(buf, MAX_CMDLEN, "%s+%d", pp->function, pp->offset);
 	if (ret <= 0)
 		goto error;
@@ -612,10 +603,9 @@ void add_trace_kprobe_events(struct probe_point *probes, int nr_probes,
 	for (j = 0; j < nr_probes; j++) {
 		pp = probes + j;
 		if (!pp->event)
-			pp->event = strdup(pp->function);
+			pp->event = xstrdup(pp->function);
 		if (!pp->group)
-			pp->group = strdup(PERFPROBE_GROUP);
-		DIE_IF(!pp->event || !pp->group);
+			pp->group = xstrdup(PERFPROBE_GROUP);
 		/* If force_add is true, suffix search is allowed */
 		allow_suffix = force_add;
 		for (i = 0; i < pp->found; i++) {
@@ -709,9 +699,7 @@ void del_trace_kprobe_events(struct strlist *dellist)
 	namelist = get_perf_event_names(fd, true);
 
 	strlist__for_each(ent, dellist) {
-		str = strdup(ent->s);
-		if (!str)
-			die("Failed to copy event.");
+		str = xstrdup(ent->s);
 		pr_debug("Parsing: %s\n", str);
 		p = strchr(str, ':');
 		if (p) {
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index c171a243d05..e887bb6157c 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -125,8 +125,7 @@ static void line_list__add_line(struct list_head *head, unsigned int line)
 	p = head;
 found:
 	pr_debug("line list: add a line %u\n", line);
-	ln = zalloc(sizeof(struct line_node));
-	DIE_IF(ln == NULL);
+	ln = xzalloc(sizeof(struct line_node));
 	ln->line = line;
 	INIT_LIST_HEAD(&ln->list);
 	list_add(&ln->list, p);
@@ -416,7 +415,7 @@ static void show_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 				(unsigned long)(pf->addr - eaddr));
 		/* Copy the function name if possible */
 		if (!pp->function) {
-			pp->function = strdup(name);
+			pp->function = xstrdup(name);
 			pp->offset = (size_t)(pf->addr - eaddr);
 		}
 	} else {
@@ -425,7 +424,7 @@ static void show_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 			       (uintmax_t)pf->addr);
 		if (!pp->function) {
 			/* TODO: Use _stext */
-			pp->function = strdup("");
+			pp->function = xstrdup("");
 			pp->offset = (size_t)pf->addr;
 		}
 	}
@@ -456,7 +455,7 @@ static void show_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 	if (pp->found == MAX_PROBES)
 		die("Too many( > %d) probe point found.\n", MAX_PROBES);
 
-	pp->probes[pp->found] = strdup(tmp);
+	pp->probes[pp->found] = xstrdup(tmp);
 	pp->found++;
 }
 
@@ -506,8 +505,7 @@ static int find_lazy_match_lines(struct list_head *head,
 	if (fd < 0)
 		die("failed to open %s", fname);
 	DIE_IF(fstat(fd, &st) < 0);
-	fbuf = malloc(st.st_size + 2);
-	DIE_IF(fbuf == NULL);
+	fbuf = xmalloc(st.st_size + 2);
 	DIE_IF(read(fd, fbuf, st.st_size) < 0);
 	close(fd);
 	fbuf[st.st_size] = '\n';	/* Dummy line */
@@ -727,7 +725,7 @@ static void find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
 
 		/* Copy real path */
 		if (!lf->lr->path)
-			lf->lr->path = strdup(src);
+			lf->lr->path = xstrdup(src);
 		line_list__add_line(&lf->lr->line_list, (unsigned int)lineno);
 	}
 	/* Update status */
-- 
cgit v1.2.3


From e0faa8d35845bb1893cf9e608a5a5d92e9390bf0 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 16 Mar 2010 18:05:37 -0400
Subject: perf probe: Move add-probe routine to util/

Move add-probe routine to util/probe_event.c. This simplifies
main routine for reducing maintenance cost.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100316220537.32050.72214.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-probe.c    | 126 +-------------------------------------
 tools/perf/util/probe-event.c | 137 +++++++++++++++++++++++++++++++++++++++++-
 tools/perf/util/probe-event.h |   2 +-
 3 files changed, 139 insertions(+), 126 deletions(-)

diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index b6afe7b344d..2087034782f 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -36,11 +36,9 @@
 #include "builtin.h"
 #include "util/util.h"
 #include "util/strlist.h"
-#include "util/event.h"
+#include "util/symbol.h"
 #include "util/debug.h"
 #include "util/debugfs.h"
-#include "util/symbol.h"
-#include "util/thread.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"	/* For debugfs_path */
 #include "util/probe-finder.h"
@@ -57,8 +55,6 @@ static struct {
 	int nr_probe;
 	struct probe_point probes[MAX_PROBES];
 	struct strlist *dellist;
-	struct map_groups kmap_groups;
-	struct map *kmaps[MAP__NR_TYPES];
 	struct line_range line_range;
 } session;
 
@@ -114,29 +110,7 @@ static int opt_del_probe_event(const struct option *opt __used,
 	return 0;
 }
 
-/* Currently just checking function name from symbol map */
-static void evaluate_probe_point(struct probe_point *pp)
-{
-	struct symbol *sym;
-	sym = map__find_symbol_by_name(session.kmaps[MAP__FUNCTION],
-				       pp->function, NULL);
-	if (!sym)
-		die("Kernel symbol \'%s\' not found - probe not added.",
-		    pp->function);
-}
-
 #ifndef NO_DWARF_SUPPORT
-static int open_vmlinux(void)
-{
-	if (map__load(session.kmaps[MAP__FUNCTION], NULL) < 0) {
-		pr_debug("Failed to load kernel map.\n");
-		return -EINVAL;
-	}
-	pr_debug("Try to open %s\n",
-		 session.kmaps[MAP__FUNCTION]->dso->long_name);
-	return open(session.kmaps[MAP__FUNCTION]->dso->long_name, O_RDONLY);
-}
-
 static int opt_show_lines(const struct option *opt __used,
 			  const char *str, int unset __used)
 {
@@ -204,31 +178,8 @@ static const struct option options[] = {
 	OPT_END()
 };
 
-/* Initialize symbol maps for vmlinux */
-static void init_vmlinux(void)
-{
-	symbol_conf.sort_by_name = true;
-	if (symbol_conf.vmlinux_name == NULL)
-		symbol_conf.try_vmlinux_path = true;
-	else
-		pr_debug("Use vmlinux: %s\n", symbol_conf.vmlinux_name);
-	if (symbol__init() < 0)
-		die("Failed to init symbol map.");
-
-	map_groups__init(&session.kmap_groups);
-	if (map_groups__create_kernel_maps(&session.kmap_groups,
-					   session.kmaps) < 0)
-		die("Failed to create kernel maps.");
-}
-
 int cmd_probe(int argc, const char **argv, const char *prefix __used)
 {
-	int i, ret;
-#ifndef NO_DWARF_SUPPORT
-	int fd;
-#endif
-	struct probe_point *pp;
-
 	argc = parse_options(argc, argv, options, probe_usage,
 			     PARSE_OPT_STOP_AT_NON_OPTION);
 	if (argc > 0) {
@@ -267,14 +218,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 				   " --add/--del.\n");
 			usage_with_options(probe_usage, options);
 		}
-		init_vmlinux();
-		fd = open_vmlinux();
-		if (fd < 0)
-			die("Could not open debuginfo file.");
-		ret = find_line_range(fd, &session.line_range);
-		if (ret <= 0)
-			die("Source line is not found.\n");
-		close(fd);
+
 		show_line_range(&session.line_range);
 		return 0;
 	}
@@ -287,72 +231,8 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 			return 0;
 	}
 
-	/* Add probes */
-	init_vmlinux();
-
-	if (session.need_dwarf)
-#ifdef NO_DWARF_SUPPORT
-		die("Debuginfo-analysis is not supported");
-#else	/* !NO_DWARF_SUPPORT */
-		pr_debug("Some probes require debuginfo.\n");
-
-	fd = open_vmlinux();
-	if (fd < 0) {
-		if (session.need_dwarf)
-			die("Could not open debuginfo file.");
-
-		pr_debug("Could not open vmlinux/module file."
-			 " Try to use symbols.\n");
-		goto end_dwarf;
-	}
-
-	/* Searching probe points */
-	for (i = 0; i < session.nr_probe; i++) {
-		pp = &session.probes[i];
-		if (pp->found)
-			continue;
-
-		lseek(fd, SEEK_SET, 0);
-		ret = find_probe_point(fd, pp);
-		if (ret > 0)
-			continue;
-		if (ret == 0) {	/* No error but failed to find probe point. */
-			synthesize_perf_probe_point(pp);
-			die("Probe point '%s' not found. - probe not added.",
-			    pp->probes[0]);
-		}
-		/* Error path */
-		if (session.need_dwarf) {
-			if (ret == -ENOENT)
-				pr_warning("No dwarf info found in the vmlinux - please rebuild with CONFIG_DEBUG_INFO=y.\n");
-			die("Could not analyze debuginfo.");
-		}
-		pr_debug("An error occurred in debuginfo analysis."
-			 " Try to use symbols.\n");
-		break;
-	}
-	close(fd);
-
-end_dwarf:
-#endif /* !NO_DWARF_SUPPORT */
-
-	/* Synthesize probes without dwarf */
-	for (i = 0; i < session.nr_probe; i++) {
-		pp = &session.probes[i];
-		if (pp->found)	/* This probe is already found. */
-			continue;
-
-		evaluate_probe_point(pp);
-		ret = synthesize_trace_kprobe_event(pp);
-		if (ret == -E2BIG)
-			die("probe point definition becomes too long.");
-		else if (ret < 0)
-			die("Failed to synthesize a probe point.");
-	}
-
-	/* Settng up probe points */
 	add_trace_kprobe_events(session.probes, session.nr_probe,
-				session.force_add);
+				session.force_add, session.need_dwarf);
 	return 0;
 }
 
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 88a3b6d75c7..1e60a659578 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -40,6 +40,8 @@
 #include "debug.h"
 #include "cache.h"
 #include "color.h"
+#include "symbol.h"
+#include "thread.h"
 #include "parse-events.h"  /* For debugfs_path */
 #include "probe-event.h"
 
@@ -65,6 +67,38 @@ static int e_snprintf(char *str, size_t size, const char *format, ...)
 	return ret;
 }
 
+
+static struct map_groups kmap_groups;
+static struct map *kmaps[MAP__NR_TYPES];
+
+/* Initialize symbol maps for vmlinux */
+static void init_vmlinux(void)
+{
+	symbol_conf.sort_by_name = true;
+	if (symbol_conf.vmlinux_name == NULL)
+		symbol_conf.try_vmlinux_path = true;
+	else
+		pr_debug("Use vmlinux: %s\n", symbol_conf.vmlinux_name);
+	if (symbol__init() < 0)
+		die("Failed to init symbol map.");
+
+	map_groups__init(&kmap_groups);
+	if (map_groups__create_kernel_maps(&kmap_groups, kmaps) < 0)
+		die("Failed to create kernel maps.");
+}
+
+#ifndef NO_DWARF_SUPPORT
+static int open_vmlinux(void)
+{
+	if (map__load(kmaps[MAP__FUNCTION], NULL) < 0) {
+		pr_debug("Failed to load kernel map.\n");
+		return -EINVAL;
+	}
+	pr_debug("Try to open %s\n", kmaps[MAP__FUNCTION]->dso->long_name);
+	return open(kmaps[MAP__FUNCTION]->dso->long_name, O_RDONLY);
+}
+#endif
+
 void parse_line_range_desc(const char *arg, struct line_range *lr)
 {
 	const char *ptr;
@@ -586,8 +620,8 @@ static void get_new_event_name(char *buf, size_t len, const char *base,
 		die("Too many events are on the same function.");
 }
 
-void add_trace_kprobe_events(struct probe_point *probes, int nr_probes,
-			     bool force_add)
+static void __add_trace_kprobe_events(struct probe_point *probes,
+				      int nr_probes, bool force_add)
 {
 	int i, j, fd;
 	struct probe_point *pp;
@@ -640,6 +674,92 @@ void add_trace_kprobe_events(struct probe_point *probes, int nr_probes,
 	close(fd);
 }
 
+/* Currently just checking function name from symbol map */
+static void evaluate_probe_point(struct probe_point *pp)
+{
+	struct symbol *sym;
+	sym = map__find_symbol_by_name(kmaps[MAP__FUNCTION],
+				       pp->function, NULL);
+	if (!sym)
+		die("Kernel symbol \'%s\' not found - probe not added.",
+		    pp->function);
+}
+
+void add_trace_kprobe_events(struct probe_point *probes, int nr_probes,
+			     bool force_add, bool need_dwarf)
+{
+	int i, ret;
+	struct probe_point *pp;
+#ifndef NO_DWARF_SUPPORT
+	int fd;
+#endif
+	/* Add probes */
+	init_vmlinux();
+
+	if (need_dwarf)
+#ifdef NO_DWARF_SUPPORT
+		die("Debuginfo-analysis is not supported");
+#else	/* !NO_DWARF_SUPPORT */
+		pr_debug("Some probes require debuginfo.\n");
+
+	fd = open_vmlinux();
+	if (fd < 0) {
+		if (need_dwarf)
+			die("Could not open debuginfo file.");
+
+		pr_debug("Could not open vmlinux/module file."
+			 " Try to use symbols.\n");
+		goto end_dwarf;
+	}
+
+	/* Searching probe points */
+	for (i = 0; i < nr_probes; i++) {
+		pp = &probes[i];
+		if (pp->found)
+			continue;
+
+		lseek(fd, SEEK_SET, 0);
+		ret = find_probe_point(fd, pp);
+		if (ret > 0)
+			continue;
+		if (ret == 0) {	/* No error but failed to find probe point. */
+			synthesize_perf_probe_point(pp);
+			die("Probe point '%s' not found. - probe not added.",
+			    pp->probes[0]);
+		}
+		/* Error path */
+		if (need_dwarf) {
+			if (ret == -ENOENT)
+				pr_warning("No dwarf info found in the vmlinux - please rebuild with CONFIG_DEBUG_INFO=y.\n");
+			die("Could not analyze debuginfo.");
+		}
+		pr_debug("An error occurred in debuginfo analysis."
+			 " Try to use symbols.\n");
+		break;
+	}
+	close(fd);
+
+end_dwarf:
+#endif /* !NO_DWARF_SUPPORT */
+
+	/* Synthesize probes without dwarf */
+	for (i = 0; i < nr_probes; i++) {
+		pp = &probes[i];
+		if (pp->found)	/* This probe is already found. */
+			continue;
+
+		evaluate_probe_point(pp);
+		ret = synthesize_trace_kprobe_event(pp);
+		if (ret == -E2BIG)
+			die("probe point definition becomes too long.");
+		else if (ret < 0)
+			die("Failed to synthesize a probe point.");
+	}
+
+	/* Settng up probe points */
+	__add_trace_kprobe_events(probes, nr_probes, force_add);
+}
+
 static void __del_trace_kprobe_event(int fd, struct str_node *ent)
 {
 	char *p;
@@ -759,6 +879,17 @@ void show_line_range(struct line_range *lr)
 	unsigned int l = 1;
 	struct line_node *ln;
 	FILE *fp;
+	int fd, ret;
+
+	/* Search a line range */
+	init_vmlinux();
+	fd = open_vmlinux();
+	if (fd < 0)
+		die("Could not open debuginfo file.");
+	ret = find_line_range(fd, lr);
+	if (ret <= 0)
+		die("Source line is not found.\n");
+	close(fd);
 
 	setup_pager();
 
@@ -788,3 +919,5 @@ void show_line_range(struct line_range *lr)
 
 	fclose(fp);
 }
+
+
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 711287d4bae..3865e163bb5 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -13,7 +13,7 @@ extern int synthesize_perf_probe_event(struct probe_point *pp);
 extern void parse_trace_kprobe_event(const char *str, struct probe_point *pp);
 extern int synthesize_trace_kprobe_event(struct probe_point *pp);
 extern void add_trace_kprobe_events(struct probe_point *probes, int nr_probes,
-				    bool force_add);
+				    bool force_add, bool need_dwarf);
 extern void del_trace_kprobe_events(struct strlist *dellist);
 extern void show_perf_probe_events(void);
 extern void show_line_range(struct line_range *lr);
-- 
cgit v1.2.3


From 12a1fadb41b5a6733c36b488b881fb19a28c92d3 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 16 Mar 2010 18:05:44 -0400
Subject: perf probe: Rename session to param

Since this name 'session' conflicts with 'perf_session', and
this structure just holds parameters anymore.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100316220544.32050.8788.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-probe.c | 54 +++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index 2087034782f..f577e141036 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -56,20 +56,20 @@ static struct {
 	struct probe_point probes[MAX_PROBES];
 	struct strlist *dellist;
 	struct line_range line_range;
-} session;
+} params;
 
 
 /* Parse an event definition. Note that any error must die. */
 static void parse_probe_event(const char *str)
 {
-	struct probe_point *pp = &session.probes[session.nr_probe];
+	struct probe_point *pp = &params.probes[params.nr_probe];
 
-	pr_debug("probe-definition(%d): %s\n", session.nr_probe, str);
-	if (++session.nr_probe == MAX_PROBES)
+	pr_debug("probe-definition(%d): %s\n", params.nr_probe, str);
+	if (++params.nr_probe == MAX_PROBES)
 		die("Too many probes (> %d) are specified.", MAX_PROBES);
 
 	/* Parse perf-probe event into probe_point */
-	parse_perf_probe_event(str, pp, &session.need_dwarf);
+	parse_perf_probe_event(str, pp, &params.need_dwarf);
 
 	pr_debug("%d arguments\n", pp->nr_args);
 }
@@ -103,9 +103,9 @@ static int opt_del_probe_event(const struct option *opt __used,
 			       const char *str, int unset __used)
 {
 	if (str) {
-		if (!session.dellist)
-			session.dellist = strlist__new(true, NULL);
-		strlist__add(session.dellist, str);
+		if (!params.dellist)
+			params.dellist = strlist__new(true, NULL);
+		strlist__add(params.dellist, str);
 	}
 	return 0;
 }
@@ -115,9 +115,9 @@ static int opt_show_lines(const struct option *opt __used,
 			  const char *str, int unset __used)
 {
 	if (str)
-		parse_line_range_desc(str, &session.line_range);
-	INIT_LIST_HEAD(&session.line_range.line_list);
-	session.show_lines = true;
+		parse_line_range_desc(str, &params.line_range);
+	INIT_LIST_HEAD(&params.line_range.line_list);
+	params.show_lines = true;
 	return 0;
 }
 #endif
@@ -140,7 +140,7 @@ static const struct option options[] = {
 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
 		   "file", "vmlinux pathname"),
 #endif
-	OPT_BOOLEAN('l', "list", &session.list_events,
+	OPT_BOOLEAN('l', "list", &params.list_events,
 		    "list up current probe events"),
 	OPT_CALLBACK('d', "del", NULL, "[GROUP:]EVENT", "delete a probe event.",
 		opt_del_probe_event),
@@ -168,7 +168,7 @@ static const struct option options[] = {
 #endif
 		"\t\t\tkprobe-tracer argument format.)\n",
 		opt_add_probe_event),
-	OPT_BOOLEAN('f', "force", &session.force_add, "forcibly add events"
+	OPT_BOOLEAN('f', "force", &params.force_add, "forcibly add events"
 		    " with existing name"),
 #ifndef NO_DWARF_SUPPORT
 	OPT_CALLBACK('L', "line", NULL,
@@ -190,20 +190,20 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 		parse_probe_event_argv(argc, argv);
 	}
 
-	if ((!session.nr_probe && !session.dellist && !session.list_events &&
-	     !session.show_lines))
+	if ((!params.nr_probe && !params.dellist && !params.list_events &&
+	     !params.show_lines))
 		usage_with_options(probe_usage, options);
 
 	if (debugfs_valid_mountpoint(debugfs_path) < 0)
 		die("Failed to find debugfs path.");
 
-	if (session.list_events) {
-		if (session.nr_probe != 0 || session.dellist) {
+	if (params.list_events) {
+		if (params.nr_probe != 0 || params.dellist) {
 			pr_warning("  Error: Don't use --list with"
 				   " --add/--del.\n");
 			usage_with_options(probe_usage, options);
 		}
-		if (session.show_lines) {
+		if (params.show_lines) {
 			pr_warning("  Error: Don't use --list with --line.\n");
 			usage_with_options(probe_usage, options);
 		}
@@ -212,27 +212,27 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 	}
 
 #ifndef NO_DWARF_SUPPORT
-	if (session.show_lines) {
-		if (session.nr_probe != 0 || session.dellist) {
+	if (params.show_lines) {
+		if (params.nr_probe != 0 || params.dellist) {
 			pr_warning("  Error: Don't use --line with"
 				   " --add/--del.\n");
 			usage_with_options(probe_usage, options);
 		}
 
-		show_line_range(&session.line_range);
+		show_line_range(&params.line_range);
 		return 0;
 	}
 #endif
 
-	if (session.dellist) {
-		del_trace_kprobe_events(session.dellist);
-		strlist__delete(session.dellist);
-		if (session.nr_probe == 0)
+	if (params.dellist) {
+		del_trace_kprobe_events(params.dellist);
+		strlist__delete(params.dellist);
+		if (params.nr_probe == 0)
 			return 0;
 	}
 
-	add_trace_kprobe_events(session.probes, session.nr_probe,
-				session.force_add, session.need_dwarf);
+	add_trace_kprobe_events(params.probes, params.nr_probe,
+				params.force_add, params.need_dwarf);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 95a3e4c4e21de1920a2ddb54bfc57c0af7e2561e Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 16 Mar 2010 18:05:51 -0400
Subject: perf probe: Rename some die_get_* functions

Rename die_get_real_subprogram and die_get_inlinefunc to
die_find_real_subprogram and die_find_inlinefunc respectively,
because these functions search its children. After that,
'die_get_' means getting a property of that die, and
'die_find_' means searching DIE-tree to get an appropriate
child die.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100316220551.32050.36181.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/probe-finder.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index e887bb6157c..c91a9605c16 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -204,8 +204,8 @@ static int __die_search_func_cb(Dwarf_Die *fn_die, void *data)
 }
 
 /* Search a real subprogram including this line, */
-static Dwarf_Die *die_get_real_subprogram(Dwarf_Die *cu_die, Dwarf_Addr addr,
-					  Dwarf_Die *die_mem)
+static Dwarf_Die *die_find_real_subprogram(Dwarf_Die *cu_die, Dwarf_Addr addr,
+					   Dwarf_Die *die_mem)
 {
 	struct __addr_die_search_param ad;
 	ad.addr = addr;
@@ -218,8 +218,8 @@ static Dwarf_Die *die_get_real_subprogram(Dwarf_Die *cu_die, Dwarf_Addr addr,
 }
 
 /* Similar to dwarf_getfuncs, but returns inlined_subroutine if exists. */
-static Dwarf_Die *die_get_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr,
-				     Dwarf_Die *die_mem)
+static Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr,
+				      Dwarf_Die *die_mem)
 {
 	Dwarf_Die child_die;
 	int ret;
@@ -233,7 +233,7 @@ static Dwarf_Die *die_get_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr,
 		    dwarf_haspc(die_mem, addr))
 			return die_mem;
 
-		if (die_get_inlinefunc(die_mem, addr, &child_die)) {
+		if (die_find_inlinefunc(die_mem, addr, &child_die)) {
 			memcpy(die_mem, &child_die, sizeof(Dwarf_Die));
 			return die_mem;
 		}
@@ -401,7 +401,7 @@ static void show_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 
 	/* If no real subprogram, find a real one */
 	if (!sp_die || dwarf_tag(sp_die) != DW_TAG_subprogram) {
-		sp_die = die_get_real_subprogram(&pf->cu_die,
+		sp_die = die_find_real_subprogram(&pf->cu_die,
 						 pf->addr, &die_mem);
 		if (!sp_die)
 			die("Probe point is not found in subprograms.");
@@ -564,7 +564,7 @@ static void find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
 			if (!dwarf_haspc(sp_die, addr))
 				continue;
 			/* Address filtering 2: No child include addr? */
-			if (die_get_inlinefunc(sp_die, addr, &die_mem))
+			if (die_find_inlinefunc(sp_die, addr, &die_mem))
 				continue;
 		}
 
@@ -714,7 +714,7 @@ static void find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
 				continue;
 
 			/* Address filtering 2: No child include addr? */
-			if (die_get_inlinefunc(sp_die, addr, &die_mem))
+			if (die_find_inlinefunc(sp_die, addr, &die_mem))
 				continue;
 		}
 
-- 
cgit v1.2.3


From 016f262e4fb10c6ecff709317098912f94a21efa Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 16 Mar 2010 18:05:58 -0400
Subject: perf probe: Introduce die_find_child() function

Introduce die_find_child() function to integrate DIE-tree
searching functions.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100316220558.32050.7905.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/probe-finder.c | 136 ++++++++++++++++++++++++-----------------
 1 file changed, 80 insertions(+), 56 deletions(-)

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index c91a9605c16..3942e14e95c 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -186,6 +186,62 @@ static const char *cu_find_realpath(Dwarf_Die *cu_die, const char *fname)
 	return src;
 }
 
+/* Compare diename and tname */
+static bool die_compare_name(Dwarf_Die *dw_die, const char *tname)
+{
+	const char *name;
+	name = dwarf_diename(dw_die);
+	DIE_IF(name == NULL);
+	return strcmp(tname, name);
+}
+
+/* Get entry pc(or low pc, 1st entry of ranges)  of the die */
+static Dwarf_Addr die_get_entrypc(Dwarf_Die *dw_die)
+{
+	Dwarf_Addr epc;
+	int ret;
+
+	ret = dwarf_entrypc(dw_die, &epc);
+	DIE_IF(ret == -1);
+	return epc;
+}
+
+/* Return values for die_find callbacks */
+enum {
+	DIE_FIND_CB_FOUND = 0,		/* End of Search */
+	DIE_FIND_CB_CHILD = 1,		/* Search only children */
+	DIE_FIND_CB_SIBLING = 2,	/* Search only siblings */
+	DIE_FIND_CB_CONTINUE = 3,	/* Search children and siblings */
+};
+
+/* Search a child die */
+static Dwarf_Die *die_find_child(Dwarf_Die *rt_die,
+				 int (*callback)(Dwarf_Die *, void *),
+				 void *data, Dwarf_Die *die_mem)
+{
+	Dwarf_Die child_die;
+	int ret;
+
+	ret = dwarf_child(rt_die, die_mem);
+	if (ret != 0)
+		return NULL;
+
+	do {
+		ret = callback(die_mem, data);
+		if (ret == DIE_FIND_CB_FOUND)
+			return die_mem;
+
+		if ((ret & DIE_FIND_CB_CHILD) &&
+		    die_find_child(die_mem, callback, data, &child_die)) {
+			memcpy(die_mem, &child_die, sizeof(Dwarf_Die));
+			return die_mem;
+		}
+	} while ((ret & DIE_FIND_CB_SIBLING) &&
+		 dwarf_siblingof(die_mem, die_mem) == 0);
+
+	return NULL;
+}
+
 struct __addr_die_search_param {
 	Dwarf_Addr	addr;
 	Dwarf_Die	*die_mem;
@@ -217,77 +273,45 @@ static Dwarf_Die *die_find_real_subprogram(Dwarf_Die *cu_die, Dwarf_Addr addr,
 		return die_mem;
 }
 
-/* Similar to dwarf_getfuncs, but returns inlined_subroutine if exists. */
-static Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr,
-				      Dwarf_Die *die_mem)
+/* die_find callback for inline function search */
+static int __die_find_inline_cb(Dwarf_Die *die_mem, void *data)
 {
-	Dwarf_Die child_die;
-	int ret;
+	Dwarf_Addr *addr = data;
 
-	ret = dwarf_child(sp_die, die_mem);
-	if (ret != 0)
-		return NULL;
-
-	do {
-		if (dwarf_tag(die_mem) == DW_TAG_inlined_subroutine &&
-		    dwarf_haspc(die_mem, addr))
-			return die_mem;
+	if (dwarf_tag(die_mem) == DW_TAG_inlined_subroutine &&
+	    dwarf_haspc(die_mem, *addr))
+		return DIE_FIND_CB_FOUND;
 
-		if (die_find_inlinefunc(die_mem, addr, &child_die)) {
-			memcpy(die_mem, &child_die, sizeof(Dwarf_Die));
-			return die_mem;
-		}
-	} while (dwarf_siblingof(die_mem, die_mem) == 0);
-
-	return NULL;
+	return DIE_FIND_CB_CONTINUE;
 }
 
-/* Compare diename and tname */
-static bool die_compare_name(Dwarf_Die *dw_die, const char *tname)
+/* Similar to dwarf_getfuncs, but returns inlined_subroutine if exists. */
+static Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr,
+				      Dwarf_Die *die_mem)
 {
-	const char *name;
-	name = dwarf_diename(dw_die);
-	DIE_IF(name == NULL);
-	return strcmp(tname, name);
+	return die_find_child(sp_die, __die_find_inline_cb, &addr, die_mem);
 }
 
-/* Get entry pc(or low pc, 1st entry of ranges)  of the die */
-static Dwarf_Addr die_get_entrypc(Dwarf_Die *dw_die)
+static int __die_find_variable_cb(Dwarf_Die *die_mem, void *data)
 {
-	Dwarf_Addr epc;
-	int ret;
+	const char *name = data;
+	int tag;
 
-	ret = dwarf_entrypc(dw_die, &epc);
-	DIE_IF(ret == -1);
-	return epc;
+	tag = dwarf_tag(die_mem);
+	if ((tag == DW_TAG_formal_parameter ||
+	     tag == DW_TAG_variable) &&
+	    (die_compare_name(die_mem, name) == 0))
+		return DIE_FIND_CB_FOUND;
+
+	return DIE_FIND_CB_CONTINUE;
 }
 
-/* Get a variable die */
+/* Find a variable called 'name' */
 static Dwarf_Die *die_find_variable(Dwarf_Die *sp_die, const char *name,
 				    Dwarf_Die *die_mem)
 {
-	Dwarf_Die child_die;
-	int tag;
-	int ret;
-
-	ret = dwarf_child(sp_die, die_mem);
-	if (ret != 0)
-		return NULL;
-
-	do {
-		tag = dwarf_tag(die_mem);
-		if ((tag == DW_TAG_formal_parameter ||
-		     tag == DW_TAG_variable) &&
-		    (die_compare_name(die_mem, name) == 0))
-			return die_mem;
-
-		if (die_find_variable(die_mem, name, &child_die)) {
-			memcpy(die_mem, &child_die, sizeof(Dwarf_Die));
-			return die_mem;
-		}
-	} while (dwarf_siblingof(die_mem, die_mem) == 0);
-
-	return NULL;
+	return die_find_child(sp_die, __die_find_variable_cb, (void *)name,
+			      die_mem);
 }
 
 /*
-- 
cgit v1.2.3


From f4d7da499e4fc1fdff8f26fdeb1a058d475a7a6c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 16 Mar 2010 18:06:05 -0400
Subject: perf probe: Add --dry-run option

Add --dry-run option for debugging and testing.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100316220605.32050.6571.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Documentation/perf-probe.txt |  5 +++++
 tools/perf/builtin-probe.c              |  1 +
 tools/perf/util/probe-event.c           | 24 ++++++++++++++++--------
 tools/perf/util/probe-event.h           |  2 ++
 4 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 34202b1be0b..0f944b3be9e 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -57,6 +57,11 @@ OPTIONS
 --force::
 	Forcibly add events with existing name.
 
+-n::
+--dry-run::
+	Dry run. With this option, --add and --del doesn't execute actual
+	adding and removal operations.
+
 PROBE SYNTAX
 ------------
 Probe points are defined by following syntax.
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index f577e141036..a1a2891ca66 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -175,6 +175,7 @@ static const struct option options[] = {
 		     "FUNC[:RLN[+NUM|:RLN2]]|SRC:ALN[+NUM|:ALN2]",
 		     "Show source code lines.", opt_show_lines),
 #endif
+	OPT__DRY_RUN(&probe_event_dry_run),
 	OPT_END()
 };
 
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 1e60a659578..ac41578a355 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -49,6 +49,8 @@
 #define MAX_PROBE_ARGS 128
 #define PERFPROBE_GROUP "probe"
 
+bool probe_event_dry_run;	/* Dry run flag */
+
 #define semantic_error(msg ...) die("Semantic error :" msg)
 
 /* If there is no space to write, returns -E2BIG. */
@@ -430,7 +432,7 @@ error:
 	return ret;
 }
 
-static int open_kprobe_events(int flags, int mode)
+static int open_kprobe_events(bool readwrite)
 {
 	char buf[PATH_MAX];
 	int ret;
@@ -439,7 +441,11 @@ static int open_kprobe_events(int flags, int mode)
 	if (ret < 0)
 		die("Failed to make kprobe_events path.");
 
-	ret = open(buf, flags, mode);
+	if (readwrite && !probe_event_dry_run)
+		ret = open(buf, O_RDWR, O_APPEND);
+	else
+		ret = open(buf, O_RDONLY, 0);
+
 	if (ret < 0) {
 		if (errno == ENOENT)
 			die("kprobe_events file does not exist -"
@@ -535,7 +541,7 @@ void show_perf_probe_events(void)
 	setup_pager();
 	memset(&pp, 0, sizeof(pp));
 
-	fd = open_kprobe_events(O_RDONLY, 0);
+	fd = open_kprobe_events(false);
 	rawlist = get_trace_kprobe_event_rawlist(fd);
 	close(fd);
 
@@ -585,9 +591,11 @@ static void write_trace_kprobe_event(int fd, const char *buf)
 	int ret;
 
 	pr_debug("Writing event: %s\n", buf);
-	ret = write(fd, buf, strlen(buf));
-	if (ret <= 0)
-		die("Failed to write event: %s", strerror(errno));
+	if (!probe_event_dry_run) {
+		ret = write(fd, buf, strlen(buf));
+		if (ret <= 0)
+			die("Failed to write event: %s", strerror(errno));
+	}
 }
 
 static void get_new_event_name(char *buf, size_t len, const char *base,
@@ -630,7 +638,7 @@ static void __add_trace_kprobe_events(struct probe_point *probes,
 	struct strlist *namelist;
 	bool allow_suffix;
 
-	fd = open_kprobe_events(O_RDWR, O_APPEND);
+	fd = open_kprobe_events(true);
 	/* Get current event names */
 	namelist = get_perf_event_names(fd, false);
 
@@ -814,7 +822,7 @@ void del_trace_kprobe_events(struct strlist *dellist)
 	struct str_node *ent;
 	struct strlist *namelist;
 
-	fd = open_kprobe_events(O_RDWR, O_APPEND);
+	fd = open_kprobe_events(true);
 	/* Get current event names */
 	namelist = get_perf_event_names(fd, true);
 
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 3865e163bb5..703b8876dfb 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -5,6 +5,8 @@
 #include "probe-finder.h"
 #include "strlist.h"
 
+extern bool probe_event_dry_run;
+
 extern void parse_line_range_desc(const char *arg, struct line_range *lr);
 extern void parse_perf_probe_event(const char *str, struct probe_point *pp,
 				   bool *need_dwarf);
-- 
cgit v1.2.3


From 4235b0454ebeefc2295ad8417e18a8761425b19e Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 16 Mar 2010 18:06:12 -0400
Subject: perf probe: Introduce kprobe_trace_event and perf_probe_event

Introduce kprobe_trace_event and perf_probe_event and replace
old probe_point structure with it. probe_point structure is
not enough flexible nor extensible. New data structures
will help implementing further features.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100316220612.32050.33806.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-probe.c     |  30 +-
 tools/perf/util/probe-event.c  | 626 ++++++++++++++++++++++++++---------------
 tools/perf/util/probe-event.h  | 111 +++++++-
 tools/perf/util/probe-finder.c | 140 +++++----
 tools/perf/util/probe-finder.h |  58 +---
 5 files changed, 589 insertions(+), 376 deletions(-)

diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index a1a2891ca66..e0dafd9dfeb 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -48,12 +48,11 @@
 
 /* Session management structure */
 static struct {
-	bool need_dwarf;
 	bool list_events;
 	bool force_add;
 	bool show_lines;
-	int nr_probe;
-	struct probe_point probes[MAX_PROBES];
+	int nevents;
+	struct perf_probe_event events[MAX_PROBES];
 	struct strlist *dellist;
 	struct line_range line_range;
 } params;
@@ -62,16 +61,16 @@ static struct {
 /* Parse an event definition. Note that any error must die. */
 static void parse_probe_event(const char *str)
 {
-	struct probe_point *pp = &params.probes[params.nr_probe];
+	struct perf_probe_event *pev = &params.events[params.nevents];
 
-	pr_debug("probe-definition(%d): %s\n", params.nr_probe, str);
-	if (++params.nr_probe == MAX_PROBES)
+	pr_debug("probe-definition(%d): %s\n", params.nevents, str);
+	if (++params.nevents == MAX_PROBES)
 		die("Too many probes (> %d) are specified.", MAX_PROBES);
 
-	/* Parse perf-probe event into probe_point */
-	parse_perf_probe_event(str, pp, &params.need_dwarf);
+	/* Parse a perf-probe command into event */
+	parse_perf_probe_command(str, pev);
 
-	pr_debug("%d arguments\n", pp->nr_args);
+	pr_debug("%d arguments\n", pev->nargs);
 }
 
 static void parse_probe_event_argv(int argc, const char **argv)
@@ -191,7 +190,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 		parse_probe_event_argv(argc, argv);
 	}
 
-	if ((!params.nr_probe && !params.dellist && !params.list_events &&
+	if ((!params.nevents && !params.dellist && !params.list_events &&
 	     !params.show_lines))
 		usage_with_options(probe_usage, options);
 
@@ -199,7 +198,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 		die("Failed to find debugfs path.");
 
 	if (params.list_events) {
-		if (params.nr_probe != 0 || params.dellist) {
+		if (params.nevents != 0 || params.dellist) {
 			pr_warning("  Error: Don't use --list with"
 				   " --add/--del.\n");
 			usage_with_options(probe_usage, options);
@@ -214,7 +213,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 
 #ifndef NO_DWARF_SUPPORT
 	if (params.show_lines) {
-		if (params.nr_probe != 0 || params.dellist) {
+		if (params.nevents != 0 || params.dellist) {
 			pr_warning("  Error: Don't use --line with"
 				   " --add/--del.\n");
 			usage_with_options(probe_usage, options);
@@ -226,14 +225,13 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 #endif
 
 	if (params.dellist) {
-		del_trace_kprobe_events(params.dellist);
+		del_perf_probe_events(params.dellist);
 		strlist__delete(params.dellist);
-		if (params.nr_probe == 0)
+		if (params.nevents == 0)
 			return 0;
 	}
 
-	add_trace_kprobe_events(params.probes, params.nr_probe,
-				params.force_add, params.need_dwarf);
+	add_perf_probe_events(params.events, params.nevents, params.force_add);
 	return 0;
 }
 
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index ac41578a355..b44ddfb030d 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -44,6 +44,7 @@
 #include "thread.h"
 #include "parse-events.h"  /* For debugfs_path */
 #include "probe-event.h"
+#include "probe-finder.h"
 
 #define MAX_CMDLEN 256
 #define MAX_PROBE_ARGS 128
@@ -150,8 +151,9 @@ static bool check_event_name(const char *name)
 }
 
 /* Parse probepoint definition. */
-static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp)
+static void parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 {
+	struct perf_probe_point *pp = &pev->point;
 	char *ptr, *tmp;
 	char c, nc = 0;
 	/*
@@ -172,7 +174,8 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp)
 		if (!check_event_name(arg))
 			semantic_error("%s is bad for event name -it must "
 				       "follow C symbol-naming rule.", arg);
-		pp->event = xstrdup(arg);
+		pev->event = xstrdup(arg);
+		pev->group = NULL;
 		arg = tmp;
 	}
 
@@ -255,57 +258,65 @@ static void parse_perf_probe_probepoint(char *arg, struct probe_point *pp)
 		semantic_error("Offset/Line/Lazy pattern can't be used with "
 			       "return probe.");
 
-	pr_debug("symbol:%s file:%s line:%d offset:%d return:%d lazy:%s\n",
+	pr_debug("symbol:%s file:%s line:%d offset:%lu return:%d lazy:%s\n",
 		 pp->function, pp->file, pp->line, pp->offset, pp->retprobe,
 		 pp->lazy_line);
 }
 
-/* Parse perf-probe event definition */
-void parse_perf_probe_event(const char *str, struct probe_point *pp,
-			    bool *need_dwarf)
+/* Parse perf-probe event command */
+void parse_perf_probe_command(const char *cmd, struct perf_probe_event *pev)
 {
 	char **argv;
 	int argc, i;
 
-	*need_dwarf = false;
-
-	argv = argv_split(str, &argc);
+	argv = argv_split(cmd, &argc);
 	if (!argv)
 		die("argv_split failed.");
 	if (argc > MAX_PROBE_ARGS + 1)
 		semantic_error("Too many arguments");
 
 	/* Parse probe point */
-	parse_perf_probe_probepoint(argv[0], pp);
-	if (pp->file || pp->line || pp->lazy_line)
-		*need_dwarf = true;
+	parse_perf_probe_point(argv[0], pev);
 
 	/* Copy arguments and ensure return probe has no C argument */
-	pp->nr_args = argc - 1;
-	pp->args = xzalloc(sizeof(char *) * pp->nr_args);
-	for (i = 0; i < pp->nr_args; i++) {
-		pp->args[i] = xstrdup(argv[i + 1]);
-		if (is_c_varname(pp->args[i])) {
-			if (pp->retprobe)
-				semantic_error("You can't specify local"
-						" variable for kretprobe");
-			*need_dwarf = true;
-		}
+	pev->nargs = argc - 1;
+	pev->args = xzalloc(sizeof(struct perf_probe_arg) * pev->nargs);
+	for (i = 0; i < pev->nargs; i++) {
+		pev->args[i].name = xstrdup(argv[i + 1]);
+		if (is_c_varname(pev->args[i].name) && pev->point.retprobe)
+			semantic_error("You can't specify local variable for"
+				       " kretprobe");
 	}
 
 	argv_free(argv);
 }
 
+/* Return true if this perf_probe_event requires debuginfo */
+bool perf_probe_event_need_dwarf(struct perf_probe_event *pev)
+{
+	int i;
+
+	if (pev->point.file || pev->point.line || pev->point.lazy_line)
+		return true;
+
+	for (i = 0; i < pev->nargs; i++)
+		if (is_c_varname(pev->args[i].name))
+			return true;
+
+	return false;
+}
+
 /* Parse kprobe_events event into struct probe_point */
-void parse_trace_kprobe_event(const char *str, struct probe_point *pp)
+void parse_kprobe_trace_command(const char *cmd, struct kprobe_trace_event *tev)
 {
+	struct kprobe_trace_point *tp = &tev->point;
 	char pr;
 	char *p;
 	int ret, i, argc;
 	char **argv;
 
-	pr_debug("Parsing kprobe_events: %s\n", str);
-	argv = argv_split(str, &argc);
+	pr_debug("Parsing kprobe_events: %s\n", cmd);
+	argv = argv_split(cmd, &argc);
 	if (!argv)
 		die("argv_split failed.");
 	if (argc < 2)
@@ -313,47 +324,46 @@ void parse_trace_kprobe_event(const char *str, struct probe_point *pp)
 
 	/* Scan event and group name. */
 	ret = sscanf(argv[0], "%c:%a[^/ \t]/%a[^ \t]",
-		     &pr, (float *)(void *)&pp->group,
-		     (float *)(void *)&pp->event);
+		     &pr, (float *)(void *)&tev->group,
+		     (float *)(void *)&tev->event);
 	if (ret != 3)
 		semantic_error("Failed to parse event name: %s", argv[0]);
-	pr_debug("Group:%s Event:%s probe:%c\n", pp->group, pp->event, pr);
+	pr_debug("Group:%s Event:%s probe:%c\n", tev->group, tev->event, pr);
 
-	pp->retprobe = (pr == 'r');
+	tp->retprobe = (pr == 'r');
 
 	/* Scan function name and offset */
-	ret = sscanf(argv[1], "%a[^+]+%d", (float *)(void *)&pp->function,
-		     &pp->offset);
+	ret = sscanf(argv[1], "%a[^+]+%lu", (float *)(void *)&tp->symbol,
+		     &tp->offset);
 	if (ret == 1)
-		pp->offset = 0;
-
-	/* kprobe_events doesn't have this information */
-	pp->line = 0;
-	pp->file = NULL;
+		tp->offset = 0;
 
-	pp->nr_args = argc - 2;
-	pp->args = xzalloc(sizeof(char *) * pp->nr_args);
-	for (i = 0; i < pp->nr_args; i++) {
+	tev->nargs = argc - 2;
+	tev->args = xzalloc(sizeof(struct kprobe_trace_arg) * tev->nargs);
+	for (i = 0; i < tev->nargs; i++) {
 		p = strchr(argv[i + 2], '=');
 		if (p)	/* We don't need which register is assigned. */
-			*p = '\0';
-		pp->args[i] = xstrdup(argv[i + 2]);
+			*p++ = '\0';
+		else
+			p = argv[i + 2];
+		tev->args[i].name = xstrdup(argv[i + 2]);
+		/* TODO: parse regs and offset */
+		tev->args[i].value = xstrdup(p);
 	}
 
 	argv_free(argv);
 }
 
-/* Synthesize only probe point (not argument) */
-int synthesize_perf_probe_point(struct probe_point *pp)
+/* Compose only probe point (not argument) */
+static char *synthesize_perf_probe_point(struct perf_probe_point *pp)
 {
 	char *buf;
 	char offs[64] = "", line[64] = "";
 	int ret;
 
-	pp->probes[0] = buf = xzalloc(MAX_CMDLEN);
-	pp->found = 1;
+	buf = xzalloc(MAX_CMDLEN);
 	if (pp->offset) {
-		ret = e_snprintf(offs, 64, "+%d", pp->offset);
+		ret = e_snprintf(offs, 64, "+%lu", pp->offset);
 		if (ret <= 0)
 			goto error;
 	}
@@ -368,68 +378,209 @@ int synthesize_perf_probe_point(struct probe_point *pp)
 				 offs, pp->retprobe ? "%return" : "", line);
 	else
 		ret = e_snprintf(buf, MAX_CMDLEN, "%s%s", pp->file, line);
-	if (ret <= 0) {
+	if (ret <= 0)
+		goto error;
+
+	return buf;
 error:
-		free(pp->probes[0]);
-		pp->probes[0] = NULL;
-		pp->found = 0;
-	}
-	return ret;
+	die("Failed to synthesize perf probe point: %s", strerror(-ret));
 }
 
-int synthesize_perf_probe_event(struct probe_point *pp)
+#if 0
+char *synthesize_perf_probe_command(struct perf_probe_event *pev)
 {
 	char *buf;
 	int i, len, ret;
 
-	len = synthesize_perf_probe_point(pp);
-	if (len < 0)
-		return 0;
+	buf = synthesize_perf_probe_point(&pev->point);
+	if (!buf)
+		return NULL;
 
-	buf = pp->probes[0];
-	for (i = 0; i < pp->nr_args; i++) {
+	len = strlen(buf);
+	for (i = 0; i < pev->nargs; i++) {
 		ret = e_snprintf(&buf[len], MAX_CMDLEN - len, " %s",
-				 pp->args[i]);
-		if (ret <= 0)
-			goto error;
+				 pev->args[i].name);
+		if (ret <= 0) {
+			free(buf);
+			return NULL;
+		}
 		len += ret;
 	}
-	pp->found = 1;
 
-	return pp->found;
-error:
-	free(pp->probes[0]);
-	pp->probes[0] = NULL;
+	return buf;
+}
+#endif
+
+static int __synthesize_kprobe_trace_arg_ref(struct kprobe_trace_arg_ref *ref,
+					     char **buf, size_t *buflen,
+					     int depth)
+{
+	int ret;
+	if (ref->next) {
+		depth = __synthesize_kprobe_trace_arg_ref(ref->next, buf,
+							 buflen, depth + 1);
+		if (depth < 0)
+			goto out;
+	}
+
+	ret = e_snprintf(*buf, *buflen, "%+ld(", ref->offset);
+	if (ret < 0)
+		depth = ret;
+	else {
+		*buf += ret;
+		*buflen -= ret;
+	}
+out:
+	return depth;
 
-	return ret;
 }
 
-int synthesize_trace_kprobe_event(struct probe_point *pp)
+static int synthesize_kprobe_trace_arg(struct kprobe_trace_arg *arg,
+				       char *buf, size_t buflen)
 {
+	int ret, depth = 0;
+	char *tmp = buf;
+
+	/* Argument name or separator */
+	if (arg->name)
+		ret = e_snprintf(buf, buflen, " %s=", arg->name);
+	else
+		ret = e_snprintf(buf, buflen, " ");
+	if (ret < 0)
+		return ret;
+	buf += ret;
+	buflen -= ret;
+
+	/* Dereferencing arguments */
+	if (arg->ref) {
+		depth = __synthesize_kprobe_trace_arg_ref(arg->ref, &buf,
+							  &buflen, 1);
+		if (depth < 0)
+			return depth;
+	}
+
+	/* Print argument value */
+	ret = e_snprintf(buf, buflen, "%s", arg->value);
+	if (ret < 0)
+		return ret;
+	buf += ret;
+	buflen -= ret;
+
+	/* Closing */
+	while (depth--) {
+		ret = e_snprintf(buf, buflen, ")");
+		if (ret < 0)
+			return ret;
+		buf += ret;
+		buflen -= ret;
+	}
+
+	return buf - tmp;
+}
+
+char *synthesize_kprobe_trace_command(struct kprobe_trace_event *tev)
+{
+	struct kprobe_trace_point *tp = &tev->point;
 	char *buf;
 	int i, len, ret;
 
-	pp->probes[0] = buf = xzalloc(MAX_CMDLEN);
-	ret = e_snprintf(buf, MAX_CMDLEN, "%s+%d", pp->function, pp->offset);
-	if (ret <= 0)
+	buf = xzalloc(MAX_CMDLEN);
+	len = e_snprintf(buf, MAX_CMDLEN, "%c:%s/%s %s+%lu",
+			 tp->retprobe ? 'r' : 'p',
+			 tev->group, tev->event,
+			 tp->symbol, tp->offset);
+	if (len <= 0)
 		goto error;
-	len = ret;
 
-	for (i = 0; i < pp->nr_args; i++) {
-		ret = e_snprintf(&buf[len], MAX_CMDLEN - len, " %s",
-				 pp->args[i]);
+	for (i = 0; i < tev->nargs; i++) {
+		ret = synthesize_kprobe_trace_arg(&tev->args[i], buf + len,
+						  MAX_CMDLEN - len);
 		if (ret <= 0)
 			goto error;
 		len += ret;
 	}
-	pp->found = 1;
 
-	return pp->found;
+	return buf;
 error:
-	free(pp->probes[0]);
-	pp->probes[0] = NULL;
+	free(buf);
+	return NULL;
+}
 
-	return ret;
+void convert_to_perf_probe_event(struct kprobe_trace_event *tev,
+				 struct perf_probe_event *pev)
+{
+	char buf[64];
+	int i;
+
+	pev->event = xstrdup(tev->event);
+	pev->group = xstrdup(tev->group);
+
+	/* Convert trace_point to probe_point */
+	pev->point.function = xstrdup(tev->point.symbol);
+	pev->point.offset = tev->point.offset;
+	pev->point.retprobe = tev->point.retprobe;
+
+	/* Convert trace_arg to probe_arg */
+	pev->nargs = tev->nargs;
+	pev->args = xzalloc(sizeof(struct perf_probe_arg) * pev->nargs);
+	for (i = 0; i < tev->nargs; i++)
+		if (tev->args[i].name)
+			pev->args[i].name = xstrdup(tev->args[i].name);
+		else {
+			synthesize_kprobe_trace_arg(&tev->args[i], buf, 64);
+			pev->args[i].name = xstrdup(buf);
+		}
+}
+
+void clear_perf_probe_event(struct perf_probe_event *pev)
+{
+	struct perf_probe_point *pp = &pev->point;
+	int i;
+
+	if (pev->event)
+		free(pev->event);
+	if (pev->group)
+		free(pev->group);
+	if (pp->file)
+		free(pp->file);
+	if (pp->function)
+		free(pp->function);
+	if (pp->lazy_line)
+		free(pp->lazy_line);
+	for (i = 0; i < pev->nargs; i++)
+		if (pev->args[i].name)
+			free(pev->args[i].name);
+	if (pev->args)
+		free(pev->args);
+	memset(pev, 0, sizeof(*pev));
+}
+
+void clear_kprobe_trace_event(struct kprobe_trace_event *tev)
+{
+	struct kprobe_trace_arg_ref *ref, *next;
+	int i;
+
+	if (tev->event)
+		free(tev->event);
+	if (tev->group)
+		free(tev->group);
+	if (tev->point.symbol)
+		free(tev->point.symbol);
+	for (i = 0; i < tev->nargs; i++) {
+		if (tev->args[i].name)
+			free(tev->args[i].name);
+		if (tev->args[i].value)
+			free(tev->args[i].value);
+		ref = tev->args[i].ref;
+		while (ref) {
+			next = ref->next;
+			free(ref);
+			ref = next;
+		}
+	}
+	if (tev->args)
+		free(tev->args);
+	memset(tev, 0, sizeof(*tev));
 }
 
 static int open_kprobe_events(bool readwrite)
@@ -458,7 +609,7 @@ static int open_kprobe_events(bool readwrite)
 }
 
 /* Get raw string list of current kprobe_events */
-static struct strlist *get_trace_kprobe_event_rawlist(int fd)
+static struct strlist *get_kprobe_trace_command_rawlist(int fd)
 {
 	int ret, idx;
 	FILE *fp;
@@ -486,99 +637,82 @@ static struct strlist *get_trace_kprobe_event_rawlist(int fd)
 	return sl;
 }
 
-/* Free and zero clear probe_point */
-static void clear_probe_point(struct probe_point *pp)
-{
-	int i;
-
-	if (pp->event)
-		free(pp->event);
-	if (pp->group)
-		free(pp->group);
-	if (pp->function)
-		free(pp->function);
-	if (pp->file)
-		free(pp->file);
-	if (pp->lazy_line)
-		free(pp->lazy_line);
-	for (i = 0; i < pp->nr_args; i++)
-		free(pp->args[i]);
-	if (pp->args)
-		free(pp->args);
-	for (i = 0; i < pp->found; i++)
-		free(pp->probes[i]);
-	memset(pp, 0, sizeof(*pp));
-}
-
 /* Show an event */
-static void show_perf_probe_event(const char *event, const char *place,
-				  struct probe_point *pp)
+static void show_perf_probe_event(struct perf_probe_event *pev)
 {
 	int i, ret;
 	char buf[128];
+	char *place;
 
-	ret = e_snprintf(buf, 128, "%s:%s", pp->group, event);
+	/* Synthesize only event probe point */
+	place = synthesize_perf_probe_point(&pev->point);
+
+	ret = e_snprintf(buf, 128, "%s:%s", pev->group, pev->event);
 	if (ret < 0)
 		die("Failed to copy event: %s", strerror(-ret));
 	printf("  %-40s (on %s", buf, place);
 
-	if (pp->nr_args > 0) {
+	if (pev->nargs > 0) {
 		printf(" with");
-		for (i = 0; i < pp->nr_args; i++)
-			printf(" %s", pp->args[i]);
+		for (i = 0; i < pev->nargs; i++)
+			printf(" %s", pev->args[i].name);
 	}
 	printf(")\n");
+	free(place);
 }
 
 /* List up current perf-probe events */
 void show_perf_probe_events(void)
 {
 	int fd;
-	struct probe_point pp;
+	struct kprobe_trace_event tev;
+	struct perf_probe_event pev;
 	struct strlist *rawlist;
 	struct str_node *ent;
 
 	setup_pager();
-	memset(&pp, 0, sizeof(pp));
+
+	memset(&tev, 0, sizeof(tev));
+	memset(&pev, 0, sizeof(pev));
 
 	fd = open_kprobe_events(false);
-	rawlist = get_trace_kprobe_event_rawlist(fd);
+	rawlist = get_kprobe_trace_command_rawlist(fd);
 	close(fd);
 
 	strlist__for_each(ent, rawlist) {
-		parse_trace_kprobe_event(ent->s, &pp);
-		/* Synthesize only event probe point */
-		synthesize_perf_probe_point(&pp);
+		parse_kprobe_trace_command(ent->s, &tev);
+		convert_to_perf_probe_event(&tev, &pev);
 		/* Show an event */
-		show_perf_probe_event(pp.event, pp.probes[0], &pp);
-		clear_probe_point(&pp);
+		show_perf_probe_event(&pev);
+		clear_perf_probe_event(&pev);
+		clear_kprobe_trace_event(&tev);
 	}
 
 	strlist__delete(rawlist);
 }
 
 /* Get current perf-probe event names */
-static struct strlist *get_perf_event_names(int fd, bool include_group)
+static struct strlist *get_kprobe_trace_event_names(int fd, bool include_group)
 {
 	char buf[128];
 	struct strlist *sl, *rawlist;
 	struct str_node *ent;
-	struct probe_point pp;
+	struct kprobe_trace_event tev;
 
-	memset(&pp, 0, sizeof(pp));
-	rawlist = get_trace_kprobe_event_rawlist(fd);
+	memset(&tev, 0, sizeof(tev));
 
+	rawlist = get_kprobe_trace_command_rawlist(fd);
 	sl = strlist__new(true, NULL);
 	strlist__for_each(ent, rawlist) {
-		parse_trace_kprobe_event(ent->s, &pp);
+		parse_kprobe_trace_command(ent->s, &tev);
 		if (include_group) {
-			if (e_snprintf(buf, 128, "%s:%s", pp.group,
-				       pp.event) < 0)
+			if (e_snprintf(buf, 128, "%s:%s", tev.group,
+				       tev.event) < 0)
 				die("Failed to copy group:event name.");
 			strlist__add(sl, buf);
 		} else
-			strlist__add(sl, pp.event);
-		clear_probe_point(&pp);
+			strlist__add(sl, tev.event);
+		clear_kprobe_trace_event(&tev);
 	}
 
 	strlist__delete(rawlist);
@@ -586,9 +720,10 @@ static struct strlist *get_perf_event_names(int fd, bool include_group)
 	return sl;
 }
 
-static void write_trace_kprobe_event(int fd, const char *buf)
+static void write_kprobe_trace_event(int fd, struct kprobe_trace_event *tev)
 {
 	int ret;
+	char *buf = synthesize_kprobe_trace_command(tev);
 
 	pr_debug("Writing event: %s\n", buf);
 	if (!probe_event_dry_run) {
@@ -596,6 +731,7 @@ static void write_trace_kprobe_event(int fd, const char *buf)
 		if (ret <= 0)
 			die("Failed to write event: %s", strerror(errno));
 	}
+	free(buf);
 }
 
 static void get_new_event_name(char *buf, size_t len, const char *base,
@@ -628,81 +764,83 @@ static void get_new_event_name(char *buf, size_t len, const char *base,
 		die("Too many events are on the same function.");
 }
 
-static void __add_trace_kprobe_events(struct probe_point *probes,
-				      int nr_probes, bool force_add)
+static void __add_kprobe_trace_events(struct perf_probe_event *pev,
+				      struct kprobe_trace_event *tevs,
+				      int ntevs, bool allow_suffix)
 {
-	int i, j, fd;
-	struct probe_point *pp;
-	char buf[MAX_CMDLEN];
-	char event[64];
+	int i, fd;
+	struct kprobe_trace_event *tev;
+	char buf[64];
+	const char *event, *group;
 	struct strlist *namelist;
-	bool allow_suffix;
 
 	fd = open_kprobe_events(true);
 	/* Get current event names */
-	namelist = get_perf_event_names(fd, false);
-
-	for (j = 0; j < nr_probes; j++) {
-		pp = probes + j;
-		if (!pp->event)
-			pp->event = xstrdup(pp->function);
-		if (!pp->group)
-			pp->group = xstrdup(PERFPROBE_GROUP);
-		/* If force_add is true, suffix search is allowed */
-		allow_suffix = force_add;
-		for (i = 0; i < pp->found; i++) {
-			/* Get an unused new event name */
-			get_new_event_name(event, 64, pp->event, namelist,
-					   allow_suffix);
-			snprintf(buf, MAX_CMDLEN, "%c:%s/%s %s\n",
-				 pp->retprobe ? 'r' : 'p',
-				 pp->group, event,
-				 pp->probes[i]);
-			write_trace_kprobe_event(fd, buf);
-			printf("Added new event:\n");
-			/* Get the first parameter (probe-point) */
-			sscanf(pp->probes[i], "%s", buf);
-			show_perf_probe_event(event, buf, pp);
-			/* Add added event name to namelist */
-			strlist__add(namelist, event);
-			/*
-			 * Probes after the first probe which comes from same
-			 * user input are always allowed to add suffix, because
-			 * there might be several addresses corresponding to
-			 * one code line.
-			 */
-			allow_suffix = true;
-		}
+	namelist = get_kprobe_trace_event_names(fd, false);
+
+	printf("Add new event%s\n", (ntevs > 1) ? "s:" : ":");
+	for (i = 0; i < ntevs; i++) {
+		tev = &tevs[i];
+		if (pev->event)
+			event = pev->event;
+		else
+			if (pev->point.function)
+				event = pev->point.function;
+			else
+				event = tev->point.symbol;
+		if (pev->group)
+			group = pev->group;
+		else
+			group = PERFPROBE_GROUP;
+
+		/* Get an unused new event name */
+		get_new_event_name(buf, 64, event, namelist, allow_suffix);
+		event = buf;
+
+		tev->event = xstrdup(event);
+		tev->group = xstrdup(group);
+		write_kprobe_trace_event(fd, tev);
+		/* Add added event name to namelist */
+		strlist__add(namelist, event);
+
+		/* Trick here - save current event/group */
+		event = pev->event;
+		group = pev->group;
+		pev->event = tev->event;
+		pev->group = tev->group;
+		show_perf_probe_event(pev);
+		/* Trick here - restore current event/group */
+		pev->event = (char *)event;
+		pev->group = (char *)group;
+
+		/*
+		 * Probes after the first probe which comes from same
+		 * user input are always allowed to add suffix, because
+		 * there might be several addresses corresponding to
+		 * one code line.
+		 */
+		allow_suffix = true;
 	}
 	/* Show how to use the event. */
 	printf("\nYou can now use it on all perf tools, such as:\n\n");
-	printf("\tperf record -e %s:%s -a sleep 1\n\n", PERFPROBE_GROUP, event);
+	printf("\tperf record -e %s:%s -a sleep 1\n\n", tev->group, tev->event);
 
 	strlist__delete(namelist);
 	close(fd);
 }
 
-/* Currently just checking function name from symbol map */
-static void evaluate_probe_point(struct probe_point *pp)
+static int convert_to_kprobe_trace_events(struct perf_probe_event *pev,
+					  struct kprobe_trace_event **tevs)
 {
 	struct symbol *sym;
-	sym = map__find_symbol_by_name(kmaps[MAP__FUNCTION],
-				       pp->function, NULL);
-	if (!sym)
-		die("Kernel symbol \'%s\' not found - probe not added.",
-		    pp->function);
-}
-
-void add_trace_kprobe_events(struct probe_point *probes, int nr_probes,
-			     bool force_add, bool need_dwarf)
-{
-	int i, ret;
-	struct probe_point *pp;
+	bool need_dwarf;
 #ifndef NO_DWARF_SUPPORT
 	int fd;
 #endif
-	/* Add probes */
-	init_vmlinux();
+	int ntevs = 0, i;
+	struct kprobe_trace_event *tev;
+
+	need_dwarf = perf_probe_event_need_dwarf(pev);
 
 	if (need_dwarf)
 #ifdef NO_DWARF_SUPPORT
@@ -721,57 +859,90 @@ void add_trace_kprobe_events(struct probe_point *probes, int nr_probes,
 	}
 
 	/* Searching probe points */
-	for (i = 0; i < nr_probes; i++) {
-		pp = &probes[i];
-		if (pp->found)
-			continue;
-
-		lseek(fd, SEEK_SET, 0);
-		ret = find_probe_point(fd, pp);
-		if (ret > 0)
-			continue;
-		if (ret == 0) {	/* No error but failed to find probe point. */
-			synthesize_perf_probe_point(pp);
-			die("Probe point '%s' not found. - probe not added.",
-			    pp->probes[0]);
-		}
-		/* Error path */
-		if (need_dwarf) {
-			if (ret == -ENOENT)
-				pr_warning("No dwarf info found in the vmlinux - please rebuild with CONFIG_DEBUG_INFO=y.\n");
-			die("Could not analyze debuginfo.");
-		}
-		pr_debug("An error occurred in debuginfo analysis."
-			 " Try to use symbols.\n");
-		break;
+	ntevs = find_kprobe_trace_events(fd, pev, tevs);
+
+	if (ntevs > 0)	/* Found */
+		goto found;
+
+	if (ntevs == 0)	/* No error but failed to find probe point. */
+		die("Probe point '%s' not found. - probe not added.",
+		    synthesize_perf_probe_point(&pev->point));
+
+	/* Error path */
+	if (need_dwarf) {
+		if (ntevs == -ENOENT)
+			pr_warning("No dwarf info found in the vmlinux - please rebuild with CONFIG_DEBUG_INFO=y.\n");
+		die("Could not analyze debuginfo.");
 	}
-	close(fd);
+	pr_debug("An error occurred in debuginfo analysis."
+		 " Try to use symbols.\n");
 
 end_dwarf:
 #endif /* !NO_DWARF_SUPPORT */
 
-	/* Synthesize probes without dwarf */
-	for (i = 0; i < nr_probes; i++) {
-		pp = &probes[i];
-		if (pp->found)	/* This probe is already found. */
-			continue;
+	/* Allocate trace event buffer */
+	ntevs = 1;
+	tev = *tevs = xzalloc(sizeof(struct kprobe_trace_event));
+
+	/* Copy parameters */
+	tev->point.symbol = xstrdup(pev->point.function);
+	tev->point.offset = pev->point.offset;
+	tev->nargs = pev->nargs;
+	if (tev->nargs) {
+		tev->args = xzalloc(sizeof(struct kprobe_trace_arg)
+				    * tev->nargs);
+		for (i = 0; i < tev->nargs; i++)
+			tev->args[i].value = xstrdup(pev->args[i].name);
+	}
+
+	/* Currently just checking function name from symbol map */
+	sym = map__find_symbol_by_name(kmaps[MAP__FUNCTION],
+				       tev->point.symbol, NULL);
+	if (!sym)
+		die("Kernel symbol \'%s\' not found - probe not added.",
+		    tev->point.symbol);
+found:
+	close(fd);
+	return ntevs;
+}
+
+struct __event_package {
+	struct perf_probe_event		*pev;
+	struct kprobe_trace_event	*tevs;
+	int				ntevs;
+};
 
-		evaluate_probe_point(pp);
-		ret = synthesize_trace_kprobe_event(pp);
-		if (ret == -E2BIG)
-			die("probe point definition becomes too long.");
-		else if (ret < 0)
-			die("Failed to synthesize a probe point.");
+void add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
+			   bool force_add)
+{
+	int i;
+	struct __event_package *pkgs;
+
+	pkgs = xzalloc(sizeof(struct __event_package) * npevs);
+
+	/* Init vmlinux path */
+	init_vmlinux();
+
+	/* Loop 1: convert all events */
+	for (i = 0; i < npevs; i++) {
+		pkgs[i].pev = &pevs[i];
+		/* Convert with or without debuginfo */
+		pkgs[i].ntevs = convert_to_kprobe_trace_events(pkgs[i].pev,
+							       &pkgs[i].tevs);
 	}
 
-	/* Settng up probe points */
-	__add_trace_kprobe_events(probes, nr_probes, force_add);
+	/* Loop 2: add all events */
+	for (i = 0; i < npevs; i++)
+		__add_kprobe_trace_events(pkgs[i].pev, pkgs[i].tevs,
+					  pkgs[i].ntevs, force_add);
+	/* TODO: cleanup all trace events? */
 }
 
 static void __del_trace_kprobe_event(int fd, struct str_node *ent)
 {
 	char *p;
 	char buf[128];
+	int ret;
 
 	/* Convert from perf-probe event to trace-kprobe event */
 	if (e_snprintf(buf, 128, "-:%s", ent->s) < 0)
@@ -781,7 +952,10 @@ static void __del_trace_kprobe_event(int fd, struct str_node *ent)
 		die("Internal error: %s should have ':' but not.", ent->s);
 	*p = '/';
 
-	write_trace_kprobe_event(fd, buf);
+	pr_debug("Writing event: %s\n", buf);
+	ret = write(fd, buf, strlen(buf));
+	if (ret <= 0)
+		die("Failed to write event: %s", strerror(errno));
 	printf("Remove event: %s\n", ent->s);
 }
 
@@ -814,7 +988,7 @@ static void del_trace_kprobe_event(int fd, const char *group,
 		pr_info("Info: event \"%s\" does not exist, could not remove it.\n", buf);
 }
 
-void del_trace_kprobe_events(struct strlist *dellist)
+void del_perf_probe_events(struct strlist *dellist)
 {
 	int fd;
 	const char *group, *event;
@@ -824,7 +998,7 @@ void del_trace_kprobe_events(struct strlist *dellist)
 
 	fd = open_kprobe_events(true);
 	/* Get current event names */
-	namelist = get_perf_event_names(fd, true);
+	namelist = get_kprobe_trace_event_names(fd, true);
 
 	strlist__for_each(ent, dellist) {
 		str = xstrdup(ent->s);
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 703b8876dfb..2a2f0a26dc6 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -2,24 +2,113 @@
 #define _PROBE_EVENT_H
 
 #include <stdbool.h>
-#include "probe-finder.h"
 #include "strlist.h"
 
 extern bool probe_event_dry_run;
 
-extern void parse_line_range_desc(const char *arg, struct line_range *lr);
-extern void parse_perf_probe_event(const char *str, struct probe_point *pp,
-				   bool *need_dwarf);
-extern int synthesize_perf_probe_point(struct probe_point *pp);
-extern int synthesize_perf_probe_event(struct probe_point *pp);
-extern void parse_trace_kprobe_event(const char *str, struct probe_point *pp);
-extern int synthesize_trace_kprobe_event(struct probe_point *pp);
-extern void add_trace_kprobe_events(struct probe_point *probes, int nr_probes,
-				    bool force_add, bool need_dwarf);
-extern void del_trace_kprobe_events(struct strlist *dellist);
+/* kprobe-tracer tracing point */
+struct kprobe_trace_point {
+	char		*symbol;	/* Base symbol */
+	unsigned long	offset;		/* Offset from symbol */
+	bool		retprobe;	/* Return probe flag */
+};
+
+/* kprobe-tracer tracing argument referencing offset */
+struct kprobe_trace_arg_ref {
+	struct kprobe_trace_arg_ref	*next;	/* Next reference */
+	long				offset;	/* Offset value */
+};
+
+/* kprobe-tracer tracing argument */
+struct kprobe_trace_arg {
+	char				*name;	/* Argument name */
+	char				*value;	/* Base value */
+	struct kprobe_trace_arg_ref	*ref;	/* Referencing offset */
+};
+
+/* kprobe-tracer tracing event (point + arg) */
+struct kprobe_trace_event {
+	char				*event;	/* Event name */
+	char				*group;	/* Group name */
+	struct kprobe_trace_point	point;	/* Trace point */
+	int				nargs;	/* Number of args */
+	struct kprobe_trace_arg		*args;	/* Arguments */
+};
+
+/* Perf probe probing point */
+struct perf_probe_point {
+	char		*file;		/* File path */
+	char		*function;	/* Function name */
+	int		line;		/* Line number */
+	char		*lazy_line;	/* Lazy matching pattern */
+	unsigned long	offset;		/* Offset from function entry */
+	bool		retprobe;	/* Return probe flag */
+};
+
+/* Perf probe probing argument */
+struct perf_probe_arg {
+	char		*name;		/* Argument name */
+};
+
+/* Perf probe probing event (point + arg) */
+struct perf_probe_event {
+	char			*event;	/* Event name */
+	char			*group;	/* Group name */
+	struct perf_probe_point	point;	/* Probe point */
+	int			nargs;	/* Number of arguments */
+	struct perf_probe_arg	*args;	/* Arguments */
+};
+
+
+/* Line number container */
+struct line_node {
+	struct list_head	list;
+	unsigned int		line;
+};
+
+/* Line range */
+struct line_range {
+	char			*file;		/* File name */
+	char			*function;	/* Function name */
+	unsigned int		start;		/* Start line number */
+	unsigned int		end;		/* End line number */
+	int			offset;		/* Start line offset */
+	char			*path;		/* Real path name */
+	struct list_head	line_list;	/* Visible lines */
+};
+
+/* Command string to events */
+extern void parse_perf_probe_command(const char *cmd,
+				     struct perf_probe_event *pev);
+extern void parse_kprobe_trace_command(const char *cmd,
+				       struct kprobe_trace_event *tev);
+
+/* Events to command string */
+extern char *synthesize_perf_probe_command(struct perf_probe_event *pev);
+extern char *synthesize_kprobe_trace_command(struct kprobe_trace_event *tev);
+
+/* Check the perf_probe_event needs debuginfo */
+extern bool perf_probe_event_need_dwarf(struct perf_probe_event *pev);
+
+/* Convert from kprobe_trace_event to perf_probe_event */
+extern void convert_to_perf_probe_event(struct kprobe_trace_event *tev,
+					struct perf_probe_event *pev);
+
+/* Release event contents */
+extern void clear_perf_probe_event(struct perf_probe_event *pev);
+extern void clear_kprobe_trace_event(struct kprobe_trace_event *tev);
+
+/* Command string to line-range */
+extern void parse_line_range_desc(const char *cmd, struct line_range *lr);
+
+
+extern void add_perf_probe_events(struct perf_probe_event *pevs, int ntevs,
+				  bool force_add);
+extern void del_perf_probe_events(struct strlist *dellist);
 extern void show_perf_probe_events(void);
 extern void show_line_range(struct line_range *lr);
 
+
 /* Maximum index number of event-name postfix */
 #define MAX_EVENT_INDEX	1024
 
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 3942e14e95c..251b4c49653 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -319,19 +319,20 @@ static Dwarf_Die *die_find_variable(Dwarf_Die *sp_die, const char *name,
  */
 
 /* Show a location */
-static void show_location(Dwarf_Op *op, struct probe_finder *pf)
+static void convert_location(Dwarf_Op *op, struct probe_finder *pf)
 {
 	unsigned int regn;
 	Dwarf_Word offs = 0;
-	int deref = 0, ret;
+	bool ref = false;
 	const char *regs;
+	struct kprobe_trace_arg *tvar = pf->tvar;
 
 	/* TODO: support CFA */
 	/* If this is based on frame buffer, set the offset */
 	if (op->atom == DW_OP_fbreg) {
 		if (pf->fb_ops == NULL)
 			die("The attribute of frame base is not supported.\n");
-		deref = 1;
+		ref = true;
 		offs = op->number;
 		op = &pf->fb_ops[0];
 	}
@@ -339,13 +340,13 @@ static void show_location(Dwarf_Op *op, struct probe_finder *pf)
 	if (op->atom >= DW_OP_breg0 && op->atom <= DW_OP_breg31) {
 		regn = op->atom - DW_OP_breg0;
 		offs += op->number;
-		deref = 1;
+		ref = true;
 	} else if (op->atom >= DW_OP_reg0 && op->atom <= DW_OP_reg31) {
 		regn = op->atom - DW_OP_reg0;
 	} else if (op->atom == DW_OP_bregx) {
 		regn = op->number;
 		offs += op->number2;
-		deref = 1;
+		ref = true;
 	} else if (op->atom == DW_OP_regx) {
 		regn = op->number;
 	} else
@@ -355,17 +356,15 @@ static void show_location(Dwarf_Op *op, struct probe_finder *pf)
 	if (!regs)
 		die("%u exceeds max register number.", regn);
 
-	if (deref)
-		ret = snprintf(pf->buf, pf->len, " %s=%+jd(%s)",
-			       pf->var, (intmax_t)offs, regs);
-	else
-		ret = snprintf(pf->buf, pf->len, " %s=%s", pf->var, regs);
-	DIE_IF(ret < 0);
-	DIE_IF(ret >= pf->len);
+	tvar->value = xstrdup(regs);
+	if (ref) {
+		tvar->ref = xzalloc(sizeof(struct kprobe_trace_arg_ref));
+		tvar->ref->offset = (long)offs;
+	}
 }
 
 /* Show a variables in kprobe event format */
-static void show_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
+static void convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
 {
 	Dwarf_Attribute attr;
 	Dwarf_Op *expr;
@@ -379,50 +378,51 @@ static void show_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
 	if (ret <= 0 || nexpr == 0)
 		goto error;
 
-	show_location(expr, pf);
+	convert_location(expr, pf);
 	/* *expr will be cached in libdw. Don't free it. */
 	return ;
 error:
 	/* TODO: Support const_value */
 	die("Failed to find the location of %s at this address.\n"
-	    " Perhaps, it has been optimized out.", pf->var);
+	    " Perhaps, it has been optimized out.", pf->pvar->name);
 }
 
 /* Find a variable in a subprogram die */
 static void find_variable(Dwarf_Die *sp_die, struct probe_finder *pf)
 {
-	int ret;
 	Dwarf_Die vr_die;
 
 	/* TODO: Support struct members and arrays */
-	if (!is_c_varname(pf->var)) {
-		/* Output raw parameters */
-		ret = snprintf(pf->buf, pf->len, " %s", pf->var);
-		DIE_IF(ret < 0);
-		DIE_IF(ret >= pf->len);
-		return ;
+	if (!is_c_varname(pf->pvar->name)) {
+		/* Copy raw parameters */
+		pf->tvar->value = xstrdup(pf->pvar->name);
+	} else {
+		pf->tvar->name = xstrdup(pf->pvar->name);
+		pr_debug("Searching '%s' variable in context.\n",
+			 pf->pvar->name);
+		/* Search child die for local variables and parameters. */
+		if (!die_find_variable(sp_die, pf->pvar->name, &vr_die))
+			die("Failed to find '%s' in this function.",
+			    pf->pvar->name);
+		convert_variable(&vr_die, pf);
 	}
-
-	pr_debug("Searching '%s' variable in context.\n", pf->var);
-	/* Search child die for local variables and parameters. */
-	if (!die_find_variable(sp_die, pf->var, &vr_die))
-		die("Failed to find '%s' in this function.", pf->var);
-
-	show_variable(&vr_die, pf);
 }
 
 /* Show a probe point to output buffer */
-static void show_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
+static void convert_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 {
-	struct probe_point *pp = pf->pp;
+	struct kprobe_trace_event *tev;
 	Dwarf_Addr eaddr;
 	Dwarf_Die die_mem;
 	const char *name;
-	char tmp[MAX_PROBE_BUFFER];
-	int ret, i, len;
+	int ret, i;
 	Dwarf_Attribute fb_attr;
 	size_t nops;
 
+	if (pf->ntevs == MAX_PROBES)
+		die("Too many( > %d) probe point found.\n", MAX_PROBES);
+	tev = &pf->tevs[pf->ntevs++];
+
 	/* If no real subprogram, find a real one */
 	if (!sp_die || dwarf_tag(sp_die) != DW_TAG_subprogram) {
 		sp_die = die_find_real_subprogram(&pf->cu_die,
@@ -431,31 +431,18 @@ static void show_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 			die("Probe point is not found in subprograms.");
 	}
 
-	/* Output name of probe point */
+	/* Copy the name of probe point */
 	name = dwarf_diename(sp_die);
 	if (name) {
 		dwarf_entrypc(sp_die, &eaddr);
-		ret = snprintf(tmp, MAX_PROBE_BUFFER, "%s+%lu", name,
-				(unsigned long)(pf->addr - eaddr));
-		/* Copy the function name if possible */
-		if (!pp->function) {
-			pp->function = xstrdup(name);
-			pp->offset = (size_t)(pf->addr - eaddr);
-		}
-	} else {
+		tev->point.symbol = xstrdup(name);
+		tev->point.offset = (unsigned long)(pf->addr - eaddr);
+	} else
 		/* This function has no name. */
-		ret = snprintf(tmp, MAX_PROBE_BUFFER, "0x%jx",
-			       (uintmax_t)pf->addr);
-		if (!pp->function) {
-			/* TODO: Use _stext */
-			pp->function = xstrdup("");
-			pp->offset = (size_t)pf->addr;
-		}
-	}
-	DIE_IF(ret < 0);
-	DIE_IF(ret >= MAX_PROBE_BUFFER);
-	len = ret;
-	pr_debug("Probe point found: %s\n", tmp);
+		tev->point.offset = (unsigned long)pf->addr;
+
+	pr_debug("Probe point found: %s+%lu\n", tev->point.symbol,
+		 tev->point.offset);
 
 	/* Get the frame base attribute/ops */
 	dwarf_attr(sp_die, DW_AT_frame_base, &fb_attr);
@@ -465,22 +452,16 @@ static void show_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 
 	/* Find each argument */
 	/* TODO: use dwarf_cfi_addrframe */
-	for (i = 0; i < pp->nr_args; i++) {
-		pf->var = pp->args[i];
-		pf->buf = &tmp[len];
-		pf->len = MAX_PROBE_BUFFER - len;
+	tev->nargs = pf->pev->nargs;
+	tev->args = xzalloc(sizeof(struct kprobe_trace_arg) * tev->nargs);
+	for (i = 0; i < pf->pev->nargs; i++) {
+		pf->pvar = &pf->pev->args[i];
+		pf->tvar = &tev->args[i];
 		find_variable(sp_die, pf);
-		len += strlen(pf->buf);
 	}
 
 	/* *pf->fb_ops will be cached in libdw. Don't free it. */
 	pf->fb_ops = NULL;
-
-	if (pp->found == MAX_PROBES)
-		die("Too many( > %d) probe point found.\n", MAX_PROBES);
-
-	pp->probes[pp->found] = xstrdup(tmp);
-	pp->found++;
 }
 
 /* Find probe point from its line number */
@@ -512,7 +493,7 @@ static void find_probe_point_by_line(struct probe_finder *pf)
 			 (int)i, lineno, (uintmax_t)addr);
 		pf->addr = addr;
 
-		show_probe_point(NULL, pf);
+		convert_probe_point(NULL, pf);
 		/* Continuing, because target line might be inlined. */
 	}
 }
@@ -563,7 +544,7 @@ static void find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
 	if (list_empty(&pf->lcache)) {
 		/* Matching lazy line pattern */
 		ret = find_lazy_match_lines(&pf->lcache, pf->fname,
-					    pf->pp->lazy_line);
+					    pf->pev->point.lazy_line);
 		if (ret <= 0)
 			die("No matched lines found in %s.", pf->fname);
 	}
@@ -596,7 +577,7 @@ static void find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
 			 (int)i, lineno, (unsigned long long)addr);
 		pf->addr = addr;
 
-		show_probe_point(sp_die, pf);
+		convert_probe_point(sp_die, pf);
 		/* Continuing, because target line might be inlined. */
 	}
 	/* TODO: deallocate lines, but how? */
@@ -605,7 +586,7 @@ static void find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
 static int probe_point_inline_cb(Dwarf_Die *in_die, void *data)
 {
 	struct probe_finder *pf = (struct probe_finder *)data;
-	struct probe_point *pp = pf->pp;
+	struct perf_probe_point *pp = &pf->pev->point;
 
 	if (pp->lazy_line)
 		find_probe_point_lazy(in_die, pf);
@@ -616,7 +597,7 @@ static int probe_point_inline_cb(Dwarf_Die *in_die, void *data)
 		pr_debug("found inline addr: 0x%jx\n",
 			 (uintmax_t)pf->addr);
 
-		show_probe_point(in_die, pf);
+		convert_probe_point(in_die, pf);
 	}
 
 	return DWARF_CB_OK;
@@ -626,7 +607,7 @@ static int probe_point_inline_cb(Dwarf_Die *in_die, void *data)
 static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
 {
 	struct probe_finder *pf = (struct probe_finder *)data;
-	struct probe_point *pp = pf->pp;
+	struct perf_probe_point *pp = &pf->pev->point;
 
 	/* Check tag and diename */
 	if (dwarf_tag(sp_die) != DW_TAG_subprogram ||
@@ -646,7 +627,7 @@ static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
 			pf->addr = die_get_entrypc(sp_die);
 			pf->addr += pp->offset;
 			/* TODO: Check the address in this function */
-			show_probe_point(sp_die, pf);
+			convert_probe_point(sp_die, pf);
 		}
 	} else
 		/* Inlined function: search instances */
@@ -660,20 +641,25 @@ static void find_probe_point_by_func(struct probe_finder *pf)
 	dwarf_getfuncs(&pf->cu_die, probe_point_search_cb, pf, 0);
 }
 
-/* Find a probe point */
-int find_probe_point(int fd, struct probe_point *pp)
+/* Find kprobe_trace_events specified by perf_probe_event from debuginfo */
+int find_kprobe_trace_events(int fd, struct perf_probe_event *pev,
+			     struct kprobe_trace_event **tevs)
 {
-	struct probe_finder pf = {.pp = pp};
+	struct probe_finder pf = {.pev = pev};
+	struct perf_probe_point *pp = &pev->point;
 	Dwarf_Off off, noff;
 	size_t cuhl;
 	Dwarf_Die *diep;
 	Dwarf *dbg;
 
+	pf.tevs = xzalloc(sizeof(struct kprobe_trace_event) * MAX_PROBES);
+	*tevs = pf.tevs;
+	pf.ntevs = 0;
+
 	dbg = dwarf_begin(fd, DWARF_C_READ);
 	if (!dbg)
 		return -ENOENT;
 
-	pp->found = 0;
 	off = 0;
 	line_list__init(&pf.lcache);
 	/* Loop on CUs (Compilation Unit) */
@@ -704,7 +690,7 @@ int find_probe_point(int fd, struct probe_point *pp)
 	line_list__free(&pf.lcache);
 	dwarf_end(dbg);
 
-	return pp->found;
+	return pf.ntevs;
 }
 
 /* Find line range from its line number */
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index 21f7354397b..494952619b9 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -3,6 +3,7 @@
 
 #include <stdbool.h>
 #include "util.h"
+#include "probe-event.h"
 
 #define MAX_PATH_LEN		 256
 #define MAX_PROBE_BUFFER	1024
@@ -14,67 +15,32 @@ static inline int is_c_varname(const char *name)
 	return isalpha(name[0]) || name[0] == '_';
 }
 
-struct probe_point {
-	char			*event;			/* Event name */
-	char			*group;			/* Event group */
-
-	/* Inputs */
-	char			*file;			/* File name */
-	int			line;			/* Line number */
-	char			*lazy_line;		/* Lazy line pattern */
-
-	char			*function;		/* Function name */
-	int			offset;			/* Offset bytes */
-
-	int			nr_args;		/* Number of arguments */
-	char			**args;			/* Arguments */
-
-	int			retprobe;		/* Return probe */
-
-	/* Output */
-	int			found;			/* Number of found probe points */
-	char			*probes[MAX_PROBES];	/* Output buffers (will be allocated)*/
-};
-
-/* Line number container */
-struct line_node {
-	struct list_head	list;
-	unsigned int		line;
-};
-
-/* Line range */
-struct line_range {
-	char			*file;			/* File name */
-	char			*function;		/* Function name */
-	unsigned int		start;			/* Start line number */
-	unsigned int		end;			/* End line number */
-	int			offset;			/* Start line offset */
-	char			*path;			/* Real path name */
-	struct list_head	line_list;		/* Visible lines */
-};
-
 #ifndef NO_DWARF_SUPPORT
-extern int find_probe_point(int fd, struct probe_point *pp);
+/* Find kprobe_trace_events specified by perf_probe_event from debuginfo */
+extern int find_kprobe_trace_events(int fd, struct perf_probe_event *pev,
+				    struct kprobe_trace_event **tevs);
+
 extern int find_line_range(int fd, struct line_range *lr);
 
 #include <dwarf.h>
 #include <libdw.h>
 
 struct probe_finder {
-	struct probe_point	*pp;		/* Target probe point */
+	struct perf_probe_event	*pev;		/* Target probe event */
+	int			ntevs;		/* number of trace events */
+	struct kprobe_trace_event *tevs;	/* Result trace events */
 
 	/* For function searching */
 	Dwarf_Addr		addr;		/* Address */
-	const char		*fname;		/* File name */
+	const char		*fname;		/* Real file name */
 	int			lno;		/* Line number */
 	Dwarf_Die		cu_die;		/* Current CU */
+	struct list_head	lcache;		/* Line cache for lazy match */
 
 	/* For variable searching */
 	Dwarf_Op		*fb_ops;	/* Frame base attribute */
-	const char		*var;		/* Current variable name */
-	char			*buf;		/* Current output buffer */
-	int			len;		/* Length of output buffer */
-	struct list_head	lcache;		/* Line cache for lazy match */
+	struct perf_probe_arg	*pvar;		/* Current target variable */
+	struct kprobe_trace_arg	*tvar;		/* Current result variable */
 };
 
 struct line_finder {
-- 
cgit v1.2.3


From fb1587d869a399554220e166d4b90b581a8ade01 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 16 Mar 2010 18:06:19 -0400
Subject: perf probe: List probes with line number and file name

Improve --list to show current exist probes with line number and
file name. This enables user easily to check which line is
already probed.

for example:

 ./perf probe --list
 probe:vfs_read       (on vfs_read:8@linux-2.6-tip/fs/read_write.c)

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100316220619.32050.48702.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/probe-event.c  | 55 +++++++++++++++++++++++++--------
 tools/perf/util/probe-finder.c | 70 ++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/probe-finder.h |  4 +++
 3 files changed, 116 insertions(+), 13 deletions(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index b44ddfb030d..4e3c1aea789 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -70,7 +70,6 @@ static int e_snprintf(char *str, size_t size, const char *format, ...)
 	return ret;
 }
 
-
 static struct map_groups kmap_groups;
 static struct map *kmaps[MAP__NR_TYPES];
 
@@ -357,27 +356,39 @@ void parse_kprobe_trace_command(const char *cmd, struct kprobe_trace_event *tev)
 /* Compose only probe point (not argument) */
 static char *synthesize_perf_probe_point(struct perf_probe_point *pp)
 {
-	char *buf;
-	char offs[64] = "", line[64] = "";
-	int ret;
+	char *buf, *tmp;
+	char offs[32] = "", line[32] = "", file[32] = "";
+	int ret, len;
 
 	buf = xzalloc(MAX_CMDLEN);
 	if (pp->offset) {
-		ret = e_snprintf(offs, 64, "+%lu", pp->offset);
+		ret = e_snprintf(offs, 32, "+%lu", pp->offset);
 		if (ret <= 0)
 			goto error;
 	}
 	if (pp->line) {
-		ret = e_snprintf(line, 64, ":%d", pp->line);
+		ret = e_snprintf(line, 32, ":%d", pp->line);
+		if (ret <= 0)
+			goto error;
+	}
+	if (pp->file) {
+		len = strlen(pp->file) - 32;
+		if (len < 0)
+			len = 0;
+		tmp = strchr(pp->file + len, '/');
+		if (!tmp)
+			tmp = pp->file + len - 1;
+		ret = e_snprintf(file, 32, "@%s", tmp + 1);
 		if (ret <= 0)
 			goto error;
 	}
 
 	if (pp->function)
-		ret = e_snprintf(buf, MAX_CMDLEN, "%s%s%s%s", pp->function,
-				 offs, pp->retprobe ? "%return" : "", line);
+		ret = e_snprintf(buf, MAX_CMDLEN, "%s%s%s%s%s", pp->function,
+				 offs, pp->retprobe ? "%return" : "", line,
+				 file);
 	else
-		ret = e_snprintf(buf, MAX_CMDLEN, "%s%s", pp->file, line);
+		ret = e_snprintf(buf, MAX_CMDLEN, "%s%s", file, line);
 	if (ret <= 0)
 		goto error;
 
@@ -511,15 +522,32 @@ void convert_to_perf_probe_event(struct kprobe_trace_event *tev,
 {
 	char buf[64];
 	int i;
+#ifndef NO_DWARF_SUPPORT
+	struct symbol *sym;
+	int fd, ret = 0;
 
-	pev->event = xstrdup(tev->event);
-	pev->group = xstrdup(tev->group);
-
+	sym = map__find_symbol_by_name(kmaps[MAP__FUNCTION],
+				       tev->point.symbol, NULL);
+	if (sym) {
+		fd = open_vmlinux();
+		ret = find_perf_probe_point(fd, sym->start + tev->point.offset,
+					    &pev->point);
+		close(fd);
+	}
+	if (ret <= 0) {
+		pev->point.function = xstrdup(tev->point.symbol);
+		pev->point.offset = tev->point.offset;
+	}
+#else
 	/* Convert trace_point to probe_point */
 	pev->point.function = xstrdup(tev->point.symbol);
 	pev->point.offset = tev->point.offset;
+#endif
 	pev->point.retprobe = tev->point.retprobe;
 
+	pev->event = xstrdup(tev->event);
+	pev->group = xstrdup(tev->group);
+
 	/* Convert trace_arg to probe_arg */
 	pev->nargs = tev->nargs;
 	pev->args = xzalloc(sizeof(struct perf_probe_arg) * pev->nargs);
@@ -650,7 +678,7 @@ static void show_perf_probe_event(struct perf_probe_event *pev)
 	ret = e_snprintf(buf, 128, "%s:%s", pev->group, pev->event);
 	if (ret < 0)
 		die("Failed to copy event: %s", strerror(-ret));
-	printf("  %-40s (on %s", buf, place);
+	printf("  %-20s (on %s", buf, place);
 
 	if (pev->nargs > 0) {
 		printf(" with");
@@ -671,6 +699,7 @@ void show_perf_probe_events(void)
 	struct str_node *ent;
 
 	setup_pager();
+	init_vmlinux();
 
 	memset(&tev, 0, sizeof(tev));
 	memset(&pev, 0, sizeof(pev));
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 251b4c49653..e02b6077048 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -693,6 +693,76 @@ int find_kprobe_trace_events(int fd, struct perf_probe_event *pev,
 	return pf.ntevs;
 }
 
+/* Reverse search */
+int find_perf_probe_point(int fd, unsigned long addr,
+			  struct perf_probe_point *ppt)
+{
+	Dwarf_Die cudie, spdie, indie;
+	Dwarf *dbg;
+	Dwarf_Line *line;
+	Dwarf_Addr laddr, eaddr;
+	const char *tmp;
+	int lineno, ret = 0;
+
+	dbg = dwarf_begin(fd, DWARF_C_READ);
+	if (!dbg)
+		return -ENOENT;
+
+	/* Find cu die */
+	if (!dwarf_addrdie(dbg, (Dwarf_Addr)addr, &cudie))
+		return -EINVAL;
+
+	/* Find a corresponding line */
+	line = dwarf_getsrc_die(&cudie, (Dwarf_Addr)addr);
+	if (line) {
+		dwarf_lineaddr(line, &laddr);
+		if ((Dwarf_Addr)addr == laddr) {
+			dwarf_lineno(line, &lineno);
+			ppt->line = lineno;
+
+			tmp = dwarf_linesrc(line, NULL, NULL);
+			DIE_IF(!tmp);
+			ppt->file = xstrdup(tmp);
+			ret = 1;
+		}
+	}
+
+	/* Find a corresponding function */
+	if (die_find_real_subprogram(&cudie, (Dwarf_Addr)addr, &spdie)) {
+		tmp = dwarf_diename(&spdie);
+		if (!tmp)
+			goto end;
+
+		dwarf_entrypc(&spdie, &eaddr);
+		if (!lineno) {
+			/* We don't have a line number, let's use offset */
+			ppt->function = xstrdup(tmp);
+			ppt->offset = addr - (unsigned long)eaddr;
+			ret = 1;
+			goto end;
+		}
+		if (die_find_inlinefunc(&spdie, (Dwarf_Addr)addr, &indie)) {
+			/* addr in an inline function */
+			tmp = dwarf_diename(&indie);
+			if (!tmp)
+				goto end;
+			dwarf_decl_line(&indie, &lineno);
+		} else {
+			if (eaddr == addr)	/* No offset: function entry */
+				lineno = ppt->line;
+			else
+				dwarf_decl_line(&spdie, &lineno);
+		}
+		ppt->function = xstrdup(tmp);
+		ppt->line -= lineno;	/* Make a relative line number */
+	}
+
+end:
+	dwarf_end(dbg);
+	return ret;
+}
+
+
 /* Find line range from its line number */
 static void find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
 {
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index 494952619b9..2f2307d4139 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -20,6 +20,10 @@ static inline int is_c_varname(const char *name)
 extern int find_kprobe_trace_events(int fd, struct perf_probe_event *pev,
 				    struct kprobe_trace_event **tevs);
 
+/* Find a perf_probe_point from debuginfo */
+extern int find_perf_probe_point(int fd, unsigned long addr,
+				 struct perf_probe_point *ppt);
+
 extern int find_line_range(int fd, struct line_range *lr);
 
 #include <dwarf.h>
-- 
cgit v1.2.3


From 7df2f32956cf0f1a45df38cd0e0fe0c3467580e8 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 16 Mar 2010 18:06:26 -0400
Subject: perf probe: Add data structure member access support

Support accessing members in the data structures. With this,
perf-probe accepts data-structure members(IOW, it now accepts
dot '.' and arrow '->' operators) as probe arguemnts.

e.g.

 ./perf probe --add 'schedule:44 rq->curr'

 ./perf probe --add 'vfs_read file->f_op->read file->f_path.dentry'

Note that '>' can be interpreted as redirection in command-line.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100316220626.32050.57552.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/probe-event.c  |  90 +++++++++++++++++++++++++++++++++--
 tools/perf/util/probe-event.h  |  12 ++++-
 tools/perf/util/probe-finder.c | 105 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 201 insertions(+), 6 deletions(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 4e3c1aea789..64dea6c3d58 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -262,6 +262,49 @@ static void parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 		 pp->lazy_line);
 }
 
+/* Parse perf-probe event argument */
+static void parse_perf_probe_arg(const char *str, struct perf_probe_arg *arg)
+{
+	const char *tmp;
+	struct perf_probe_arg_field **fieldp;
+
+	pr_debug("parsing arg: %s into ", str);
+
+	tmp = strpbrk(str, "-.");
+	if (!is_c_varname(str) || !tmp) {
+		/* A variable, register, symbol or special value */
+		arg->name = xstrdup(str);
+		pr_debug("%s\n", arg->name);
+		return;
+	}
+
+	/* Structure fields */
+	arg->name = xstrndup(str, tmp - str);
+	pr_debug("%s, ", arg->name);
+	fieldp = &arg->field;
+
+	do {
+		*fieldp = xzalloc(sizeof(struct perf_probe_arg_field));
+		if (*tmp == '.') {
+			str = tmp + 1;
+			(*fieldp)->ref = false;
+		} else if (tmp[1] == '>') {
+			str = tmp + 2;
+			(*fieldp)->ref = true;
+		} else
+			semantic_error("Argument parse error: %s", str);
+
+		tmp = strpbrk(str, "-.");
+		if (tmp) {
+			(*fieldp)->name = xstrndup(str, tmp - str);
+			pr_debug("%s(%d), ", (*fieldp)->name, (*fieldp)->ref);
+			fieldp = &(*fieldp)->next;
+		}
+	} while (tmp);
+	(*fieldp)->name = xstrdup(str);
+	pr_debug("%s(%d)\n", (*fieldp)->name, (*fieldp)->ref);
+}
+
 /* Parse perf-probe event command */
 void parse_perf_probe_command(const char *cmd, struct perf_probe_event *pev)
 {
@@ -281,7 +324,7 @@ void parse_perf_probe_command(const char *cmd, struct perf_probe_event *pev)
 	pev->nargs = argc - 1;
 	pev->args = xzalloc(sizeof(struct perf_probe_arg) * pev->nargs);
 	for (i = 0; i < pev->nargs; i++) {
-		pev->args[i].name = xstrdup(argv[i + 1]);
+		parse_perf_probe_arg(argv[i + 1], &pev->args[i]);
 		if (is_c_varname(pev->args[i].name) && pev->point.retprobe)
 			semantic_error("You can't specify local variable for"
 				       " kretprobe");
@@ -353,6 +396,33 @@ void parse_kprobe_trace_command(const char *cmd, struct kprobe_trace_event *tev)
 	argv_free(argv);
 }
 
+/* Compose only probe arg */
+int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf, size_t len)
+{
+	struct perf_probe_arg_field *field = pa->field;
+	int ret;
+	char *tmp = buf;
+
+	ret = e_snprintf(tmp, len, "%s", pa->name);
+	if (ret <= 0)
+		goto error;
+	tmp += ret;
+	len -= ret;
+
+	while (field) {
+		ret = e_snprintf(tmp, len, "%s%s", field->ref ? "->" : ".",
+				 field->name);
+		if (ret <= 0)
+			goto error;
+		tmp += ret;
+		len -= ret;
+		field = field->next;
+	}
+	return tmp - buf;
+error:
+	die("Failed to synthesize perf probe argument: %s", strerror(-ret));
+}
+
 /* Compose only probe point (not argument) */
 static char *synthesize_perf_probe_point(struct perf_probe_point *pp)
 {
@@ -563,6 +633,7 @@ void convert_to_perf_probe_event(struct kprobe_trace_event *tev,
 void clear_perf_probe_event(struct perf_probe_event *pev)
 {
 	struct perf_probe_point *pp = &pev->point;
+	struct perf_probe_arg_field *field, *next;
 	int i;
 
 	if (pev->event)
@@ -575,9 +646,18 @@ void clear_perf_probe_event(struct perf_probe_event *pev)
 		free(pp->function);
 	if (pp->lazy_line)
 		free(pp->lazy_line);
-	for (i = 0; i < pev->nargs; i++)
+	for (i = 0; i < pev->nargs; i++) {
 		if (pev->args[i].name)
 			free(pev->args[i].name);
+		field = pev->args[i].field;
+		while (field) {
+			next = field->next;
+			if (field->name)
+				free(field->name);
+			free(field);
+			field = next;
+		}
+	}
 	if (pev->args)
 		free(pev->args);
 	memset(pev, 0, sizeof(*pev));
@@ -682,8 +762,10 @@ static void show_perf_probe_event(struct perf_probe_event *pev)
 
 	if (pev->nargs > 0) {
 		printf(" with");
-		for (i = 0; i < pev->nargs; i++)
-			printf(" %s", pev->args[i].name);
+		for (i = 0; i < pev->nargs; i++) {
+			synthesize_perf_probe_arg(&pev->args[i], buf, 128);
+			printf(" %s", buf);
+		}
 	}
 	printf(")\n");
 	free(place);
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 2a2f0a26dc6..cd308b0a4d9 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -45,9 +45,17 @@ struct perf_probe_point {
 	bool		retprobe;	/* Return probe flag */
 };
 
+/* Perf probe probing argument field chain */
+struct perf_probe_arg_field {
+	struct perf_probe_arg_field	*next;	/* Next field */
+	char				*name;	/* Name of the field */
+	bool				ref;	/* Referencing flag */
+};
+
 /* Perf probe probing argument */
 struct perf_probe_arg {
-	char		*name;		/* Argument name */
+	char				*name;	/* Argument name */
+	struct perf_probe_arg_field	*field;	/* Structure fields */
 };
 
 /* Perf probe probing event (point + arg) */
@@ -86,6 +94,8 @@ extern void parse_kprobe_trace_command(const char *cmd,
 /* Events to command string */
 extern char *synthesize_perf_probe_command(struct perf_probe_event *pev);
 extern char *synthesize_kprobe_trace_command(struct kprobe_trace_event *tev);
+extern int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf,
+				     size_t len);
 
 /* Check the perf_probe_event needs debuginfo */
 extern bool perf_probe_event_need_dwarf(struct perf_probe_event *pev);
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index e02b6077048..db52ec2e84d 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -206,6 +206,28 @@ static Dwarf_Addr die_get_entrypc(Dwarf_Die *dw_die)
 	return epc;
 }
 
+/* Get type die, but skip qualifiers and typedef */
+static Dwarf_Die *die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem)
+{
+	Dwarf_Attribute attr;
+	int tag;
+
+	do {
+		if (dwarf_attr(vr_die, DW_AT_type, &attr) == NULL ||
+		    dwarf_formref_die(&attr, die_mem) == NULL)
+			return NULL;
+
+		tag = dwarf_tag(die_mem);
+		vr_die = die_mem;
+	} while (tag == DW_TAG_const_type ||
+		 tag == DW_TAG_restrict_type ||
+		 tag == DW_TAG_volatile_type ||
+		 tag == DW_TAG_shared_type ||
+		 tag == DW_TAG_typedef);
+
+	return die_mem;
+}
+
 /* Return values for die_find callbacks */
 enum {
 	DIE_FIND_CB_FOUND = 0,		/* End of Search */
@@ -314,6 +336,25 @@ static Dwarf_Die *die_find_variable(Dwarf_Die *sp_die, const char *name,
 			      die_mem);
 }
 
+static int __die_find_member_cb(Dwarf_Die *die_mem, void *data)
+{
+	const char *name = data;
+
+	if ((dwarf_tag(die_mem) == DW_TAG_member) &&
+	    (die_compare_name(die_mem, name) == 0))
+		return DIE_FIND_CB_FOUND;
+
+	return DIE_FIND_CB_SIBLING;
+}
+
+/* Find a member called 'name' */
+static Dwarf_Die *die_find_member(Dwarf_Die *st_die, const char *name,
+				  Dwarf_Die *die_mem)
+{
+	return die_find_child(st_die, __die_find_member_cb, (void *)name,
+			      die_mem);
+}
+
 /*
  * Probe finder related functions
  */
@@ -363,6 +404,62 @@ static void convert_location(Dwarf_Op *op, struct probe_finder *pf)
 	}
 }
 
+static void convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
+				    struct perf_probe_arg_field *field,
+				    struct kprobe_trace_arg_ref **ref_ptr)
+{
+	struct kprobe_trace_arg_ref *ref = *ref_ptr;
+	Dwarf_Attribute attr;
+	Dwarf_Die member;
+	Dwarf_Die type;
+	Dwarf_Word offs;
+
+	pr_debug("converting %s in %s\n", field->name, varname);
+	if (die_get_real_type(vr_die, &type) == NULL)
+		die("Failed to get a type information of %s.", varname);
+
+	/* Check the pointer and dereference */
+	if (dwarf_tag(&type) == DW_TAG_pointer_type) {
+		if (!field->ref)
+			die("Semantic error: %s must be referred by '->'",
+			    field->name);
+		/* Get the type pointed by this pointer */
+		if (die_get_real_type(&type, &type) == NULL)
+			die("Failed to get a type information of %s.", varname);
+
+		ref = xzalloc(sizeof(struct kprobe_trace_arg_ref));
+		if (*ref_ptr)
+			(*ref_ptr)->next = ref;
+		else
+			*ref_ptr = ref;
+	} else {
+		if (field->ref)
+			die("Semantic error: %s must be referred by '.'",
+			    field->name);
+		if (!ref)
+			die("Structure on a register is not supported yet.");
+	}
+
+	/* Verify it is a data structure  */
+	if (dwarf_tag(&type) != DW_TAG_structure_type)
+		die("%s is not a data structure.", varname);
+
+	if (die_find_member(&type, field->name, &member) == NULL)
+		die("%s(tyep:%s) has no member %s.", varname,
+		    dwarf_diename(&type), field->name);
+
+	/* Get the offset of the field */
+	if (dwarf_attr(&member, DW_AT_data_member_location, &attr) == NULL ||
+	    dwarf_formudata(&attr, &offs) != 0)
+		die("Failed to get the offset of %s.", field->name);
+	ref->offset += (long)offs;
+
+	/* Converting next field */
+	if (field->next)
+		convert_variable_fields(&member, field->name, field->next,
+					&ref);
+}
+
 /* Show a variables in kprobe event format */
 static void convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
 {
@@ -379,6 +476,10 @@ static void convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
 		goto error;
 
 	convert_location(expr, pf);
+
+	if (pf->pvar->field)
+		convert_variable_fields(vr_die, pf->pvar->name,
+					pf->pvar->field, &pf->tvar->ref);
 	/* *expr will be cached in libdw. Don't free it. */
 	return ;
 error:
@@ -391,13 +492,15 @@ error:
 static void find_variable(Dwarf_Die *sp_die, struct probe_finder *pf)
 {
 	Dwarf_Die vr_die;
+	char buf[128];
 
 	/* TODO: Support struct members and arrays */
 	if (!is_c_varname(pf->pvar->name)) {
 		/* Copy raw parameters */
 		pf->tvar->value = xstrdup(pf->pvar->name);
 	} else {
-		pf->tvar->name = xstrdup(pf->pvar->name);
+		synthesize_perf_probe_arg(pf->pvar, buf, 128);
+		pf->tvar->name = xstrdup(buf);
 		pr_debug("Searching '%s' variable in context.\n",
 			 pf->pvar->name);
 		/* Search child die for local variables and parameters. */
-- 
cgit v1.2.3


From 3b0d516463f8deb897a55cb81e9dbbe58a2490ed Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Mar 2010 12:13:28 +0100
Subject: perf probe: Fix !dwarf build

Fix the !drawf build.

This uses the existing NO_DWARF_SUPPORT mechanism we use for that,
but it's really fragile and needs a cleanup. (in a separate patch)

1) Such uses:

 #ifndef NO_DWARF_SUPPORT

are double inverted logic a'la 'not not'. Instead the flag should
be called DWARF_SUPPORT.

2) Furthermore, assymetric #ifdef polluted code flow like:

        if (need_dwarf)
 #ifdef NO_DWARF_SUPPORT
                die("Debuginfo-analysis is not supported");
 #else   /* !NO_DWARF_SUPPORT */
                pr_debug("Some probes require debuginfo.\n");

        fd = open_vmlinux();

is very fragile and not acceptable. Instead of that helper functions
should be created and the dwarf/no-dwarf logic should be separated more
cleanly.

3) Local variable #ifdefs like this:

 #ifndef NO_DWARF_SUPPORT
        int fd;
 #endif

Are fragile as well and should be eliminated. Helper functions achieve
that too.

Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100316220612.32050.33806.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/probe-event.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 64dea6c3d58..f3332698058 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -1012,8 +1012,10 @@ end_dwarf:
 	if (!sym)
 		die("Kernel symbol \'%s\' not found - probe not added.",
 		    tev->point.symbol);
+#ifndef NO_DWARF_SUPPORT
 found:
 	close(fd);
+#endif
 	return ntevs;
 }
 
@@ -1172,10 +1174,13 @@ void show_line_range(struct line_range *lr)
 	unsigned int l = 1;
 	struct line_node *ln;
 	FILE *fp;
+#ifndef NO_DWARF_SUPPORT
 	int fd, ret;
+#endif
 
 	/* Search a line range */
 	init_vmlinux();
+#ifndef NO_DWARF_SUPPORT
 	fd = open_vmlinux();
 	if (fd < 0)
 		die("Could not open debuginfo file.");
@@ -1183,6 +1188,7 @@ void show_line_range(struct line_range *lr)
 	if (ret <= 0)
 		die("Source line is not found.\n");
 	close(fd);
+#endif
 
 	setup_pager();
 
-- 
cgit v1.2.3


From b27ea29c6267889be255f2217fa7a6106e6a8b04 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 17 Mar 2010 12:49:10 +0100
Subject: perf/core, x86: Reduce number of CONFIG_X86_LOCAL_APIC macros

The function reserve_pmc_hardware() and release_pmc_hardware()
were hard to read. This patch improves readability of the code by
removing most of the CONFIG_X86_LOCAL_APIC macros.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268826553-19518-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5dacf63f913..793e63f6c42 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -313,9 +313,10 @@ again:
 static atomic_t active_events;
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
+#ifdef CONFIG_X86_LOCAL_APIC
+
 static bool reserve_pmc_hardware(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	int i;
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -330,11 +331,9 @@ static bool reserve_pmc_hardware(void)
 		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
-#endif
 
 	return true;
 
-#ifdef CONFIG_X86_LOCAL_APIC
 eventsel_fail:
 	for (i--; i >= 0; i--)
 		release_evntsel_nmi(x86_pmu.eventsel + i);
@@ -349,12 +348,10 @@ perfctr_fail:
 		enable_lapic_nmi_watchdog();
 
 	return false;
-#endif
 }
 
 static void release_pmc_hardware(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	int i;
 
 	for (i = 0; i < x86_pmu.num_events; i++) {
@@ -364,9 +361,15 @@ static void release_pmc_hardware(void)
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		enable_lapic_nmi_watchdog();
-#endif
 }
 
+#else
+
+static bool reserve_pmc_hardware(void) { return true; }
+static void release_pmc_hardware(void) {}
+
+#endif
+
 static int reserve_ds_buffers(void);
 static void release_ds_buffers(void);
 
-- 
cgit v1.2.3


From 141c4296cb630a7ed4c3730913bc3c0617ef9753 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 17 Mar 2010 12:49:11 +0100
Subject: perf/core: Correct files in MAINTAINERS entry

This corrects the file entries for perf_events. The following
files are caught now:

 $ xargs | eval ls $(cat) | sort -u
 kernel/perf_event*.c
 include/linux/perf_event.h
 arch/*/kernel/perf_event*.c
 arch/*/kernel/*/perf_event*.c
 arch/*/kernel/*/*/perf_event*.c
 arch/*/include/asm/perf_event.h
 arch/*/lib/perf_event*.c
 arch/*/kernel/perf_callchain.c

 arch/alpha/include/asm/perf_event.h
 arch/arm/include/asm/perf_event.h
 arch/arm/kernel/perf_event.c
 arch/frv/include/asm/perf_event.h
 arch/frv/lib/perf_event.c
 arch/parisc/include/asm/perf_event.h
 arch/powerpc/include/asm/perf_event.h
 arch/powerpc/kernel/perf_callchain.c
 arch/powerpc/kernel/perf_event.c
 arch/s390/include/asm/perf_event.h
 arch/sh/include/asm/perf_event.h
 arch/sh/kernel/cpu/sh4a/perf_event.c
 arch/sh/kernel/cpu/sh4/perf_event.c
 arch/sh/kernel/perf_callchain.c
 arch/sh/kernel/perf_event.c
 arch/sparc/include/asm/perf_event.h
 arch/sparc/kernel/perf_event.c
 arch/x86/include/asm/perf_event.h
 arch/x86/kernel/cpu/perf_event_amd.c
 arch/x86/kernel/cpu/perf_event.c
 arch/x86/kernel/cpu/perf_event_intel.c
 arch/x86/kernel/cpu/perf_event_p6.c
 include/linux/perf_event.h
 kernel/perf_event.c

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268826553-19518-3-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 MAINTAINERS | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 14d5c4c7091..e3aa8508a06 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4295,13 +4295,13 @@ M:	Paul Mackerras <paulus@samba.org>
 M:	Ingo Molnar <mingo@elte.hu>
 M:	Arnaldo Carvalho de Melo <acme@redhat.com>
 S:	Supported
-F:	kernel/perf_event.c
+F:	kernel/perf_event*.c
 F:	include/linux/perf_event.h
-F:	arch/*/kernel/perf_event.c
-F:	arch/*/kernel/*/perf_event.c
-F:	arch/*/kernel/*/*/perf_event.c
+F:	arch/*/kernel/perf_event*.c
+F:	arch/*/kernel/*/perf_event*.c
+F:	arch/*/kernel/*/*/perf_event*.c
 F:	arch/*/include/asm/perf_event.h
-F:	arch/*/lib/perf_event.c
+F:	arch/*/lib/perf_event*.c
 F:	arch/*/kernel/perf_callchain.c
 F:	tools/perf/
 
-- 
cgit v1.2.3


From 10f1014d86fd4fe5087080d609b51183396c5e4c Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 17 Mar 2010 12:49:12 +0100
Subject: perf/core, x86: Remove cpu_hw_events.interrupts

This member in the struct is not used anymore and can be
removed.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268826553-19518-4-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 793e63f6c42..104292a58c2 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -102,7 +102,6 @@ struct cpu_hw_events {
 	 */
 	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
 	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		interrupts;
 	int			enabled;
 
 	int			n_events;
-- 
cgit v1.2.3


From d6dc0b4ead6e8720096ecfa3d9e899b47ddbc8ed Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 17 Mar 2010 12:49:13 +0100
Subject: perf/core, x86: Remove duplicate perf_event_mask variable

The same information is stored also in x86_pmu.intel_ctrl. This
patch removes perf_event_mask and instead uses
x86_pmu.intel_ctrl directly.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1268826553-19518-5-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 104292a58c2..c97d5b52d12 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -75,8 +75,6 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
 	return len;
 }
 
-static u64 perf_event_mask __read_mostly;
-
 struct event_constraint {
 	union {
 		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
@@ -1406,7 +1404,7 @@ void __init init_hw_perf_events(void)
 		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
 		x86_pmu.num_events = X86_PMC_MAX_GENERIC;
 	}
-	perf_event_mask = (1 << x86_pmu.num_events) - 1;
+	x86_pmu.intel_ctrl = (1 << x86_pmu.num_events) - 1;
 	perf_max_events = x86_pmu.num_events;
 
 	if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
@@ -1415,9 +1413,8 @@ void __init init_hw_perf_events(void)
 		x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
 	}
 
-	perf_event_mask |=
+	x86_pmu.intel_ctrl |=
 		((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
-	x86_pmu.intel_ctrl = perf_event_mask;
 
 	perf_events_lapic_init();
 	register_die_notifier(&perf_event_nmi_notifier);
@@ -1442,7 +1439,7 @@ void __init init_hw_perf_events(void)
 	pr_info("... value mask:             %016Lx\n", x86_pmu.event_mask);
 	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
 	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed);
-	pr_info("... event mask:             %016Lx\n", perf_event_mask);
+	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
 
 	perf_cpu_notifier(x86_pmu_notifier);
 }
-- 
cgit v1.2.3


From 2acebe9ecb2b77876e87a1480729cfb2db4570dd Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Wed, 17 Mar 2010 10:40:38 -0500
Subject: x86, UV: Delete unneeded boot messages

SGI:UV: Delete extra boot messages that describe the system
topology. These messages are no longer useful.

Signed-off-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20100317154038.GA29346@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 3740c8a4eae..9a93b07fa14 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -736,9 +736,6 @@ void __init uv_system_init(void)
 		uv_node_to_blade[nid] = blade;
 		uv_cpu_to_blade[cpu] = blade;
 		max_pnode = max(pnode, max_pnode);
-
-		printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
-			cpu, apicid, pnode, nid, lcpu, blade);
 	}
 
 	/* Add blade/pnode info for nodes without cpus */
-- 
cgit v1.2.3


From 6be2850effd6a8bae11d623c8c52e88d2fbc0e96 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Thu, 18 Mar 2010 11:36:03 -0300
Subject: perf stat: Enable counters when collecting process-wide or
 system-wide data

Command 'perf stat' doesn't enable counters when collecting an
existing (by -p) process or system-wide statistics. Fix the
issue.

Change the condition of fork/exec subcommand. If there is a
subcommand parameter, perf always forks/execs it. The usage
example is:

 # perf stat -a sleep 10

So this command could collect statistics for 10 seconds
precisely. User still could stop it by CTRL+C. Without the new
capability, user could only use CTRL+C to stop it without
precise time clock.

Another issue is 'perf stat -a' consumes 100% time of a full
single logical cpu. It has a bad impact on running workload.

Fix it by adding a sleep(1) in the while(!done) loop in function
run_perf_stat.

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Sheng Yang <sheng@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Jes Sorensen <Jes.Sorensen@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Zachary Amsden <zamsden@redhat.com>
Cc: <zhiteng.huang@intel.com>
LKML-Reference: <1268922965-14774-1-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-stat.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 95db31cff6f..5f41244cbbf 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -159,8 +159,10 @@ static void create_perf_stat_counter(int counter, int pid)
 		}
 	} else {
 		attr->inherit	     = inherit;
-		attr->disabled	     = 1;
-		attr->enable_on_exec = 1;
+		if (target_pid == -1) {
+			attr->disabled = 1;
+			attr->enable_on_exec = 1;
+		}
 
 		fd[0][counter] = sys_perf_event_open(attr, pid, -1, -1, 0);
 		if (fd[0][counter] < 0 && verbose)
@@ -251,9 +253,9 @@ static int run_perf_stat(int argc __used, const char **argv)
 	unsigned long long t0, t1;
 	int status = 0;
 	int counter;
-	int pid = target_pid;
+	int pid;
 	int child_ready_pipe[2], go_pipe[2];
-	const bool forks = (target_pid == -1 && argc > 0);
+	const bool forks = (argc > 0);
 	char buf;
 
 	if (!system_wide)
@@ -265,10 +267,10 @@ static int run_perf_stat(int argc __used, const char **argv)
 	}
 
 	if (forks) {
-		if ((pid = fork()) < 0)
+		if ((child_pid = fork()) < 0)
 			perror("failed to fork");
 
-		if (!pid) {
+		if (!child_pid) {
 			close(child_ready_pipe[0]);
 			close(go_pipe[1]);
 			fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
@@ -297,8 +299,6 @@ static int run_perf_stat(int argc __used, const char **argv)
 			exit(-1);
 		}
 
-		child_pid = pid;
-
 		/*
 		 * Wait for the child to be ready to exec.
 		 */
@@ -309,6 +309,10 @@ static int run_perf_stat(int argc __used, const char **argv)
 		close(child_ready_pipe[0]);
 	}
 
+	if (target_pid == -1)
+		pid = child_pid;
+	else
+		pid = target_pid;
 	for (counter = 0; counter < nr_counters; counter++)
 		create_perf_stat_counter(counter, pid);
 
@@ -321,7 +325,7 @@ static int run_perf_stat(int argc __used, const char **argv)
 		close(go_pipe[1]);
 		wait(&status);
 	} else {
-		while(!done);
+		while(!done) sleep(1);
 	}
 
 	t1 = rdclock();
@@ -459,7 +463,7 @@ static volatile int signr = -1;
 
 static void skip_signal(int signo)
 {
-	if(target_pid != -1)
+	if(child_pid == -1)
 		done = 1;
 
 	signr = signo;
-- 
cgit v1.2.3


From 46be604b5ba738d53e5f5314813a4e7092864baf Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Thu, 18 Mar 2010 11:36:04 -0300
Subject: perf record: Enable counters only when kernel is execing subcommand

'perf record' starts counters before subcommand is execed, so
the statistics is not precise because it includes data of some
preparation steps. I fix it with the patch.

In addition, change the condition to fork/exec subcommand. If
there is a subcommand parameter, perf always fork/exec it. The
usage example is:

 # perf record -f -a sleep 10

So this command could collect statistics for 10 seconds
precisely. User still could stop it by CTRL+C. Without the new
capability, user could only input CTRL+C to stop it without
precise time clock.

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Sheng Yang <sheng@linux.intel.com>
Cc: oerg Roedel <joro@8bytes.org>
Cc: Jes Sorensen <Jes.Sorensen@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: <zhiteng.huang@intel.com>
Cc: Zachary Amsden <zamsden@redhat.com>
LKML-Reference: <1268922965-14774-2-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 962cdbf44ae..e2b35ad82a7 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -225,7 +225,7 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n
 	return h_attr;
 }
 
-static void create_counter(int counter, int cpu, pid_t pid, bool forks)
+static void create_counter(int counter, int cpu, pid_t pid)
 {
 	char *filter = filters[counter];
 	struct perf_event_attr *attr = attrs + counter;
@@ -275,10 +275,10 @@ static void create_counter(int counter, int cpu, pid_t pid, bool forks)
 	attr->mmap		= track;
 	attr->comm		= track;
 	attr->inherit		= inherit;
-	attr->disabled		= 1;
-
-	if (forks)
+	if (target_pid == -1 && !system_wide) {
+		attr->disabled = 1;
 		attr->enable_on_exec = 1;
+	}
 
 try_again:
 	fd[nr_cpu][counter] = sys_perf_event_open(attr, pid, cpu, group_fd, 0);
@@ -380,17 +380,15 @@ try_again:
 			exit(-1);
 		}
 	}
-
-	ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_ENABLE);
 }
 
-static void open_counters(int cpu, pid_t pid, bool forks)
+static void open_counters(int cpu, pid_t pid)
 {
 	int counter;
 
 	group_fd = -1;
 	for (counter = 0; counter < nr_counters; counter++)
-		create_counter(counter, cpu, pid, forks);
+		create_counter(counter, cpu, pid);
 
 	nr_cpu++;
 }
@@ -425,7 +423,7 @@ static int __cmd_record(int argc, const char **argv)
 	int err;
 	unsigned long waking = 0;
 	int child_ready_pipe[2], go_pipe[2];
-	const bool forks = target_pid == -1 && argc > 0;
+	const bool forks = argc > 0;
 	char buf;
 
 	page_size = sysconf(_SC_PAGE_SIZE);
@@ -496,13 +494,13 @@ static int __cmd_record(int argc, const char **argv)
 	atexit(atexit_header);
 
 	if (forks) {
-		pid = fork();
+		child_pid = fork();
 		if (pid < 0) {
 			perror("failed to fork");
 			exit(-1);
 		}
 
-		if (!pid) {
+		if (!child_pid) {
 			close(child_ready_pipe[0]);
 			close(go_pipe[1]);
 			fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
@@ -531,11 +529,6 @@ static int __cmd_record(int argc, const char **argv)
 			exit(-1);
 		}
 
-		child_pid = pid;
-
-		if (!system_wide)
-			target_pid = pid;
-
 		close(child_ready_pipe[1]);
 		close(go_pipe[0]);
 		/*
@@ -548,13 +541,17 @@ static int __cmd_record(int argc, const char **argv)
 		close(child_ready_pipe[0]);
 	}
 
+	if (forks && target_pid == -1 && !system_wide)
+		pid = child_pid;
+	else
+		pid = target_pid;
 
 	if ((!system_wide && !inherit) || profile_cpu != -1) {
-		open_counters(profile_cpu, target_pid, forks);
+		open_counters(profile_cpu, pid);
 	} else {
 		nr_cpus = read_cpu_map();
 		for (i = 0; i < nr_cpus; i++)
-			open_counters(cpumap[i], target_pid, forks);
+			open_counters(cpumap[i], pid);
 	}
 
 	if (file_new) {
-- 
cgit v1.2.3


From d6d901c23a9c4c7361aa901b5b2dda69703dd5e0 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Thu, 18 Mar 2010 11:36:05 -0300
Subject: perf events: Change perf parameter --pid to process-wide collection
 instead of thread-wide

Parameter --pid (or -p) of perf currently means a thread-wide
collection. For exmaple, if a process whose id is 8888 has 10
threads, 'perf top -p 8888' just collects the main thread
statistics. That's misleading. Users are used to attach a whole
process when debugging a process by gdb. To follow normal usage
style, the patch change --pid to process-wide collection and add
--tid (-t) to mean a thread-wide collection.

Usage example is:

 # perf top -p 8888
 # perf record -p 8888 -f sleep 10
 # perf stat -p 8888 -f sleep 10

Above commands collect the statistics of all threads of process
8888.

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Sheng Yang <sheng@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Jes Sorensen <Jes.Sorensen@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: zhiteng.huang@intel.com
Cc: Zachary Amsden <zamsden@redhat.com>
LKML-Reference: <1268922965-14774-3-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 260 +++++++++++++++++++++++++++-----------------
 tools/perf/builtin-stat.c   | 110 +++++++++++++------
 tools/perf/builtin-top.c    | 162 +++++++++++++++++----------
 tools/perf/util/thread.c    |  32 ++++++
 tools/perf/util/thread.h    |   1 +
 5 files changed, 372 insertions(+), 193 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index e2b35ad82a7..bb5b23db423 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -27,7 +27,7 @@
 #include <unistd.h>
 #include <sched.h>
 
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+static int			*fd[MAX_NR_CPUS][MAX_COUNTERS];
 
 static long			default_interval		=      0;
 
@@ -43,6 +43,9 @@ static int			raw_samples			=      0;
 static int			system_wide			=      0;
 static int			profile_cpu			=     -1;
 static pid_t			target_pid			=     -1;
+static pid_t			target_tid			=     -1;
+static pid_t			*all_tids			=      NULL;
+static int			thread_num			=      0;
 static pid_t			child_pid			=     -1;
 static int			inherit				=      1;
 static int			force				=      0;
@@ -60,7 +63,7 @@ static struct timeval		this_read;
 
 static u64			bytes_written			=      0;
 
-static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];
+static struct pollfd		*event_array;
 
 static int			nr_poll				=      0;
 static int			nr_cpu				=      0;
@@ -77,7 +80,7 @@ struct mmap_data {
 	unsigned int		prev;
 };
 
-static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+static struct mmap_data		*mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
 static unsigned long mmap_read_head(struct mmap_data *md)
 {
@@ -225,12 +228,13 @@ static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int n
 	return h_attr;
 }
 
-static void create_counter(int counter, int cpu, pid_t pid)
+static void create_counter(int counter, int cpu)
 {
 	char *filter = filters[counter];
 	struct perf_event_attr *attr = attrs + counter;
 	struct perf_header_attr *h_attr;
 	int track = !counter; /* only the first counter needs these */
+	int thread_index;
 	int ret;
 	struct {
 		u64 count;
@@ -280,115 +284,124 @@ static void create_counter(int counter, int cpu, pid_t pid)
 		attr->enable_on_exec = 1;
 	}
 
+	for (thread_index = 0; thread_index < thread_num; thread_index++) {
 try_again:
-	fd[nr_cpu][counter] = sys_perf_event_open(attr, pid, cpu, group_fd, 0);
-
-	if (fd[nr_cpu][counter] < 0) {
-		int err = errno;
-
-		if (err == EPERM || err == EACCES)
-			die("Permission error - are you root?\n"
-			    "\t Consider tweaking /proc/sys/kernel/perf_event_paranoid.\n");
-		else if (err ==  ENODEV && profile_cpu != -1)
-			die("No such device - did you specify an out-of-range profile CPU?\n");
+		fd[nr_cpu][counter][thread_index] = sys_perf_event_open(attr,
+				all_tids[thread_index], cpu, group_fd, 0);
+
+		if (fd[nr_cpu][counter][thread_index] < 0) {
+			int err = errno;
+
+			if (err == EPERM || err == EACCES)
+				die("Permission error - are you root?\n"
+					"\t Consider tweaking"
+					" /proc/sys/kernel/perf_event_paranoid.\n");
+			else if (err ==  ENODEV && profile_cpu != -1) {
+				die("No such device - did you specify"
+					" an out-of-range profile CPU?\n");
+			}
 
-		/*
-		 * If it's cycles then fall back to hrtimer
-		 * based cpu-clock-tick sw counter, which
-		 * is always available even if no PMU support:
-		 */
-		if (attr->type == PERF_TYPE_HARDWARE
-			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
-
-			if (verbose)
-				warning(" ... trying to fall back to cpu-clock-ticks\n");
-			attr->type = PERF_TYPE_SOFTWARE;
-			attr->config = PERF_COUNT_SW_CPU_CLOCK;
-			goto try_again;
-		}
-		printf("\n");
-		error("perfcounter syscall returned with %d (%s)\n",
-			fd[nr_cpu][counter], strerror(err));
+			/*
+			 * If it's cycles then fall back to hrtimer
+			 * based cpu-clock-tick sw counter, which
+			 * is always available even if no PMU support:
+			 */
+			if (attr->type == PERF_TYPE_HARDWARE
+					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
+
+				if (verbose)
+					warning(" ... trying to fall back to cpu-clock-ticks\n");
+				attr->type = PERF_TYPE_SOFTWARE;
+				attr->config = PERF_COUNT_SW_CPU_CLOCK;
+				goto try_again;
+			}
+			printf("\n");
+			error("perfcounter syscall returned with %d (%s)\n",
+					fd[nr_cpu][counter][thread_index], strerror(err));
 
 #if defined(__i386__) || defined(__x86_64__)
-		if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
-			die("No hardware sampling interrupt available. No APIC? If so then you can boot the kernel with the \"lapic\" boot parameter to force-enable it.\n");
+			if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
+				die("No hardware sampling interrupt available."
+				    " No APIC? If so then you can boot the kernel"
+				    " with the \"lapic\" boot parameter to"
+				    " force-enable it.\n");
 #endif
 
-		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
-		exit(-1);
-	}
+			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
+			exit(-1);
+		}
 
-	h_attr = get_header_attr(attr, counter);
-	if (h_attr == NULL)
-		die("nomem\n");
+		h_attr = get_header_attr(attr, counter);
+		if (h_attr == NULL)
+			die("nomem\n");
 
-	if (!file_new) {
-		if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
-			fprintf(stderr, "incompatible append\n");
-			exit(-1);
+		if (!file_new) {
+			if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
+				fprintf(stderr, "incompatible append\n");
+				exit(-1);
+			}
 		}
-	}
 
-	if (read(fd[nr_cpu][counter], &read_data, sizeof(read_data)) == -1) {
-		perror("Unable to read perf file descriptor\n");
-		exit(-1);
-	}
+		if (read(fd[nr_cpu][counter][thread_index], &read_data, sizeof(read_data)) == -1) {
+			perror("Unable to read perf file descriptor\n");
+			exit(-1);
+		}
 
-	if (perf_header_attr__add_id(h_attr, read_data.id) < 0) {
-		pr_warning("Not enough memory to add id\n");
-		exit(-1);
-	}
+		if (perf_header_attr__add_id(h_attr, read_data.id) < 0) {
+			pr_warning("Not enough memory to add id\n");
+			exit(-1);
+		}
 
-	assert(fd[nr_cpu][counter] >= 0);
-	fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
+		assert(fd[nr_cpu][counter][thread_index] >= 0);
+		fcntl(fd[nr_cpu][counter][thread_index], F_SETFL, O_NONBLOCK);
 
-	/*
-	 * First counter acts as the group leader:
-	 */
-	if (group && group_fd == -1)
-		group_fd = fd[nr_cpu][counter];
-	if (multiplex && multiplex_fd == -1)
-		multiplex_fd = fd[nr_cpu][counter];
+		/*
+		 * First counter acts as the group leader:
+		 */
+		if (group && group_fd == -1)
+			group_fd = fd[nr_cpu][counter][thread_index];
+		if (multiplex && multiplex_fd == -1)
+			multiplex_fd = fd[nr_cpu][counter][thread_index];
 
-	if (multiplex && fd[nr_cpu][counter] != multiplex_fd) {
+		if (multiplex && fd[nr_cpu][counter][thread_index] != multiplex_fd) {
 
-		ret = ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_SET_OUTPUT, multiplex_fd);
-		assert(ret != -1);
-	} else {
-		event_array[nr_poll].fd = fd[nr_cpu][counter];
-		event_array[nr_poll].events = POLLIN;
-		nr_poll++;
-
-		mmap_array[nr_cpu][counter].counter = counter;
-		mmap_array[nr_cpu][counter].prev = 0;
-		mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
-		mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-				PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0);
-		if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
-			error("failed to mmap with %d (%s)\n", errno, strerror(errno));
-			exit(-1);
+			ret = ioctl(fd[nr_cpu][counter][thread_index], PERF_EVENT_IOC_SET_OUTPUT, multiplex_fd);
+			assert(ret != -1);
+		} else {
+			event_array[nr_poll].fd = fd[nr_cpu][counter][thread_index];
+			event_array[nr_poll].events = POLLIN;
+			nr_poll++;
+
+			mmap_array[nr_cpu][counter][thread_index].counter = counter;
+			mmap_array[nr_cpu][counter][thread_index].prev = 0;
+			mmap_array[nr_cpu][counter][thread_index].mask = mmap_pages*page_size - 1;
+			mmap_array[nr_cpu][counter][thread_index].base = mmap(NULL, (mmap_pages+1)*page_size,
+				PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter][thread_index], 0);
+			if (mmap_array[nr_cpu][counter][thread_index].base == MAP_FAILED) {
+				error("failed to mmap with %d (%s)\n", errno, strerror(errno));
+				exit(-1);
+			}
 		}
-	}
 
-	if (filter != NULL) {
-		ret = ioctl(fd[nr_cpu][counter],
-			    PERF_EVENT_IOC_SET_FILTER, filter);
-		if (ret) {
-			error("failed to set filter with %d (%s)\n", errno,
-			      strerror(errno));
-			exit(-1);
+		if (filter != NULL) {
+			ret = ioctl(fd[nr_cpu][counter][thread_index],
+					PERF_EVENT_IOC_SET_FILTER, filter);
+			if (ret) {
+				error("failed to set filter with %d (%s)\n", errno,
+						strerror(errno));
+				exit(-1);
+			}
 		}
 	}
 }
 
-static void open_counters(int cpu, pid_t pid)
+static void open_counters(int cpu)
 {
 	int counter;
 
 	group_fd = -1;
 	for (counter = 0; counter < nr_counters; counter++)
-		create_counter(counter, cpu, pid);
+		create_counter(counter, cpu);
 
 	nr_cpu++;
 }
@@ -529,6 +542,9 @@ static int __cmd_record(int argc, const char **argv)
 			exit(-1);
 		}
 
+		if (!system_wide && target_tid == -1 && target_pid == -1)
+			all_tids[0] = child_pid;
+
 		close(child_ready_pipe[1]);
 		close(go_pipe[0]);
 		/*
@@ -541,17 +557,12 @@ static int __cmd_record(int argc, const char **argv)
 		close(child_ready_pipe[0]);
 	}
 
-	if (forks && target_pid == -1 && !system_wide)
-		pid = child_pid;
-	else
-		pid = target_pid;
-
 	if ((!system_wide && !inherit) || profile_cpu != -1) {
-		open_counters(profile_cpu, pid);
+		open_counters(profile_cpu);
 	} else {
 		nr_cpus = read_cpu_map();
 		for (i = 0; i < nr_cpus; i++)
-			open_counters(cpumap[i], pid);
+			open_counters(cpumap[i]);
 	}
 
 	if (file_new) {
@@ -576,7 +587,7 @@ static int __cmd_record(int argc, const char **argv)
 	}
 
 	if (!system_wide && profile_cpu == -1)
-		event__synthesize_thread(target_pid, process_synthesized_event,
+		event__synthesize_thread(target_tid, process_synthesized_event,
 					 session);
 	else
 		event__synthesize_threads(process_synthesized_event, session);
@@ -599,11 +610,16 @@ static int __cmd_record(int argc, const char **argv)
 
 	for (;;) {
 		int hits = samples;
+		int thread;
 
 		for (i = 0; i < nr_cpu; i++) {
 			for (counter = 0; counter < nr_counters; counter++) {
-				if (mmap_array[i][counter].base)
-					mmap_read(&mmap_array[i][counter]);
+				for (thread = 0;
+					thread < thread_num; thread++) {
+					if (mmap_array[i][counter][thread].base)
+						mmap_read(&mmap_array[i][counter][thread]);
+				}
+
 			}
 		}
 
@@ -616,8 +632,15 @@ static int __cmd_record(int argc, const char **argv)
 
 		if (done) {
 			for (i = 0; i < nr_cpu; i++) {
-				for (counter = 0; counter < nr_counters; counter++)
-					ioctl(fd[i][counter], PERF_EVENT_IOC_DISABLE);
+				for (counter = 0;
+					counter < nr_counters;
+					counter++) {
+					for (thread = 0;
+						thread < thread_num;
+						thread++)
+						ioctl(fd[i][counter][thread],
+							PERF_EVENT_IOC_DISABLE);
+				}
 			}
 		}
 	}
@@ -649,7 +672,9 @@ static const struct option options[] = {
 	OPT_CALLBACK(0, "filter", NULL, "filter",
 		     "event filter", parse_filter),
 	OPT_INTEGER('p', "pid", &target_pid,
-		    "record events on existing pid"),
+		    "record events on existing process id"),
+	OPT_INTEGER('t', "tid", &target_tid,
+		    "record events on existing thread id"),
 	OPT_INTEGER('r', "realtime", &realtime_prio,
 		    "collect data with this RT SCHED_FIFO priority"),
 	OPT_BOOLEAN('R', "raw-samples", &raw_samples,
@@ -690,10 +715,12 @@ static const struct option options[] = {
 int cmd_record(int argc, const char **argv, const char *prefix __used)
 {
 	int counter;
+	int i,j;
 
 	argc = parse_options(argc, argv, options, record_usage,
 			    PARSE_OPT_STOP_AT_NON_OPTION);
-	if (!argc && target_pid == -1 && !system_wide && profile_cpu == -1)
+	if (!argc && target_pid == -1 && target_tid == -1 &&
+		!system_wide && profile_cpu == -1)
 		usage_with_options(record_usage, options);
 
 	symbol__init();
@@ -704,6 +731,37 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 		attrs[0].config = PERF_COUNT_HW_CPU_CYCLES;
 	}
 
+	if (target_pid != -1) {
+		target_tid = target_pid;
+		thread_num = find_all_tid(target_pid, &all_tids);
+		if (thread_num <= 0) {
+			fprintf(stderr, "Can't find all threads of pid %d\n",
+					target_pid);
+			usage_with_options(record_usage, options);
+		}
+	} else {
+		all_tids=malloc(sizeof(pid_t));
+		if (!all_tids)
+			return -ENOMEM;
+
+		all_tids[0] = target_tid;
+		thread_num = 1;
+	}
+
+	for (i = 0; i < MAX_NR_CPUS; i++) {
+		for (j = 0; j < MAX_COUNTERS; j++) {
+			fd[i][j] = malloc(sizeof(int)*thread_num);
+			mmap_array[i][j] = malloc(
+				sizeof(struct mmap_data)*thread_num);
+			if (!fd[i][j] || !mmap_array[i][j])
+				return -ENOMEM;
+		}
+	}
+	event_array = malloc(
+		sizeof(struct pollfd)*MAX_NR_CPUS*MAX_COUNTERS*thread_num);
+	if (!event_array)
+		return -ENOMEM;
+
 	/*
 	 * User specified count overrides default frequency.
 	 */
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 5f41244cbbf..c92f90ff5a9 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -46,6 +46,7 @@
 #include "util/debug.h"
 #include "util/header.h"
 #include "util/cpumap.h"
+#include "util/thread.h"
 
 #include <sys/prctl.h>
 #include <math.h>
@@ -74,10 +75,13 @@ static int			run_count			=  1;
 static int			inherit				=  1;
 static int			scale				=  1;
 static pid_t			target_pid			= -1;
+static pid_t			target_tid			= -1;
+static pid_t			*all_tids			=  NULL;
+static int			thread_num			=  0;
 static pid_t			child_pid			= -1;
 static int			null_run			=  0;
 
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+static int			*fd[MAX_NR_CPUS][MAX_COUNTERS];
 
 static int			event_scaled[MAX_COUNTERS];
 
@@ -140,9 +144,10 @@ struct stats			runtime_branches_stats;
 #define ERR_PERF_OPEN \
 "Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n"
 
-static void create_perf_stat_counter(int counter, int pid)
+static void create_perf_stat_counter(int counter)
 {
 	struct perf_event_attr *attr = attrs + counter;
+	int thread;
 
 	if (scale)
 		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
@@ -152,10 +157,11 @@ static void create_perf_stat_counter(int counter, int pid)
 		unsigned int cpu;
 
 		for (cpu = 0; cpu < nr_cpus; cpu++) {
-			fd[cpu][counter] = sys_perf_event_open(attr, -1, cpumap[cpu], -1, 0);
-			if (fd[cpu][counter] < 0 && verbose)
+			fd[cpu][counter][0] = sys_perf_event_open(attr,
+					-1, cpumap[cpu], -1, 0);
+			if (fd[cpu][counter][0] < 0 && verbose)
 				fprintf(stderr, ERR_PERF_OPEN, counter,
-					fd[cpu][counter], strerror(errno));
+					fd[cpu][counter][0], strerror(errno));
 		}
 	} else {
 		attr->inherit	     = inherit;
@@ -163,11 +169,14 @@ static void create_perf_stat_counter(int counter, int pid)
 			attr->disabled = 1;
 			attr->enable_on_exec = 1;
 		}
-
-		fd[0][counter] = sys_perf_event_open(attr, pid, -1, -1, 0);
-		if (fd[0][counter] < 0 && verbose)
-			fprintf(stderr, ERR_PERF_OPEN, counter,
-				fd[0][counter], strerror(errno));
+		for (thread = 0; thread < thread_num; thread++) {
+			fd[0][counter][thread] = sys_perf_event_open(attr,
+				all_tids[thread], -1, -1, 0);
+			if (fd[0][counter][thread] < 0 && verbose)
+				fprintf(stderr, ERR_PERF_OPEN, counter,
+					fd[0][counter][thread],
+					strerror(errno));
+		}
 	}
 }
 
@@ -192,25 +201,28 @@ static void read_counter(int counter)
 	unsigned int cpu;
 	size_t res, nv;
 	int scaled;
-	int i;
+	int i, thread;
 
 	count[0] = count[1] = count[2] = 0;
 
 	nv = scale ? 3 : 1;
 	for (cpu = 0; cpu < nr_cpus; cpu++) {
-		if (fd[cpu][counter] < 0)
-			continue;
-
-		res = read(fd[cpu][counter], single_count, nv * sizeof(u64));
-		assert(res == nv * sizeof(u64));
-
-		close(fd[cpu][counter]);
-		fd[cpu][counter] = -1;
-
-		count[0] += single_count[0];
-		if (scale) {
-			count[1] += single_count[1];
-			count[2] += single_count[2];
+		for (thread = 0; thread < thread_num; thread++) {
+			if (fd[cpu][counter][thread] < 0)
+				continue;
+
+			res = read(fd[cpu][counter][thread],
+					single_count, nv * sizeof(u64));
+			assert(res == nv * sizeof(u64));
+
+			close(fd[cpu][counter][thread]);
+			fd[cpu][counter][thread] = -1;
+
+			count[0] += single_count[0];
+			if (scale) {
+				count[1] += single_count[1];
+				count[2] += single_count[2];
+			}
 		}
 	}
 
@@ -253,7 +265,6 @@ static int run_perf_stat(int argc __used, const char **argv)
 	unsigned long long t0, t1;
 	int status = 0;
 	int counter;
-	int pid;
 	int child_ready_pipe[2], go_pipe[2];
 	const bool forks = (argc > 0);
 	char buf;
@@ -299,6 +310,9 @@ static int run_perf_stat(int argc __used, const char **argv)
 			exit(-1);
 		}
 
+		if (target_tid == -1 && target_pid == -1 && !system_wide)
+			all_tids[0] = child_pid;
+
 		/*
 		 * Wait for the child to be ready to exec.
 		 */
@@ -309,12 +323,8 @@ static int run_perf_stat(int argc __used, const char **argv)
 		close(child_ready_pipe[0]);
 	}
 
-	if (target_pid == -1)
-		pid = child_pid;
-	else
-		pid = target_pid;
 	for (counter = 0; counter < nr_counters; counter++)
-		create_perf_stat_counter(counter, pid);
+		create_perf_stat_counter(counter);
 
 	/*
 	 * Enable counters and exec the command:
@@ -433,12 +443,14 @@ static void print_stat(int argc, const char **argv)
 
 	fprintf(stderr, "\n");
 	fprintf(stderr, " Performance counter stats for ");
-	if(target_pid == -1) {
+	if(target_pid == -1 && target_tid == -1) {
 		fprintf(stderr, "\'%s", argv[0]);
 		for (i = 1; i < argc; i++)
 			fprintf(stderr, " %s", argv[i]);
-	}else
-		fprintf(stderr, "task pid \'%d", target_pid);
+	} else if (target_pid != -1)
+		fprintf(stderr, "process id \'%d", target_pid);
+	else
+		fprintf(stderr, "thread id \'%d", target_tid);
 
 	fprintf(stderr, "\'");
 	if (run_count > 1)
@@ -493,7 +505,9 @@ static const struct option options[] = {
 	OPT_BOOLEAN('i', "inherit", &inherit,
 		    "child tasks inherit counters"),
 	OPT_INTEGER('p', "pid", &target_pid,
-		    "stat events on existing pid"),
+		    "stat events on existing process id"),
+	OPT_INTEGER('t', "tid", &target_tid,
+		    "stat events on existing thread id"),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 		    "system-wide collection from all CPUs"),
 	OPT_BOOLEAN('c', "scale", &scale,
@@ -510,10 +524,11 @@ static const struct option options[] = {
 int cmd_stat(int argc, const char **argv, const char *prefix __used)
 {
 	int status;
+	int i,j;
 
 	argc = parse_options(argc, argv, options, stat_usage,
 		PARSE_OPT_STOP_AT_NON_OPTION);
-	if (!argc && target_pid == -1)
+	if (!argc && target_pid == -1 && target_tid == -1)
 		usage_with_options(stat_usage, options);
 	if (run_count <= 0)
 		usage_with_options(stat_usage, options);
@@ -529,6 +544,31 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 	else
 		nr_cpus = 1;
 
+	if (target_pid != -1) {
+		target_tid = target_pid;
+		thread_num = find_all_tid(target_pid, &all_tids);
+		if (thread_num <= 0) {
+			fprintf(stderr, "Can't find all threads of pid %d\n",
+					target_pid);
+			usage_with_options(stat_usage, options);
+		}
+	} else {
+		all_tids=malloc(sizeof(pid_t));
+		if (!all_tids)
+			return -ENOMEM;
+
+		all_tids[0] = target_tid;
+		thread_num = 1;
+	}
+
+	for (i = 0; i < MAX_NR_CPUS; i++) {
+		for (j = 0; j < MAX_COUNTERS; j++) {
+			fd[i][j] = malloc(sizeof(int)*thread_num);
+			if (!fd[i][j])
+				return -ENOMEM;
+		}
+	}
+
 	/*
 	 * We dont want to block the signals - that would cause
 	 * child tasks to inherit that and Ctrl-C would not work.
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 887ebbf5d1f..5f3ac9ff354 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -55,7 +55,7 @@
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+static int			*fd[MAX_NR_CPUS][MAX_COUNTERS];
 
 static int			system_wide			=      0;
 
@@ -65,6 +65,9 @@ static int			count_filter			=      5;
 static int			print_entries;
 
 static int			target_pid			=     -1;
+static int			target_tid			=     -1;
+static pid_t			*all_tids			=      NULL;
+static int			thread_num			=      0;
 static int			inherit				=      0;
 static int			profile_cpu			=     -1;
 static int			nr_cpus				=      0;
@@ -524,13 +527,15 @@ static void print_sym_table(void)
 
 	if (target_pid != -1)
 		printf(" (target_pid: %d", target_pid);
+	else if (target_tid != -1)
+		printf(" (target_tid: %d", target_tid);
 	else
 		printf(" (all");
 
 	if (profile_cpu != -1)
 		printf(", cpu: %d)\n", profile_cpu);
 	else {
-		if (target_pid != -1)
+		if (target_tid != -1)
 			printf(")\n");
 		else
 			printf(", %d CPUs)\n", nr_cpus);
@@ -1129,16 +1134,21 @@ static void perf_session__mmap_read_counter(struct perf_session *self,
 	md->prev = old;
 }
 
-static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
-static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+static struct pollfd *event_array;
+static struct mmap_data *mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
 static void perf_session__mmap_read(struct perf_session *self)
 {
-	int i, counter;
+	int i, counter, thread_index;
 
 	for (i = 0; i < nr_cpus; i++) {
 		for (counter = 0; counter < nr_counters; counter++)
-			perf_session__mmap_read_counter(self, &mmap_array[i][counter]);
+			for (thread_index = 0;
+				thread_index < thread_num;
+				thread_index++) {
+				perf_session__mmap_read_counter(self,
+					&mmap_array[i][counter][thread_index]);
+			}
 	}
 }
 
@@ -1149,9 +1159,10 @@ static void start_counter(int i, int counter)
 {
 	struct perf_event_attr *attr;
 	int cpu;
+	int thread_index;
 
 	cpu = profile_cpu;
-	if (target_pid == -1 && profile_cpu == -1)
+	if (target_tid == -1 && profile_cpu == -1)
 		cpu = cpumap[i];
 
 	attr = attrs + counter;
@@ -1167,55 +1178,58 @@ static void start_counter(int i, int counter)
 	attr->inherit		= (cpu < 0) && inherit;
 	attr->mmap		= 1;
 
+	for (thread_index = 0; thread_index < thread_num; thread_index++) {
 try_again:
-	fd[i][counter] = sys_perf_event_open(attr, target_pid, cpu, group_fd, 0);
-
-	if (fd[i][counter] < 0) {
-		int err = errno;
+		fd[i][counter][thread_index] = sys_perf_event_open(attr,
+				all_tids[thread_index], cpu, group_fd, 0);
+
+		if (fd[i][counter][thread_index] < 0) {
+			int err = errno;
+
+			if (err == EPERM || err == EACCES)
+				die("No permission - are you root?\n");
+			/*
+			 * If it's cycles then fall back to hrtimer
+			 * based cpu-clock-tick sw counter, which
+			 * is always available even if no PMU support:
+			 */
+			if (attr->type == PERF_TYPE_HARDWARE
+					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
+
+				if (verbose)
+					warning(" ... trying to fall back to cpu-clock-ticks\n");
+
+				attr->type = PERF_TYPE_SOFTWARE;
+				attr->config = PERF_COUNT_SW_CPU_CLOCK;
+				goto try_again;
+			}
+			printf("\n");
+			error("perfcounter syscall returned with %d (%s)\n",
+					fd[i][counter][thread_index], strerror(err));
+			die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
+			exit(-1);
+		}
+		assert(fd[i][counter][thread_index] >= 0);
+		fcntl(fd[i][counter][thread_index], F_SETFL, O_NONBLOCK);
 
-		if (err == EPERM || err == EACCES)
-			die("No permission - are you root?\n");
 		/*
-		 * If it's cycles then fall back to hrtimer
-		 * based cpu-clock-tick sw counter, which
-		 * is always available even if no PMU support:
+		 * First counter acts as the group leader:
 		 */
-		if (attr->type == PERF_TYPE_HARDWARE
-			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
-
-			if (verbose)
-				warning(" ... trying to fall back to cpu-clock-ticks\n");
-
-			attr->type = PERF_TYPE_SOFTWARE;
-			attr->config = PERF_COUNT_SW_CPU_CLOCK;
-			goto try_again;
-		}
-		printf("\n");
-		error("perfcounter syscall returned with %d (%s)\n",
-			fd[i][counter], strerror(err));
-		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
-		exit(-1);
+		if (group && group_fd == -1)
+			group_fd = fd[i][counter][thread_index];
+
+		event_array[nr_poll].fd = fd[i][counter][thread_index];
+		event_array[nr_poll].events = POLLIN;
+		nr_poll++;
+
+		mmap_array[i][counter][thread_index].counter = counter;
+		mmap_array[i][counter][thread_index].prev = 0;
+		mmap_array[i][counter][thread_index].mask = mmap_pages*page_size - 1;
+		mmap_array[i][counter][thread_index].base = mmap(NULL, (mmap_pages+1)*page_size,
+				PROT_READ, MAP_SHARED, fd[i][counter][thread_index], 0);
+		if (mmap_array[i][counter][thread_index].base == MAP_FAILED)
+			die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 	}
-	assert(fd[i][counter] >= 0);
-	fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
-
-	/*
-	 * First counter acts as the group leader:
-	 */
-	if (group && group_fd == -1)
-		group_fd = fd[i][counter];
-
-	event_array[nr_poll].fd = fd[i][counter];
-	event_array[nr_poll].events = POLLIN;
-	nr_poll++;
-
-	mmap_array[i][counter].counter = counter;
-	mmap_array[i][counter].prev = 0;
-	mmap_array[i][counter].mask = mmap_pages*page_size - 1;
-	mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-			PROT_READ, MAP_SHARED, fd[i][counter], 0);
-	if (mmap_array[i][counter].base == MAP_FAILED)
-		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 }
 
 static int __cmd_top(void)
@@ -1231,8 +1245,8 @@ static int __cmd_top(void)
 	if (session == NULL)
 		return -ENOMEM;
 
-	if (target_pid != -1)
-		event__synthesize_thread(target_pid, event__process, session);
+	if (target_tid != -1)
+		event__synthesize_thread(target_tid, event__process, session);
 	else
 		event__synthesize_threads(event__process, session);
 
@@ -1243,7 +1257,7 @@ static int __cmd_top(void)
 	}
 
 	/* Wait for a minimal set of events before starting the snapshot */
-	poll(event_array, nr_poll, 100);
+	poll(&event_array[0], nr_poll, 100);
 
 	perf_session__mmap_read(session);
 
@@ -1286,7 +1300,9 @@ static const struct option options[] = {
 	OPT_INTEGER('c', "count", &default_interval,
 		    "event period to sample"),
 	OPT_INTEGER('p', "pid", &target_pid,
-		    "profile events on existing pid"),
+		    "profile events on existing process id"),
+	OPT_INTEGER('t', "tid", &target_tid,
+		    "profile events on existing thread id"),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 			    "system-wide collection from all CPUs"),
 	OPT_INTEGER('C', "CPU", &profile_cpu,
@@ -1327,6 +1343,7 @@ static const struct option options[] = {
 int cmd_top(int argc, const char **argv, const char *prefix __used)
 {
 	int counter;
+	int i,j;
 
 	page_size = sysconf(_SC_PAGE_SIZE);
 
@@ -1334,8 +1351,39 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 	if (argc)
 		usage_with_options(top_usage, options);
 
+	if (target_pid != -1) {
+		target_tid = target_pid;
+		thread_num = find_all_tid(target_pid, &all_tids);
+		if (thread_num <= 0) {
+			fprintf(stderr, "Can't find all threads of pid %d\n",
+				target_pid);
+			usage_with_options(top_usage, options);
+		}
+	} else {
+		all_tids=malloc(sizeof(pid_t));
+		if (!all_tids)
+			return -ENOMEM;
+
+		all_tids[0] = target_tid;
+		thread_num = 1;
+	}
+
+	for (i = 0; i < MAX_NR_CPUS; i++) {
+		for (j = 0; j < MAX_COUNTERS; j++) {
+			fd[i][j] = malloc(sizeof(int)*thread_num);
+			mmap_array[i][j] = malloc(
+				sizeof(struct mmap_data)*thread_num);
+			if (!fd[i][j] || !mmap_array[i][j])
+				return -ENOMEM;
+		}
+	}
+	event_array = malloc(
+		sizeof(struct pollfd)*MAX_NR_CPUS*MAX_COUNTERS*thread_num);
+	if (!event_array)
+		return -ENOMEM;
+
 	/* CPU and PID are mutually exclusive */
-	if (target_pid != -1 && profile_cpu != -1) {
+	if (target_tid > 0 && profile_cpu != -1) {
 		printf("WARNING: PID switch overriding CPU\n");
 		sleep(1);
 		profile_cpu = -1;
@@ -1376,7 +1424,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 		attrs[counter].sample_period = default_interval;
 	}
 
-	if (target_pid != -1 || profile_cpu != -1)
+	if (target_tid != -1 || profile_cpu != -1)
 		nr_cpus = 1;
 	else
 		nr_cpus = read_cpu_map();
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index fa968312ee7..ea6506234d5 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -7,6 +7,37 @@
 #include "util.h"
 #include "debug.h"
 
+int find_all_tid(int pid, pid_t ** all_tid)
+{
+	char name[256];
+	int items;
+	struct dirent **namelist = NULL;
+	int ret = 0;
+	int i;
+
+	sprintf(name, "/proc/%d/task", pid);
+	items = scandir(name, &namelist, NULL, NULL);
+	if (items <= 0)
+                return -ENOENT;
+	*all_tid = malloc(sizeof(pid_t) * items);
+	if (!*all_tid) {
+		ret = -ENOMEM;
+		goto failure;
+	}
+
+	for (i = 0; i < items; i++)
+		(*all_tid)[i] = atoi(namelist[i]->d_name);
+
+	ret = items;
+
+failure:
+	for (i=0; i<items; i++)
+		free(namelist[i]);
+	free(namelist);
+
+	return ret;
+}
+
 void map_groups__init(struct map_groups *self)
 {
 	int i;
@@ -348,3 +379,4 @@ struct symbol *map_groups__find_symbol(struct map_groups *self,
 
 	return NULL;
 }
+
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index dcf70303e58..a81426a891b 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -23,6 +23,7 @@ struct thread {
 	int			comm_len;
 };
 
+int find_all_tid(int pid, pid_t ** all_tid);
 void map_groups__init(struct map_groups *self);
 int thread__set_comm(struct thread *self, const char *comm);
 int thread__comm_len(struct thread *self);
-- 
cgit v1.2.3


From d674cd1963129b70bc5f631c51fb30fb73213fb2 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed, 17 Mar 2010 13:37:00 +0300
Subject: x86, apic: Allow to use certain functions without APIC built-in
 support

In case even if the kernel is configured so that
no APIC support is built-in we still may allow
to use certain apic functions as dummy calls.

In particular we start using it in perf-events code.

Note that this is not that same as NOOP apic driver (which
is used if APIC support is present but no physical APIC is
available), this is for the case when we don't have apic code
compiled in at all.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <20100317104356.011052632@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apic.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index b4ac2cdcb64..1fa03e04ae4 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -373,6 +373,7 @@ extern atomic_t init_deasserted;
 extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
 #endif
 
+#ifdef CONFIG_X86_LOCAL_APIC
 static inline u32 apic_read(u32 reg)
 {
 	return apic->read(reg);
@@ -403,10 +404,19 @@ static inline u32 safe_apic_wait_icr_idle(void)
 	return apic->safe_wait_icr_idle();
 }
 
+#else /* CONFIG_X86_LOCAL_APIC */
+
+static inline u32 apic_read(u32 reg) { return 0; }
+static inline void apic_write(u32 reg, u32 val) { }
+static inline u64 apic_icr_read(void) { return 0; }
+static inline void apic_icr_write(u32 low, u32 high) { }
+static inline void apic_wait_icr_idle(void) { }
+static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
+
+#endif /* CONFIG_X86_LOCAL_APIC */
 
 static inline void ack_APIC_irq(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	/*
 	 * ack_APIC_irq() actually gets compiled as a single instruction
 	 * ... yummie.
@@ -414,7 +424,6 @@ static inline void ack_APIC_irq(void)
 
 	/* Docs say use 0 for future compatibility */
 	apic_write(APIC_EOI, 0);
-#endif
 }
 
 static inline unsigned default_get_apic_id(unsigned long x)
-- 
cgit v1.2.3


From 7335f75e9ca166044e38a96abad422d8e6e364b5 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed, 17 Mar 2010 13:37:01 +0300
Subject: x86, perf: Use apic_write unconditionally

Since apic_write() maps to a plain noop in the !CONFIG_X86_LOCAL_APIC
case we're safe to remove this conditional compilation and clean up
the code a bit.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: fweisbec@gmail.com
Cc: acme@redhat.com
Cc: eranian@google.com
Cc: peterz@infradead.org
LKML-Reference: <20100317104356.232371479@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c    | 4 ----
 arch/x86/kernel/cpu/perf_event_p4.c | 2 --
 2 files changed, 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c97d5b52d12..14eca80918d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1136,7 +1136,6 @@ void set_perf_event_pending(void)
 
 void perf_events_lapic_init(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	if (!x86_pmu.apic || !x86_pmu_initialized())
 		return;
 
@@ -1144,7 +1143,6 @@ void perf_events_lapic_init(void)
 	 * Always use NMI for PMU
 	 */
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
 }
 
 static int __kprobes
@@ -1168,9 +1166,7 @@ perf_event_nmi_handler(struct notifier_block *self,
 
 	regs = args->regs;
 
-#ifdef CONFIG_X86_LOCAL_APIC
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
 	/*
 	 * Can't rely on the handled return value to say it was our NMI, two
 	 * events could trigger 'simultaneously' raising two back-to-back NMIs.
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index a11ce73a93c..0367889b4ae 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -363,10 +363,8 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 	}
 
 	if (handled) {
-#ifdef CONFIG_X86_LOCAL_APIC
 		/* p4 quirk: unmask it again */
 		apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
-#endif
 		inc_irq_stat(apic_perf_irqs);
 	}
 
-- 
cgit v1.2.3


From 55632770d7298835645489828af87f854c47749c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 18 Mar 2010 16:51:16 +0100
Subject: perf events: Fix false positive build warning with older GCC's

gcc 4.2.1 produces:

 util/probe-event.c: In function 'add_perf_probe_events':
 util/probe-event.c:883: warning: 'tev' may be used uninitialized in this function
 make: *** [util/probe-event.o] Error 1

Newer GCCs get this right.

To work it around, initialize the variable to NULL so that older GCCs see
it as initialized too.

Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100316220612.32050.33806.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/probe-event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index f3332698058..c6603f3bb43 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -880,7 +880,7 @@ static void __add_kprobe_trace_events(struct perf_probe_event *pev,
 				      int ntevs, bool allow_suffix)
 {
 	int i, fd;
-	struct kprobe_trace_event *tev;
+	struct kprobe_trace_event *tev = NULL;
 	char buf[64];
 	const char *event, *group;
 	struct strlist *namelist;
-- 
cgit v1.2.3


From f34edbc1cdb0f8f83d94e1d668dd6e41abf0defb Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Thu, 18 Mar 2010 18:33:07 +0800
Subject: perf, x86: Add a key to simplify template lookup in Pentium-4 PMU

Currently, we use opcode(Event and Event-Selector) + emask to
look up template in p4_templates.

But cache events (L1-dcache-load-misses, LLC-load-misses, etc)
use the same event(P4_REPLAY_EVENT) to do the counting, ie, they
have the same opcode and emask. So we can not use current lookup
mechanism to find the template for cache events.

This patch introduces a "key", which is the index into
p4_templates. The low 12 bits of CCCR are reserved, so we can
hide the "key" in the low 12 bits of hwc->config.

We extract the key from hwc->config and then quickly find the
template.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1268908387.13901.127.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h |  5 ++-
 arch/x86/kernel/cpu/perf_event_p4.c  | 86 ++++++++++++++----------------------
 2 files changed, 38 insertions(+), 53 deletions(-)

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index b842b3238e4..7d3406a2773 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -65,6 +65,7 @@
 #define P4_CCCR_THREAD_SINGLE		0x00010000U
 #define P4_CCCR_THREAD_BOTH		0x00020000U
 #define P4_CCCR_THREAD_ANY		0x00030000U
+#define P4_CCCR_RESERVED		0x00000fffU
 
 /* Non HT mask */
 #define P4_CCCR_MASK				\
@@ -116,7 +117,7 @@
 #define p4_config_pack_escr(v)		(((u64)(v)) << 32)
 #define p4_config_pack_cccr(v)		(((u64)(v)) & 0xffffffffULL)
 #define p4_config_unpack_escr(v)	(((u64)(v)) >> 32)
-#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xffffffffULL)
+#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xfffff000ULL)
 
 #define p4_config_unpack_emask(v)			\
 	({						\
@@ -126,6 +127,8 @@
 		t;					\
 	})
 
+#define p4_config_unpack_key(v)		(((u64)(v)) & P4_CCCR_RESERVED)
+
 #define P4_CONFIG_HT_SHIFT		63
 #define P4_CONFIG_HT			(1ULL << P4_CONFIG_HT_SHIFT)
 
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 0367889b4ae..3e97ed3904c 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -18,6 +18,7 @@ struct p4_event_template {
 	u32 opcode;			/* ESCR event + CCCR selector */
 	u64 config;			/* packed predefined bits */
 	int dep;			/* upstream dependency event index */
+	int key;			/* index into p4_templates */
 	unsigned int emask;		/* ESCR EventMask */
 	unsigned int escr_msr[2];	/* ESCR MSR for this event */
 	unsigned int cntr[2];		/* counter index (offset) */
@@ -39,38 +40,31 @@ static DEFINE_PER_CPU(struct p4_pmu_res, p4_pmu_config);
  */
 struct p4_event_template p4_templates[] = {
 	[0] = {
-		.opcode	= P4_UOP_TYPE,
-		.config	= 0,
-		.dep	= -1,
-		.emask	=
-			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
-			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
-		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
-		.cntr		= { 16, 17 },
-	},
-	[1] = {
 		.opcode	= P4_GLOBAL_POWER_EVENTS,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 0,
 		.emask	=
 			P4_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
 		.cntr		= { 0, 2 },
 	},
-	[2] = {
+	[1] = {
 		.opcode	= P4_INSTR_RETIRED,
 		.config	= 0,
 		.dep	= -1, /* needs front-end tagging */
+		.key	= 1,
 		.emask	=
 			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG)	|
 			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
 		.cntr		= { 12, 14 },
 	},
-	[3] = {
+	[2] = {
 		.opcode	= P4_BSQ_CACHE_REFERENCE,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 2,
 		.emask	=
 			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
 			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
@@ -81,10 +75,11 @@ struct p4_event_template p4_templates[] = {
 		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
 		.cntr		= { 0, 2 },
 	},
-	[4] = {
+	[3] = {
 		.opcode	= P4_BSQ_CACHE_REFERENCE,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 3,
 		.emask	=
 			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
 			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
@@ -92,10 +87,11 @@ struct p4_event_template p4_templates[] = {
 		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
 		.cntr		= { 0, 3 },
 	},
-	[5] = {
+	[4] = {
 		.opcode	= P4_RETIRED_BRANCH_TYPE,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 4,
 		.emask	=
 			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
 			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL)		|
@@ -104,48 +100,38 @@ struct p4_event_template p4_templates[] = {
 		.escr_msr	= { MSR_P4_TBPU_ESCR0, MSR_P4_TBPU_ESCR1 },
 		.cntr		= { 4, 6 },
 	},
-	[6] = {
+	[5] = {
 		.opcode	= P4_MISPRED_BRANCH_RETIRED,
 		.config	= 0,
 		.dep	= -1,
+		.key	= 5,
 		.emask	=
 			P4_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
 		.cntr		= { 12, 14 },
 	},
-	[7] = {
+	[6] = {
 		.opcode	= P4_FSB_DATA_ACTIVITY,
 		.config	= p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
 		.dep	= -1,
+		.key	= 6,
 		.emask	=
 			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV)	|
 			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN),
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
 		.cntr		= { 0, 2 },
 	},
-};
-
-static struct p4_event_template *p4_event_map[PERF_COUNT_HW_MAX] = {
-	/* non-halted CPU clocks */
-	[PERF_COUNT_HW_CPU_CYCLES]		= &p4_templates[1],
-
-	/* retired instructions: dep on tagging the FSB */
-	[PERF_COUNT_HW_INSTRUCTIONS]		= &p4_templates[2],
-
-	/* cache hits */
-	[PERF_COUNT_HW_CACHE_REFERENCES]	= &p4_templates[3],
-
-	/* cache misses */
-	[PERF_COUNT_HW_CACHE_MISSES]		= &p4_templates[4],
-
-	/* branch instructions retired */
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= &p4_templates[5],
-
-	/* mispredicted branches retired */
-	[PERF_COUNT_HW_BRANCH_MISSES]		= &p4_templates[6],
-
-	/* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
-	[PERF_COUNT_HW_BUS_CYCLES]		= &p4_templates[7],
+	[7] = {
+		.opcode	= P4_UOP_TYPE,
+		.config	= 0,
+		.dep	= -1,
+		.key	= 7,
+		.emask	=
+			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
+			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
+		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+		.cntr		= { 16, 17 },
+	},
 };
 
 static u64 p4_pmu_event_map(int hw_event)
@@ -153,11 +139,11 @@ static u64 p4_pmu_event_map(int hw_event)
 	struct p4_event_template *tpl;
 	u64 config;
 
-	if (hw_event > ARRAY_SIZE(p4_event_map)) {
+	if (hw_event > ARRAY_SIZE(p4_templates)) {
 		printk_once(KERN_ERR "PMU: Incorrect event index\n");
 		return 0;
 	}
-	tpl = p4_event_map[hw_event];
+	tpl = &p4_templates[hw_event];
 
 	/*
 	 * fill config up according to
@@ -167,6 +153,7 @@ static u64 p4_pmu_event_map(int hw_event)
 	config |= p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(tpl->opcode) << P4_EVNTSEL_EVENT_SHIFT);
 	config |= p4_config_pack_escr(tpl->emask << P4_EVNTSEL_EVENTMASK_SHIFT);
 	config |= p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(tpl->opcode) << P4_CCCR_ESCR_SELECT_SHIFT);
+	config |= p4_config_pack_cccr(hw_event & P4_CCCR_RESERVED);
 
 	/* on HT machine we need a special bit */
 	if (p4_ht_active() && p4_ht_thread(raw_smp_processor_id()))
@@ -187,17 +174,12 @@ static inline int p4_pmu_emask_match(unsigned int dst, unsigned int src)
 
 static struct p4_event_template *p4_pmu_template_lookup(u64 config)
 {
-	u32 opcode = p4_config_unpack_opcode(config);
-	unsigned int emask = p4_config_unpack_emask(config);
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(p4_templates); i++) {
-		if (opcode == p4_templates[i].opcode &&
-			p4_pmu_emask_match(emask, p4_templates[i].emask))
-			return &p4_templates[i];
-	}
+	int key = p4_config_unpack_key(config);
 
-	return NULL;
+	if (key < ARRAY_SIZE(p4_templates))
+		return &p4_templates[key];
+	else
+		return NULL;
 }
 
 /*
@@ -564,7 +546,7 @@ static __initconst struct x86_pmu p4_pmu = {
 	.perfctr		= MSR_P4_BPU_PERFCTR0,
 	.event_map		= p4_pmu_event_map,
 	.raw_event		= p4_pmu_raw_event,
-	.max_events		= ARRAY_SIZE(p4_event_map),
+	.max_events		= ARRAY_SIZE(p4_templates),
 	.get_event_constraints	= x86_get_event_constraints,
 	/*
 	 * IF HT disabled we may need to use all
-- 
cgit v1.2.3


From cb7d6b5053e86598735d9af19930f5929f007b7f Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Thu, 18 Mar 2010 18:33:12 +0800
Subject: perf, x86: Add cache events for the Pentium-4 PMU

Move the HT bit setting code from p4_pmu_event_map to
p4_hw_config. So the cache events can get HT bit set correctly.

Tested on my P4 desktop, below 6 cache events work:

 L1-dcache-load-misses
 LLC-load-misses
 dTLB-load-misses
 dTLB-store-misses
 iTLB-loads
 iTLB-load-misses

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1268908392.13901.128.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/msr-index.h     |   2 +
 arch/x86/include/asm/perf_event_p4.h |  10 +++
 arch/x86/kernel/cpu/perf_event_p4.c  | 153 +++++++++++++++++++++++++++++++++--
 3 files changed, 159 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1cd58cdbc03..aef562c0a64 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -357,6 +357,8 @@
 #define MSR_P4_U2L_ESCR0		0x000003b0
 #define MSR_P4_U2L_ESCR1		0x000003b1
 
+#define MSR_P4_PEBS_MATRIX_VERT		0x000003f2
+
 /* Intel Core-based CPU performance counters */
 #define MSR_CORE_PERF_FIXED_CTR0	0x00000309
 #define MSR_CORE_PERF_FIXED_CTR1	0x0000030a
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 7d3406a2773..871249cf4d2 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -708,4 +708,14 @@ enum P4_EVENTS_ATTR {
 	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, BOGUS, 1),
 };
 
+enum {
+	KEY_P4_L1D_OP_READ_RESULT_MISS,
+	KEY_P4_LL_OP_READ_RESULT_MISS,
+	KEY_P4_DTLB_OP_READ_RESULT_MISS,
+	KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+	KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+	KEY_P4_ITLB_OP_READ_RESULT_MISS,
+	KEY_P4_UOP_TYPE,
+};
+
 #endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 3e97ed3904c..b7bf9911198 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -19,6 +19,11 @@ struct p4_event_template {
 	u64 config;			/* packed predefined bits */
 	int dep;			/* upstream dependency event index */
 	int key;			/* index into p4_templates */
+	u64 msr;			/*
+					 * the high 32 bits set into MSR_IA32_PEBS_ENABLE and
+					 * the low 32 bits set into MSR_P4_PEBS_MATRIX_VERT
+					 * for cache events
+					 */
 	unsigned int emask;		/* ESCR EventMask */
 	unsigned int escr_msr[2];	/* ESCR MSR for this event */
 	unsigned int cntr[2];		/* counter index (offset) */
@@ -31,6 +36,67 @@ struct p4_pmu_res {
 
 static DEFINE_PER_CPU(struct p4_pmu_res, p4_pmu_config);
 
+#define P4_CACHE_EVENT_CONFIG(event, bit) \
+	p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(event) << P4_EVNTSEL_EVENT_SHIFT) | \
+	p4_config_pack_escr((event##_##bit) << P4_EVNTSEL_EVENTMASK_SHIFT) | \
+	p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(event) << P4_CCCR_ESCR_SELECT_SHIFT)
+
+static __initconst u64 p4_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+					/* 1stL_cache_load_miss_retired */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+					| KEY_P4_L1D_OP_READ_RESULT_MISS,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+					/* 2ndL_cache_load_miss_retired */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+					| KEY_P4_LL_OP_READ_RESULT_MISS,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+					/* DTLB_load_miss_retired */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+					| KEY_P4_DTLB_OP_READ_RESULT_MISS,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+					/* DTLB_store_miss_retired */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
+					| KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+					/* ITLB_reference.HIT */
+		[ C(RESULT_ACCESS) ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, HIT)
+					| KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+
+					/* ITLB_reference.MISS */
+		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, MISS)
+					| KEY_P4_ITLB_OP_READ_RESULT_MISS,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
 /*
  * WARN: CCCR1 doesn't have a working enable bit so try to not
  * use it if possible
@@ -121,11 +187,77 @@ struct p4_event_template p4_templates[] = {
 		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
 		.cntr		= { 0, 2 },
 	},
-	[7] = {
+	[KEY_P4_L1D_OP_READ_RESULT_MISS] = {
+		.opcode	= P4_REPLAY_EVENT,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= (u64)(1 << 0 | 1 << 24) << 32 | (1 << 0),
+		.key	= KEY_P4_L1D_OP_READ_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+		.cntr		= { 16, 17 },
+	},
+	[KEY_P4_LL_OP_READ_RESULT_MISS] = {
+		.opcode	= P4_REPLAY_EVENT,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= (u64)(1 << 1 | 1 << 24) << 32 | (1 << 0),
+		.key	= KEY_P4_LL_OP_READ_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+		.cntr		= { 16, 17 },
+	},
+	[KEY_P4_DTLB_OP_READ_RESULT_MISS] = {
+		.opcode	= P4_REPLAY_EVENT,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= (u64)(1 << 2 | 1 << 24) << 32 | (1 << 0),
+		.key	= KEY_P4_DTLB_OP_READ_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+		.cntr		= { 16, 17 },
+	},
+	[KEY_P4_DTLB_OP_WRITE_RESULT_MISS] = {
+		.opcode	= P4_REPLAY_EVENT,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= (u64)(1 << 2 | 1 << 24) << 32 | (1 << 1),
+		.key	= KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
+		.cntr		= { 16, 17 },
+	},
+	[KEY_P4_ITLB_OP_READ_RESULT_ACCESS] = {
+		.opcode	= P4_ITLB_REFERENCE,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= 0,
+		.key	= KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
+		.emask	=
+			P4_EVENT_ATTR(P4_ITLB_REFERENCE, HIT),
+		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+	[KEY_P4_ITLB_OP_READ_RESULT_MISS] = {
+		.opcode	= P4_ITLB_REFERENCE,
+		.config	= 0,
+		.dep	= -1,
+		.msr	= 0,
+		.key	= KEY_P4_ITLB_OP_READ_RESULT_MISS,
+		.emask	=
+			P4_EVENT_ATTR(P4_ITLB_REFERENCE, MISS),
+		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+		.cntr		= { 0, 2 },
+	},
+	[KEY_P4_UOP_TYPE] = {
 		.opcode	= P4_UOP_TYPE,
 		.config	= 0,
 		.dep	= -1,
-		.key	= 7,
+		.key	= KEY_P4_UOP_TYPE,
 		.emask	=
 			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
 			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
@@ -155,10 +287,6 @@ static u64 p4_pmu_event_map(int hw_event)
 	config |= p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(tpl->opcode) << P4_CCCR_ESCR_SELECT_SHIFT);
 	config |= p4_config_pack_cccr(hw_event & P4_CCCR_RESERVED);
 
-	/* on HT machine we need a special bit */
-	if (p4_ht_active() && p4_ht_thread(raw_smp_processor_id()))
-		config = p4_set_ht_bit(config);
-
 	return config;
 }
 
@@ -211,6 +339,10 @@ static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
 	/* Count user and OS events unless not requested to */
 	hwc->config |= p4_config_pack_escr(p4_default_escr_conf(cpu, attr->exclude_kernel,
 								attr->exclude_user));
+	/* on HT machine we need a special bit */
+	if (p4_ht_active() && p4_ht_thread(cpu))
+		hwc->config = p4_set_ht_bit(hwc->config);
+
 	return 0;
 }
 
@@ -271,6 +403,12 @@ static void p4_pmu_enable_event(struct perf_event *event)
 		pr_crit("%s: Wrong index: %d\n", __func__, hwc->idx);
 		return;
 	}
+
+	if (tpl->msr) {
+		(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, tpl->msr >> 32);
+		(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, tpl->msr & 0xffffffff);
+	}
+
 	escr_base = (u64)tpl->escr_msr[thread];
 
 	/*
@@ -577,6 +715,9 @@ static __init int p4_pmu_init(void)
 		return -ENODEV;
 	}
 
+	memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+	       sizeof(hw_cache_event_ids));
+
 	pr_cont("Netburst events, ");
 
 	x86_pmu = p4_pmu;
-- 
cgit v1.2.3


From 4b24a88b35e15e04bd8f2c5dda65b5dc8ebca05f Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 17 Mar 2010 23:21:01 +0200
Subject: perf_events: Fix resource leak in x86 __hw_perf_event_init()

If reserve_pmc_hardware() succeeds but reserve_ds_buffers()
fails, then we need to release_pmc_hardware. It won't be done
by the destroy() callback because we return before setting it
in case of error.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: <stable@kernel.org>
Cc: peterz@infradead.org
Cc: paulus@samba.org
Cc: davem@davemloft.net
Cc: fweisbec@gmail.com
Cc: robert.richter@amd.com
Cc: perfmon2-devel@lists.sf.net
LKML-Reference: <4ba1568b.15185e0a.182a.7802@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--
 arch/x86/kernel/cpu/perf_event.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
---
 arch/x86/kernel/cpu/perf_event.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 14eca80918d..f571f514de2 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -455,8 +455,11 @@ static int __hw_perf_event_init(struct perf_event *event)
 		if (atomic_read(&active_events) == 0) {
 			if (!reserve_pmc_hardware())
 				err = -EBUSY;
-			else
+			else {
 				err = reserve_ds_buffers();
+				if (err)
+					release_pmc_hardware();
+			}
 		}
 		if (!err)
 			atomic_inc(&active_events);
-- 
cgit v1.2.3


From 9c8c6bad3137112d2c7bf3d215b736ee4215fa74 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 19 Mar 2010 00:12:56 +0300
Subject: x86, perf: Fix few cosmetic dabs for P4 pmu (comments and
 constantify)

- A few ESCR have escaped fixing at previous attempt.
- p4_escr_map is read only, make it const.

Nothing serious.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100318211256.GH5062@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 4 ++--
 arch/x86/kernel/cpu/perf_event_p4.c  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 871249cf4d2..2a1a57f7153 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -401,13 +401,13 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 #define P4_RETIRED_MISPRED_BRANCH_TYPE	P4_EVENT_PACK(0x05, 0x02)
 	/*
 	 * MSR_P4_TBPU_ESCR0:	4, 5
-	 * MSR_P4_TBPU_ESCR0:	6, 7
+	 * MSR_P4_TBPU_ESCR1:	6, 7
 	 */
 
 #define P4_RETIRED_BRANCH_TYPE		P4_EVENT_PACK(0x04, 0x02)
 	/*
 	 * MSR_P4_TBPU_ESCR0:	4, 5
-	 * MSR_P4_TBPU_ESCR0:	6, 7
+	 * MSR_P4_TBPU_ESCR1:	6, 7
 	 */
 
 #define P4_RESOURCE_STALL		P4_EVENT_PACK(0x01, 0x01)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index b7bf9911198..b8a811ab760 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -545,7 +545,7 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 }
 
 /* ESCRs are not sequential in memory so we need a map */
-static unsigned int p4_escr_map[ARCH_P4_TOTAL_ESCR] = {
+static const unsigned int p4_escr_map[ARCH_P4_TOTAL_ESCR] = {
 	MSR_P4_ALF_ESCR0,	/*  0 */
 	MSR_P4_ALF_ESCR1,	/*  1 */
 	MSR_P4_BPU_ESCR0,	/*  2 */
-- 
cgit v1.2.3


From 40b7e05e17eef31ff30fe08dfc2424ef653a792c Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Fri, 19 Mar 2010 15:28:58 +0800
Subject: perf, x86: Fix key indexing in Pentium-4 PMU

Index 0-6 in p4_templates are reserved for common hardware
events. So p4_templates is arranged as below:

    0  -    6:  common hardware events
    7  -    N:  cache events
  N+1  -  ...:  other raw events

Reported-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1268983738.13901.142.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 2a1a57f7153..facf96186b2 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -709,7 +709,7 @@ enum P4_EVENTS_ATTR {
 };
 
 enum {
-	KEY_P4_L1D_OP_READ_RESULT_MISS,
+	KEY_P4_L1D_OP_READ_RESULT_MISS = PERF_COUNT_HW_MAX,
 	KEY_P4_LL_OP_READ_RESULT_MISS,
 	KEY_P4_DTLB_OP_READ_RESULT_MISS,
 	KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
-- 
cgit v1.2.3


From 4bd96a7a8185755b091233b16034c7436cbf57af Mon Sep 17 00:00:00 2001
From: Shane Wang <shane.wang@intel.com>
Date: Wed, 10 Mar 2010 14:36:10 +0800
Subject: x86, tboot: Add support for S3 memory integrity protection

This patch adds support for S3 memory integrity protection within an Intel(R)
TXT launched kernel, for all kernel and userspace memory.  All RAM used by the
kernel and userspace, as indicated by memory ranges of type E820_RAM and
E820_RESERVED_KERN in the e820 table, will be integrity protected.

The MAINTAINERS file is also updated to reflect the maintainers of the
TXT-related code.

All MACing is done in tboot, based on a complexity analysis and tradeoff.

v3: Compared with v2, this patch adds a check of array size in
tboot.c, and a note to specify which c/s of tboot supports this kind
of MACing in intel_txt.txt.

Signed-off-by: Shane Wang <shane.wang@intel.com>
LKML-Reference: <4B973DDA.6050902@intel.com>
Signed-off-by: Joseph Cihula <joseph.cihula@intel.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/intel_txt.txt | 16 +++++++++-------
 MAINTAINERS                 | 11 +++++++++++
 arch/x86/include/asm/e820.h |  7 ++++++-
 arch/x86/kernel/tboot.c     | 20 +++++++++++---------
 4 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt
index f40a1f03001..87c8990dbbd 100644
--- a/Documentation/intel_txt.txt
+++ b/Documentation/intel_txt.txt
@@ -161,13 +161,15 @@ o  In order to put a system into any of the sleep states after a TXT
       has been restored, it will restore the TPM PCRs and then
       transfer control back to the kernel's S3 resume vector.
       In order to preserve system integrity across S3, the kernel
-      provides tboot with a set of memory ranges (kernel
-      code/data/bss, S3 resume code, and AP trampoline) that tboot
-      will calculate a MAC (message authentication code) over and then
-      seal with the TPM.  On resume and once the measured environment
-      has been re-established, tboot will re-calculate the MAC and
-      verify it against the sealed value.  Tboot's policy determines
-      what happens if the verification fails.
+      provides tboot with a set of memory ranges (RAM and RESERVED_KERN
+      in the e820 table, but not any memory that BIOS might alter over
+      the S3 transition) that tboot will calculate a MAC (message
+      authentication code) over and then seal with the TPM. On resume
+      and once the measured environment has been re-established, tboot
+      will re-calculate the MAC and verify it against the sealed value.
+      Tboot's policy determines what happens if the verification fails.
+      Note that the c/s 194 of tboot which has the new MAC code supports
+      this.
 
 That's pretty much it for TXT support.
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 47cc449d89d..d3072cb8805 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2940,6 +2940,17 @@ S:	Odd Fixes
 F:	Documentation/networking/README.ipw2200
 F:	drivers/net/wireless/ipw2x00/ipw2200.*
 
+INTEL(R) TRUSTED EXECUTION TECHNOLOGY (TXT)
+M:	Joseph Cihula <joseph.cihula@intel.com>
+M:	Shane Wang <shane.wang@intel.com>
+L:	tboot-devel@lists.sourceforge.net
+W:	http://tboot.sourceforge.net
+T:	Mercurial http://www.bughost.org/repos.hg/tboot.hg
+S:	Supported
+F:	Documentation/intel_txt.txt
+F:	include/linux/tboot.h
+F:	arch/x86/kernel/tboot.c
+
 INTEL WIRELESS WIMAX CONNECTION 2400
 M:	Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>
 M:	linux-wimax@intel.com
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 0e22296790d..ec8a52d14ab 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -45,7 +45,12 @@
 #define E820_NVS	4
 #define E820_UNUSABLE	5
 
-/* reserved RAM used by kernel itself */
+/*
+ * reserved RAM used by kernel itself
+ * if CONFIG_INTEL_TXT is enabled, memory of this type will be
+ * included in the S3 integrity calculation and so should not include
+ * any memory that BIOS might alter over the S3 transition
+ */
 #define E820_RESERVED_KERN        128
 
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 86c9f91b48a..cc2c60474fd 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -175,6 +175,9 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
 	struct tboot_mac_region *mr;
 	phys_addr_t end = start + size;
 
+	if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS)
+		panic("tboot: Too many MAC regions\n");
+
 	if (start && size) {
 		mr = &tboot->mac_regions[tboot->num_mac_regions++];
 		mr->start = round_down(start, PAGE_SIZE);
@@ -184,18 +187,17 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
 
 static int tboot_setup_sleep(void)
 {
+	int i;
+
 	tboot->num_mac_regions = 0;
 
-	/* S3 resume code */
-	add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
+	for (i = 0; i < e820.nr_map; i++) {
+		if ((e820.map[i].type != E820_RAM)
+		 && (e820.map[i].type != E820_RESERVED_KERN))
+			continue;
 
-#ifdef CONFIG_X86_TRAMPOLINE
-	/* AP trampoline code */
-	add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
-#endif
-
-	/* kernel code + data + bss */
-	add_mac_region(virt_to_phys(_text), _end - _text);
+		add_mac_region(e820.map[i].addr, e820.map[i].size);
+	}
 
 	tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
 
-- 
cgit v1.2.3


From 301fde27c7fcd0380b02b175d547e894ff65d78a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 22 Mar 2010 13:09:33 -0300
Subject: perf: Fix orphan callchain branches

Callchains have markers inside their capture to tell we
enter a context (kernel, user, ...).

Those are not displayed in the callchains but they are
incidentally an active part of the radix tree where
callchains are stored, just like any other address.

If we have the two following callchains:

addr1 -> addr2 -> user context -> addr3
addr1 -> addr2 -> user context -> addr4
addr1 -> addr2 -> addr 5

This is pretty common if addr1 and addr2 are part of an
interrupt path, addr3 and addr4 are user addresses and
addr5 is a kernel non interrupt path.

This will be stored as follows in the tree:

                   addr1
                   addr2
                   /   \
                  /     addr5
            user context
               /    \
             addr3  addr4

But we ignore the context markers in the report, hence
the addr3 and addr4 will appear as orphan branches:

    |--28.30%-- hrtimer_interrupt
    |          smp_apic_timer_interrupt
    |          apic_timer_interrupt
    |          |           <------------- here, no parent!
    |          |          |
    |          |          |--11.11%-- 0x7fae7bccb875
    |          |          |
    |          |          |--11.11%-- 0xffffffffff60013b
    |          |          |
    |          |          |--11.11%-- __pthread_mutex_lock_internal
    |          |          |
    |          |          |--11.11%-- __errno_location

Fix this by removing the context markers when we process the
callchains to the tree.

Reported-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1269274173-20328-1-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c |   6 ++-
 tools/perf/util/callchain.c | 109 ++++++++++++++++++++++++++++++++------------
 tools/perf/util/callchain.h |   4 +-
 tools/perf/util/hist.c      |   5 --
 4 files changed, 88 insertions(+), 36 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 1f9f8695f05..d609afbd1a3 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -83,6 +83,7 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 {
 	struct symbol **syms = NULL, *parent = NULL;
 	bool hit;
+	int err;
 	struct hist_entry *he;
 	struct event_stat_id *stats;
 	struct perf_event_attr *attr;
@@ -109,8 +110,11 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 	if (symbol_conf.use_callchain) {
 		if (!hit)
 			callchain_init(&he->callchain);
-		append_chain(&he->callchain, data->callchain, syms);
+		err = append_chain(&he->callchain, data->callchain, syms);
 		free(syms);
+
+		if (err)
+			return err;
 	}
 
 	return 0;
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index b3b71258272..883844eb4b0 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (C) 2009-2010, Frederic Weisbecker <fweisbec@gmail.com>
  *
  * Handle the callchains from the stream in an ad-hoc radix tree and then
  * sort them in an rbtree.
@@ -183,12 +183,23 @@ create_child(struct callchain_node *parent, bool inherit_children)
 	return new;
 }
 
+
+struct resolved_ip {
+	u64		ip;
+	struct symbol	*sym;
+};
+
+struct resolved_chain {
+	u64			nr;
+	struct resolved_ip	ips[0];
+};
+
+
 /*
  * Fill the node with callchain values
  */
 static void
-fill_node(struct callchain_node *node, struct ip_callchain *chain,
-	  int start, struct symbol **syms)
+fill_node(struct callchain_node *node, struct resolved_chain *chain, int start)
 {
 	unsigned int i;
 
@@ -200,8 +211,8 @@ fill_node(struct callchain_node *node, struct ip_callchain *chain,
 			perror("not enough memory for the code path tree");
 			return;
 		}
-		call->ip = chain->ips[i];
-		call->sym = syms[i];
+		call->ip = chain->ips[i].ip;
+		call->sym = chain->ips[i].sym;
 		list_add_tail(&call->list, &node->val);
 	}
 	node->val_nr = chain->nr - start;
@@ -210,13 +221,13 @@ fill_node(struct callchain_node *node, struct ip_callchain *chain,
 }
 
 static void
-add_child(struct callchain_node *parent, struct ip_callchain *chain,
-	  int start, struct symbol **syms)
+add_child(struct callchain_node *parent, struct resolved_chain *chain,
+	  int start)
 {
 	struct callchain_node *new;
 
 	new = create_child(parent, false);
-	fill_node(new, chain, start, syms);
+	fill_node(new, chain, start);
 
 	new->children_hit = 0;
 	new->hit = 1;
@@ -228,9 +239,8 @@ add_child(struct callchain_node *parent, struct ip_callchain *chain,
  * Then create another child to host the given callchain of new branch
  */
 static void
-split_add_child(struct callchain_node *parent, struct ip_callchain *chain,
-		struct callchain_list *to_split, int idx_parents, int idx_local,
-		struct symbol **syms)
+split_add_child(struct callchain_node *parent, struct resolved_chain *chain,
+		struct callchain_list *to_split, int idx_parents, int idx_local)
 {
 	struct callchain_node *new;
 	struct list_head *old_tail;
@@ -257,7 +267,7 @@ split_add_child(struct callchain_node *parent, struct ip_callchain *chain,
 	/* create a new child for the new branch if any */
 	if (idx_total < chain->nr) {
 		parent->hit = 0;
-		add_child(parent, chain, idx_total, syms);
+		add_child(parent, chain, idx_total);
 		parent->children_hit++;
 	} else {
 		parent->hit = 1;
@@ -265,32 +275,33 @@ split_add_child(struct callchain_node *parent, struct ip_callchain *chain,
 }
 
 static int
-__append_chain(struct callchain_node *root, struct ip_callchain *chain,
-	       unsigned int start, struct symbol **syms);
+__append_chain(struct callchain_node *root, struct resolved_chain *chain,
+	       unsigned int start);
 
 static void
-__append_chain_children(struct callchain_node *root, struct ip_callchain *chain,
-			struct symbol **syms, unsigned int start)
+__append_chain_children(struct callchain_node *root,
+			struct resolved_chain *chain,
+			unsigned int start)
 {
 	struct callchain_node *rnode;
 
 	/* lookup in childrens */
 	chain_for_each_child(rnode, root) {
-		unsigned int ret = __append_chain(rnode, chain, start, syms);
+		unsigned int ret = __append_chain(rnode, chain, start);
 
 		if (!ret)
 			goto inc_children_hit;
 	}
 	/* nothing in children, add to the current node */
-	add_child(root, chain, start, syms);
+	add_child(root, chain, start);
 
 inc_children_hit:
 	root->children_hit++;
 }
 
 static int
-__append_chain(struct callchain_node *root, struct ip_callchain *chain,
-	       unsigned int start, struct symbol **syms)
+__append_chain(struct callchain_node *root, struct resolved_chain *chain,
+	       unsigned int start)
 {
 	struct callchain_list *cnode;
 	unsigned int i = start;
@@ -302,13 +313,19 @@ __append_chain(struct callchain_node *root, struct ip_callchain *chain,
 	 * anywhere inside a function.
 	 */
 	list_for_each_entry(cnode, &root->val, list) {
+		struct symbol *sym;
+
 		if (i == chain->nr)
 			break;
-		if (cnode->sym && syms[i]) {
-			if (cnode->sym->start != syms[i]->start)
+
+		sym = chain->ips[i].sym;
+
+		if (cnode->sym && sym) {
+			if (cnode->sym->start != sym->start)
 				break;
-		} else if (cnode->ip != chain->ips[i])
+		} else if (cnode->ip != chain->ips[i].ip)
 			break;
+
 		if (!found)
 			found = true;
 		i++;
@@ -320,7 +337,7 @@ __append_chain(struct callchain_node *root, struct ip_callchain *chain,
 
 	/* we match only a part of the node. Split it and add the new chain */
 	if (i - start < root->val_nr) {
-		split_add_child(root, chain, cnode, start, i - start, syms);
+		split_add_child(root, chain, cnode, start, i - start);
 		return 0;
 	}
 
@@ -331,15 +348,51 @@ __append_chain(struct callchain_node *root, struct ip_callchain *chain,
 	}
 
 	/* We match the node and still have a part remaining */
-	__append_chain_children(root, chain, syms, i);
+	__append_chain_children(root, chain, i);
 
 	return 0;
 }
 
-void append_chain(struct callchain_node *root, struct ip_callchain *chain,
+static void
+filter_context(struct ip_callchain *old, struct resolved_chain *new,
+	       struct symbol **syms)
+{
+	int i, j = 0;
+
+	for (i = 0; i < (int)old->nr; i++) {
+		if (old->ips[i] >= PERF_CONTEXT_MAX)
+			continue;
+
+		new->ips[j].ip = old->ips[i];
+		new->ips[j].sym = syms[i];
+		j++;
+	}
+
+	new->nr = j;
+}
+
+
+int append_chain(struct callchain_node *root, struct ip_callchain *chain,
 		  struct symbol **syms)
 {
+	struct resolved_chain *filtered;
+
 	if (!chain->nr)
-		return;
-	__append_chain_children(root, chain, syms, 0);
+		return 0;
+
+	filtered = malloc(sizeof(*filtered) +
+			  chain->nr * sizeof(struct resolved_ip));
+	if (!filtered)
+		return -ENOMEM;
+
+	filter_context(chain, filtered, syms);
+
+	if (!filtered->nr)
+		goto end;
+
+	__append_chain_children(root, filtered, 0);
+end:
+	free(filtered);
+
+	return 0;
 }
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index ad4626de4c2..bbd76da27f2 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -56,6 +56,6 @@ static inline u64 cumul_hits(struct callchain_node *node)
 }
 
 int register_callchain_param(struct callchain_param *param);
-void append_chain(struct callchain_node *root, struct ip_callchain *chain,
-		  struct symbol **syms);
+int append_chain(struct callchain_node *root, struct ip_callchain *chain,
+		 struct symbol **syms);
 #endif	/* __PERF_CALLCHAIN_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index c37da8b8857..5843a9c572a 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -328,8 +328,6 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 						   left_margin);
 		i = 0;
 		list_for_each_entry(chain, &child->val, list) {
-			if (chain->ip >= PERF_CONTEXT_MAX)
-				continue;
 			ret += ipchain__fprintf_graph(fp, chain, depth,
 						      new_depth_mask, i++,
 						      new_total,
@@ -368,9 +366,6 @@ static size_t callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 	int ret = 0;
 
 	list_for_each_entry(chain, &self->val, list) {
-		if (chain->ip >= PERF_CONTEXT_MAX)
-			continue;
-
 		if (!i++ && sort__first_dimension == SORT_SYM)
 			continue;
 
-- 
cgit v1.2.3


From f3a1f0ea9432ec395cd112f42201e8e523c07bc5 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 22 Mar 2010 13:10:25 -0300
Subject: perf newt: Properly restore the screen when error exiting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Show an OK message box with the last message sent via pr_err,
etc.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269274229-20442-1-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/perf.c       |  4 ++--
 tools/perf/util/cache.h |  4 ++--
 tools/perf/util/newt.c  | 19 +++++++++++++------
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index d2de8393a33..2826e702986 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -264,11 +264,11 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
 	set_debugfs_path();
 
 	status = p->fn(argc, argv, prefix);
+	exit_browser(status);
+
 	if (status)
 		return status & 0xff;
 
-	exit_browser();
-
 	/* Somebody closed stdout? */
 	if (fstat(fileno(stdout), &st))
 		return 0;
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index 47b12a3d11b..4b9aab7f040 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -77,10 +77,10 @@ static inline void setup_browser(void)
 {
 	setup_pager();
 }
-static inline void exit_browser(void) {}
+static inline void exit_browser(bool wait_for_ok __used) {}
 #else
 void setup_browser(void);
-void exit_browser(void);
+void exit_browser(bool wait_for_ok);
 #endif
 
 extern const char *editor_program;
diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 2d19e7a3e6e..3c2ef95d940 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -170,18 +170,20 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 	newtFormDestroy(form);
 }
 
+static char browser__last_msg[1024];
+
 int browser__show_help(const char *format, va_list ap)
 {
 	int ret;
 	static int backlog;
-	static char msg[1024];
 
-        ret = vsnprintf(msg + backlog, sizeof(msg) - backlog, format, ap);
+        ret = vsnprintf(browser__last_msg + backlog,
+			sizeof(browser__last_msg) - backlog, format, ap);
 	backlog += ret;
 
-	if (msg[backlog - 1] == '\n') {
+	if (browser__last_msg[backlog - 1] == '\n') {
 		newtPopHelpLine();
-		newtPushHelpLine(msg);
+		newtPushHelpLine(browser__last_msg);
 		newtRefresh();
 		backlog = 0;
 	}
@@ -200,8 +202,13 @@ void setup_browser(void)
 	newtPushHelpLine(" ");
 }
 
-void exit_browser(void)
+void exit_browser(bool wait_for_ok)
 {
-	if (use_browser)
+	if (use_browser) {
+		if (wait_for_ok) {
+			char title[] = "Fatal Error", ok[] = "Ok";
+			newtWinMessage(title, ok, browser__last_msg);
+		}
 		newtFinished();
+	}
 }
-- 
cgit v1.2.3


From 4b4da7f76660ea8b5aa45615165c48f62167ffa8 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 22 Mar 2010 13:10:26 -0300
Subject: perf probe: Cleanup debuginfo related code

Cleanup debuginfo related code to eliminate fragile code which
pointed by Ingo (Thanks!).
1) Invert logic of NO_DWARF_SUPPORT to DWARF_SUPPORT.
2) For removing assymetric/local variable ifdefs, introduce
  more helper functions.
3) Change options order to reduce the number of ifdefs.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1269274229-20442-2-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Makefile            |   3 +-
 tools/perf/builtin-probe.c     |  28 ++--
 tools/perf/util/probe-event.c  | 343 ++++++++++++++++++++++-------------------
 tools/perf/util/probe-finder.h |   4 +-
 4 files changed, 200 insertions(+), 178 deletions(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 0abd25ee595..c9918182715 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -506,9 +506,8 @@ endif
 
 ifneq ($(shell sh -c "(echo '\#include <dwarf.h>'; echo '\#include <libdw.h>'; echo 'int main(void) { Dwarf *dbg; dbg = dwarf_begin(0, DWARF_C_READ); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -I/usr/include/elfutils -ldw -lelf -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
 	msg := $(warning No libdw.h found or old libdw.h found, disables dwarf support. Please install elfutils-devel/elfutils-dev);
-	BASIC_CFLAGS += -DNO_DWARF_SUPPORT
 else
-	BASIC_CFLAGS += -I/usr/include/elfutils
+	BASIC_CFLAGS += -I/usr/include/elfutils -DDWARF_SUPPORT
 	EXTLIBS += -lelf -ldw
 	LIB_OBJS += util/probe-finder.o
 endif
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index e0dafd9dfeb..cf2ffa5a384 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -109,7 +109,7 @@ static int opt_del_probe_event(const struct option *opt __used,
 	return 0;
 }
 
-#ifndef NO_DWARF_SUPPORT
+#ifdef DWARF_SUPPORT
 static int opt_show_lines(const struct option *opt __used,
 			  const char *str, int unset __used)
 {
@@ -126,7 +126,7 @@ static const char * const probe_usage[] = {
 	"perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]",
 	"perf probe [<options>] --del '[GROUP:]EVENT' ...",
 	"perf probe --list",
-#ifndef NO_DWARF_SUPPORT
+#ifdef DWARF_SUPPORT
 	"perf probe --line 'LINEDESC'",
 #endif
 	NULL
@@ -135,20 +135,16 @@ static const char * const probe_usage[] = {
 static const struct option options[] = {
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show parsed arguments, etc)"),
-#ifndef NO_DWARF_SUPPORT
-	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
-		   "file", "vmlinux pathname"),
-#endif
 	OPT_BOOLEAN('l', "list", &params.list_events,
 		    "list up current probe events"),
 	OPT_CALLBACK('d', "del", NULL, "[GROUP:]EVENT", "delete a probe event.",
 		opt_del_probe_event),
 	OPT_CALLBACK('a', "add", NULL,
-#ifdef NO_DWARF_SUPPORT
-		"[EVENT=]FUNC[+OFF|%return] [ARG ...]",
-#else
+#ifdef DWARF_SUPPORT
 		"[EVENT=]FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT"
 		" [ARG ...]",
+#else
+		"[EVENT=]FUNC[+OFF|%return] [ARG ...]",
 #endif
 		"probe point definition, where\n"
 		"\t\tGROUP:\tGroup name (optional)\n"
@@ -156,23 +152,25 @@ static const struct option options[] = {
 		"\t\tFUNC:\tFunction name\n"
 		"\t\tOFF:\tOffset from function entry (in byte)\n"
 		"\t\t%return:\tPut the probe at function return\n"
-#ifdef NO_DWARF_SUPPORT
-		"\t\tARG:\tProbe argument (only \n"
-#else
+#ifdef DWARF_SUPPORT
 		"\t\tSRC:\tSource code path\n"
 		"\t\tRL:\tRelative line number from function entry.\n"
 		"\t\tAL:\tAbsolute line number in file.\n"
 		"\t\tPT:\tLazy expression of line code.\n"
 		"\t\tARG:\tProbe argument (local variable name or\n"
-#endif
 		"\t\t\tkprobe-tracer argument format.)\n",
+#else
+		"\t\tARG:\tProbe argument (kprobe-tracer argument format.)\n",
+#endif
 		opt_add_probe_event),
 	OPT_BOOLEAN('f', "force", &params.force_add, "forcibly add events"
 		    " with existing name"),
-#ifndef NO_DWARF_SUPPORT
+#ifdef DWARF_SUPPORT
 	OPT_CALLBACK('L', "line", NULL,
 		     "FUNC[:RLN[+NUM|:RLN2]]|SRC:ALN[+NUM|:ALN2]",
 		     "Show source code lines.", opt_show_lines),
+	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
+		   "file", "vmlinux pathname"),
 #endif
 	OPT__DRY_RUN(&probe_event_dry_run),
 	OPT_END()
@@ -211,7 +209,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 		return 0;
 	}
 
-#ifndef NO_DWARF_SUPPORT
+#ifdef DWARF_SUPPORT
 	if (params.show_lines) {
 		if (params.nevents != 0 || params.dellist) {
 			pr_warning("  Error: Don't use --line with"
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index c6603f3bb43..3fc0be741b8 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -42,7 +42,8 @@
 #include "color.h"
 #include "symbol.h"
 #include "thread.h"
-#include "parse-events.h"  /* For debugfs_path */
+#include "trace-event.h"	/* For __unused */
+#include "parse-events.h"	/* For debugfs_path */
 #include "probe-event.h"
 #include "probe-finder.h"
 
@@ -70,10 +71,11 @@ static int e_snprintf(char *str, size_t size, const char *format, ...)
 	return ret;
 }
 
+static char *synthesize_perf_probe_point(struct perf_probe_point *pp);
 static struct map_groups kmap_groups;
 static struct map *kmaps[MAP__NR_TYPES];
 
-/* Initialize symbol maps for vmlinux */
+/* Initialize symbol maps and path of vmlinux */
 static void init_vmlinux(void)
 {
 	symbol_conf.sort_by_name = true;
@@ -89,7 +91,7 @@ static void init_vmlinux(void)
 		die("Failed to create kernel maps.");
 }
 
-#ifndef NO_DWARF_SUPPORT
+#ifdef DWARF_SUPPORT
 static int open_vmlinux(void)
 {
 	if (map__load(kmaps[MAP__FUNCTION], NULL) < 0) {
@@ -99,6 +101,176 @@ static int open_vmlinux(void)
 	pr_debug("Try to open %s\n", kmaps[MAP__FUNCTION]->dso->long_name);
 	return open(kmaps[MAP__FUNCTION]->dso->long_name, O_RDONLY);
 }
+
+static void convert_to_perf_probe_point(struct kprobe_trace_point *tp,
+					struct perf_probe_point *pp)
+{
+	struct symbol *sym;
+	int fd, ret = 0;
+
+	sym = map__find_symbol_by_name(kmaps[MAP__FUNCTION],
+				       tp->symbol, NULL);
+	if (sym) {
+		fd = open_vmlinux();
+		ret = find_perf_probe_point(fd, sym->start + tp->offset, pp);
+		close(fd);
+	}
+	if (ret <= 0) {
+		pp->function = xstrdup(tp->symbol);
+		pp->offset = tp->offset;
+	}
+	pp->retprobe = tp->retprobe;
+}
+
+/* Try to find perf_probe_event with debuginfo */
+static int try_to_find_kprobe_trace_events(struct perf_probe_event *pev,
+					   struct kprobe_trace_event **tevs)
+{
+	bool need_dwarf = perf_probe_event_need_dwarf(pev);
+	int fd, ntevs;
+
+	fd = open_vmlinux();
+	if (fd < 0) {
+		if (need_dwarf)
+			die("Could not open debuginfo file.");
+
+		pr_debug("Could not open vmlinux. Try to use symbols.\n");
+		return 0;
+	}
+
+	/* Searching trace events corresponding to probe event */
+	ntevs = find_kprobe_trace_events(fd, pev, tevs);
+	close(fd);
+
+	if (ntevs > 0)	/* Succeeded to find trace events */
+		return ntevs;
+
+	if (ntevs == 0)	/* No error but failed to find probe point. */
+		die("Probe point '%s' not found. - probe not added.",
+		    synthesize_perf_probe_point(&pev->point));
+
+	/* Error path */
+	if (need_dwarf) {
+		if (ntevs == -ENOENT)
+			pr_warning("No dwarf info found in the vmlinux - "
+				"please rebuild with CONFIG_DEBUG_INFO=y.\n");
+		die("Could not analyze debuginfo.");
+	}
+	pr_debug("An error occurred in debuginfo analysis."
+		 " Try to use symbols.\n");
+	return 0;
+
+}
+
+#define LINEBUF_SIZE 256
+#define NR_ADDITIONAL_LINES 2
+
+static void show_one_line(FILE *fp, unsigned int l, bool skip, bool show_num)
+{
+	char buf[LINEBUF_SIZE];
+	const char *color = PERF_COLOR_BLUE;
+
+	if (fgets(buf, LINEBUF_SIZE, fp) == NULL)
+		goto error;
+	if (!skip) {
+		if (show_num)
+			fprintf(stdout, "%7u  %s", l, buf);
+		else
+			color_fprintf(stdout, color, "         %s", buf);
+	}
+
+	while (strlen(buf) == LINEBUF_SIZE - 1 &&
+	       buf[LINEBUF_SIZE - 2] != '\n') {
+		if (fgets(buf, LINEBUF_SIZE, fp) == NULL)
+			goto error;
+		if (!skip) {
+			if (show_num)
+				fprintf(stdout, "%s", buf);
+			else
+				color_fprintf(stdout, color, "%s", buf);
+		}
+	}
+	return;
+error:
+	if (feof(fp))
+		die("Source file is shorter than expected.");
+	else
+		die("File read error: %s", strerror(errno));
+}
+
+/*
+ * Show line-range always requires debuginfo to find source file and
+ * line number.
+ */
+void show_line_range(struct line_range *lr)
+{
+	unsigned int l = 1;
+	struct line_node *ln;
+	FILE *fp;
+	int fd, ret;
+
+	/* Search a line range */
+	init_vmlinux();
+	fd = open_vmlinux();
+	if (fd < 0)
+		die("Could not open debuginfo file.");
+	ret = find_line_range(fd, lr);
+	if (ret <= 0)
+		die("Source line is not found.\n");
+	close(fd);
+
+	setup_pager();
+
+	if (lr->function)
+		fprintf(stdout, "<%s:%d>\n", lr->function,
+			lr->start - lr->offset);
+	else
+		fprintf(stdout, "<%s:%d>\n", lr->file, lr->start);
+
+	fp = fopen(lr->path, "r");
+	if (fp == NULL)
+		die("Failed to open %s: %s", lr->path, strerror(errno));
+	/* Skip to starting line number */
+	while (l < lr->start)
+		show_one_line(fp, l++, true, false);
+
+	list_for_each_entry(ln, &lr->line_list, list) {
+		while (ln->line > l)
+			show_one_line(fp, (l++) - lr->offset, false, false);
+		show_one_line(fp, (l++) - lr->offset, false, true);
+	}
+
+	if (lr->end == INT_MAX)
+		lr->end = l + NR_ADDITIONAL_LINES;
+	while (l < lr->end && !feof(fp))
+		show_one_line(fp, (l++) - lr->offset, false, false);
+
+	fclose(fp);
+}
+
+#else	/* !DWARF_SUPPORT */
+
+static void convert_to_perf_probe_point(struct kprobe_trace_point *tp,
+					struct perf_probe_point *pp)
+{
+	pp->function = xstrdup(tp->symbol);
+	pp->offset = tp->offset;
+	pp->retprobe = tp->retprobe;
+}
+
+static int try_to_find_kprobe_trace_events(struct perf_probe_event *pev,
+				struct kprobe_trace_event **tevs __unused)
+{
+	if (perf_probe_event_need_dwarf(pev))
+		die("Debuginfo-analysis is not supported");
+	return 0;
+}
+
+void show_line_range(struct line_range *lr __unused)
+{
+	die("Debuginfo-analysis is not supported");
+}
+
 #endif
 
 void parse_line_range_desc(const char *arg, struct line_range *lr)
@@ -592,32 +764,14 @@ void convert_to_perf_probe_event(struct kprobe_trace_event *tev,
 {
 	char buf[64];
 	int i;
-#ifndef NO_DWARF_SUPPORT
-	struct symbol *sym;
-	int fd, ret = 0;
-
-	sym = map__find_symbol_by_name(kmaps[MAP__FUNCTION],
-				       tev->point.symbol, NULL);
-	if (sym) {
-		fd = open_vmlinux();
-		ret = find_perf_probe_point(fd, sym->start + tev->point.offset,
-					    &pev->point);
-		close(fd);
-	}
-	if (ret <= 0) {
-		pev->point.function = xstrdup(tev->point.symbol);
-		pev->point.offset = tev->point.offset;
-	}
-#else
-	/* Convert trace_point to probe_point */
-	pev->point.function = xstrdup(tev->point.symbol);
-	pev->point.offset = tev->point.offset;
-#endif
-	pev->point.retprobe = tev->point.retprobe;
 
+	/* Convert event/group name */
 	pev->event = xstrdup(tev->event);
 	pev->group = xstrdup(tev->group);
 
+	/* Convert trace_point to probe_point */
+	convert_to_perf_probe_point(&tev->point, &pev->point);
+
 	/* Convert trace_arg to probe_arg */
 	pev->nargs = tev->nargs;
 	pev->args = xzalloc(sizeof(struct perf_probe_arg) * pev->nargs);
@@ -944,52 +1098,13 @@ static int convert_to_kprobe_trace_events(struct perf_probe_event *pev,
 					  struct kprobe_trace_event **tevs)
 {
 	struct symbol *sym;
-	bool need_dwarf;
-#ifndef NO_DWARF_SUPPORT
-	int fd;
-#endif
 	int ntevs = 0, i;
 	struct kprobe_trace_event *tev;
 
-	need_dwarf = perf_probe_event_need_dwarf(pev);
-
-	if (need_dwarf)
-#ifdef NO_DWARF_SUPPORT
-		die("Debuginfo-analysis is not supported");
-#else	/* !NO_DWARF_SUPPORT */
-		pr_debug("Some probes require debuginfo.\n");
-
-	fd = open_vmlinux();
-	if (fd < 0) {
-		if (need_dwarf)
-			die("Could not open debuginfo file.");
-
-		pr_debug("Could not open vmlinux/module file."
-			 " Try to use symbols.\n");
-		goto end_dwarf;
-	}
-
-	/* Searching probe points */
-	ntevs = find_kprobe_trace_events(fd, pev, tevs);
-
-	if (ntevs > 0)	/* Found */
-		goto found;
-
-	if (ntevs == 0)	/* No error but failed to find probe point. */
-		die("Probe point '%s' not found. - probe not added.",
-		    synthesize_perf_probe_point(&pev->point));
-
-	/* Error path */
-	if (need_dwarf) {
-		if (ntevs == -ENOENT)
-			pr_warning("No dwarf info found in the vmlinux - please rebuild with CONFIG_DEBUG_INFO=y.\n");
-		die("Could not analyze debuginfo.");
-	}
-	pr_debug("An error occurred in debuginfo analysis."
-		 " Try to use symbols.\n");
-
-end_dwarf:
-#endif /* !NO_DWARF_SUPPORT */
+	/* Convert perf_probe_event with debuginfo */
+	ntevs = try_to_find_kprobe_trace_events(pev, tevs);
+	if (ntevs > 0)
+		return ntevs;
 
 	/* Allocate trace event buffer */
 	ntevs = 1;
@@ -1012,10 +1127,7 @@ end_dwarf:
 	if (!sym)
 		die("Kernel symbol \'%s\' not found - probe not added.",
 		    tev->point.symbol);
-#ifndef NO_DWARF_SUPPORT
-found:
-	close(fd);
-#endif
+
 	return ntevs;
 }
 
@@ -1133,90 +1245,3 @@ void del_perf_probe_events(struct strlist *dellist)
 	close(fd);
 }
 
-#define LINEBUF_SIZE 256
-#define NR_ADDITIONAL_LINES 2
-
-static void show_one_line(FILE *fp, unsigned int l, bool skip, bool show_num)
-{
-	char buf[LINEBUF_SIZE];
-	const char *color = PERF_COLOR_BLUE;
-
-	if (fgets(buf, LINEBUF_SIZE, fp) == NULL)
-		goto error;
-	if (!skip) {
-		if (show_num)
-			fprintf(stdout, "%7u  %s", l, buf);
-		else
-			color_fprintf(stdout, color, "         %s", buf);
-	}
-
-	while (strlen(buf) == LINEBUF_SIZE - 1 &&
-	       buf[LINEBUF_SIZE - 2] != '\n') {
-		if (fgets(buf, LINEBUF_SIZE, fp) == NULL)
-			goto error;
-		if (!skip) {
-			if (show_num)
-				fprintf(stdout, "%s", buf);
-			else
-				color_fprintf(stdout, color, "%s", buf);
-		}
-	}
-	return;
-error:
-	if (feof(fp))
-		die("Source file is shorter than expected.");
-	else
-		die("File read error: %s", strerror(errno));
-}
-
-void show_line_range(struct line_range *lr)
-{
-	unsigned int l = 1;
-	struct line_node *ln;
-	FILE *fp;
-#ifndef NO_DWARF_SUPPORT
-	int fd, ret;
-#endif
-
-	/* Search a line range */
-	init_vmlinux();
-#ifndef NO_DWARF_SUPPORT
-	fd = open_vmlinux();
-	if (fd < 0)
-		die("Could not open debuginfo file.");
-	ret = find_line_range(fd, lr);
-	if (ret <= 0)
-		die("Source line is not found.\n");
-	close(fd);
-#endif
-
-	setup_pager();
-
-	if (lr->function)
-		fprintf(stdout, "<%s:%d>\n", lr->function,
-			lr->start - lr->offset);
-	else
-		fprintf(stdout, "<%s:%d>\n", lr->file, lr->start);
-
-	fp = fopen(lr->path, "r");
-	if (fp == NULL)
-		die("Failed to open %s: %s", lr->path, strerror(errno));
-	/* Skip to starting line number */
-	while (l < lr->start)
-		show_one_line(fp, l++, true, false);
-
-	list_for_each_entry(ln, &lr->line_list, list) {
-		while (ln->line > l)
-			show_one_line(fp, (l++) - lr->offset, false, false);
-		show_one_line(fp, (l++) - lr->offset, false, true);
-	}
-
-	if (lr->end == INT_MAX)
-		lr->end = l + NR_ADDITIONAL_LINES;
-	while (l < lr->end && !feof(fp))
-		show_one_line(fp, (l++) - lr->offset, false, false);
-
-	fclose(fp);
-}
-
-
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index 2f2307d4139..3564f22954f 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -15,7 +15,7 @@ static inline int is_c_varname(const char *name)
 	return isalpha(name[0]) || name[0] == '_';
 }
 
-#ifndef NO_DWARF_SUPPORT
+#ifdef DWARF_SUPPORT
 /* Find kprobe_trace_events specified by perf_probe_event from debuginfo */
 extern int find_kprobe_trace_events(int fd, struct perf_probe_event *pev,
 				    struct kprobe_trace_event **tevs);
@@ -57,6 +57,6 @@ struct line_finder {
 	int			found;
 };
 
-#endif /* NO_DWARF_SUPPORT */
+#endif /* DWARF_SUPPORT */
 
 #endif /*_PROBE_FINDER_H */
-- 
cgit v1.2.3


From ca721e45b39209415d2288dbac3667b26d9d1def Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 22 Mar 2010 13:10:27 -0300
Subject: perf probe: Add NO_DWARF make option

Add NO_DWARF make option for testing build without libdw.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1269274229-20442-3-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index c9918182715..69036457761 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -150,6 +150,8 @@ all::
 # Define LDFLAGS=-static to build a static binary.
 #
 # Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds.
+#
+# Define NO_DWARF if you do not want debug-info analysis feature at all.
 
 PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
 	@$(SHELL_PATH) util/PERF-VERSION-GEN
@@ -507,10 +509,12 @@ endif
 ifneq ($(shell sh -c "(echo '\#include <dwarf.h>'; echo '\#include <libdw.h>'; echo 'int main(void) { Dwarf *dbg; dbg = dwarf_begin(0, DWARF_C_READ); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -I/usr/include/elfutils -ldw -lelf -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
 	msg := $(warning No libdw.h found or old libdw.h found, disables dwarf support. Please install elfutils-devel/elfutils-dev);
 else
+ifndef NO_DWARF
 	BASIC_CFLAGS += -I/usr/include/elfutils -DDWARF_SUPPORT
 	EXTLIBS += -lelf -ldw
 	LIB_OBJS += util/probe-finder.o
 endif
+endif
 
 ifneq ($(shell sh -c "(echo '\#include <newt.h>'; echo 'int main(void) { newtInit(); newtCls(); return newtFinished(); }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -lnewt -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
 	msg := $(warning newt not found, disables TUI support. Please install newt-devel or libnewt-dev);
-- 
cgit v1.2.3


From 084ab9f862416b2ddb4bb9804884de19bf09774d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 22 Mar 2010 13:10:28 -0300
Subject: perf stat: Better report failure to collect system wide stats
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before:

[acme@doppio linux-2.6-tip]$ perf stat -a sleep 1s

 Performance counter stats for 'sleep 1s':

  <not counted>  task-clock-msecs
  <not counted>  context-switches
  <not counted>  CPU-migrations
  <not counted>  page-faults
  <not counted>  cycles
  <not counted>  instructions
  <not counted>  branches
  <not counted>  branch-misses
  <not counted>  cache-references
  <not counted>  cache-misses

    1.016998463  seconds time elapsed

[acme@doppio linux-2.6-tip]$

Now:

[acme@doppio linux-2.6-tip]$ perf stat -a sleep 1s
No permission to collect system-wide stats.
Consider tweaking /proc/sys/kernel/perf_event_paranoid.
[acme@doppio linux-2.6-tip]$

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269274229-20442-4-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-stat.c | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index c92f90ff5a9..1036ca739e6 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -144,10 +144,11 @@ struct stats			runtime_branches_stats;
 #define ERR_PERF_OPEN \
 "Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n"
 
-static void create_perf_stat_counter(int counter)
+static int create_perf_stat_counter(int counter)
 {
 	struct perf_event_attr *attr = attrs + counter;
 	int thread;
+	int ncreated = 0;
 
 	if (scale)
 		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
@@ -159,9 +160,11 @@ static void create_perf_stat_counter(int counter)
 		for (cpu = 0; cpu < nr_cpus; cpu++) {
 			fd[cpu][counter][0] = sys_perf_event_open(attr,
 					-1, cpumap[cpu], -1, 0);
-			if (fd[cpu][counter][0] < 0 && verbose)
-				fprintf(stderr, ERR_PERF_OPEN, counter,
-					fd[cpu][counter][0], strerror(errno));
+			if (fd[cpu][counter][0] < 0)
+				pr_debug(ERR_PERF_OPEN, counter,
+					 fd[cpu][counter][0], strerror(errno));
+			else
+				++ncreated;
 		}
 	} else {
 		attr->inherit	     = inherit;
@@ -172,12 +175,16 @@ static void create_perf_stat_counter(int counter)
 		for (thread = 0; thread < thread_num; thread++) {
 			fd[0][counter][thread] = sys_perf_event_open(attr,
 				all_tids[thread], -1, -1, 0);
-			if (fd[0][counter][thread] < 0 && verbose)
-				fprintf(stderr, ERR_PERF_OPEN, counter,
-					fd[0][counter][thread],
-					strerror(errno));
+			if (fd[0][counter][thread] < 0)
+				pr_debug(ERR_PERF_OPEN, counter,
+					 fd[0][counter][thread],
+					 strerror(errno));
+			else
+				++ncreated;
 		}
 	}
+
+	return ncreated;
 }
 
 /*
@@ -264,7 +271,7 @@ static int run_perf_stat(int argc __used, const char **argv)
 {
 	unsigned long long t0, t1;
 	int status = 0;
-	int counter;
+	int counter, ncreated = 0;
 	int child_ready_pipe[2], go_pipe[2];
 	const bool forks = (argc > 0);
 	char buf;
@@ -324,7 +331,16 @@ static int run_perf_stat(int argc __used, const char **argv)
 	}
 
 	for (counter = 0; counter < nr_counters; counter++)
-		create_perf_stat_counter(counter);
+		ncreated += create_perf_stat_counter(counter);
+
+	if (ncreated == 0) {
+		pr_err("No permission to collect %sstats.\n"
+		       "Consider tweaking /proc/sys/kernel/perf_event_paranoid.\n",
+		       system_wide ? "system-wide " : "");
+		if (child_pid != -1)
+			kill(child_pid, SIGTERM);
+		return -1;
+	}
 
 	/*
 	 * Enable counters and exec the command:
@@ -587,7 +603,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 		status = run_perf_stat(argc, argv);
 	}
 
-	print_stat(argc, argv);
+	if (status != -1)
+		print_stat(argc, argv);
 
 	return status;
 }
-- 
cgit v1.2.3


From 478b0973bf8c90db3677fbb8d812e2bdefc43d9b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 22 Mar 2010 13:10:29 -0300
Subject: perf tools: Exit browser before printing usage when unkown option
 passed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If not the screen will get garbled when using newt.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269274229-20442-5-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/parse-options.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
index efebd5b476b..79dfa0c34b3 100644
--- a/tools/perf/util/parse-options.c
+++ b/tools/perf/util/parse-options.c
@@ -500,6 +500,7 @@ int usage_with_options_internal(const char * const *usagestr,
 void usage_with_options(const char * const *usagestr,
 			const struct option *opts)
 {
+	exit_browser(false);
 	usage_with_options_internal(usagestr, opts, 0);
 	exit(129);
 }
-- 
cgit v1.2.3


From 4ded2b250f1fbba4e414d17dc55ee513485c0aa1 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 22 Mar 2010 17:52:49 -0300
Subject: perf report: Implement Newt callgraphs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Starts collapsed, allows annotating by pressing 'A' or 'a' on
the symbol, be it the top level one or any of the symbols in the
chains.

It (ab)uses the only tree widget in newt, that is actually a
checkbox tree that we use with just one option ('.'), end result
is usable but we really need to create a custom widget tree so
that we can use the data structures we have (hist_entry rb_tree
+ callchain rb_tree + lists), so that we reduce the memory
footprint by not creating a mirror set of data structures in the
newtCheckboxTree widget.

Thanks to Frédéric Weisbacker for fixing the orphanage problem
in 301fde2, without that we were tripping a newt bug (fix
already sent to newt's maintainer).

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269291169-29820-1-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/newt.c | 291 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 259 insertions(+), 32 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 3c2ef95d940..12b229bb9dc 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -28,11 +28,197 @@ static newtComponent newt_form__new(void)
 	return self;
 }
 
+/*
+ * When debugging newt problems it was useful to be able to "unroll"
+ * the calls to newtCheckBoxTreeAdd{Array,Item}, so that we can generate
+ * a source file with the sequence of calls to these methods, to then
+ * tweak the arrays to get the intended results, so I'm keeping this code
+ * here, may be useful again in the future.
+ */
+#undef NEWT_DEBUG
+
+static void newt_checkbox_tree__add(newtComponent tree, const char *str,
+				    void *priv, int *indexes)
+{
+#ifdef NEWT_DEBUG
+	/* Print the newtCheckboxTreeAddArray to tinker with its index arrays */
+	int i = 0, len = 40 - strlen(str);
+
+	fprintf(stderr,
+		"\tnewtCheckboxTreeAddItem(tree, %*.*s\"%s\", (void *)%p, 0, ",
+		len, len, " ", str, priv);
+	while (indexes[i] != NEWT_ARG_LAST) {
+		if (indexes[i] != NEWT_ARG_APPEND)
+			fprintf(stderr, " %d,", indexes[i]);
+		else
+			fprintf(stderr, " %s,", "NEWT_ARG_APPEND");
+		++i;
+	}
+	fprintf(stderr, " %s", " NEWT_ARG_LAST);\n");
+	fflush(stderr);
+#endif
+	newtCheckboxTreeAddArray(tree, str, priv, 0, indexes);
+}
+
+static char *callchain_list__sym_name(struct callchain_list *self,
+				      char *bf, size_t bfsize)
+{
+	if (self->sym)
+		return self->sym->name;
+
+	snprintf(bf, bfsize, "%#Lx", self->ip);
+	return bf;
+}
+
+static void __callchain__append_graph_browser(struct callchain_node *self,
+					      newtComponent tree, u64 total,
+					      int *indexes, int depth)
+{
+	struct rb_node *node;
+	u64 new_total, remaining;
+	int idx = 0;
+
+	if (callchain_param.mode == CHAIN_GRAPH_REL)
+		new_total = self->children_hit;
+	else
+		new_total = total;
+
+	remaining = new_total;
+	node = rb_first(&self->rb_root);
+	while (node) {
+		struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
+		struct rb_node *next = rb_next(node);
+		u64 cumul = cumul_hits(child);
+		struct callchain_list *chain;
+		int first = true, printed = 0;
+		int chain_idx = -1;
+		remaining -= cumul;
+
+		indexes[depth] = NEWT_ARG_APPEND;
+		indexes[depth + 1] = NEWT_ARG_LAST;
+
+		list_for_each_entry(chain, &child->val, list) {
+			char ipstr[BITS_PER_LONG / 4 + 1],
+			     *alloc_str = NULL;
+			const char *str = callchain_list__sym_name(chain, ipstr, sizeof(ipstr));
+
+			if (first) {
+				double percent = cumul * 100.0 / new_total;
+
+				first = false;
+				if (asprintf(&alloc_str, "%2.2f%% %s", percent, str) < 0)
+					str = "Not enough memory!";
+				else
+					str = alloc_str;
+			} else {
+				indexes[depth] = idx;
+				indexes[depth + 1] = NEWT_ARG_APPEND;
+				indexes[depth + 2] = NEWT_ARG_LAST;
+				++chain_idx;
+			}
+			newt_checkbox_tree__add(tree, str, chain->sym, indexes);
+			free(alloc_str);
+			++printed;
+		}
+
+		indexes[depth] = idx;
+		if (chain_idx != -1)
+			indexes[depth + 1] = chain_idx;
+		if (printed != 0)
+			++idx;
+		__callchain__append_graph_browser(child, tree, new_total, indexes,
+						  depth + (chain_idx != -1 ? 2 : 1));
+		node = next;
+	}
+}
+
+static void callchain__append_graph_browser(struct callchain_node *self,
+					    newtComponent tree, u64 total,
+					    int *indexes, int parent_idx)
+{
+	struct callchain_list *chain;
+	int i = 0;
+
+	indexes[1] = NEWT_ARG_APPEND;
+	indexes[2] = NEWT_ARG_LAST;
+
+	list_for_each_entry(chain, &self->val, list) {
+		char ipstr[BITS_PER_LONG / 4 + 1], *str;
+
+		if (chain->ip >= PERF_CONTEXT_MAX)
+			continue;
+
+		if (!i++ && sort__first_dimension == SORT_SYM)
+			continue;
+
+		str = callchain_list__sym_name(chain, ipstr, sizeof(ipstr));
+		newt_checkbox_tree__add(tree, str, chain->sym, indexes);
+	}
+
+	indexes[1] = parent_idx;
+	indexes[2] = NEWT_ARG_APPEND;
+	indexes[3] = NEWT_ARG_LAST;
+	__callchain__append_graph_browser(self, tree, total, indexes, 2);
+}
+
+static void hist_entry__append_callchain_browser(struct hist_entry *self,
+						 newtComponent tree, u64 total, int parent_idx)
+{
+	struct rb_node *rb_node;
+	int indexes[1024] = { [0] = parent_idx, };
+	int idx = 0;
+	struct callchain_node *chain;
+
+	rb_node = rb_first(&self->sorted_chain);
+	while (rb_node) {
+		chain = rb_entry(rb_node, struct callchain_node, rb_node);
+		switch (callchain_param.mode) {
+		case CHAIN_FLAT:
+			break;
+		case CHAIN_GRAPH_ABS: /* falldown */
+		case CHAIN_GRAPH_REL:
+			callchain__append_graph_browser(chain, tree, total, indexes, idx++);
+			break;
+		case CHAIN_NONE:
+		default:
+			break;
+		}
+		rb_node = rb_next(rb_node);
+	}
+}
+
+/*
+ * FIXME: get lib/string.c linked with perf somehow
+ */
+static char *skip_spaces(const char *str)
+{
+	while (isspace(*str))
+		++str;
+	return (char *)str;
+}
+
+static char *strim(char *s)
+{
+	size_t size;
+	char *end;
+
+	s = skip_spaces(s);
+	size = strlen(s);
+	if (!size)
+		return s;
+
+	end = s + size - 1;
+	while (end >= s && isspace(*end))
+		end--;
+	*(end + 1) = '\0';
+
+	return s;
+}
+
 static size_t hist_entry__append_browser(struct hist_entry *self,
-					 newtComponent listbox, u64 total)
+					 newtComponent tree, u64 total)
 {
-	char bf[1024];
-	size_t len;
+	char bf[1024], *s;
 	FILE *fp;
 
 	if (symbol_conf.exclude_other && !self->parent)
@@ -42,28 +228,46 @@ static size_t hist_entry__append_browser(struct hist_entry *self,
 	if (fp == NULL)
 		return 0;
 
-	len = hist_entry__fprintf(self, NULL, false, 0, fp, total);
-
+	hist_entry__fprintf(self, NULL, false, 0, fp, total);
 	fclose(fp);
-	newtListboxAppendEntry(listbox, bf, self);
-	return len;
+
+	/*
+	 * FIXME: We shouldn't need to trim, as the printing routines shouldn't
+	 * add spaces it in the first place, the stdio output routines should
+	 * call a __snprintf method instead of the current __print (that
+	 * actually is a __fprintf) one, but get the raw string and _then_ add
+	 * the newline, as this is a detail of stdio printing, not needed in
+	 * other UIs, e.g. newt.
+	 */
+	s = strim(bf);
+
+	if (symbol_conf.use_callchain) {
+		int indexes[2];
+
+		indexes[0] = NEWT_ARG_APPEND;
+		indexes[1] = NEWT_ARG_LAST;
+		newt_checkbox_tree__add(tree, s, self->sym, indexes);
+	} else
+		newtListboxAppendEntry(tree, s, self->sym);
+
+	return strlen(s);
 }
 
-static void hist_entry__annotate_browser(struct hist_entry *self)
+static void symbol__annotate_browser(const struct symbol *self)
 {
 	FILE *fp;
 	int cols, rows;
-	newtComponent form, listbox;
+	newtComponent form, tree;
 	struct newtExitStruct es;
 	char *str;
 	size_t line_len, max_line_len = 0;
 	size_t max_usable_width;
 	char *line = NULL;
 
-	if (self->sym == NULL)
+	if (self == NULL)
 		return;
 
-	if (asprintf(&str, "perf annotate %s 2>&1 | expand", self->sym->name) < 0)
+	if (asprintf(&str, "perf annotate %s 2>&1 | expand", self->name) < 0)
 		return;
 
 	fp = popen(str, "r");
@@ -72,7 +276,7 @@ static void hist_entry__annotate_browser(struct hist_entry *self)
 
 	newtPushHelpLine("Press ESC to exit");
 	newtGetScreenSize(&cols, &rows);
-	listbox = newtListbox(0, 0, rows - 5, NEWT_FLAG_SCROLL);
+	tree = newtListbox(0, 0, rows - 5, NEWT_FLAG_SCROLL);
 
 	while (!feof(fp)) {
 		if (getline(&line, &line_len, fp) < 0 || !line_len)
@@ -82,7 +286,7 @@ static void hist_entry__annotate_browser(struct hist_entry *self)
 
 		if (line_len > max_line_len)
 			max_line_len = line_len;
-		newtListboxAppendEntry(listbox, line, NULL);
+		newtListboxAppendEntry(tree, line, NULL);
 	}
 	fclose(fp);
 	free(line);
@@ -91,11 +295,11 @@ static void hist_entry__annotate_browser(struct hist_entry *self)
 	if (max_line_len > max_usable_width)
 		max_line_len = max_usable_width;
 
-	newtListboxSetWidth(listbox, max_line_len);
+	newtListboxSetWidth(tree, max_line_len);
 
-	newtCenteredWindow(max_line_len + 2, rows - 5, self->sym->name);
+	newtCenteredWindow(max_line_len + 2, rows - 5, self->name);
 	form = newt_form__new();
-	newtFormAddComponents(form, listbox, NULL);
+	newtFormAddComponents(form, tree, NULL);
 
 	newtFormRun(form, &es);
 	newtFormDestroy(form);
@@ -110,25 +314,27 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 {
 	struct sort_entry *se;
 	struct rb_node *nd;
+	char seq[] = ".";
 	unsigned int width;
 	char *col_width = symbol_conf.col_width_list_str;
-	int rows;
-	size_t max_len = 0;
+	int rows, cols, idx;
+	int max_len = 0;
 	char str[1024];
-	newtComponent form, listbox;
+	newtComponent form, tree;
 	struct newtExitStruct es;
 
 	snprintf(str, sizeof(str), "Samples: %Ld", session_total);
 	newtDrawRootText(0, 0, str);
 	newtPushHelpLine(helpline);
 
-	newtGetScreenSize(NULL, &rows);
-
-	form = newt_form__new();
+	newtGetScreenSize(&cols, &rows);
 
-	listbox = newtListbox(1, 1, rows - 2, (NEWT_FLAG_SCROLL |
-					       NEWT_FLAG_BORDER |
-					       NEWT_FLAG_RETURNEXIT));
+	if (symbol_conf.use_callchain)
+		tree = newtCheckboxTreeMulti(0, 0, rows - 5, seq,
+						NEWT_FLAG_SCROLL);
+	else
+		tree = newtListbox(0, 0, rows - 5, (NEWT_FLAG_SCROLL |
+						       NEWT_FLAG_RETURNEXIT));
 
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		if (se->elide)
@@ -147,27 +353,48 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 		}
 	}
 
+	idx = 0;
 	for (nd = rb_first(hists); nd; nd = rb_next(nd)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-		size_t len = hist_entry__append_browser(h, listbox, session_total);
+		int len = hist_entry__append_browser(h, tree, session_total);
 		if (len > max_len)
 			max_len = len;
+		if (symbol_conf.use_callchain) {
+			hist_entry__append_callchain_browser(h, tree, session_total, idx++);
+			if (idx > 3300)
+				break;
+		}
 	}
 
-	newtListboxSetWidth(listbox, max_len);
-	newtFormAddComponents(form, listbox, NULL);
+	if (max_len > cols)
+		max_len = cols - 3;
+
+	if (!symbol_conf.use_callchain)
+		newtListboxSetWidth(tree, max_len);
+
+	newtCenteredWindow(max_len + (symbol_conf.use_callchain ? 5 : 0),
+			   rows - 5, "Report");
+	form = newt_form__new();
+	newtFormAddHotKey(form, 'A');
+	newtFormAddHotKey(form, 'a');
+	newtFormAddComponents(form, tree, NULL);
 
 	while (1) {
-		struct hist_entry *selection;
+		const struct symbol *selection;
 
 		newtFormRun(form, &es);
-		if (es.reason == NEWT_EXIT_HOTKEY)
+		if (es.reason == NEWT_EXIT_HOTKEY &&
+		    toupper(es.u.key) != 'A')
 			break;
-		selection = newtListboxGetCurrent(listbox);
-		hist_entry__annotate_browser(selection);
+		if (!symbol_conf.use_callchain)
+			selection = newtListboxGetCurrent(tree);
+		else
+			selection = newtCheckboxTreeGetCurrent(tree);
+		symbol__annotate_browser(selection);
 	}
 
 	newtFormDestroy(form);
+	newtPopWindow();
 }
 
 static char browser__last_msg[1024];
-- 
cgit v1.2.3


From 88978e562302c836c1c4597700c79d971e93abc0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 23 Mar 2010 14:33:58 -0300
Subject: perf archive: Explain how to use the generated tarball
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[root@doppio ~]# perf archive
 Now please run:

 $ tar xvf perf.data.tar.bz2 -C ~/.debug

 wherever you need to run 'perf report' on.
 [root@doppio ~]#

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269365638-10223-1-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/perf-archive.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/perf/perf-archive.sh b/tools/perf/perf-archive.sh
index 910468e6e01..2e7a4f417e2 100644
--- a/tools/perf/perf-archive.sh
+++ b/tools/perf/perf-archive.sh
@@ -30,4 +30,7 @@ done
 
 tar cfj $PERF_DATA.tar.bz2 -C $DEBUGDIR -T $MANIFEST
 rm -f $MANIFEST $BUILDIDS
+echo -e "Now please run:\n"
+echo -e "$ tar xvf $PERF_DATA.tar.bz2 -C ~/.debug\n"
+echo "wherever you need to run 'perf report' on."
 exit 0
-- 
cgit v1.2.3


From d814f30105798b6677ecb73ed61d691ff96dada9 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed, 24 Mar 2010 12:09:26 +0800
Subject: x86, perf: Add raw events support for the P4 PMU

The adding of raw event support lead to complete code
refactoring. I hope is became more readable then it was.

The list of changes:

1)  The 64bit config field is enough to hold all information we need
    to track event details. To achieve it we used *own* enum for
    events selection in ESCR register and map this key into proper
    value at moment of event enabling.

    For the same reason we use 12LSB bits in CCCR register -- to track
    which exactly cache trace event was requested. And we cear this bits
    at real 'write' moment.

2)  There is no per-cpu area reserved for P4 PMU anymore. We
    don't need it. All is held by config.

3)  Now we may use any available counter, ie we try to grab any
    possible counter.

v2:
  - Lin Ming reported the lack of ESCR selector in CCCR for cache events

v3:
  - Don't loose cache event codes at config unpacking procedure, we may
    need it one day so no obscure hack behind our back, better to clear
    reserved bits explicitly when needed (thanks Ming for pointing out)

  - Lin Ming fixed misplaced opcodes in cache events

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Tested-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1269403766.3409.6.camel@minggr.sh.intel.com>
[ v4: did a few whitespace fixlets ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 691 +++++++++++++++++---------------
 arch/x86/kernel/cpu/perf_event_p4.c  | 746 ++++++++++++++++++++---------------
 2 files changed, 806 insertions(+), 631 deletions(-)

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index facf96186b2..b05400a542f 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -15,38 +15,40 @@
  * perf-MSRs are not shared and every thread has its
  * own perf-MSRs set)
  */
-#define ARCH_P4_TOTAL_ESCR		(46)
-#define ARCH_P4_RESERVED_ESCR		(2) /* IQ_ESCR(0,1) not always present */
-#define ARCH_P4_MAX_ESCR		(ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
-#define ARCH_P4_MAX_CCCR		(18)
-#define ARCH_P4_MAX_COUNTER		(ARCH_P4_MAX_CCCR / 2)
-
-#define P4_EVNTSEL_EVENT_MASK		0x7e000000U
-#define P4_EVNTSEL_EVENT_SHIFT		25
-#define P4_EVNTSEL_EVENTMASK_MASK	0x01fffe00U
-#define P4_EVNTSEL_EVENTMASK_SHIFT	9
-#define P4_EVNTSEL_TAG_MASK		0x000001e0U
-#define P4_EVNTSEL_TAG_SHIFT		5
-#define P4_EVNTSEL_TAG_ENABLE		0x00000010U
-#define P4_EVNTSEL_T0_OS		0x00000008U
-#define P4_EVNTSEL_T0_USR		0x00000004U
-#define P4_EVNTSEL_T1_OS		0x00000002U
-#define P4_EVNTSEL_T1_USR		0x00000001U
+#define ARCH_P4_TOTAL_ESCR	(46)
+#define ARCH_P4_RESERVED_ESCR	(2) /* IQ_ESCR(0,1) not always present */
+#define ARCH_P4_MAX_ESCR	(ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
+#define ARCH_P4_MAX_CCCR	(18)
+#define ARCH_P4_MAX_COUNTER	(ARCH_P4_MAX_CCCR / 2)
+
+#define P4_ESCR_EVENT_MASK	0x7e000000U
+#define P4_ESCR_EVENT_SHIFT	25
+#define P4_ESCR_EVENTMASK_MASK	0x01fffe00U
+#define P4_ESCR_EVENTMASK_SHIFT	9
+#define P4_ESCR_TAG_MASK	0x000001e0U
+#define P4_ESCR_TAG_SHIFT	5
+#define P4_ESCR_TAG_ENABLE	0x00000010U
+#define P4_ESCR_T0_OS		0x00000008U
+#define P4_ESCR_T0_USR		0x00000004U
+#define P4_ESCR_T1_OS		0x00000002U
+#define P4_ESCR_T1_USR		0x00000001U
+
+#define P4_ESCR_EVENT(v)	((v) << P4_ESCR_EVENT_SHIFT)
+#define P4_ESCR_EMASK(v)	((v) << P4_ESCR_EVENTMASK_SHIFT)
+#define P4_ESCR_TAG(v)		((v) << P4_ESCR_TAG_SHIFT)
 
 /* Non HT mask */
-#define P4_EVNTSEL_MASK				\
-	(P4_EVNTSEL_EVENT_MASK		|	\
-	P4_EVNTSEL_EVENTMASK_MASK	|	\
-	P4_EVNTSEL_TAG_MASK		|	\
-	P4_EVNTSEL_TAG_ENABLE		|	\
-	P4_EVNTSEL_T0_OS		|	\
-	P4_EVNTSEL_T0_USR)
+#define P4_ESCR_MASK			\
+	(P4_ESCR_EVENT_MASK	|	\
+	P4_ESCR_EVENTMASK_MASK	|	\
+	P4_ESCR_TAG_MASK	|	\
+	P4_ESCR_TAG_ENABLE	|	\
+	P4_ESCR_T0_OS		|	\
+	P4_ESCR_T0_USR)
 
 /* HT mask */
-#define P4_EVNTSEL_MASK_HT			\
-	(P4_EVNTSEL_MASK		|	\
-	P4_EVNTSEL_T1_OS		|	\
-	P4_EVNTSEL_T1_USR)
+#define P4_ESCR_MASK_HT			\
+	(P4_ESCR_MASK |	P4_ESCR_T1_OS | P4_ESCR_T1_USR)
 
 #define P4_CCCR_OVF			0x80000000U
 #define P4_CCCR_CASCADE			0x40000000U
@@ -56,7 +58,6 @@
 #define P4_CCCR_EDGE			0x01000000U
 #define P4_CCCR_THRESHOLD_MASK		0x00f00000U
 #define P4_CCCR_THRESHOLD_SHIFT		20
-#define P4_CCCR_THRESHOLD(v)		((v) << P4_CCCR_THRESHOLD_SHIFT)
 #define P4_CCCR_COMPLEMENT		0x00080000U
 #define P4_CCCR_COMPARE			0x00040000U
 #define P4_CCCR_ESCR_SELECT_MASK	0x0000e000U
@@ -67,6 +68,13 @@
 #define P4_CCCR_THREAD_ANY		0x00030000U
 #define P4_CCCR_RESERVED		0x00000fffU
 
+#define P4_CCCR_THRESHOLD(v)		((v) << P4_CCCR_THRESHOLD_SHIFT)
+#define P4_CCCR_ESEL(v)			((v) << P4_CCCR_ESCR_SELECT_SHIFT)
+
+/* Custom bits in reerved CCCR area */
+#define P4_CCCR_CACHE_OPS_MASK		0x0000003fU
+
+
 /* Non HT mask */
 #define P4_CCCR_MASK				\
 	(P4_CCCR_OVF			|	\
@@ -81,25 +89,11 @@
 	P4_CCCR_ENABLE)
 
 /* HT mask */
-#define P4_CCCR_MASK_HT				\
-	(P4_CCCR_MASK			|	\
-	P4_CCCR_THREAD_ANY)
+#define P4_CCCR_MASK_HT	(P4_CCCR_MASK | P4_CCCR_THREAD_ANY)
 
-/*
- * format is 32 bit: ee ss aa aa
- * where
- *	ee - 8 bit event
- *	ss - 8 bit selector
- *	aa aa - 16 bits reserved for tags/attributes
- */
-#define P4_EVENT_PACK(event, selector)		(((event) << 24) | ((selector) << 16))
-#define P4_EVENT_UNPACK_EVENT(packed)		(((packed) >> 24) & 0xff)
-#define P4_EVENT_UNPACK_SELECTOR(packed)	(((packed) >> 16) & 0xff)
-#define P4_EVENT_PACK_ATTR(attr)		((attr))
-#define P4_EVENT_UNPACK_ATTR(packed)		((packed) & 0xffff)
-#define P4_MAKE_EVENT_ATTR(class, name, bit)	class##_##name = (1 << bit)
-#define P4_EVENT_ATTR(class, name)		class##_##name
-#define P4_EVENT_ATTR_STR(class, name)		__stringify(class##_##name)
+#define P4_GEN_ESCR_EMASK(class, name, bit)	\
+	class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
+#define P4_ESCR_EMASK_BIT(class, name)		class##__##name
 
 /*
  * config field is 64bit width and consists of
@@ -117,35 +111,29 @@
 #define p4_config_pack_escr(v)		(((u64)(v)) << 32)
 #define p4_config_pack_cccr(v)		(((u64)(v)) & 0xffffffffULL)
 #define p4_config_unpack_escr(v)	(((u64)(v)) >> 32)
-#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xfffff000ULL)
+#define p4_config_unpack_cccr(v)	(((u64)(v)) & 0xffffffffULL)
 
 #define p4_config_unpack_emask(v)			\
 	({						\
 		u32 t = p4_config_unpack_escr((v));	\
-		t  &= P4_EVNTSEL_EVENTMASK_MASK;	\
-		t >>= P4_EVNTSEL_EVENTMASK_SHIFT;	\
+		t = t &  P4_ESCR_EVENTMASK_MASK;	\
+		t = t >> P4_ESCR_EVENTMASK_SHIFT;	\
+		t;					\
+	})
+
+#define p4_config_unpack_event(v)			\
+	({						\
+		u32 t = p4_config_unpack_escr((v));	\
+		t = t &  P4_ESCR_EVENT_MASK;		\
+		t = t >> P4_ESCR_EVENT_SHIFT;		\
 		t;					\
 	})
 
-#define p4_config_unpack_key(v)		(((u64)(v)) & P4_CCCR_RESERVED)
+#define p4_config_unpack_cache_event(v)	(((u64)(v)) & P4_CCCR_CACHE_OPS_MASK)
 
 #define P4_CONFIG_HT_SHIFT		63
 #define P4_CONFIG_HT			(1ULL << P4_CONFIG_HT_SHIFT)
 
-static inline u32 p4_config_unpack_opcode(u64 config)
-{
-	u32 e, s;
-
-	/*
-	 * we don't care about HT presence here since
-	 * event opcode doesn't depend on it
-	 */
-	e = (p4_config_unpack_escr(config) & P4_EVNTSEL_EVENT_MASK) >> P4_EVNTSEL_EVENT_SHIFT;
-	s = (p4_config_unpack_cccr(config) & P4_CCCR_ESCR_SELECT_MASK) >> P4_CCCR_ESCR_SELECT_SHIFT;
-
-	return P4_EVENT_PACK(e, s);
-}
-
 static inline bool p4_is_event_cascaded(u64 config)
 {
 	u32 cccr = p4_config_unpack_cccr(config);
@@ -212,19 +200,73 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
 
 	if (!p4_ht_thread(cpu)) {
 		if (!exclude_os)
-			escr |= P4_EVNTSEL_T0_OS;
+			escr |= P4_ESCR_T0_OS;
 		if (!exclude_usr)
-			escr |= P4_EVNTSEL_T0_USR;
+			escr |= P4_ESCR_T0_USR;
 	} else {
 		if (!exclude_os)
-			escr |= P4_EVNTSEL_T1_OS;
+			escr |= P4_ESCR_T1_OS;
 		if (!exclude_usr)
-			escr |= P4_EVNTSEL_T1_USR;
+			escr |= P4_ESCR_T1_USR;
 	}
 
 	return escr;
 }
 
+enum P4_EVENTS {
+	P4_EVENT_TC_DELIVER_MODE,
+	P4_EVENT_BPU_FETCH_REQUEST,
+	P4_EVENT_ITLB_REFERENCE,
+	P4_EVENT_MEMORY_CANCEL,
+	P4_EVENT_MEMORY_COMPLETE,
+	P4_EVENT_LOAD_PORT_REPLAY,
+	P4_EVENT_STORE_PORT_REPLAY,
+	P4_EVENT_MOB_LOAD_REPLAY,
+	P4_EVENT_PAGE_WALK_TYPE,
+	P4_EVENT_BSQ_CACHE_REFERENCE,
+	P4_EVENT_IOQ_ALLOCATION,
+	P4_EVENT_IOQ_ACTIVE_ENTRIES,
+	P4_EVENT_FSB_DATA_ACTIVITY,
+	P4_EVENT_BSQ_ALLOCATION,
+	P4_EVENT_BSQ_ACTIVE_ENTRIES,
+	P4_EVENT_SSE_INPUT_ASSIST,
+	P4_EVENT_PACKED_SP_UOP,
+	P4_EVENT_PACKED_DP_UOP,
+	P4_EVENT_SCALAR_SP_UOP,
+	P4_EVENT_SCALAR_DP_UOP,
+	P4_EVENT_64BIT_MMX_UOP,
+	P4_EVENT_128BIT_MMX_UOP,
+	P4_EVENT_X87_FP_UOP,
+	P4_EVENT_TC_MISC,
+	P4_EVENT_GLOBAL_POWER_EVENTS,
+	P4_EVENT_TC_MS_XFER,
+	P4_EVENT_UOP_QUEUE_WRITES,
+	P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE,
+	P4_EVENT_RETIRED_BRANCH_TYPE,
+	P4_EVENT_RESOURCE_STALL,
+	P4_EVENT_WC_BUFFER,
+	P4_EVENT_B2B_CYCLES,
+	P4_EVENT_BNR,
+	P4_EVENT_SNOOP,
+	P4_EVENT_RESPONSE,
+	P4_EVENT_FRONT_END_EVENT,
+	P4_EVENT_EXECUTION_EVENT,
+	P4_EVENT_REPLAY_EVENT,
+	P4_EVENT_INSTR_RETIRED,
+	P4_EVENT_UOPS_RETIRED,
+	P4_EVENT_UOP_TYPE,
+	P4_EVENT_BRANCH_RETIRED,
+	P4_EVENT_MISPRED_BRANCH_RETIRED,
+	P4_EVENT_X87_ASSIST,
+	P4_EVENT_MACHINE_CLEAR,
+	P4_EVENT_INSTR_COMPLETED,
+};
+
+#define P4_OPCODE(event)		event##_OPCODE
+#define P4_OPCODE_ESEL(opcode)		((opcode & 0x00ff) >> 0)
+#define P4_OPCODE_EVNT(opcode)		((opcode & 0xff00) >> 8)
+#define P4_OPCODE_PACK(event, sel)	(((event) << 8) | sel)
+
 /*
  * Comments below the event represent ESCR restriction
  * for this event and counter index per ESCR
@@ -238,484 +280,515 @@ static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr)
  * working so that we should not use this CCCR and respective
  * counter as result
  */
-#define P4_TC_DELIVER_MODE		P4_EVENT_PACK(0x01, 0x01)
+enum P4_EVENT_OPCODES {
+	P4_OPCODE(P4_EVENT_TC_DELIVER_MODE)		= P4_OPCODE_PACK(0x01, 0x01),
 	/*
 	 * MSR_P4_TC_ESCR0:	4, 5
 	 * MSR_P4_TC_ESCR1:	6, 7
 	 */
 
-#define P4_BPU_FETCH_REQUEST		P4_EVENT_PACK(0x03, 0x00)
+	P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST)		= P4_OPCODE_PACK(0x03, 0x00),
 	/*
 	 * MSR_P4_BPU_ESCR0:	0, 1
 	 * MSR_P4_BPU_ESCR1:	2, 3
 	 */
 
-#define P4_ITLB_REFERENCE		P4_EVENT_PACK(0x18, 0x03)
+	P4_OPCODE(P4_EVENT_ITLB_REFERENCE)		= P4_OPCODE_PACK(0x18, 0x03),
 	/*
 	 * MSR_P4_ITLB_ESCR0:	0, 1
 	 * MSR_P4_ITLB_ESCR1:	2, 3
 	 */
 
-#define P4_MEMORY_CANCEL		P4_EVENT_PACK(0x02, 0x05)
+	P4_OPCODE(P4_EVENT_MEMORY_CANCEL)		= P4_OPCODE_PACK(0x02, 0x05),
 	/*
 	 * MSR_P4_DAC_ESCR0:	8, 9
 	 * MSR_P4_DAC_ESCR1:	10, 11
 	 */
 
-#define P4_MEMORY_COMPLETE		P4_EVENT_PACK(0x08, 0x02)
+	P4_OPCODE(P4_EVENT_MEMORY_COMPLETE)		= P4_OPCODE_PACK(0x08, 0x02),
 	/*
 	 * MSR_P4_SAAT_ESCR0:	8, 9
 	 * MSR_P4_SAAT_ESCR1:	10, 11
 	 */
 
-#define P4_LOAD_PORT_REPLAY		P4_EVENT_PACK(0x04, 0x02)
+	P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY)		= P4_OPCODE_PACK(0x04, 0x02),
 	/*
 	 * MSR_P4_SAAT_ESCR0:	8, 9
 	 * MSR_P4_SAAT_ESCR1:	10, 11
 	 */
 
-#define P4_STORE_PORT_REPLAY		P4_EVENT_PACK(0x05, 0x02)
+	P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY)		= P4_OPCODE_PACK(0x05, 0x02),
 	/*
 	 * MSR_P4_SAAT_ESCR0:	8, 9
 	 * MSR_P4_SAAT_ESCR1:	10, 11
 	 */
 
-#define P4_MOB_LOAD_REPLAY		P4_EVENT_PACK(0x03, 0x02)
+	P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY)		= P4_OPCODE_PACK(0x03, 0x02),
 	/*
 	 * MSR_P4_MOB_ESCR0:	0, 1
 	 * MSR_P4_MOB_ESCR1:	2, 3
 	 */
 
-#define P4_PAGE_WALK_TYPE		P4_EVENT_PACK(0x01, 0x04)
+	P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE)		= P4_OPCODE_PACK(0x01, 0x04),
 	/*
 	 * MSR_P4_PMH_ESCR0:	0, 1
 	 * MSR_P4_PMH_ESCR1:	2, 3
 	 */
 
-#define P4_BSQ_CACHE_REFERENCE		P4_EVENT_PACK(0x0c, 0x07)
+	P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE)		= P4_OPCODE_PACK(0x0c, 0x07),
 	/*
 	 * MSR_P4_BSU_ESCR0:	0, 1
 	 * MSR_P4_BSU_ESCR1:	2, 3
 	 */
 
-#define P4_IOQ_ALLOCATION		P4_EVENT_PACK(0x03, 0x06)
+	P4_OPCODE(P4_EVENT_IOQ_ALLOCATION)		= P4_OPCODE_PACK(0x03, 0x06),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_IOQ_ACTIVE_ENTRIES		P4_EVENT_PACK(0x1a, 0x06)
+	P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES)		= P4_OPCODE_PACK(0x1a, 0x06),
 	/*
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_FSB_DATA_ACTIVITY		P4_EVENT_PACK(0x17, 0x06)
+	P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY)		= P4_OPCODE_PACK(0x17, 0x06),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_BSQ_ALLOCATION		P4_EVENT_PACK(0x05, 0x07)
+	P4_OPCODE(P4_EVENT_BSQ_ALLOCATION)		= P4_OPCODE_PACK(0x05, 0x07),
 	/*
 	 * MSR_P4_BSU_ESCR0:	0, 1
 	 */
 
-#define P4_BSQ_ACTIVE_ENTRIES		P4_EVENT_PACK(0x06, 0x07)
+	P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES)		= P4_OPCODE_PACK(0x06, 0x07),
 	/*
 	 * NOTE: no ESCR name in docs, it's guessed
 	 * MSR_P4_BSU_ESCR1:	2, 3
 	 */
 
-#define P4_SSE_INPUT_ASSIST		P4_EVENT_PACK(0x34, 0x01)
+	P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST)		= P4_OPCODE_PACK(0x34, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_PACKED_SP_UOP		P4_EVENT_PACK(0x08, 0x01)
+	P4_OPCODE(P4_EVENT_PACKED_SP_UOP)		= P4_OPCODE_PACK(0x08, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_PACKED_DP_UOP		P4_EVENT_PACK(0x0c, 0x01)
+	P4_OPCODE(P4_EVENT_PACKED_DP_UOP)		= P4_OPCODE_PACK(0x0c, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_SCALAR_SP_UOP		P4_EVENT_PACK(0x0a, 0x01)
+	P4_OPCODE(P4_EVENT_SCALAR_SP_UOP)		= P4_OPCODE_PACK(0x0a, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_SCALAR_DP_UOP		P4_EVENT_PACK(0x0e, 0x01)
+	P4_OPCODE(P4_EVENT_SCALAR_DP_UOP)		= P4_OPCODE_PACK(0x0e, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_64BIT_MMX_UOP		P4_EVENT_PACK(0x02, 0x01)
+	P4_OPCODE(P4_EVENT_64BIT_MMX_UOP)		= P4_OPCODE_PACK(0x02, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_128BIT_MMX_UOP		P4_EVENT_PACK(0x1a, 0x01)
+	P4_OPCODE(P4_EVENT_128BIT_MMX_UOP)		= P4_OPCODE_PACK(0x1a, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_X87_FP_UOP			P4_EVENT_PACK(0x04, 0x01)
+	P4_OPCODE(P4_EVENT_X87_FP_UOP)			= P4_OPCODE_PACK(0x04, 0x01),
 	/*
 	 * MSR_P4_FIRM_ESCR0:	8, 9
 	 * MSR_P4_FIRM_ESCR1:	10, 11
 	 */
 
-#define P4_TC_MISC			P4_EVENT_PACK(0x06, 0x01)
+	P4_OPCODE(P4_EVENT_TC_MISC)			= P4_OPCODE_PACK(0x06, 0x01),
 	/*
 	 * MSR_P4_TC_ESCR0:	4, 5
 	 * MSR_P4_TC_ESCR1:	6, 7
 	 */
 
-#define P4_GLOBAL_POWER_EVENTS		P4_EVENT_PACK(0x13, 0x06)
+	P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS)		= P4_OPCODE_PACK(0x13, 0x06),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_TC_MS_XFER			P4_EVENT_PACK(0x05, 0x00)
+	P4_OPCODE(P4_EVENT_TC_MS_XFER)			= P4_OPCODE_PACK(0x05, 0x00),
 	/*
 	 * MSR_P4_MS_ESCR0:	4, 5
 	 * MSR_P4_MS_ESCR1:	6, 7
 	 */
 
-#define P4_UOP_QUEUE_WRITES		P4_EVENT_PACK(0x09, 0x00)
+	P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES)		= P4_OPCODE_PACK(0x09, 0x00),
 	/*
 	 * MSR_P4_MS_ESCR0:	4, 5
 	 * MSR_P4_MS_ESCR1:	6, 7
 	 */
 
-#define P4_RETIRED_MISPRED_BRANCH_TYPE	P4_EVENT_PACK(0x05, 0x02)
+	P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE)	= P4_OPCODE_PACK(0x05, 0x02),
 	/*
 	 * MSR_P4_TBPU_ESCR0:	4, 5
 	 * MSR_P4_TBPU_ESCR1:	6, 7
 	 */
 
-#define P4_RETIRED_BRANCH_TYPE		P4_EVENT_PACK(0x04, 0x02)
+	P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE)		= P4_OPCODE_PACK(0x04, 0x02),
 	/*
 	 * MSR_P4_TBPU_ESCR0:	4, 5
 	 * MSR_P4_TBPU_ESCR1:	6, 7
 	 */
 
-#define P4_RESOURCE_STALL		P4_EVENT_PACK(0x01, 0x01)
+	P4_OPCODE(P4_EVENT_RESOURCE_STALL)		= P4_OPCODE_PACK(0x01, 0x01),
 	/*
 	 * MSR_P4_ALF_ESCR0:	12, 13, 16
 	 * MSR_P4_ALF_ESCR1:	14, 15, 17
 	 */
 
-#define P4_WC_BUFFER			P4_EVENT_PACK(0x05, 0x05)
+	P4_OPCODE(P4_EVENT_WC_BUFFER)			= P4_OPCODE_PACK(0x05, 0x05),
 	/*
 	 * MSR_P4_DAC_ESCR0:	8, 9
 	 * MSR_P4_DAC_ESCR1:	10, 11
 	 */
 
-#define P4_B2B_CYCLES			P4_EVENT_PACK(0x16, 0x03)
+	P4_OPCODE(P4_EVENT_B2B_CYCLES)			= P4_OPCODE_PACK(0x16, 0x03),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_BNR				P4_EVENT_PACK(0x08, 0x03)
+	P4_OPCODE(P4_EVENT_BNR)				= P4_OPCODE_PACK(0x08, 0x03),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_SNOOP			P4_EVENT_PACK(0x06, 0x03)
+	P4_OPCODE(P4_EVENT_SNOOP)			= P4_OPCODE_PACK(0x06, 0x03),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_RESPONSE			P4_EVENT_PACK(0x04, 0x03)
+	P4_OPCODE(P4_EVENT_RESPONSE)			= P4_OPCODE_PACK(0x04, 0x03),
 	/*
 	 * MSR_P4_FSB_ESCR0:	0, 1
 	 * MSR_P4_FSB_ESCR1:	2, 3
 	 */
 
-#define P4_FRONT_END_EVENT		P4_EVENT_PACK(0x08, 0x05)
+	P4_OPCODE(P4_EVENT_FRONT_END_EVENT)		= P4_OPCODE_PACK(0x08, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_EXECUTION_EVENT		P4_EVENT_PACK(0x0c, 0x05)
+	P4_OPCODE(P4_EVENT_EXECUTION_EVENT)		= P4_OPCODE_PACK(0x0c, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_REPLAY_EVENT			P4_EVENT_PACK(0x09, 0x05)
+	P4_OPCODE(P4_EVENT_REPLAY_EVENT)		= P4_OPCODE_PACK(0x09, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_INSTR_RETIRED		P4_EVENT_PACK(0x02, 0x04)
+	P4_OPCODE(P4_EVENT_INSTR_RETIRED)		= P4_OPCODE_PACK(0x02, 0x04),
 	/*
 	 * MSR_P4_CRU_ESCR0:	12, 13, 16
 	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
 
-#define P4_UOPS_RETIRED			P4_EVENT_PACK(0x01, 0x04)
+	P4_OPCODE(P4_EVENT_UOPS_RETIRED)		= P4_OPCODE_PACK(0x01, 0x04),
 	/*
 	 * MSR_P4_CRU_ESCR0:	12, 13, 16
 	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
 
-#define P4_UOP_TYPE			P4_EVENT_PACK(0x02, 0x02)
+	P4_OPCODE(P4_EVENT_UOP_TYPE)			= P4_OPCODE_PACK(0x02, 0x02),
 	/*
 	 * MSR_P4_RAT_ESCR0:	12, 13, 16
 	 * MSR_P4_RAT_ESCR1:	14, 15, 17
 	 */
 
-#define P4_BRANCH_RETIRED		P4_EVENT_PACK(0x06, 0x05)
+	P4_OPCODE(P4_EVENT_BRANCH_RETIRED)		= P4_OPCODE_PACK(0x06, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_MISPRED_BRANCH_RETIRED	P4_EVENT_PACK(0x03, 0x04)
+	P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED)	= P4_OPCODE_PACK(0x03, 0x04),
 	/*
 	 * MSR_P4_CRU_ESCR0:	12, 13, 16
 	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
 
-#define P4_X87_ASSIST			P4_EVENT_PACK(0x03, 0x05)
+	P4_OPCODE(P4_EVENT_X87_ASSIST)			= P4_OPCODE_PACK(0x03, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_MACHINE_CLEAR		P4_EVENT_PACK(0x02, 0x05)
+	P4_OPCODE(P4_EVENT_MACHINE_CLEAR)		= P4_OPCODE_PACK(0x02, 0x05),
 	/*
 	 * MSR_P4_CRU_ESCR2:	12, 13, 16
 	 * MSR_P4_CRU_ESCR3:	14, 15, 17
 	 */
 
-#define P4_INSTR_COMPLETED		P4_EVENT_PACK(0x07, 0x04)
+	P4_OPCODE(P4_EVENT_INSTR_COMPLETED)		= P4_OPCODE_PACK(0x07, 0x04),
 	/*
 	 * MSR_P4_CRU_ESCR0:	12, 13, 16
 	 * MSR_P4_CRU_ESCR1:	14, 15, 17
 	 */
+};
 
 /*
- * a caller should use P4_EVENT_ATTR helper to
- * pick the attribute needed, for example
+ * a caller should use P4_ESCR_EMASK_NAME helper to
+ * pick the EventMask needed, for example
  *
- *	P4_EVENT_ATTR(P4_TC_DELIVER_MODE, DD)
+ *	P4_ESCR_EMASK_NAME(P4_EVENT_TC_DELIVER_MODE, DD)
  */
-enum P4_EVENTS_ATTR {
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DD, 0),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DB, 1),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, DI, 2),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BD, 3),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BB, 4),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, BI, 5),
-	P4_MAKE_EVENT_ATTR(P4_TC_DELIVER_MODE, ID, 6),
-
-	P4_MAKE_EVENT_ATTR(P4_BPU_FETCH_REQUEST, TCMISS, 0),
-
-	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, HIT, 0),
-	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, MISS, 1),
-	P4_MAKE_EVENT_ATTR(P4_ITLB_REFERENCE, HIT_UK, 2),
-
-	P4_MAKE_EVENT_ATTR(P4_MEMORY_CANCEL, ST_RB_FULL, 2),
-	P4_MAKE_EVENT_ATTR(P4_MEMORY_CANCEL, 64K_CONF, 3),
-
-	P4_MAKE_EVENT_ATTR(P4_MEMORY_COMPLETE, LSC, 0),
-	P4_MAKE_EVENT_ATTR(P4_MEMORY_COMPLETE, SSC, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_LOAD_PORT_REPLAY, SPLIT_LD, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_STORE_PORT_REPLAY, SPLIT_ST, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, NO_STA, 1),
-	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, NO_STD, 3),
-	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, PARTIAL_DATA, 4),
-	P4_MAKE_EVENT_ATTR(P4_MOB_LOAD_REPLAY, UNALGN_ADDR, 5),
-
-	P4_MAKE_EVENT_ATTR(P4_PAGE_WALK_TYPE, DTMISS, 0),
-	P4_MAKE_EVENT_ATTR(P4_PAGE_WALK_TYPE, ITMISS, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10),
-
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, DEFAULT, 0),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, ALL_READ, 5),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, ALL_WRITE, 6),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_UC, 7),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WC, 8),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WT, 9),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WP, 10),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, MEM_WB, 11),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, OWN, 13),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, OTHER, 14),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ALLOCATION, PREFETCH, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, DEFAULT, 0),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, ALL_READ, 5),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_UC, 7),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WC, 8),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WT, 9),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WP, 10),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, MEM_WB, 11),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, OWN, 13),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, OTHER, 14),
-	P4_MAKE_EVENT_ATTR(P4_IOQ_ACTIVE_ENTRIES, PREFETCH, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV, 0),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN, 1),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OTHER, 2),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_DRV, 3),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_OWN, 4),
-	P4_MAKE_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DBSY_OTHER, 5),
-
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_TYPE0, 0),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_TYPE1, 1),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LEN0, 2),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LEN1, 3),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_IO_TYPE, 5),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_DEM_TYPE, 9),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, REQ_ORD_TYPE, 10),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE0, 11),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE1, 12),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ALLOCATION, MEM_TYPE2, 13),
-
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12),
-	P4_MAKE_EVENT_ATTR(P4_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13),
-
-	P4_MAKE_EVENT_ATTR(P4_SSE_INPUT_ASSIST, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_PACKED_SP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_PACKED_DP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_SCALAR_SP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_SCALAR_DP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_64BIT_MMX_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_128BIT_MMX_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_X87_FP_UOP, ALL, 15),
-
-	P4_MAKE_EVENT_ATTR(P4_TC_MISC, FLUSH, 4),
-
-	P4_MAKE_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING, 0),
-
-	P4_MAKE_EVENT_ATTR(P4_TC_MS_XFER, CISC, 0),
-
-	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0),
-	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1),
-	P4_MAKE_EVENT_ATTR(P4_UOP_QUEUE_WRITES, FROM_ROM, 2),
-
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4),
-
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL, 1),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL, 2),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, RETURN, 3),
-	P4_MAKE_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, INDIRECT, 4),
-
-	P4_MAKE_EVENT_ATTR(P4_RESOURCE_STALL, SBFULL, 5),
-
-	P4_MAKE_EVENT_ATTR(P4_WC_BUFFER, WCB_EVICTS, 0),
-	P4_MAKE_EVENT_ATTR(P4_WC_BUFFER, WCB_FULL_EVICTS, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_FRONT_END_EVENT, NBOGUS, 0),
-	P4_MAKE_EVENT_ATTR(P4_FRONT_END_EVENT, BOGUS, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS0, 0),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS1, 1),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS2, 2),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, NBOGUS3, 3),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS0, 4),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS1, 5),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS2, 6),
-	P4_MAKE_EVENT_ATTR(P4_EXECUTION_EVENT, BOGUS3, 7),
-
-	P4_MAKE_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS, 0),
-	P4_MAKE_EVENT_ATTR(P4_REPLAY_EVENT, BOGUS, 1),
-
-	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG, 0),
-	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSTAG, 1),
-	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG, 2),
-	P4_MAKE_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSTAG, 3),
-
-	P4_MAKE_EVENT_ATTR(P4_UOPS_RETIRED, NBOGUS, 0),
-	P4_MAKE_EVENT_ATTR(P4_UOPS_RETIRED, BOGUS, 1),
+enum P4_ESCR_EMASKS {
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DB, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DI, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BD, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BB, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BI, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, ID, 6),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BPU_FETCH_REQUEST, TCMISS, 0),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, MISS, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT_UK, 2),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, 64K_CONF, 3),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, LSC, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, SSC, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STA, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STD, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR, 5),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, DTMISS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, ITMISS, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, DEFAULT, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_READ, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_UC, 7),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WC, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WT, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WP, 10),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WB, 11),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OWN, 13),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OTHER, 14),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, PREFETCH, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC, 7),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP, 10),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB, 11),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN, 13),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER, 14),
+	P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER, 5),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE, 10),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0, 11),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1, 12),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2, 13),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_SSE_INPUT_ASSIST, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_SP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_DP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_SP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_DP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_64BIT_MMX_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_128BIT_MMX_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_FP_UOP, ALL, 15),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_MISC, FLUSH, 4),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING, 0),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_TC_MS_XFER, CISC, 0),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM, 2),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CALL, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT, 4),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_RESOURCE_STALL, SBFULL, 5),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_EVICTS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, BOGUS, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS0, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS1, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS2, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS3, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS0, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS1, 5),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS2, 6),
+	P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS3, 7),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, BOGUS, 1),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSTAG, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSNTAG, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSTAG, 3),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, BOGUS, 1),
 
-	P4_MAKE_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS, 1),
-	P4_MAKE_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES, 2),
-
-	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMNP, 0),
-	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMNM, 1),
-	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMTP, 2),
-	P4_MAKE_EVENT_ATTR(P4_BRANCH_RETIRED, MMTM, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGLOADS, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGSTORES, 2),
+
+	P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNP, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNM, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTP, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTM, 3),
 
-	P4_MAKE_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS, 0),
 
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, FPSU, 0),
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, FPSO, 1),
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, POAO, 2),
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, POAU, 3),
-	P4_MAKE_EVENT_ATTR(P4_X87_ASSIST, PREA, 4),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSU, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSO, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAO, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAU, 3),
+	P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, PREA, 4),
 
-	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, CLEAR, 0),
-	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, MOCLEAR, 1),
-	P4_MAKE_EVENT_ATTR(P4_MACHINE_CLEAR, SMCLEAR, 2),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, CLEAR, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, MOCLEAR, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, SMCLEAR, 2),
 
-	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, NBOGUS, 0),
-	P4_MAKE_EVENT_ATTR(P4_INSTR_COMPLETED, BOGUS, 1),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, NBOGUS, 0),
+	P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1),
 };
 
-enum {
-	KEY_P4_L1D_OP_READ_RESULT_MISS = PERF_COUNT_HW_MAX,
-	KEY_P4_LL_OP_READ_RESULT_MISS,
-	KEY_P4_DTLB_OP_READ_RESULT_MISS,
-	KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
-	KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
-	KEY_P4_ITLB_OP_READ_RESULT_MISS,
-	KEY_P4_UOP_TYPE,
+/* P4 PEBS: stale for a while */
+#define P4_PEBS_METRIC_MASK	0x00001fffU
+#define P4_PEBS_UOB_TAG		0x01000000U
+#define P4_PEBS_ENABLE		0x02000000U
+
+/* Replay metrics for MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT */
+#define P4_PEBS__1stl_cache_load_miss_retired	0x3000001
+#define P4_PEBS__2ndl_cache_load_miss_retired	0x3000002
+#define P4_PEBS__dtlb_load_miss_retired		0x3000004
+#define P4_PEBS__dtlb_store_miss_retired	0x3000004
+#define P4_PEBS__dtlb_all_miss_retired		0x3000004
+#define P4_PEBS__tagged_mispred_branch		0x3018000
+#define P4_PEBS__mob_load_replay_retired	0x3000200
+#define P4_PEBS__split_load_retired		0x3000400
+#define P4_PEBS__split_store_retired		0x3000400
+
+#define P4_VERT__1stl_cache_load_miss_retired	0x0000001
+#define P4_VERT__2ndl_cache_load_miss_retired	0x0000001
+#define P4_VERT__dtlb_load_miss_retired		0x0000001
+#define P4_VERT__dtlb_store_miss_retired	0x0000002
+#define P4_VERT__dtlb_all_miss_retired		0x0000003
+#define P4_VERT__tagged_mispred_branch		0x0000010
+#define P4_VERT__mob_load_replay_retired	0x0000001
+#define P4_VERT__split_load_retired		0x0000001
+#define P4_VERT__split_store_retired		0x0000002
+
+enum P4_CACHE_EVENTS {
+	P4_CACHE__NONE,
+
+	P4_CACHE__1stl_cache_load_miss_retired,
+	P4_CACHE__2ndl_cache_load_miss_retired,
+	P4_CACHE__dtlb_load_miss_retired,
+	P4_CACHE__dtlb_store_miss_retired,
+	P4_CACHE__itlb_reference_hit,
+	P4_CACHE__itlb_reference_miss,
+
+	P4_CACHE__MAX
 };
 
 #endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index b8a811ab760..f8fe069f14e 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -11,35 +11,281 @@
 
 #include <asm/perf_event_p4.h>
 
+#define P4_CNTR_LIMIT 3
 /*
  * array indices: 0,1 - HT threads, used with HT enabled cpu
  */
-struct p4_event_template {
-	u32 opcode;			/* ESCR event + CCCR selector */
-	u64 config;			/* packed predefined bits */
-	int dep;			/* upstream dependency event index */
-	int key;			/* index into p4_templates */
-	u64 msr;			/*
-					 * the high 32 bits set into MSR_IA32_PEBS_ENABLE and
-					 * the low 32 bits set into MSR_P4_PEBS_MATRIX_VERT
-					 * for cache events
-					 */
-	unsigned int emask;		/* ESCR EventMask */
-	unsigned int escr_msr[2];	/* ESCR MSR for this event */
-	unsigned int cntr[2];		/* counter index (offset) */
+struct p4_event_bind {
+	unsigned int opcode;			/* Event code and ESCR selector */
+	unsigned int escr_msr[2];		/* ESCR MSR for this event */
+	unsigned char cntr[2][P4_CNTR_LIMIT];	/* counter index (offset), -1 on abscence */
 };
 
-struct p4_pmu_res {
-	/* maps hw_conf::idx into template for ESCR sake */
-	struct p4_event_template *tpl[ARCH_P4_MAX_CCCR];
+struct p4_cache_event_bind {
+	unsigned int metric_pebs;
+	unsigned int metric_vert;
 };
 
-static DEFINE_PER_CPU(struct p4_pmu_res, p4_pmu_config);
+#define P4_GEN_CACHE_EVENT_BIND(name)		\
+	[P4_CACHE__##name] = {			\
+		.metric_pebs = P4_PEBS__##name,	\
+		.metric_vert = P4_VERT__##name,	\
+	}
+
+static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
+	P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
+	P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
+	P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
+	P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
+};
+
+/*
+ * Note that we don't use CCCR1 here, there is an
+ * exception for P4_BSQ_ALLOCATION but we just have
+ * no workaround
+ *
+ * consider this binding as resources which particular
+ * event may borrow, it doesn't contain EventMask,
+ * Tags and friends -- they are left to a caller
+ */
+static struct p4_event_bind p4_event_bind_map[] = {
+	[P4_EVENT_TC_DELIVER_MODE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
+		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_BPU_FETCH_REQUEST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
+		.escr_msr	= { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_ITLB_REFERENCE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
+		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_MEMORY_CANCEL] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
+		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_MEMORY_COMPLETE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_LOAD_PORT_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_STORE_PORT_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_MOB_LOAD_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
+		.escr_msr	= { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_PAGE_WALK_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
+		.escr_msr	= { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BSQ_CACHE_REFERENCE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_IOQ_ALLOCATION] = {
+		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_IOQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
+		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
+		.escr_msr	= { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
+		.cntr		= { {2, -1, -1}, {3, -1, -1} },
+	},
+	[P4_EVENT_FSB_DATA_ACTIVITY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BSQ_ALLOCATION] = {		/* shared ESCR, broken CCCR1 */
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+		.cntr		= { {0, -1, -1}, {1, -1, -1} },
+	},
+	[P4_EVENT_BSQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
+		.escr_msr	= { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+		.cntr		= { {2, -1, -1}, {3, -1, -1} },
+	},
+	[P4_EVENT_SSE_INPUT_ASSIST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_PACKED_SP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_PACKED_DP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_SCALAR_SP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_SCALAR_DP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_64BIT_MMX_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_128BIT_MMX_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_X87_FP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_X87_FP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_TC_MISC] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_MISC),
+		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_GLOBAL_POWER_EVENTS] = {
+		.opcode		= P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_TC_MS_XFER] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_MS_XFER),
+		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_UOP_QUEUE_WRITES] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
+		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
+		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RETIRED_BRANCH_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
+		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RESOURCE_STALL] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RESOURCE_STALL),
+		.escr_msr	= { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_WC_BUFFER] = {
+		.opcode		= P4_OPCODE(P4_EVENT_WC_BUFFER),
+		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_B2B_CYCLES] = {
+		.opcode		= P4_OPCODE(P4_EVENT_B2B_CYCLES),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BNR] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BNR),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_SNOOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SNOOP),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_RESPONSE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RESPONSE),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_FRONT_END_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_EXECUTION_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_REPLAY_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_REPLAY_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_INSTR_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_INSTR_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_UOPS_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOPS_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_UOP_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOP_TYPE),
+		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_BRANCH_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_MISPRED_BRANCH_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_X87_ASSIST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_X87_ASSIST),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_MACHINE_CLEAR] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_INSTR_COMPLETED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+};
 
-#define P4_CACHE_EVENT_CONFIG(event, bit) \
-	p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(event) << P4_EVNTSEL_EVENT_SHIFT) | \
-	p4_config_pack_escr((event##_##bit) << P4_EVNTSEL_EVENTMASK_SHIFT) | \
-	p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(event) << P4_CCCR_ESCR_SELECT_SHIFT)
+#define P4_GEN_CACHE_EVENT(event, bit, cache_event)			  \
+	p4_config_pack_escr(P4_ESCR_EVENT(event)			| \
+			    P4_ESCR_EMASK_BIT(event, bit))		| \
+	p4_config_pack_cccr(cache_event					| \
+			    P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
 
 static __initconst u64 p4_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
@@ -49,42 +295,35 @@ static __initconst u64 p4_hw_cache_event_ids
  [ C(L1D ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
-					/* 1stL_cache_load_miss_retired */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
-					| KEY_P4_L1D_OP_READ_RESULT_MISS,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_CACHE__1stl_cache_load_miss_retired),
 	},
  },
  [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
-					/* 2ndL_cache_load_miss_retired */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
-					| KEY_P4_LL_OP_READ_RESULT_MISS,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_CACHE__2ndl_cache_load_miss_retired),
 	},
- },
+},
  [ C(DTLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
-					/* DTLB_load_miss_retired */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
-					| KEY_P4_DTLB_OP_READ_RESULT_MISS,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_CACHE__dtlb_load_miss_retired),
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0,
-					/* DTLB_store_miss_retired */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_REPLAY_EVENT, NBOGUS)
-					| KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_CACHE__dtlb_store_miss_retired),
 	},
  },
  [ C(ITLB) ] = {
 	[ C(OP_READ) ] = {
-					/* ITLB_reference.HIT */
-		[ C(RESULT_ACCESS) ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, HIT)
-					| KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
-
-					/* ITLB_reference.MISS */
-		[ C(RESULT_MISS)   ] = P4_CACHE_EVENT_CONFIG(P4_ITLB_REFERENCE, MISS)
-					| KEY_P4_ITLB_OP_READ_RESULT_MISS,
+		[ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
+						P4_CACHE__itlb_reference_hit),
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
+						P4_CACHE__itlb_reference_miss),
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
@@ -97,219 +336,89 @@ static __initconst u64 p4_hw_cache_event_ids
  },
 };
 
-/*
- * WARN: CCCR1 doesn't have a working enable bit so try to not
- * use it if possible
- *
- * Also as only we start to support raw events we will need to
- * append _all_ P4_EVENT_PACK'ed events here
- */
-struct p4_event_template p4_templates[] = {
-	[0] = {
-		.opcode	= P4_GLOBAL_POWER_EVENTS,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 0,
-		.emask	=
-			P4_EVENT_ATTR(P4_GLOBAL_POWER_EVENTS, RUNNING),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[1] = {
-		.opcode	= P4_INSTR_RETIRED,
-		.config	= 0,
-		.dep	= -1, /* needs front-end tagging */
-		.key	= 1,
-		.emask	=
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, NBOGUSNTAG)	|
-			P4_EVENT_ATTR(P4_INSTR_RETIRED, BOGUSNTAG),
-		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-		.cntr		= { 12, 14 },
-	},
-	[2] = {
-		.opcode	= P4_BSQ_CACHE_REFERENCE,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 2,
-		.emask	=
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_HITM),
-		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[3] = {
-		.opcode	= P4_BSQ_CACHE_REFERENCE,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 3,
-		.emask	=
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
-			P4_EVENT_ATTR(P4_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
-		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
-		.cntr		= { 0, 3 },
-	},
-	[4] = {
-		.opcode	= P4_RETIRED_BRANCH_TYPE,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 4,
-		.emask	=
-			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
-			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, CALL)		|
-			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, RETURN)		|
-			P4_EVENT_ATTR(P4_RETIRED_BRANCH_TYPE, INDIRECT),
-		.escr_msr	= { MSR_P4_TBPU_ESCR0, MSR_P4_TBPU_ESCR1 },
-		.cntr		= { 4, 6 },
-	},
-	[5] = {
-		.opcode	= P4_MISPRED_BRANCH_RETIRED,
-		.config	= 0,
-		.dep	= -1,
-		.key	= 5,
-		.emask	=
-			P4_EVENT_ATTR(P4_MISPRED_BRANCH_RETIRED, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-		.cntr		= { 12, 14 },
-	},
-	[6] = {
-		.opcode	= P4_FSB_DATA_ACTIVITY,
-		.config	= p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
-		.dep	= -1,
-		.key	= 6,
-		.emask	=
-			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_DRV)	|
-			P4_EVENT_ATTR(P4_FSB_DATA_ACTIVITY, DRDY_OWN),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[KEY_P4_L1D_OP_READ_RESULT_MISS] = {
-		.opcode	= P4_REPLAY_EVENT,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= (u64)(1 << 0 | 1 << 24) << 32 | (1 << 0),
-		.key	= KEY_P4_L1D_OP_READ_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
-		.cntr		= { 16, 17 },
-	},
-	[KEY_P4_LL_OP_READ_RESULT_MISS] = {
-		.opcode	= P4_REPLAY_EVENT,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= (u64)(1 << 1 | 1 << 24) << 32 | (1 << 0),
-		.key	= KEY_P4_LL_OP_READ_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
-		.cntr		= { 16, 17 },
-	},
-	[KEY_P4_DTLB_OP_READ_RESULT_MISS] = {
-		.opcode	= P4_REPLAY_EVENT,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= (u64)(1 << 2 | 1 << 24) << 32 | (1 << 0),
-		.key	= KEY_P4_DTLB_OP_READ_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
-		.cntr		= { 16, 17 },
-	},
-	[KEY_P4_DTLB_OP_WRITE_RESULT_MISS] = {
-		.opcode	= P4_REPLAY_EVENT,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= (u64)(1 << 2 | 1 << 24) << 32 | (1 << 1),
-		.key	= KEY_P4_DTLB_OP_WRITE_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_REPLAY_EVENT, NBOGUS),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR2 },
-		.cntr		= { 16, 17 },
-	},
-	[KEY_P4_ITLB_OP_READ_RESULT_ACCESS] = {
-		.opcode	= P4_ITLB_REFERENCE,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= 0,
-		.key	= KEY_P4_ITLB_OP_READ_RESULT_ACCESS,
-		.emask	=
-			P4_EVENT_ATTR(P4_ITLB_REFERENCE, HIT),
-		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[KEY_P4_ITLB_OP_READ_RESULT_MISS] = {
-		.opcode	= P4_ITLB_REFERENCE,
-		.config	= 0,
-		.dep	= -1,
-		.msr	= 0,
-		.key	= KEY_P4_ITLB_OP_READ_RESULT_MISS,
-		.emask	=
-			P4_EVENT_ATTR(P4_ITLB_REFERENCE, MISS),
-		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
-		.cntr		= { 0, 2 },
-	},
-	[KEY_P4_UOP_TYPE] = {
-		.opcode	= P4_UOP_TYPE,
-		.config	= 0,
-		.dep	= -1,
-		.key	= KEY_P4_UOP_TYPE,
-		.emask	=
-			P4_EVENT_ATTR(P4_UOP_TYPE, TAGLOADS)	|
-			P4_EVENT_ATTR(P4_UOP_TYPE, TAGSTORES),
-		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
-		.cntr		= { 16, 17 },
-	},
+static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
+  /* non-halted CPU clocks */
+  [PERF_COUNT_HW_CPU_CYCLES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+
+  /*
+   * retired instructions
+   * in a sake of simplicity we don't use the FSB tagging
+   */
+  [PERF_COUNT_HW_INSTRUCTIONS] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
+
+  /* cache hits */
+  [PERF_COUNT_HW_CACHE_REFERENCES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
+
+  /* cache misses */
+  [PERF_COUNT_HW_CACHE_MISSES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
+
+  /* branch instructions retired */
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
+
+  /* mispredicted branches retired */
+  [PERF_COUNT_HW_BRANCH_MISSES]	=
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
+
+  /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
+  [PERF_COUNT_HW_BUS_CYCLES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN))	|
+	p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
 };
 
+static struct p4_event_bind *p4_config_get_bind(u64 config)
+{
+	unsigned int evnt = p4_config_unpack_event(config);
+	struct p4_event_bind *bind = NULL;
+
+	if (evnt < ARRAY_SIZE(p4_event_bind_map))
+		bind = &p4_event_bind_map[evnt];
+
+	return bind;
+}
+
 static u64 p4_pmu_event_map(int hw_event)
 {
-	struct p4_event_template *tpl;
+	struct p4_event_bind *bind;
+	unsigned int esel;
 	u64 config;
 
-	if (hw_event > ARRAY_SIZE(p4_templates)) {
-		printk_once(KERN_ERR "PMU: Incorrect event index\n");
+	if (hw_event > ARRAY_SIZE(p4_general_events)) {
+		printk_once(KERN_ERR "P4 PMU: Bad index: %i\n", hw_event);
 		return 0;
 	}
-	tpl = &p4_templates[hw_event];
 
-	/*
-	 * fill config up according to
-	 * a predefined event template
-	 */
-	config  = tpl->config;
-	config |= p4_config_pack_escr(P4_EVENT_UNPACK_EVENT(tpl->opcode) << P4_EVNTSEL_EVENT_SHIFT);
-	config |= p4_config_pack_escr(tpl->emask << P4_EVNTSEL_EVENTMASK_SHIFT);
-	config |= p4_config_pack_cccr(P4_EVENT_UNPACK_SELECTOR(tpl->opcode) << P4_CCCR_ESCR_SELECT_SHIFT);
-	config |= p4_config_pack_cccr(hw_event & P4_CCCR_RESERVED);
+	config = p4_general_events[hw_event];
+	bind = p4_config_get_bind(config);
+	esel = P4_OPCODE_ESEL(bind->opcode);
+	config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
 
 	return config;
 }
 
-/*
- * Note that we still have 5 events (from global events SDM list)
- * intersected in opcode+emask bits so we will need another
- * scheme there do distinguish templates.
- */
-static inline int p4_pmu_emask_match(unsigned int dst, unsigned int src)
-{
-	return dst & src;
-}
-
-static struct p4_event_template *p4_pmu_template_lookup(u64 config)
-{
-	int key = p4_config_unpack_key(config);
-
-	if (key < ARRAY_SIZE(p4_templates))
-		return &p4_templates[key];
-	else
-		return NULL;
-}
-
 /*
  * We don't control raw events so it's up to the caller
  * to pass sane values (and we don't count the thread number
@@ -319,13 +428,14 @@ static struct p4_event_template *p4_pmu_template_lookup(u64 config)
 static u64 p4_pmu_raw_event(u64 hw_event)
 {
 	return hw_event &
-		(p4_config_pack_escr(P4_EVNTSEL_MASK_HT) |
+		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
 		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
 }
 
 static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
 {
 	int cpu = raw_smp_processor_id();
+	u32 escr, cccr;
 
 	/*
 	 * the reason we use cpu that early is that: if we get scheduled
@@ -333,13 +443,10 @@ static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
 	 * specific flags in config (and will save some cpu cycles)
 	 */
 
-	/* CCCR by default */
-	hwc->config = p4_config_pack_cccr(p4_default_cccr_conf(cpu));
+	cccr = p4_default_cccr_conf(cpu);
+	escr = p4_default_escr_conf(cpu, attr->exclude_kernel, attr->exclude_user);
+	hwc->config = p4_config_pack_escr(escr) | p4_config_pack_cccr(cccr);
 
-	/* Count user and OS events unless not requested to */
-	hwc->config |= p4_config_pack_escr(p4_default_escr_conf(cpu, attr->exclude_kernel,
-								attr->exclude_user));
-	/* on HT machine we need a special bit */
 	if (p4_ht_active() && p4_ht_thread(cpu))
 		hwc->config = p4_set_ht_bit(hwc->config);
 
@@ -368,7 +475,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
 	 */
 	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
 		(u64)(p4_config_unpack_cccr(hwc->config)) &
-			~P4_CCCR_ENABLE & ~P4_CCCR_OVF);
+			~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
 }
 
 static void p4_pmu_disable_all(void)
@@ -389,27 +496,14 @@ static void p4_pmu_enable_event(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	int thread = p4_ht_config_thread(hwc->config);
 	u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
-	u64 escr_base;
-	struct p4_event_template *tpl;
-	struct p4_pmu_res *c;
+	unsigned int idx = p4_config_unpack_event(hwc->config);
+	unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
+	struct p4_event_bind *bind;
+	struct p4_cache_event_bind *bind_cache;
+	u64 escr_addr, cccr;
 
-	/*
-	 * some preparation work from per-cpu private fields
-	 * since we need to find out which ESCR to use
-	 */
-	c = &__get_cpu_var(p4_pmu_config);
-	tpl = c->tpl[hwc->idx];
-	if (!tpl) {
-		pr_crit("%s: Wrong index: %d\n", __func__, hwc->idx);
-		return;
-	}
-
-	if (tpl->msr) {
-		(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, tpl->msr >> 32);
-		(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, tpl->msr & 0xffffffff);
-	}
-
-	escr_base = (u64)tpl->escr_msr[thread];
+	bind = &p4_event_bind_map[idx];
+	escr_addr = (u64)bind->escr_msr[thread];
 
 	/*
 	 * - we dont support cascaded counters yet
@@ -418,9 +512,27 @@ static void p4_pmu_enable_event(struct perf_event *event)
 	WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
 	WARN_ON_ONCE(hwc->idx == 1);
 
-	(void)checking_wrmsrl(escr_base, escr_conf);
+	/* we need a real Event value */
+	escr_conf &= ~P4_ESCR_EVENT_MASK;
+	escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
+
+	cccr = p4_config_unpack_cccr(hwc->config);
+
+	/*
+	 * it could be Cache event so that we need to
+	 * set metrics into additional MSRs
+	 */
+	BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
+	if (idx_cache > P4_CACHE__NONE &&
+		idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
+		bind_cache = &p4_cache_event_bind_map[idx_cache];
+		(void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
+		(void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
+	}
+
+	(void)checking_wrmsrl(escr_addr, escr_conf);
 	(void)checking_wrmsrl(hwc->config_base + hwc->idx,
-		(u64)(p4_config_unpack_cccr(hwc->config)) | P4_CCCR_ENABLE);
+				(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
 }
 
 static void p4_pmu_enable_all(void)
@@ -516,13 +628,13 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 	if (p4_ht_thread(cpu)) {
 		cccr &= ~P4_CCCR_OVF_PMI_T0;
 		cccr |= P4_CCCR_OVF_PMI_T1;
-		if (escr & P4_EVNTSEL_T0_OS) {
-			escr &= ~P4_EVNTSEL_T0_OS;
-			escr |= P4_EVNTSEL_T1_OS;
+		if (escr & P4_ESCR_T0_OS) {
+			escr &= ~P4_ESCR_T0_OS;
+			escr |= P4_ESCR_T1_OS;
 		}
-		if (escr & P4_EVNTSEL_T0_USR) {
-			escr &= ~P4_EVNTSEL_T0_USR;
-			escr |= P4_EVNTSEL_T1_USR;
+		if (escr & P4_ESCR_T0_USR) {
+			escr &= ~P4_ESCR_T0_USR;
+			escr |= P4_ESCR_T1_USR;
 		}
 		hwc->config  = p4_config_pack_escr(escr);
 		hwc->config |= p4_config_pack_cccr(cccr);
@@ -530,13 +642,13 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 	} else {
 		cccr &= ~P4_CCCR_OVF_PMI_T1;
 		cccr |= P4_CCCR_OVF_PMI_T0;
-		if (escr & P4_EVNTSEL_T1_OS) {
-			escr &= ~P4_EVNTSEL_T1_OS;
-			escr |= P4_EVNTSEL_T0_OS;
+		if (escr & P4_ESCR_T1_OS) {
+			escr &= ~P4_ESCR_T1_OS;
+			escr |= P4_ESCR_T0_OS;
 		}
-		if (escr & P4_EVNTSEL_T1_USR) {
-			escr &= ~P4_EVNTSEL_T1_USR;
-			escr |= P4_EVNTSEL_T0_USR;
+		if (escr & P4_ESCR_T1_USR) {
+			escr &= ~P4_ESCR_T1_USR;
+			escr |= P4_ESCR_T0_USR;
 		}
 		hwc->config  = p4_config_pack_escr(escr);
 		hwc->config |= p4_config_pack_cccr(cccr);
@@ -606,66 +718,56 @@ static int p4_get_escr_idx(unsigned int addr)
 	return -1;
 }
 
+static int p4_next_cntr(int thread, unsigned long *used_mask,
+			struct p4_event_bind *bind)
+{
+	int i = 0, j;
+
+	for (i = 0; i < P4_CNTR_LIMIT; i++) {
+		j = bind->cntr[thread][i++];
+		if (j == -1 || !test_bit(j, used_mask))
+			return j;
+	}
+
+	return -1;
+}
+
 static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 {
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long escr_mask[BITS_TO_LONGS(ARCH_P4_TOTAL_ESCR)];
-
-	struct hw_perf_event *hwc;
-	struct p4_event_template *tpl;
-	struct p4_pmu_res *c;
 	int cpu = raw_smp_processor_id();
-	int escr_idx, thread, i, num;
+	struct hw_perf_event *hwc;
+	struct p4_event_bind *bind;
+	unsigned int i, thread, num;
+	int cntr_idx, escr_idx;
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 	bitmap_zero(escr_mask, ARCH_P4_TOTAL_ESCR);
 
-	c = &__get_cpu_var(p4_pmu_config);
-	/*
-	 * Firstly find out which resource events are going
-	 * to use, if ESCR+CCCR tuple is already borrowed
-	 * then get out of here
-	 */
 	for (i = 0, num = n; i < n; i++, num--) {
+
 		hwc = &cpuc->event_list[i]->hw;
-		tpl = p4_pmu_template_lookup(hwc->config);
-		if (!tpl)
-			goto done;
 		thread = p4_ht_thread(cpu);
-		escr_idx = p4_get_escr_idx(tpl->escr_msr[thread]);
-		if (escr_idx == -1)
-			goto done;
+		bind = p4_config_get_bind(hwc->config);
+		escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
 
-		/* already allocated and remains on the same cpu */
 		if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+			cntr_idx = hwc->idx;
 			if (assign)
 				assign[i] = hwc->idx;
-			/* upstream dependent event */
-			if (unlikely(tpl->dep != -1))
-				printk_once(KERN_WARNING "PMU: Dep events are "
-					"not implemented yet\n");
 			goto reserve;
 		}
 
-		/* it may be already borrowed */
-		if (test_bit(tpl->cntr[thread], used_mask) ||
-			test_bit(escr_idx, escr_mask))
+		cntr_idx = p4_next_cntr(thread, used_mask, bind);
+		if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
 			goto done;
 
-		/*
-		 * ESCR+CCCR+COUNTERs are available to use lets swap
-		 * thread specific bits, push assigned bits
-		 * back and save template into per-cpu
-		 * area (which will allow us to find out the ESCR
-		 * to be used at moment of "enable event via real MSR")
-		 */
 		p4_pmu_swap_config_ts(hwc, cpu);
-		if (assign) {
-			assign[i] = tpl->cntr[thread];
-			c->tpl[assign[i]] = tpl;
-		}
+		if (assign)
+			assign[i] = cntr_idx;
 reserve:
-		set_bit(tpl->cntr[thread], used_mask);
+		set_bit(cntr_idx, used_mask);
 		set_bit(escr_idx, escr_mask);
 	}
 
@@ -684,7 +786,7 @@ static __initconst struct x86_pmu p4_pmu = {
 	.perfctr		= MSR_P4_BPU_PERFCTR0,
 	.event_map		= p4_pmu_event_map,
 	.raw_event		= p4_pmu_raw_event,
-	.max_events		= ARRAY_SIZE(p4_templates),
+	.max_events		= ARRAY_SIZE(p4_general_events),
 	.get_event_constraints	= x86_get_event_constraints,
 	/*
 	 * IF HT disabled we may need to use all
@@ -716,7 +818,7 @@ static __init int p4_pmu_init(void)
 	}
 
 	memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
-	       sizeof(hw_cache_event_ids));
+		sizeof(hw_cache_event_ids));
 
 	pr_cont("Netburst events, ");
 
-- 
cgit v1.2.3


From 53c540195724b52422da067a31ef6916d2c70202 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 24 Mar 2010 16:40:14 -0300
Subject: perf report: Add a popup menu to ask what operation is to be
 performed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Right now it presents a menu with these options:

 +------------------------------+
 | Annotate CURRENT_SYMBOL_NAME |
 | Exit                         |
 +------------------------------+

If the highlighted (current) symbol is not annotatable only the
"Exit" option will appear.

Also add a confirmation dialog when ESC is pressed on the top
level to avoid exiting the application by pressing one too many
ESC key.

To get to the menu just press the -> (Right navigation key), to
exit just press ESC.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269459619-982-1-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/newt.c | 96 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 88 insertions(+), 8 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 12b229bb9dc..a3465a0ad70 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -28,6 +28,47 @@ static newtComponent newt_form__new(void)
 	return self;
 }
 
+static int popup_menu(int argc, const char *argv[])
+{
+	struct newtExitStruct es;
+	int i, rc = -1, max_len = 5;
+	newtComponent listbox, form = newt_form__new();
+
+	if (form == NULL)
+		return -1;
+
+	listbox = newtListbox(0, 0, argc, NEWT_FLAG_RETURNEXIT);
+	if (listbox == NULL)
+		goto out_destroy_form;
+
+	newtFormAddComponents(form, listbox, NULL);
+
+	for (i = 0; i < argc; ++i) {
+		int len = strlen(argv[i]);
+		if (len > max_len)
+			max_len = len;
+		if (newtListboxAddEntry(listbox, argv[i], (void *)(long)i))
+			goto out_destroy_form;
+	}
+
+	newtCenteredWindow(max_len, argc, NULL);
+	newtFormRun(form, &es);
+	rc = newtListboxGetCurrent(listbox) - NULL;
+	if (es.reason == NEWT_EXIT_HOTKEY)
+		rc = -1;
+	newtPopWindow();
+out_destroy_form:
+	newtFormDestroy(form);
+	return rc;
+}
+
+static bool dialog_yesno(const char *msg)
+{
+	/* newtWinChoice should really be accepting const char pointers... */
+	char yes[] = "Yes", no[] = "No";
+	return newtWinChoice(NULL, no, yes, (char *)msg) == 2;
+}
+
 /*
  * When debugging newt problems it was useful to be able to "unroll"
  * the calls to newtCheckBoxTreeAdd{Array,Item}, so that we can generate
@@ -309,6 +350,19 @@ out_free_str:
 	free(str);
 }
 
+static const void *newt__symbol_tree_get_current(newtComponent self)
+{
+	if (symbol_conf.use_callchain)
+		return newtCheckboxTreeGetCurrent(self);
+	return newtListboxGetCurrent(self);
+}
+
+static void perf_session__selection(newtComponent self, void *data)
+{
+	const struct symbol **symbol_ptr = data;
+	*symbol_ptr = newt__symbol_tree_get_current(self);
+}
+
 void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 				const char *helpline)
 {
@@ -322,6 +376,7 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 	char str[1024];
 	newtComponent form, tree;
 	struct newtExitStruct es;
+	const struct symbol *selection;
 
 	snprintf(str, sizeof(str), "Samples: %Ld", session_total);
 	newtDrawRootText(0, 0, str);
@@ -336,6 +391,8 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 		tree = newtListbox(0, 0, rows - 5, (NEWT_FLAG_SCROLL |
 						       NEWT_FLAG_RETURNEXIT));
 
+	newtComponentAddCallback(tree, perf_session__selection, &selection);
+
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		if (se->elide)
 			continue;
@@ -377,20 +434,43 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 	form = newt_form__new();
 	newtFormAddHotKey(form, 'A');
 	newtFormAddHotKey(form, 'a');
+	newtFormAddHotKey(form, NEWT_KEY_RIGHT);
 	newtFormAddComponents(form, tree, NULL);
+	selection = newt__symbol_tree_get_current(tree);
 
 	while (1) {
-		const struct symbol *selection;
+		char annotate[512];
+		const char *options[2];
+		int nr_options = 0, choice;
 
 		newtFormRun(form, &es);
-		if (es.reason == NEWT_EXIT_HOTKEY &&
-		    toupper(es.u.key) != 'A')
+		if (es.reason == NEWT_EXIT_HOTKEY) {
+			if (toupper(es.u.key) == 'A') {
+				symbol__annotate_browser(selection);
+				continue;
+			}
+			if (es.u.key == NEWT_KEY_ESCAPE ||
+			    toupper(es.u.key) == 'Q' ||
+			    es.u.key == CTRL('c')) {
+				if (dialog_yesno("Do you really want to exit?"))
+					break;
+				else
+					continue;
+			}
+		}
+
+		if (selection != NULL) {
+			snprintf(annotate, sizeof(annotate),
+				 "Annotate %s", selection->name);
+			options[nr_options++] = annotate;
+		}
+
+		options[nr_options++] = "Exit";
+		choice = popup_menu(nr_options, options);
+		if (choice == nr_options - 1)
 			break;
-		if (!symbol_conf.use_callchain)
-			selection = newtListboxGetCurrent(tree);
-		else
-			selection = newtCheckboxTreeGetCurrent(tree);
-		symbol__annotate_browser(selection);
+		else if (selection != NULL && choice >= 0)
+			symbol__annotate_browser(selection);
 	}
 
 	newtFormDestroy(form);
-- 
cgit v1.2.3


From 96415e4d3f5fdf9cdb12eedfcbc58152b1e1458c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 24 Mar 2010 16:40:15 -0300
Subject: perf symbols: Avoid unnecessary symbol loading when dso list is
 specified
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We were performing the full thread__find_addr_location
operation, i.e. resolving to a map/dso _and_ loading its symbols
when we can optimize it by first calling thread__find_addr_map
to find just the map/dso, check if it is one that we are
interested in (passed via --dsos/-d in 'perf annotate', 'perf
report', etc) and if not avoid loading the symtab.

Nice speedup when we know which DSO we're interested in.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269459619-982-2-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/event.c | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 705ec63548b..c2808ad3b76 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -513,24 +513,32 @@ int event__preprocess_sample(const event_t *self, struct perf_session *session,
 
 	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
 
-	thread__find_addr_location(thread, session, cpumode, MAP__FUNCTION,
-				   self->ip.ip, al, filter);
+	thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION,
+			      self->ip.ip, al);
 	dump_printf(" ...... dso: %s\n",
 		    al->map ? al->map->dso->long_name :
 			al->level == 'H' ? "[hypervisor]" : "<not found>");
-	/*
-	 * We have to do this here as we may have a dso with no symbol hit that
-	 * has a name longer than the ones with symbols sampled.
-	 */
-	if (al->map && !sort_dso.elide && !al->map->dso->slen_calculated)
-		dso__calc_col_width(al->map->dso);
-
-	if (symbol_conf.dso_list &&
-	    (!al->map || !al->map->dso ||
-	     !(strlist__has_entry(symbol_conf.dso_list, al->map->dso->short_name) ||
-	       (al->map->dso->short_name != al->map->dso->long_name &&
-		strlist__has_entry(symbol_conf.dso_list, al->map->dso->long_name)))))
-		goto out_filtered;
+	al->sym = NULL;
+
+	if (al->map) {
+		if (symbol_conf.dso_list &&
+		    (!al->map || !al->map->dso ||
+		     !(strlist__has_entry(symbol_conf.dso_list,
+					  al->map->dso->short_name) ||
+		       (al->map->dso->short_name != al->map->dso->long_name &&
+			strlist__has_entry(symbol_conf.dso_list,
+					   al->map->dso->long_name)))))
+			goto out_filtered;
+		/*
+		 * We have to do this here as we may have a dso with no symbol
+		 * hit that has a name longer than the ones with symbols
+		 * sampled.
+		 */
+		if (!sort_dso.elide && !al->map->dso->slen_calculated)
+			dso__calc_col_width(al->map->dso);
+
+		al->sym = map__find_symbol(al->map, al->addr, filter);
+	}
 
 	if (symbol_conf.sym_list && al->sym &&
 	    !strlist__has_entry(symbol_conf.sym_list, al->sym->name))
-- 
cgit v1.2.3


From ac73c5a9c1767b2771e6d2b5accafdef89db04c2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 24 Mar 2010 16:40:16 -0300
Subject: perf annotate: Allow specifying DSOs where to look for symbol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using the same parameter as in 'perf report', allowing to
specify just one and disambiguate between DSOs that have the
symbol of interest.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269459619-982-3-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-annotate.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 45d14660d53..ce9b1ef784b 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -591,6 +591,8 @@ static const char * const annotate_usage[] = {
 static const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
+	OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
+		   "only consider symbols in these dsos"),
 	OPT_STRING('s', "symbol", &sym_hist_filter, "symbol",
 		    "symbol to annotate"),
 	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
-- 
cgit v1.2.3


From 59fd53062f71011a68d03f4cd0ba93d822ac3249 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 24 Mar 2010 16:40:17 -0300
Subject: perf tools: Introduce struct map_symbol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

That will be in both struct hist_entry and struct
callchain_list, so that the TUI can store a pointer to the pair
(map, symbol) in the trees where hist_entries and
callchain_lists are present, to allow precise annotation instead
of looking for the first symbol with the selected name.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269459619-982-4-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-annotate.c | 34 +++++++++++++++++-----------------
 tools/perf/util/hist.c        |  8 +++++---
 tools/perf/util/newt.c        |  4 ++--
 tools/perf/util/sort.c        | 22 +++++++++++-----------
 tools/perf/util/sort.h        |  3 +--
 tools/perf/util/symbol.h      |  5 +++++
 6 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index ce9b1ef784b..887e8e04a6f 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -69,13 +69,13 @@ static int sym__alloc_hist(struct symbol *self)
 static int annotate__hist_hit(struct hist_entry *he, u64 ip)
 {
 	unsigned int sym_size, offset;
-	struct symbol *sym = he->sym;
+	struct symbol *sym = he->ms.sym;
 	struct sym_priv *priv;
 	struct sym_hist *h;
 
 	he->count++;
 
-	if (!sym || !he->map)
+	if (!sym || !he->ms.map)
 		return 0;
 
 	priv = symbol__priv(sym);
@@ -85,7 +85,7 @@ static int annotate__hist_hit(struct hist_entry *he, u64 ip)
 	sym_size = sym->end - sym->start;
 	offset = ip - sym->start;
 
-	pr_debug3("%s: ip=%#Lx\n", __func__, he->map->unmap_ip(he->map, ip));
+	pr_debug3("%s: ip=%#Lx\n", __func__, he->ms.map->unmap_ip(he->ms.map, ip));
 
 	if (offset >= sym_size)
 		return 0;
@@ -94,8 +94,8 @@ static int annotate__hist_hit(struct hist_entry *he, u64 ip)
 	h->sum++;
 	h->ip[offset]++;
 
-	pr_debug3("%#Lx %s: count++ [ip: %#Lx, %#Lx] => %Ld\n", he->sym->start,
-		  he->sym->name, ip, ip - he->sym->start, h->ip[offset]);
+	pr_debug3("%#Lx %s: count++ [ip: %#Lx, %#Lx] => %Ld\n", he->ms.sym->start,
+		  he->ms.sym->name, ip, ip - he->ms.sym->start, h->ip[offset]);
 	return 0;
 }
 
@@ -187,7 +187,7 @@ static struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
 static int parse_line(FILE *file, struct hist_entry *he,
 		      struct list_head *head)
 {
-	struct symbol *sym = he->sym;
+	struct symbol *sym = he->ms.sym;
 	struct objdump_line *objdump_line;
 	char *line = NULL, *tmp, *tmp2;
 	size_t line_len;
@@ -226,7 +226,7 @@ static int parse_line(FILE *file, struct hist_entry *he,
 	}
 
 	if (line_ip != -1) {
-		u64 start = map__rip_2objdump(he->map, sym->start);
+		u64 start = map__rip_2objdump(he->ms.map, sym->start);
 		offset = line_ip - start;
 	}
 
@@ -244,7 +244,7 @@ static int objdump_line__print(struct objdump_line *self,
 			       struct list_head *head,
 			       struct hist_entry *he, u64 len)
 {
-	struct symbol *sym = he->sym;
+	struct symbol *sym = he->ms.sym;
 	static const char *prev_line;
 	static const char *prev_color;
 
@@ -327,7 +327,7 @@ static void insert_source_line(struct sym_ext *sym_ext)
 
 static void free_source_line(struct hist_entry *he, int len)
 {
-	struct sym_priv *priv = symbol__priv(he->sym);
+	struct sym_priv *priv = symbol__priv(he->ms.sym);
 	struct sym_ext *sym_ext = priv->ext;
 	int i;
 
@@ -346,7 +346,7 @@ static void free_source_line(struct hist_entry *he, int len)
 static void
 get_source_line(struct hist_entry *he, int len, const char *filename)
 {
-	struct symbol *sym = he->sym;
+	struct symbol *sym = he->ms.sym;
 	u64 start;
 	int i;
 	char cmd[PATH_MAX * 2];
@@ -361,7 +361,7 @@ get_source_line(struct hist_entry *he, int len, const char *filename)
 	if (!priv->ext)
 		return;
 
-	start = he->map->unmap_ip(he->map, sym->start);
+	start = he->ms.map->unmap_ip(he->ms.map, sym->start);
 
 	for (i = 0; i < len; i++) {
 		char *path = NULL;
@@ -425,7 +425,7 @@ static void print_summary(const char *filename)
 
 static void hist_entry__print_hits(struct hist_entry *self)
 {
-	struct symbol *sym = self->sym;
+	struct symbol *sym = self->ms.sym;
 	struct sym_priv *priv = symbol__priv(sym);
 	struct sym_hist *h = priv->hist;
 	u64 len = sym->end - sym->start, offset;
@@ -439,9 +439,9 @@ static void hist_entry__print_hits(struct hist_entry *self)
 
 static void annotate_sym(struct hist_entry *he)
 {
-	struct map *map = he->map;
+	struct map *map = he->ms.map;
 	struct dso *dso = map->dso;
-	struct symbol *sym = he->sym;
+	struct symbol *sym = he->ms.sym;
 	const char *filename = dso->long_name, *d_filename;
 	u64 len;
 	char command[PATH_MAX*2];
@@ -526,17 +526,17 @@ static void perf_session__find_annotations(struct perf_session *self)
 		struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
 		struct sym_priv *priv;
 
-		if (he->sym == NULL)
+		if (he->ms.sym == NULL)
 			continue;
 
-		priv = symbol__priv(he->sym);
+		priv = symbol__priv(he->ms.sym);
 		if (priv->hist == NULL)
 			continue;
 
 		annotate_sym(he);
 		/*
 		 * Since we have a hist_entry per IP for the same symbol, free
-		 * he->sym->hist to signal we already processed this symbol.
+		 * he->ms.sym->hist to signal we already processed this symbol.
 		 */
 		free(priv->hist);
 		priv->hist = NULL;
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 5843a9c572a..4eefb52a866 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -22,8 +22,10 @@ struct hist_entry *__perf_session__add_hist_entry(struct rb_root *hists,
 	struct hist_entry *he;
 	struct hist_entry entry = {
 		.thread	= al->thread,
-		.map	= al->map,
-		.sym	= al->sym,
+		.ms = {
+			.map	= al->map,
+			.sym	= al->sym,
+		},
 		.ip	= al->addr,
 		.level	= al->level,
 		.count	= count,
@@ -654,7 +656,7 @@ print_entries:
 		if (symbol_conf.use_callchain)
 			ret += hist_entry__fprintf_callchain(h, fp, session_total);
 
-		if (h->map == NULL && verbose > 1) {
+		if (h->ms.map == NULL && verbose > 1) {
 			__map_groups__fprintf_maps(&h->thread->mg,
 						   MAP__FUNCTION, fp);
 			fprintf(fp, "%.10s end\n", graph_dotted_line);
diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index a3465a0ad70..25cd2e1f425 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -287,9 +287,9 @@ static size_t hist_entry__append_browser(struct hist_entry *self,
 
 		indexes[0] = NEWT_ARG_APPEND;
 		indexes[1] = NEWT_ARG_LAST;
-		newt_checkbox_tree__add(tree, s, self->sym, indexes);
+		newt_checkbox_tree__add(tree, s, self->ms.sym, indexes);
 	} else
-		newtListboxAppendEntry(tree, s, self->sym);
+		newtListboxAppendEntry(tree, s, self->ms.sym);
 
 	return strlen(s);
 }
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index cb0f327de9e..9b80c13cae4 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -131,8 +131,8 @@ sort__comm_print(FILE *fp, struct hist_entry *self, unsigned int width)
 int64_t
 sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	struct dso *dso_l = left->map ? left->map->dso : NULL;
-	struct dso *dso_r = right->map ? right->map->dso : NULL;
+	struct dso *dso_l = left->ms.map ? left->ms.map->dso : NULL;
+	struct dso *dso_r = right->ms.map ? right->ms.map->dso : NULL;
 	const char *dso_name_l, *dso_name_r;
 
 	if (!dso_l || !dso_r)
@@ -152,9 +152,9 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 size_t
 sort__dso_print(FILE *fp, struct hist_entry *self, unsigned int width)
 {
-	if (self->map && self->map->dso) {
-		const char *dso_name = !verbose ? self->map->dso->short_name :
-						  self->map->dso->long_name;
+	if (self->ms.map && self->ms.map->dso) {
+		const char *dso_name = !verbose ? self->ms.map->dso->short_name :
+						  self->ms.map->dso->long_name;
 		return repsep_fprintf(fp, "%-*s", width, dso_name);
 	}
 
@@ -168,11 +168,11 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
 	u64 ip_l, ip_r;
 
-	if (left->sym == right->sym)
+	if (left->ms.sym == right->ms.sym)
 		return 0;
 
-	ip_l = left->sym ? left->sym->start : left->ip;
-	ip_r = right->sym ? right->sym->start : right->ip;
+	ip_l = left->ms.sym ? left->ms.sym->start : left->ip;
+	ip_r = right->ms.sym ? right->ms.sym->start : right->ip;
 
 	return (int64_t)(ip_r - ip_l);
 }
@@ -184,13 +184,13 @@ sort__sym_print(FILE *fp, struct hist_entry *self, unsigned int width __used)
 	size_t ret = 0;
 
 	if (verbose) {
-		char o = self->map ? dso__symtab_origin(self->map->dso) : '!';
+		char o = self->ms.map ? dso__symtab_origin(self->ms.map->dso) : '!';
 		ret += repsep_fprintf(fp, "%#018llx %c ", (u64)self->ip, o);
 	}
 
 	ret += repsep_fprintf(fp, "[%c] ", self->level);
-	if (self->sym)
-		ret += repsep_fprintf(fp, "%s", self->sym->name);
+	if (self->ms.sym)
+		ret += repsep_fprintf(fp, "%s", self->ms.sym->name);
 	else
 		ret += repsep_fprintf(fp, "%#016llx", (u64)self->ip);
 
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 753f9ea99fb..598568696f9 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -45,8 +45,7 @@ struct hist_entry {
 	struct rb_node		rb_node;
 	u64			count;
 	struct thread		*thread;
-	struct map		*map;
-	struct symbol		*sym;
+	struct map_symbol	ms;
 	u64			ip;
 	char			level;
 	struct symbol	  *parent;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 0da2455d5b9..a4a894b8ea0 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -88,6 +88,11 @@ struct ref_reloc_sym {
 	u64		unrelocated_addr;
 };
 
+struct map_symbol {
+	struct map    *map;
+	struct symbol *sym;
+};
+
 struct addr_location {
 	struct thread *thread;
 	struct map    *map;
-- 
cgit v1.2.3


From b3c9ac0846c654dea4df095999ee202e8b4cb253 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 24 Mar 2010 16:40:18 -0300
Subject: perf callchains: Store the map together with the symbol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We need this to know where a symbol in a callchain came from,
for various reasons, among them precise annotation from a
TUI/GUI tool.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269459619-982-5-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c |  3 ++-
 tools/perf/util/callchain.c | 21 ++++++++++-----------
 tools/perf/util/callchain.h |  4 ++--
 tools/perf/util/hist.c      | 14 +++++++-------
 tools/perf/util/newt.c      |  8 ++++----
 tools/perf/util/session.c   | 13 +++++++------
 tools/perf/util/session.h   |  8 ++++----
 7 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index d609afbd1a3..6ab16980dd6 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -81,7 +81,8 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 					struct addr_location *al,
 					struct sample_data *data)
 {
-	struct symbol **syms = NULL, *parent = NULL;
+	struct map_symbol *syms = NULL;
+	struct symbol *parent = NULL;
 	bool hit;
 	int err;
 	struct hist_entry *he;
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 883844eb4b0..db628af6d20 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -185,8 +185,8 @@ create_child(struct callchain_node *parent, bool inherit_children)
 
 
 struct resolved_ip {
-	u64		ip;
-	struct symbol	*sym;
+	u64		  ip;
+	struct map_symbol ms;
 };
 
 struct resolved_chain {
@@ -212,7 +212,7 @@ fill_node(struct callchain_node *node, struct resolved_chain *chain, int start)
 			return;
 		}
 		call->ip = chain->ips[i].ip;
-		call->sym = chain->ips[i].sym;
+		call->ms = chain->ips[i].ms;
 		list_add_tail(&call->list, &node->val);
 	}
 	node->val_nr = chain->nr - start;
@@ -318,10 +318,10 @@ __append_chain(struct callchain_node *root, struct resolved_chain *chain,
 		if (i == chain->nr)
 			break;
 
-		sym = chain->ips[i].sym;
+		sym = chain->ips[i].ms.sym;
 
-		if (cnode->sym && sym) {
-			if (cnode->sym->start != sym->start)
+		if (cnode->ms.sym && sym) {
+			if (cnode->ms.sym->start != sym->start)
 				break;
 		} else if (cnode->ip != chain->ips[i].ip)
 			break;
@@ -353,9 +353,8 @@ __append_chain(struct callchain_node *root, struct resolved_chain *chain,
 	return 0;
 }
 
-static void
-filter_context(struct ip_callchain *old, struct resolved_chain *new,
-	       struct symbol **syms)
+static void filter_context(struct ip_callchain *old, struct resolved_chain *new,
+			   struct map_symbol *syms)
 {
 	int i, j = 0;
 
@@ -364,7 +363,7 @@ filter_context(struct ip_callchain *old, struct resolved_chain *new,
 			continue;
 
 		new->ips[j].ip = old->ips[i];
-		new->ips[j].sym = syms[i];
+		new->ips[j].ms = syms[i];
 		j++;
 	}
 
@@ -373,7 +372,7 @@ filter_context(struct ip_callchain *old, struct resolved_chain *new,
 
 
 int append_chain(struct callchain_node *root, struct ip_callchain *chain,
-		  struct symbol **syms)
+		 struct map_symbol *syms)
 {
 	struct resolved_chain *filtered;
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index bbd76da27f2..8a7e8bbd0fd 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -39,7 +39,7 @@ struct callchain_param {
 
 struct callchain_list {
 	u64			ip;
-	struct symbol		*sym;
+	struct map_symbol	ms;
 	struct list_head	list;
 };
 
@@ -57,5 +57,5 @@ static inline u64 cumul_hits(struct callchain_node *node)
 
 int register_callchain_param(struct callchain_param *param);
 int append_chain(struct callchain_node *root, struct ip_callchain *chain,
-		 struct symbol **syms);
+		 struct map_symbol *syms);
 #endif	/* __PERF_CALLCHAIN_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 4eefb52a866..09e09e78cb6 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -260,8 +260,8 @@ static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain,
 		} else
 			ret += fprintf(fp, "%s", "          ");
 	}
-	if (chain->sym)
-		ret += fprintf(fp, "%s\n", chain->sym->name);
+	if (chain->ms.sym)
+		ret += fprintf(fp, "%s\n", chain->ms.sym->name);
 	else
 		ret += fprintf(fp, "%p\n", (void *)(long)chain->ip);
 
@@ -280,7 +280,7 @@ static void init_rem_hits(void)
 	}
 
 	strcpy(rem_sq_bracket->name, "[...]");
-	rem_hits.sym = rem_sq_bracket;
+	rem_hits.ms.sym = rem_sq_bracket;
 }
 
 static size_t __callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
@@ -382,8 +382,8 @@ static size_t callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 		} else
 			ret += callchain__fprintf_left_margin(fp, left_margin);
 
-		if (chain->sym)
-			ret += fprintf(fp, " %s\n", chain->sym->name);
+		if (chain->ms.sym)
+			ret += fprintf(fp, " %s\n", chain->ms.sym->name);
 		else
 			ret += fprintf(fp, " %p\n", (void *)(long)chain->ip);
 	}
@@ -408,8 +408,8 @@ static size_t callchain__fprintf_flat(FILE *fp, struct callchain_node *self,
 	list_for_each_entry(chain, &self->val, list) {
 		if (chain->ip >= PERF_CONTEXT_MAX)
 			continue;
-		if (chain->sym)
-			ret += fprintf(fp, "                %s\n", chain->sym->name);
+		if (chain->ms.sym)
+			ret += fprintf(fp, "                %s\n", chain->ms.sym->name);
 		else
 			ret += fprintf(fp, "                %p\n",
 					(void *)(long)chain->ip);
diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 25cd2e1f425..f6953ca89bb 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -104,8 +104,8 @@ static void newt_checkbox_tree__add(newtComponent tree, const char *str,
 static char *callchain_list__sym_name(struct callchain_list *self,
 				      char *bf, size_t bfsize)
 {
-	if (self->sym)
-		return self->sym->name;
+	if (self->ms.sym)
+		return self->ms.sym->name;
 
 	snprintf(bf, bfsize, "%#Lx", self->ip);
 	return bf;
@@ -157,7 +157,7 @@ static void __callchain__append_graph_browser(struct callchain_node *self,
 				indexes[depth + 2] = NEWT_ARG_LAST;
 				++chain_idx;
 			}
-			newt_checkbox_tree__add(tree, str, chain->sym, indexes);
+			newt_checkbox_tree__add(tree, str, chain->ms.sym, indexes);
 			free(alloc_str);
 			++printed;
 		}
@@ -193,7 +193,7 @@ static void callchain__append_graph_browser(struct callchain_node *self,
 			continue;
 
 		str = callchain_list__sym_name(chain, ipstr, sizeof(ipstr));
-		newt_checkbox_tree__add(tree, str, chain->sym, indexes);
+		newt_checkbox_tree__add(tree, str, chain->ms.sym, indexes);
 	}
 
 	indexes[1] = parent_idx;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index eed1cb88900..2cef3730cd9 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -117,13 +117,13 @@ static bool symbol__match_parent_regex(struct symbol *sym)
 	return 0;
 }
 
-struct symbol **perf_session__resolve_callchain(struct perf_session *self,
-						struct thread *thread,
-						struct ip_callchain *chain,
-						struct symbol **parent)
+struct map_symbol *perf_session__resolve_callchain(struct perf_session *self,
+						   struct thread *thread,
+						   struct ip_callchain *chain,
+						   struct symbol **parent)
 {
 	u8 cpumode = PERF_RECORD_MISC_USER;
-	struct symbol **syms = NULL;
+	struct map_symbol *syms = NULL;
 	unsigned int i;
 
 	if (symbol_conf.use_callchain) {
@@ -160,7 +160,8 @@ struct symbol **perf_session__resolve_callchain(struct perf_session *self,
 				*parent = al.sym;
 			if (!symbol_conf.use_callchain)
 				break;
-			syms[i] = al.sym;
+			syms[i].map = al.map;
+			syms[i].sym = al.sym;
 		}
 	}
 
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 34d73395baa..631f8157fc1 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -57,10 +57,10 @@ int __perf_session__process_events(struct perf_session *self,
 int perf_session__process_events(struct perf_session *self,
 				 struct perf_event_ops *event_ops);
 
-struct symbol **perf_session__resolve_callchain(struct perf_session *self,
-						struct thread *thread,
-						struct ip_callchain *chain,
-						struct symbol **parent);
+struct map_symbol *perf_session__resolve_callchain(struct perf_session *self,
+						   struct thread *thread,
+						   struct ip_callchain *chain,
+						   struct symbol **parent);
 
 bool perf_session__has_traces(struct perf_session *self, const char *msg);
 
-- 
cgit v1.2.3


From d5679ae4d2d4369477dc3b60027cca222aef33e9 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 24 Mar 2010 16:40:19 -0300
Subject: perf report: Pass the DSO to 'perf annotate'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So that we ensure that the symbol asked for annotation really is
in the DSO we are interested in.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269459619-982-6-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/newt.c | 51 +++++++++++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index f6953ca89bb..e99bcc8d193 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -157,7 +157,7 @@ static void __callchain__append_graph_browser(struct callchain_node *self,
 				indexes[depth + 2] = NEWT_ARG_LAST;
 				++chain_idx;
 			}
-			newt_checkbox_tree__add(tree, str, chain->ms.sym, indexes);
+			newt_checkbox_tree__add(tree, str, &chain->ms, indexes);
 			free(alloc_str);
 			++printed;
 		}
@@ -193,7 +193,7 @@ static void callchain__append_graph_browser(struct callchain_node *self,
 			continue;
 
 		str = callchain_list__sym_name(chain, ipstr, sizeof(ipstr));
-		newt_checkbox_tree__add(tree, str, chain->ms.sym, indexes);
+		newt_checkbox_tree__add(tree, str, &chain->ms, indexes);
 	}
 
 	indexes[1] = parent_idx;
@@ -287,14 +287,14 @@ static size_t hist_entry__append_browser(struct hist_entry *self,
 
 		indexes[0] = NEWT_ARG_APPEND;
 		indexes[1] = NEWT_ARG_LAST;
-		newt_checkbox_tree__add(tree, s, self->ms.sym, indexes);
+		newt_checkbox_tree__add(tree, s, &self->ms, indexes);
 	} else
-		newtListboxAppendEntry(tree, s, self->ms.sym);
+		newtListboxAppendEntry(tree, s, &self->ms);
 
 	return strlen(s);
 }
 
-static void symbol__annotate_browser(const struct symbol *self)
+static void map_symbol__annotate_browser(const struct map_symbol *self)
 {
 	FILE *fp;
 	int cols, rows;
@@ -305,10 +305,11 @@ static void symbol__annotate_browser(const struct symbol *self)
 	size_t max_usable_width;
 	char *line = NULL;
 
-	if (self == NULL)
+	if (self->sym == NULL)
 		return;
 
-	if (asprintf(&str, "perf annotate %s 2>&1 | expand", self->name) < 0)
+	if (asprintf(&str, "perf annotate -d \"%s\" %s 2>&1 | expand",
+		     self->map->dso->name, self->sym->name) < 0)
 		return;
 
 	fp = popen(str, "r");
@@ -338,7 +339,7 @@ static void symbol__annotate_browser(const struct symbol *self)
 
 	newtListboxSetWidth(tree, max_line_len);
 
-	newtCenteredWindow(max_line_len + 2, rows - 5, self->name);
+	newtCenteredWindow(max_line_len + 2, rows - 5, self->sym->name);
 	form = newt_form__new();
 	newtFormAddComponents(form, tree, NULL);
 
@@ -359,7 +360,7 @@ static const void *newt__symbol_tree_get_current(newtComponent self)
 
 static void perf_session__selection(newtComponent self, void *data)
 {
-	const struct symbol **symbol_ptr = data;
+	const struct map_symbol **symbol_ptr = data;
 	*symbol_ptr = newt__symbol_tree_get_current(self);
 }
 
@@ -376,7 +377,7 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 	char str[1024];
 	newtComponent form, tree;
 	struct newtExitStruct es;
-	const struct symbol *selection;
+	const struct map_symbol *selection;
 
 	snprintf(str, sizeof(str), "Samples: %Ld", session_total);
 	newtDrawRootText(0, 0, str);
@@ -416,11 +417,8 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 		int len = hist_entry__append_browser(h, tree, session_total);
 		if (len > max_len)
 			max_len = len;
-		if (symbol_conf.use_callchain) {
+		if (symbol_conf.use_callchain)
 			hist_entry__append_callchain_browser(h, tree, session_total, idx++);
-			if (idx > 3300)
-				break;
-		}
 	}
 
 	if (max_len > cols)
@@ -441,14 +439,12 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 	while (1) {
 		char annotate[512];
 		const char *options[2];
-		int nr_options = 0, choice;
+		int nr_options = 0, choice = 0;
 
 		newtFormRun(form, &es);
 		if (es.reason == NEWT_EXIT_HOTKEY) {
-			if (toupper(es.u.key) == 'A') {
-				symbol__annotate_browser(selection);
-				continue;
-			}
+			if (toupper(es.u.key) == 'A')
+				goto do_annotate;
 			if (es.u.key == NEWT_KEY_ESCAPE ||
 			    toupper(es.u.key) == 'Q' ||
 			    es.u.key == CTRL('c')) {
@@ -459,9 +455,9 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 			}
 		}
 
-		if (selection != NULL) {
+		if (selection->sym != NULL) {
 			snprintf(annotate, sizeof(annotate),
-				 "Annotate %s", selection->name);
+				 "Annotate %s", selection->sym->name);
 			options[nr_options++] = annotate;
 		}
 
@@ -469,8 +465,17 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 		choice = popup_menu(nr_options, options);
 		if (choice == nr_options - 1)
 			break;
-		else if (selection != NULL && choice >= 0)
-			symbol__annotate_browser(selection);
+do_annotate:
+		if (selection->sym != NULL && choice >= 0) {
+			if (selection->map->dso->origin == DSO__ORIG_KERNEL) {
+				newtPopHelpLine();
+				newtPushHelpLine("No vmlinux file found, can't "
+						 "annotate with just a "
+						 "kallsyms file");
+				continue;
+			}
+			map_symbol__annotate_browser(selection);
+		}
 	}
 
 	newtFormDestroy(form);
-- 
cgit v1.2.3


From b177f63f5226e75280855bbcd106e677250778bd Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 25 Mar 2010 19:58:57 -0300
Subject: perf symbols: Pass the mmap parameters instead of using mmap_event
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To reduce the coupling of the symbol system with the rest of
perf.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269557941-15617-2-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/event.c |  3 ++-
 tools/perf/util/map.c   | 11 ++++-------
 tools/perf/util/map.h   |  5 ++---
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index c2808ad3b76..052eaeccc20 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -393,7 +393,8 @@ int event__process_mmap(event_t *self, struct perf_session *session)
 	}
 
 	thread = perf_session__findnew(session, self->mmap.pid);
-	map = map__new(&self->mmap, MAP__FUNCTION,
+	map = map__new(self->mmap.start, self->mmap.len, self->mmap.pgoff,
+		       self->mmap.pid, self->mmap.filename, MAP__FUNCTION,
 		       session->cwd, session->cwdlen);
 
 	if (thread == NULL || map == NULL)
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index e509cd59c67..a9b42273675 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -1,4 +1,3 @@
-#include "event.h"
 #include "symbol.h"
 #include <stdlib.h>
 #include <string.h>
@@ -38,13 +37,12 @@ void map__init(struct map *self, enum map_type type,
 	RB_CLEAR_NODE(&self->rb_node);
 }
 
-struct map *map__new(struct mmap_event *event, enum map_type type,
-		     char *cwd, int cwdlen)
+struct map *map__new(u64 start, u64 len, u64 pgoff, u32 pid, char *filename,
+		     enum map_type type, char *cwd, int cwdlen)
 {
 	struct map *self = malloc(sizeof(*self));
 
 	if (self != NULL) {
-		const char *filename = event->filename;
 		char newfilename[PATH_MAX];
 		struct dso *dso;
 		int anon;
@@ -62,7 +60,7 @@ struct map *map__new(struct mmap_event *event, enum map_type type,
 		anon = is_anon_memory(filename);
 
 		if (anon) {
-			snprintf(newfilename, sizeof(newfilename), "/tmp/perf-%d.map", event->pid);
+			snprintf(newfilename, sizeof(newfilename), "/tmp/perf-%d.map", pid);
 			filename = newfilename;
 		}
 
@@ -70,8 +68,7 @@ struct map *map__new(struct mmap_event *event, enum map_type type,
 		if (dso == NULL)
 			goto out_delete;
 
-		map__init(self, type, event->start, event->start + event->len,
-			  event->pgoff, dso);
+		map__init(self, type, start, start + len, pgoff, dso);
 
 		if (anon) {
 set_identity:
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index b756368076c..a4a5bc4fca6 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -68,14 +68,13 @@ u64 map__rip_2objdump(struct map *map, u64 rip);
 u64 map__objdump_2ip(struct map *map, u64 addr);
 
 struct symbol;
-struct mmap_event;
 
 typedef int (*symbol_filter_t)(struct map *map, struct symbol *sym);
 
 void map__init(struct map *self, enum map_type type,
 	       u64 start, u64 end, u64 pgoff, struct dso *dso);
-struct map *map__new(struct mmap_event *event, enum map_type,
-		     char *cwd, int cwdlen);
+struct map *map__new(u64 start, u64 len, u64 pgoff, u32 pid, char *filename,
+		     enum map_type type, char *cwd, int cwdlen);
 void map__delete(struct map *self);
 struct map *map__clone(struct map *self);
 int map__overlap(struct map *l, struct map *r);
-- 
cgit v1.2.3


From 4b8cf84624e9a58a21aaac3d064222092ae234e0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 25 Mar 2010 19:58:58 -0300
Subject: perf symbols: Move map related routines to map.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thru series of refactorings functions were being renamed but not
moved to map.c to reduce patch noise, now lets have them in the
same place so that use of the symbol system by tools can be
constrained to building and linking fewer source files:
symbol.c, map.c and rbtree.c.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269557941-15617-3-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/map.c     | 84 ++++++++++++++++++++++++++++++++++++++++++++++-
 tools/perf/util/map.h     | 47 +++++++++++++++++++++++++-
 tools/perf/util/session.c | 29 ----------------
 tools/perf/util/thread.c  | 53 ------------------------------
 tools/perf/util/thread.h  | 48 ++-------------------------
 5 files changed, 131 insertions(+), 130 deletions(-)

diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index a9b42273675..9f2963f9ee9 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -1,8 +1,9 @@
 #include "symbol.h"
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
-#include "debug.h"
+#include "map.h"
 
 const char *map_type__name[MAP__NR_TYPES] = {
 	[MAP__FUNCTION] = "Functions",
@@ -232,3 +233,84 @@ u64 map__objdump_2ip(struct map *map, u64 addr)
 			map->unmap_ip(map, addr);	/* RIP -> IP */
 	return ip;
 }
+
+struct symbol *map_groups__find_symbol(struct map_groups *self,
+				       enum map_type type, u64 addr,
+				       symbol_filter_t filter)
+{
+	struct map *map = map_groups__find(self, type, addr);
+
+	if (map != NULL)
+		return map__find_symbol(map, map->map_ip(map, addr), filter);
+
+	return NULL;
+}
+
+static u64 map__reloc_map_ip(struct map *map, u64 ip)
+{
+	return ip + (s64)map->pgoff;
+}
+
+static u64 map__reloc_unmap_ip(struct map *map, u64 ip)
+{
+	return ip - (s64)map->pgoff;
+}
+
+void map__reloc_vmlinux(struct map *self)
+{
+	struct kmap *kmap = map__kmap(self);
+	s64 reloc;
+
+	if (!kmap->ref_reloc_sym || !kmap->ref_reloc_sym->unrelocated_addr)
+		return;
+
+	reloc = (kmap->ref_reloc_sym->unrelocated_addr -
+		 kmap->ref_reloc_sym->addr);
+
+	if (!reloc)
+		return;
+
+	self->map_ip   = map__reloc_map_ip;
+	self->unmap_ip = map__reloc_unmap_ip;
+	self->pgoff    = reloc;
+}
+
+void maps__insert(struct rb_root *maps, struct map *map)
+{
+	struct rb_node **p = &maps->rb_node;
+	struct rb_node *parent = NULL;
+	const u64 ip = map->start;
+	struct map *m;
+
+	while (*p != NULL) {
+		parent = *p;
+		m = rb_entry(parent, struct map, rb_node);
+		if (ip < m->start)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&map->rb_node, parent, p);
+	rb_insert_color(&map->rb_node, maps);
+}
+
+struct map *maps__find(struct rb_root *maps, u64 ip)
+{
+	struct rb_node **p = &maps->rb_node;
+	struct rb_node *parent = NULL;
+	struct map *m;
+
+	while (*p != NULL) {
+		parent = *p;
+		m = rb_entry(parent, struct map, rb_node);
+		if (ip < m->start)
+			p = &(*p)->rb_left;
+		else if (ip > m->end)
+			p = &(*p)->rb_right;
+		else
+			return m;
+	}
+
+	return NULL;
+}
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index a4a5bc4fca6..6a703fa7470 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -4,7 +4,8 @@
 #include <linux/compiler.h>
 #include <linux/list.h>
 #include <linux/rbtree.h>
-#include <linux/types.h>
+#include <stdio.h>
+#include "types.h"
 
 enum map_type {
 	MAP__FUNCTION = 0,
@@ -90,4 +91,48 @@ void map__fixup_end(struct map *self);
 
 void map__reloc_vmlinux(struct map *self);
 
+struct map_groups {
+	struct rb_root		maps[MAP__NR_TYPES];
+	struct list_head	removed_maps[MAP__NR_TYPES];
+};
+
+size_t __map_groups__fprintf_maps(struct map_groups *self,
+				  enum map_type type, FILE *fp);
+void maps__insert(struct rb_root *maps, struct map *map);
+struct map *maps__find(struct rb_root *maps, u64 addr);
+void map_groups__init(struct map_groups *self);
+size_t map_groups__fprintf_maps(struct map_groups *self, FILE *fp);
+
+static inline void map_groups__insert(struct map_groups *self, struct map *map)
+{
+	 maps__insert(&self->maps[map->type], map);
+}
+
+static inline struct map *map_groups__find(struct map_groups *self,
+					   enum map_type type, u64 addr)
+{
+	return maps__find(&self->maps[type], addr);
+}
+
+struct symbol *map_groups__find_symbol(struct map_groups *self,
+				       enum map_type type, u64 addr,
+				       symbol_filter_t filter);
+
+static inline struct symbol *map_groups__find_function(struct map_groups *self,
+						       u64 addr,
+						       symbol_filter_t filter)
+{
+	return map_groups__find_symbol(self, MAP__FUNCTION, addr, filter);
+}
+
+struct map *map_groups__find_by_name(struct map_groups *self,
+				     enum map_type type, const char *name);
+int __map_groups__create_kernel_maps(struct map_groups *self,
+				     struct map *vmlinux_maps[MAP__NR_TYPES],
+				     struct dso *kernel);
+int map_groups__create_kernel_maps(struct map_groups *self,
+				   struct map *vmlinux_maps[MAP__NR_TYPES]);
+struct map *map_groups__new_module(struct map_groups *self, u64 start,
+				   const char *filename);
+
 #endif /* __PERF_MAP_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 2cef3730cd9..76b4ac689df 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -544,32 +544,3 @@ int perf_session__set_kallsyms_ref_reloc_sym(struct perf_session *self,
 
 	return 0;
 }
-
-static u64 map__reloc_map_ip(struct map *map, u64 ip)
-{
-	return ip + (s64)map->pgoff;
-}
-
-static u64 map__reloc_unmap_ip(struct map *map, u64 ip)
-{
-	return ip - (s64)map->pgoff;
-}
-
-void map__reloc_vmlinux(struct map *self)
-{
-	struct kmap *kmap = map__kmap(self);
-	s64 reloc;
-
-	if (!kmap->ref_reloc_sym || !kmap->ref_reloc_sym->unrelocated_addr)
-		return;
-
-	reloc = (kmap->ref_reloc_sym->unrelocated_addr -
-		 kmap->ref_reloc_sym->addr);
-
-	if (!reloc)
-		return;
-
-	self->map_ip   = map__reloc_map_ip;
-	self->unmap_ip = map__reloc_unmap_ip;
-	self->pgoff    = reloc;
-}
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index ea6506234d5..9bbe27d7530 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -272,46 +272,6 @@ static int map_groups__fixup_overlappings(struct map_groups *self,
 	return 0;
 }
 
-void maps__insert(struct rb_root *maps, struct map *map)
-{
-	struct rb_node **p = &maps->rb_node;
-	struct rb_node *parent = NULL;
-	const u64 ip = map->start;
-	struct map *m;
-
-	while (*p != NULL) {
-		parent = *p;
-		m = rb_entry(parent, struct map, rb_node);
-		if (ip < m->start)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&map->rb_node, parent, p);
-	rb_insert_color(&map->rb_node, maps);
-}
-
-struct map *maps__find(struct rb_root *maps, u64 ip)
-{
-	struct rb_node **p = &maps->rb_node;
-	struct rb_node *parent = NULL;
-	struct map *m;
-
-	while (*p != NULL) {
-		parent = *p;
-		m = rb_entry(parent, struct map, rb_node);
-		if (ip < m->start)
-			p = &(*p)->rb_left;
-		else if (ip > m->end)
-			p = &(*p)->rb_right;
-		else
-			return m;
-	}
-
-	return NULL;
-}
-
 void thread__insert_map(struct thread *self, struct map *map)
 {
 	map_groups__fixup_overlappings(&self->mg, map);
@@ -367,16 +327,3 @@ size_t perf_session__fprintf(struct perf_session *self, FILE *fp)
 
 	return ret;
 }
-
-struct symbol *map_groups__find_symbol(struct map_groups *self,
-				       enum map_type type, u64 addr,
-				       symbol_filter_t filter)
-{
-	struct map *map = map_groups__find(self, type, addr);
-
-	if (map != NULL)
-		return map__find_symbol(map, map->map_ip(map, addr), filter);
-
-	return NULL;
-}
-
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index a81426a891b..9c488fcadec 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -5,14 +5,6 @@
 #include <unistd.h>
 #include "symbol.h"
 
-struct map_groups {
-	struct rb_root		maps[MAP__NR_TYPES];
-	struct list_head	removed_maps[MAP__NR_TYPES];
-};
-
-size_t __map_groups__fprintf_maps(struct map_groups *self,
-				  enum map_type type, FILE *fp);
-
 struct thread {
 	struct rb_node		rb_node;
 	struct map_groups	mg;
@@ -23,30 +15,16 @@ struct thread {
 	int			comm_len;
 };
 
+struct perf_session;
+
 int find_all_tid(int pid, pid_t ** all_tid);
-void map_groups__init(struct map_groups *self);
 int thread__set_comm(struct thread *self, const char *comm);
 int thread__comm_len(struct thread *self);
 struct thread *perf_session__findnew(struct perf_session *self, pid_t pid);
 void thread__insert_map(struct thread *self, struct map *map);
 int thread__fork(struct thread *self, struct thread *parent);
-size_t map_groups__fprintf_maps(struct map_groups *self, FILE *fp);
 size_t perf_session__fprintf(struct perf_session *self, FILE *fp);
 
-void maps__insert(struct rb_root *maps, struct map *map);
-struct map *maps__find(struct rb_root *maps, u64 addr);
-
-static inline void map_groups__insert(struct map_groups *self, struct map *map)
-{
-	 maps__insert(&self->maps[map->type], map);
-}
-
-static inline struct map *map_groups__find(struct map_groups *self,
-					   enum map_type type, u64 addr)
-{
-	return maps__find(&self->maps[type], addr);
-}
-
 static inline struct map *thread__find_map(struct thread *self,
 					   enum map_type type, u64 addr)
 {
@@ -63,26 +41,4 @@ void thread__find_addr_location(struct thread *self,
 				enum map_type type, u64 addr,
 				struct addr_location *al,
 				symbol_filter_t filter);
-struct symbol *map_groups__find_symbol(struct map_groups *self,
-				       enum map_type type, u64 addr,
-				       symbol_filter_t filter);
-
-static inline struct symbol *map_groups__find_function(struct map_groups *self,
-						       u64 addr,
-						       symbol_filter_t filter)
-{
-	return map_groups__find_symbol(self, MAP__FUNCTION, addr, filter);
-}
-
-struct map *map_groups__find_by_name(struct map_groups *self,
-				     enum map_type type, const char *name);
-
-int __map_groups__create_kernel_maps(struct map_groups *self,
-				     struct map *vmlinux_maps[MAP__NR_TYPES],
-				     struct dso *kernel);
-int map_groups__create_kernel_maps(struct map_groups *self,
-				   struct map *vmlinux_maps[MAP__NR_TYPES]);
-
-struct map *map_groups__new_module(struct map_groups *self, u64 start,
-				   const char *filename);
 #endif	/* __PERF_THREAD_H */
-- 
cgit v1.2.3


From 618038df3588fdfcaccfd40057f36ce792bee252 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 25 Mar 2010 19:58:59 -0300
Subject: perf tools: Move __used from perf.h to linux/compiler.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Just like in the kernel and also to remove the need to include
perf.h in the symbol subsystem.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269557941-15617-4-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/perf.h                        | 2 --
 tools/perf/util/include/linux/compiler.h | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index aa786158b66..ec212748d65 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -106,8 +106,6 @@ static inline unsigned long long rdclock(void)
 #define __user
 #define asmlinkage
 
-#define __used		__attribute__((__unused__))
-
 #define unlikely(x)	__builtin_expect(!!(x), 0)
 #define min(x, y) ({				\
 	typeof(x) _min1 = (x);			\
diff --git a/tools/perf/util/include/linux/compiler.h b/tools/perf/util/include/linux/compiler.h
index dfb0713ed47..791f9dd27eb 100644
--- a/tools/perf/util/include/linux/compiler.h
+++ b/tools/perf/util/include/linux/compiler.h
@@ -7,4 +7,6 @@
 #define __user
 #define __attribute_const__
 
+#define __used		__attribute__((__unused__))
+
 #endif
-- 
cgit v1.2.3


From 5aab621b7bf024608f0c089e21656e7fe875a150 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 25 Mar 2010 19:59:00 -0300
Subject: perf symbols: Move hex2u64 and strxfrchar to symbol.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mostly used in symbol.c so move them there to reduce the number
of files needed to use the symbol system.

Also do some header adjustments with the same intent.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1269557941-15617-5-git-send-email-acme@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/parse-events.c |  1 +
 tools/perf/util/string.c       | 43 ---------------------
 tools/perf/util/string.h       |  2 -
 tools/perf/util/symbol.c       | 85 +++++++++++++++++++++++++++++++++---------
 tools/perf/util/symbol.h       | 10 ++++-
 5 files changed, 76 insertions(+), 65 deletions(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index a2014459125..435781e0c20 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -5,6 +5,7 @@
 #include "parse-events.h"
 #include "exec_cmd.h"
 #include "string.h"
+#include "symbol.h"
 #include "cache.h"
 #include "header.h"
 #include "debugfs.h"
diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c
index a175949ed21..d4389242cfd 100644
--- a/tools/perf/util/string.c
+++ b/tools/perf/util/string.c
@@ -1,49 +1,6 @@
 #include "string.h"
 #include "util.h"
 
-static int hex(char ch)
-{
-	if ((ch >= '0') && (ch <= '9'))
-		return ch - '0';
-	if ((ch >= 'a') && (ch <= 'f'))
-		return ch - 'a' + 10;
-	if ((ch >= 'A') && (ch <= 'F'))
-		return ch - 'A' + 10;
-	return -1;
-}
-
-/*
- * While we find nice hex chars, build a long_val.
- * Return number of chars processed.
- */
-int hex2u64(const char *ptr, u64 *long_val)
-{
-	const char *p = ptr;
-	*long_val = 0;
-
-	while (*p) {
-		const int hex_val = hex(*p);
-
-		if (hex_val < 0)
-			break;
-
-		*long_val = (*long_val << 4) | hex_val;
-		p++;
-	}
-
-	return p - ptr;
-}
-
-char *strxfrchar(char *s, char from, char to)
-{
-	char *p = s;
-
-	while ((p = strchr(p, from)) != NULL)
-		*p++ = to;
-
-	return s;
-}
-
 #define K 1024LL
 /*
  * perf_atoll()
diff --git a/tools/perf/util/string.h b/tools/perf/util/string.h
index 542e44de371..70058241666 100644
--- a/tools/perf/util/string.h
+++ b/tools/perf/util/string.h
@@ -4,8 +4,6 @@
 #include <stdbool.h>
 #include "types.h"
 
-int hex2u64(const char *ptr, u64 *val);
-char *strxfrchar(char *s, char from, char to);
 s64 perf_atoll(const char *str);
 char **argv_split(const char *str, int *argcp);
 void argv_free(char **argv);
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 3eb9de4baef..f3d4151e46a 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1,13 +1,19 @@
-#include "util.h"
-#include "../perf.h"
-#include "sort.h"
-#include "string.h"
+#define _GNU_SOURCE
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <libgen.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/param.h>
+#include <fcntl.h>
+#include <unistd.h>
 #include "symbol.h"
-#include "thread.h"
+#include "strlist.h"
 
-#include "debug.h"
-
-#include <asm/bug.h>
 #include <libelf.h>
 #include <gelf.h>
 #include <elf.h>
@@ -114,8 +120,8 @@ static void map_groups__fixup_end(struct map_groups *self)
 static struct symbol *symbol__new(u64 start, u64 len, const char *name)
 {
 	size_t namelen = strlen(name) + 1;
-	struct symbol *self = zalloc(symbol_conf.priv_size +
-				     sizeof(*self) + namelen);
+	struct symbol *self = calloc(1, (symbol_conf.priv_size +
+					 sizeof(*self) + namelen));
 	if (self == NULL)
 		return NULL;
 
@@ -166,7 +172,7 @@ static void dso__set_basename(struct dso *self)
 
 struct dso *dso__new(const char *name)
 {
-	struct dso *self = zalloc(sizeof(*self) + strlen(name) + 1);
+	struct dso *self = calloc(1, sizeof(*self) + strlen(name) + 1);
 
 	if (self != NULL) {
 		int i;
@@ -1382,13 +1388,13 @@ static int dso__kernel_module_get_build_id(struct dso *self)
 	return 0;
 }
 
-static int map_groups__set_modules_path_dir(struct map_groups *self, char *dirname)
+static int map_groups__set_modules_path_dir(struct map_groups *self, char *dir_name)
 {
 	struct dirent *dent;
-	DIR *dir = opendir(dirname);
+	DIR *dir = opendir(dir_name);
 
 	if (!dir) {
-		pr_debug("%s: cannot open %s dir\n", __func__, dirname);
+		pr_debug("%s: cannot open %s dir\n", __func__, dir_name);
 		return -1;
 	}
 
@@ -1401,7 +1407,7 @@ static int map_groups__set_modules_path_dir(struct map_groups *self, char *dirna
 				continue;
 
 			snprintf(path, sizeof(path), "%s/%s",
-				 dirname, dent->d_name);
+				 dir_name, dent->d_name);
 			if (map_groups__set_modules_path_dir(self, path) < 0)
 				goto failure;
 		} else {
@@ -1421,7 +1427,7 @@ static int map_groups__set_modules_path_dir(struct map_groups *self, char *dirna
 				continue;
 
 			snprintf(path, sizeof(path), "%s/%s",
-				 dirname, dent->d_name);
+				 dir_name, dent->d_name);
 
 			long_name = strdup(path);
 			if (long_name == NULL)
@@ -1458,8 +1464,8 @@ static int map_groups__set_modules_path(struct map_groups *self)
  */
 static struct map *map__new2(u64 start, struct dso *dso, enum map_type type)
 {
-	struct map *self = zalloc(sizeof(*self) +
-				  (dso->kernel ? sizeof(struct kmap) : 0));
+	struct map *self = calloc(1, (sizeof(*self) +
+				      (dso->kernel ? sizeof(struct kmap) : 0)));
 	if (self != NULL) {
 		/*
 		 * ->end will be filled after we load all the symbols
@@ -1963,3 +1969,46 @@ int map_groups__create_kernel_maps(struct map_groups *self,
 	map_groups__fixup_end(self);
 	return 0;
 }
+
+static int hex(char ch)
+{
+	if ((ch >= '0') && (ch <= '9'))
+		return ch - '0';
+	if ((ch >= 'a') && (ch <= 'f'))
+		return ch - 'a' + 10;
+	if ((ch >= 'A') && (ch <= 'F'))
+		return ch - 'A' + 10;
+	return -1;
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int hex2u64(const char *ptr, u64 *long_val)
+{
+	const char *p = ptr;
+	*long_val = 0;
+
+	while (*p) {
+		const int hex_val = hex(*p);
+
+		if (hex_val < 0)
+			break;
+
+		*long_val = (*long_val << 4) | hex_val;
+		p++;
+	}
+
+	return p - ptr;
+}
+
+char *strxfrchar(char *s, char from, char to)
+{
+	char *p = s;
+
+	while ((p = strchr(p, from)) != NULL)
+		*p++ = to;
+
+	return s;
+}
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index a4a894b8ea0..757fae3f5ee 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -3,10 +3,11 @@
 
 #include <linux/types.h>
 #include <stdbool.h>
-#include "types.h"
+#include <stdint.h>
+#include "map.h"
 #include <linux/list.h>
 #include <linux/rbtree.h>
-#include "event.h"
+#include <stdio.h>
 
 #define DEBUG_CACHE_DIR ".debug"
 
@@ -29,6 +30,9 @@ static inline char *bfd_demangle(void __used *v, const char __used *c,
 #endif
 #endif
 
+int hex2u64(const char *ptr, u64 *val);
+char *strxfrchar(char *s, char from, char to);
+
 /*
  * libelf 0.8.x and earlier do not support ELF_C_READ_MMAP;
  * for newer versions we can use mmap to reduce memory usage:
@@ -44,6 +48,8 @@ static inline char *bfd_demangle(void __used *v, const char __used *c,
 #define DMGL_ANSI        (1 << 1)       /* Include const, volatile, etc */
 #endif
 
+#define BUILD_ID_SIZE 20
+
 struct symbol {
 	struct rb_node	rb_node;
 	u64		start;
-- 
cgit v1.2.3


From 5a10317483f606106395814ee2fdaa2f1256a3b3 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Thu, 25 Mar 2010 19:59:01 -0300
Subject: perf record: Zero out mmap_array to fix segfault

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Tested-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1269557941-15617-6-git-send-email-acme@infradead.org>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 2 +-
 tools/perf/builtin-top.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index bb5b23db423..60ecdd3dd26 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -751,7 +751,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 	for (i = 0; i < MAX_NR_CPUS; i++) {
 		for (j = 0; j < MAX_COUNTERS; j++) {
 			fd[i][j] = malloc(sizeof(int)*thread_num);
-			mmap_array[i][j] = malloc(
+			mmap_array[i][j] = zalloc(
 				sizeof(struct mmap_data)*thread_num);
 			if (!fd[i][j] || !mmap_array[i][j])
 				return -ENOMEM;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 5f3ac9ff354..4abdd9b646b 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1371,7 +1371,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used)
 	for (i = 0; i < MAX_NR_CPUS; i++) {
 		for (j = 0; j < MAX_COUNTERS; j++) {
 			fd[i][j] = malloc(sizeof(int)*thread_num);
-			mmap_array[i][j] = malloc(
+			mmap_array[i][j] = zalloc(
 				sizeof(struct mmap_data)*thread_num);
 			if (!fd[i][j] || !mmap_array[i][j])
 				return -ENOMEM;
-- 
cgit v1.2.3


From 7c5ecaf7666617889f337296c610815b519abfa9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 25 Mar 2010 14:51:49 +0100
Subject: perf, x86: Clean up debugctlmsr bit definitions

Move all debugctlmsr thingies into msr-index.h

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100325135413.861425293@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/msr-index.h           | 13 ++++++++-----
 arch/x86/kernel/cpu/perf_event_intel_ds.c  | 23 +++++++----------------
 arch/x86/kernel/cpu/perf_event_intel_lbr.c |  7 ++-----
 3 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index aef562c0a64..06e4cf0d384 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -71,11 +71,14 @@
 #define MSR_IA32_LASTINTTOIP		0x000001de
 
 /* DEBUGCTLMSR bits (others vary by model): */
-#define _DEBUGCTLMSR_LBR	0 /* last branch recording */
-#define _DEBUGCTLMSR_BTF	1 /* single-step on branches */
-
-#define DEBUGCTLMSR_LBR		(1UL << _DEBUGCTLMSR_LBR)
-#define DEBUGCTLMSR_BTF		(1UL << _DEBUGCTLMSR_BTF)
+#define DEBUGCTLMSR_LBR			(1UL <<  0) /* last branch recording */
+#define DEBUGCTLMSR_BTF			(1UL <<  1) /* single-step on branches */
+#define DEBUGCTLMSR_TR			(1UL <<  6)
+#define DEBUGCTLMSR_BTS			(1UL <<  7)
+#define DEBUGCTLMSR_BTINT		(1UL <<  8)
+#define DEBUGCTLMSR_BTS_OFF_OS		(1UL <<  9)
+#define DEBUGCTLMSR_BTS_OFF_USR		(1UL << 10)
+#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI	(1UL << 11)
 
 #define MSR_IA32_MC0_CTL		0x00000400
 #define MSR_IA32_MC0_STATUS		0x00000401
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index c59678a14a2..2fea3622af7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -37,15 +37,6 @@ struct pebs_record_nhm {
 	u64 status, dla, dse, lat;
 };
 
-/*
- * Bits in the debugctlmsr controlling branch tracing.
- */
-#define X86_DEBUGCTL_TR			(1 << 6)
-#define X86_DEBUGCTL_BTS		(1 << 7)
-#define X86_DEBUGCTL_BTINT		(1 << 8)
-#define X86_DEBUGCTL_BTS_OFF_OS		(1 << 9)
-#define X86_DEBUGCTL_BTS_OFF_USR	(1 << 10)
-
 /*
  * A debug store configuration.
  *
@@ -193,15 +184,15 @@ static void intel_pmu_enable_bts(u64 config)
 
 	debugctlmsr = get_debugctlmsr();
 
-	debugctlmsr |= X86_DEBUGCTL_TR;
-	debugctlmsr |= X86_DEBUGCTL_BTS;
-	debugctlmsr |= X86_DEBUGCTL_BTINT;
+	debugctlmsr |= DEBUGCTLMSR_TR;
+	debugctlmsr |= DEBUGCTLMSR_BTS;
+	debugctlmsr |= DEBUGCTLMSR_BTINT;
 
 	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
 
 	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-		debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
 
 	update_debugctlmsr(debugctlmsr);
 }
@@ -217,8 +208,8 @@ static void intel_pmu_disable_bts(void)
 	debugctlmsr = get_debugctlmsr();
 
 	debugctlmsr &=
-		~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
-		  X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+		~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
+		  DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
 
 	update_debugctlmsr(debugctlmsr);
 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index df4c98e26c5..d202c1bece1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -12,15 +12,12 @@ enum {
  * otherwise it becomes near impossible to get a reliable stack.
  */
 
-#define X86_DEBUGCTL_LBR               		(1 << 0)
-#define X86_DEBUGCTL_FREEZE_LBRS_ON_PMI		(1 << 11)
-
 static void __intel_pmu_lbr_enable(void)
 {
 	u64 debugctl;
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	debugctl |= (X86_DEBUGCTL_LBR | X86_DEBUGCTL_FREEZE_LBRS_ON_PMI);
+	debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
@@ -29,7 +26,7 @@ static void __intel_pmu_lbr_disable(void)
 	u64 debugctl;
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	debugctl &= ~(X86_DEBUGCTL_LBR | X86_DEBUGCTL_FREEZE_LBRS_ON_PMI);
+	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
-- 
cgit v1.2.3


From faa4602e47690fb11221e00f9b9697c8dc0d4b19 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 25 Mar 2010 14:51:50 +0100
Subject: x86, perf, bts, mm: Delete the never used BTS-ptrace code

Support for the PMU's BTS features has been upstreamed in
v2.6.32, but we still have the old and disabled ptrace-BTS,
as Linus noticed it not so long ago.

It's buggy: TIF_DEBUGCTLMSR is trampling all over that MSR without
regard for other uses (perf) and doesn't provide the flexibility
needed for perf either.

Its users are ptrace-block-step and ptrace-bts, since ptrace-bts
was never used and ptrace-block-step can be implemented using a
much simpler approach.

So axe all 3000 lines of it. That includes the *locked_memory*()
APIs in mm/mlock.c as well.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Markus Metzger <markus.t.metzger@intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20100325135413.938004390@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.cpu               |   20 -
 arch/x86/Kconfig.debug             |    9 -
 arch/x86/include/asm/ds.h          |  302 --------
 arch/x86/include/asm/processor.h   |   33 +-
 arch/x86/include/asm/ptrace-abi.h  |   57 +-
 arch/x86/include/asm/ptrace.h      |    6 -
 arch/x86/include/asm/thread_info.h |    6 +-
 arch/x86/kernel/Makefile           |    2 -
 arch/x86/kernel/cpu/intel.c        |    2 -
 arch/x86/kernel/ds.c               | 1437 ------------------------------------
 arch/x86/kernel/ds_selftest.c      |  408 ----------
 arch/x86/kernel/ds_selftest.h      |   15 -
 arch/x86/kernel/dumpstack.c        |    5 -
 arch/x86/kernel/kprobes.c          |    6 +-
 arch/x86/kernel/process.c          |    9 -
 arch/x86/kernel/process_32.c       |    8 -
 arch/x86/kernel/process_64.c       |    8 -
 arch/x86/kernel/ptrace.c           |  382 ----------
 arch/x86/kernel/step.c             |   36 +-
 arch/x86/kernel/traps.c            |    5 -
 include/linux/ftrace.h             |   12 -
 include/linux/mm.h                 |    4 -
 include/linux/ptrace.h             |   12 -
 include/linux/sched.h              |    9 -
 kernel/fork.c                      |    3 -
 kernel/ptrace.c                    |    1 -
 kernel/sched.c                     |   43 --
 kernel/trace/Kconfig               |   11 -
 kernel/trace/Makefile              |    1 -
 kernel/trace/trace.h               |    4 -
 kernel/trace/trace_entries.h       |   12 -
 kernel/trace/trace_hw_branches.c   |  312 --------
 kernel/trace/trace_selftest.c      |   57 --
 mm/mlock.c                         |   41 -
 34 files changed, 9 insertions(+), 3269 deletions(-)
 delete mode 100644 arch/x86/include/asm/ds.h
 delete mode 100644 arch/x86/kernel/ds.c
 delete mode 100644 arch/x86/kernel/ds_selftest.c
 delete mode 100644 arch/x86/kernel/ds_selftest.h
 delete mode 100644 kernel/trace/trace_hw_branches.c

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a19829374e6..918fbb1855c 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -502,23 +502,3 @@ config CPU_SUP_UMC_32
 	  CPU might render the kernel unbootable.
 
 	  If unsure, say N.
-
-config X86_DS
-	def_bool X86_PTRACE_BTS
-	depends on X86_DEBUGCTLMSR
-	select HAVE_HW_BRANCH_TRACER
-
-config X86_PTRACE_BTS
-	bool "Branch Trace Store"
-	default y
-	depends on X86_DEBUGCTLMSR
-	depends on BROKEN
-	---help---
-	  This adds a ptrace interface to the hardware's branch trace store.
-
-	  Debuggers may use it to collect an execution trace of the debugged
-	  application in order to answer the question 'how did I get here?'.
-	  Debuggers may trace user mode as well as kernel mode.
-
-	  Say Y unless there is no application development on this machine
-	  and you want to save a small amount of code size.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bc01e3ebfeb..bd58c8abbfb 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -174,15 +174,6 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
-config X86_DS_SELFTEST
-    bool "DS selftest"
-    default y
-    depends on DEBUG_KERNEL
-    depends on X86_DS
-	---help---
-	  Perform Debug Store selftests at boot time.
-	  If in doubt, say "N".
-
 config HAVE_MMIOTRACE_SUPPORT
 	def_bool y
 
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
deleted file mode 100644
index 70dac199b09..00000000000
--- a/arch/x86/include/asm/ds.h
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- * Debug Store (DS) support
- *
- * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
- * precise-event based sampling (PEBS).
- *
- * It manages:
- * - DS and BTS hardware configuration
- * - buffer overflow handling (to be done)
- * - buffer access
- *
- * It does not do:
- * - security checking (is the caller allowed to trace the task)
- * - buffer allocation (memory accounting)
- *
- *
- * Copyright (C) 2007-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
- */
-
-#ifndef _ASM_X86_DS_H
-#define _ASM_X86_DS_H
-
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/err.h>
-
-
-#ifdef CONFIG_X86_DS
-
-struct task_struct;
-struct ds_context;
-struct ds_tracer;
-struct bts_tracer;
-struct pebs_tracer;
-
-typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
-typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
-
-
-/*
- * A list of features plus corresponding macros to talk about them in
- * the ds_request function's flags parameter.
- *
- * We use the enum to index an array of corresponding control bits;
- * we use the macro to index a flags bit-vector.
- */
-enum ds_feature {
-	dsf_bts = 0,
-	dsf_bts_kernel,
-#define BTS_KERNEL (1 << dsf_bts_kernel)
-	/* trace kernel-mode branches */
-
-	dsf_bts_user,
-#define BTS_USER (1 << dsf_bts_user)
-	/* trace user-mode branches */
-
-	dsf_bts_overflow,
-	dsf_bts_max,
-	dsf_pebs = dsf_bts_max,
-
-	dsf_pebs_max,
-	dsf_ctl_max = dsf_pebs_max,
-	dsf_bts_timestamps = dsf_ctl_max,
-#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps)
-	/* add timestamps into BTS trace */
-
-#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS)
-};
-
-
-/*
- * Request BTS or PEBS
- *
- * Due to alignement constraints, the actual buffer may be slightly
- * smaller than the requested or provided buffer.
- *
- * Returns a pointer to a tracer structure on success, or
- * ERR_PTR(errcode) on failure.
- *
- * The interrupt threshold is independent from the overflow callback
- * to allow users to use their own overflow interrupt handling mechanism.
- *
- * The function might sleep.
- *
- * task: the task to request recording for
- * cpu:  the cpu to request recording for
- * base: the base pointer for the (non-pageable) buffer;
- * size: the size of the provided buffer in bytes
- * ovfl: pointer to a function to be called on buffer overflow;
- *       NULL if cyclic buffer requested
- * th: the interrupt threshold in records from the end of the buffer;
- *     -1 if no interrupt threshold is requested.
- * flags: a bit-mask of the above flags
- */
-extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
-					      void *base, size_t size,
-					      bts_ovfl_callback_t ovfl,
-					      size_t th, unsigned int flags);
-extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
-					     bts_ovfl_callback_t ovfl,
-					     size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
-						void *base, size_t size,
-						pebs_ovfl_callback_t ovfl,
-						size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
-					       void *base, size_t size,
-					       pebs_ovfl_callback_t ovfl,
-					       size_t th, unsigned int flags);
-
-/*
- * Release BTS or PEBS resources
- * Suspend and resume BTS or PEBS tracing
- *
- * Must be called with irq's enabled.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern void ds_release_bts(struct bts_tracer *tracer);
-extern void ds_suspend_bts(struct bts_tracer *tracer);
-extern void ds_resume_bts(struct bts_tracer *tracer);
-extern void ds_release_pebs(struct pebs_tracer *tracer);
-extern void ds_suspend_pebs(struct pebs_tracer *tracer);
-extern void ds_resume_pebs(struct pebs_tracer *tracer);
-
-/*
- * Release BTS or PEBS resources
- * Suspend and resume BTS or PEBS tracing
- *
- * Cpu tracers must call this on the traced cpu.
- * Task tracers must call ds_release_~_noirq() for themselves.
- *
- * May be called with irq's disabled.
- *
- * Returns 0 if successful;
- * -EPERM if the cpu tracer does not trace the current cpu.
- * -EPERM if the task tracer does not trace itself.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern int ds_release_bts_noirq(struct bts_tracer *tracer);
-extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
-extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
-extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
-extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
-extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
-
-
-/*
- * The raw DS buffer state as it is used for BTS and PEBS recording.
- *
- * This is the low-level, arch-dependent interface for working
- * directly on the raw trace data.
- */
-struct ds_trace {
-	/* the number of bts/pebs records */
-	size_t n;
-	/* the size of a bts/pebs record in bytes */
-	size_t size;
-	/* pointers into the raw buffer:
-	   - to the first entry */
-	void *begin;
-	/* - one beyond the last entry */
-	void *end;
-	/* - one beyond the newest entry */
-	void *top;
-	/* - the interrupt threshold */
-	void *ith;
-	/* flags given on ds_request() */
-	unsigned int flags;
-};
-
-/*
- * An arch-independent view on branch trace data.
- */
-enum bts_qualifier {
-	bts_invalid,
-#define BTS_INVALID bts_invalid
-
-	bts_branch,
-#define BTS_BRANCH bts_branch
-
-	bts_task_arrives,
-#define BTS_TASK_ARRIVES bts_task_arrives
-
-	bts_task_departs,
-#define BTS_TASK_DEPARTS bts_task_departs
-
-	bts_qual_bit_size = 4,
-	bts_qual_max = (1 << bts_qual_bit_size),
-};
-
-struct bts_struct {
-	__u64 qualifier;
-	union {
-		/* BTS_BRANCH */
-		struct {
-			__u64 from;
-			__u64 to;
-		} lbr;
-		/* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
-		struct {
-			__u64 clock;
-			pid_t pid;
-		} event;
-	} variant;
-};
-
-
-/*
- * The BTS state.
- *
- * This gives access to the raw DS state and adds functions to provide
- * an arch-independent view of the BTS data.
- */
-struct bts_trace {
-	struct ds_trace ds;
-
-	int (*read)(struct bts_tracer *tracer, const void *at,
-		    struct bts_struct *out);
-	int (*write)(struct bts_tracer *tracer, const struct bts_struct *in);
-};
-
-
-/*
- * The PEBS state.
- *
- * This gives access to the raw DS state and the PEBS-specific counter
- * reset value.
- */
-struct pebs_trace {
-	struct ds_trace ds;
-
-	/* the number of valid counters in the below array */
-	unsigned int counters;
-
-#define MAX_PEBS_COUNTERS 4
-	/* the counter reset value */
-	unsigned long long counter_reset[MAX_PEBS_COUNTERS];
-};
-
-
-/*
- * Read the BTS or PEBS trace.
- *
- * Returns a view on the trace collected for the parameter tracer.
- *
- * The view remains valid as long as the traced task is not running or
- * the tracer is suspended.
- * Writes into the trace buffer are not reflected.
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer);
-extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer);
-
-
-/*
- * Reset the write pointer of the BTS/PEBS buffer.
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_~()
- */
-extern int ds_reset_bts(struct bts_tracer *tracer);
-extern int ds_reset_pebs(struct pebs_tracer *tracer);
-
-/*
- * Set the PEBS counter reset value.
- *
- * Returns 0 on success; -Eerrno on error
- *
- * tracer: the tracer handle returned from ds_request_pebs()
- * counter: the index of the counter
- * value: the new counter reset value
- */
-extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
-			     unsigned int counter, u64 value);
-
-/*
- * Initialization
- */
-struct cpuinfo_x86;
-extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
-
-/*
- * Context switch work
- */
-extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
-
-#else /* CONFIG_X86_DS */
-
-struct cpuinfo_x86;
-static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
-static inline void ds_switch_to(struct task_struct *prev,
-				struct task_struct *next) {}
-
-#endif /* CONFIG_X86_DS */
-#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b753ea59703..5bec21a66dc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -21,7 +21,6 @@ struct mm_struct;
 #include <asm/msr.h>
 #include <asm/desc_defs.h>
 #include <asm/nops.h>
-#include <asm/ds.h>
 
 #include <linux/personality.h>
 #include <linux/cpumask.h>
@@ -29,6 +28,7 @@ struct mm_struct;
 #include <linux/threads.h>
 #include <linux/math64.h>
 #include <linux/init.h>
+#include <linux/err.h>
 
 #define HBP_NUM 4
 /*
@@ -473,10 +473,6 @@ struct thread_struct {
 	unsigned long		iopl;
 	/* Max allowed port in the bitmap, in bytes: */
 	unsigned		io_bitmap_max;
-/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
-	unsigned long	debugctlmsr;
-	/* Debug Store context; see asm/ds.h */
-	struct ds_context	*ds_ctx;
 };
 
 static inline unsigned long native_get_debugreg(int regno)
@@ -814,21 +810,6 @@ static inline unsigned long get_debugctlmsr(void)
     return debugctlmsr;
 }
 
-static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
-{
-	u64 debugctlmsr = 0;
-	u32 val1, val2;
-
-#ifndef CONFIG_X86_DEBUGCTLMSR
-	if (boot_cpu_data.x86 < 6)
-		return 0;
-#endif
-	rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
-	debugctlmsr = val1 | ((u64)val2 << 32);
-
-	return debugctlmsr;
-}
-
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
 {
 #ifndef CONFIG_X86_DEBUGCTLMSR
@@ -838,18 +819,6 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
 }
 
-static inline void update_debugctlmsr_on_cpu(int cpu,
-					     unsigned long debugctlmsr)
-{
-#ifndef CONFIG_X86_DEBUGCTLMSR
-	if (boot_cpu_data.x86 < 6)
-		return;
-#endif
-	wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
-		     (u32)((u64)debugctlmsr),
-		     (u32)((u64)debugctlmsr >> 32));
-}
-
 /*
  * from system description table in BIOS. Mostly for MCA use, but
  * others may find it useful:
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 86723035a51..52b098a6eeb 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -82,61 +82,6 @@
 
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
-
-/* configuration/status structure used in PTRACE_BTS_CONFIG and
-   PTRACE_BTS_STATUS commands.
-*/
-struct ptrace_bts_config {
-	/* requested or actual size of BTS buffer in bytes */
-	__u32 size;
-	/* bitmask of below flags */
-	__u32 flags;
-	/* buffer overflow signal */
-	__u32 signal;
-	/* actual size of bts_struct in bytes */
-	__u32 bts_size;
-};
-#endif /* __ASSEMBLY__ */
-
-#define PTRACE_BTS_O_TRACE	0x1 /* branch trace */
-#define PTRACE_BTS_O_SCHED	0x2 /* scheduling events w/ jiffies */
-#define PTRACE_BTS_O_SIGNAL     0x4 /* send SIG<signal> on buffer overflow
-				       instead of wrapping around */
-#define PTRACE_BTS_O_ALLOC	0x8 /* (re)allocate buffer */
-
-#define PTRACE_BTS_CONFIG	40
-/* Configure branch trace recording.
-   ADDR points to a struct ptrace_bts_config.
-   DATA gives the size of that buffer.
-   A new buffer is allocated, if requested in the flags.
-   An overflow signal may only be requested for new buffers.
-   Returns the number of bytes read.
-*/
-#define PTRACE_BTS_STATUS	41
-/* Return the current configuration in a struct ptrace_bts_config
-   pointed to by ADDR; DATA gives the size of that buffer.
-   Returns the number of bytes written.
-*/
-#define PTRACE_BTS_SIZE		42
-/* Return the number of available BTS records for draining.
-   DATA and ADDR are ignored.
-*/
-#define PTRACE_BTS_GET		43
-/* Get a single BTS record.
-   DATA defines the index into the BTS array, where 0 is the newest
-   entry, and higher indices refer to older entries.
-   ADDR is pointing to struct bts_struct (see asm/ds.h).
-*/
-#define PTRACE_BTS_CLEAR	44
-/* Clear the BTS buffer.
-   DATA and ADDR are ignored.
-*/
-#define PTRACE_BTS_DRAIN	45
-/* Read all available BTS records and clear the buffer.
-   ADDR points to an array of struct bts_struct.
-   DATA gives the size of that buffer.
-   BTS records are read from oldest to newest.
-   Returns number of BTS records drained.
-*/
+#endif
 
 #endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 69a686a7dff..78cd1ea9450 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -289,12 +289,6 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 
-#ifdef CONFIG_X86_PTRACE_BTS
-extern void ptrace_bts_untrace(struct task_struct *tsk);
-
-#define arch_ptrace_untrace(tsk)	ptrace_bts_untrace(tsk)
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 #endif /* __KERNEL__ */
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e0d28901e96..dc85e12d140 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -92,8 +92,6 @@ struct thread_info {
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_FREEZE		23	/* is freezing for suspend */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
-#define TIF_DEBUGCTLMSR		25	/* uses thread_struct.debugctlmsr */
-#define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
 
@@ -115,8 +113,6 @@ struct thread_info {
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
-#define _TIF_DEBUGCTLMSR	(1 << TIF_DEBUGCTLMSR)
-#define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 
@@ -147,7 +143,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
+	(_TIF_IO_BITMAP|_TIF_NOTSC)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4c58352209e..e77b2208372 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -47,8 +47,6 @@ obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-y				+= process.o
 obj-y				+= i387.o xsave.o
 obj-y				+= ptrace.o
-obj-$(CONFIG_X86_DS)		+= ds.o
-obj-$(CONFIG_X86_DS_SELFTEST)		+= ds_selftest.o
 obj-$(CONFIG_X86_32)		+= tls.o
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-y				+= step.o
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7e1cca13af3..d72377c41c7 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -12,7 +12,6 @@
 #include <asm/processor.h>
 #include <asm/pgtable.h>
 #include <asm/msr.h>
-#include <asm/ds.h>
 #include <asm/bugs.h>
 #include <asm/cpu.h>
 
@@ -367,7 +366,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_BTS);
 		if (!(l1 & (1<<12)))
 			set_cpu_cap(c, X86_FEATURE_PEBS);
-		ds_init_intel(c);
 	}
 
 	if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
deleted file mode 100644
index 1c47390dd0e..00000000000
--- a/arch/x86/kernel/ds.c
+++ /dev/null
@@ -1,1437 +0,0 @@
-/*
- * Debug Store support
- *
- * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
- * precise-event based sampling (PEBS).
- *
- * It manages:
- * - DS and BTS hardware configuration
- * - buffer overflow handling (to be done)
- * - buffer access
- *
- * It does not do:
- * - security checking (is the caller allowed to trace the task)
- * - buffer allocation (memory accounting)
- *
- *
- * Copyright (C) 2007-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/trace_clock.h>
-
-#include <asm/ds.h>
-
-#include "ds_selftest.h"
-
-/*
- * The configuration for a particular DS hardware implementation:
- */
-struct ds_configuration {
-	/* The name of the configuration: */
-	const char		*name;
-
-	/* The size of pointer-typed fields in DS, BTS, and PEBS: */
-	unsigned char		sizeof_ptr_field;
-
-	/* The size of a BTS/PEBS record in bytes: */
-	unsigned char		sizeof_rec[2];
-
-	/* The number of pebs counter reset values in the DS structure. */
-	unsigned char		nr_counter_reset;
-
-	/* Control bit-masks indexed by enum ds_feature: */
-	unsigned long		ctl[dsf_ctl_max];
-};
-static struct ds_configuration ds_cfg __read_mostly;
-
-
-/* Maximal size of a DS configuration: */
-#define MAX_SIZEOF_DS		0x80
-
-/* Maximal size of a BTS record: */
-#define MAX_SIZEOF_BTS		(3 * 8)
-
-/* BTS and PEBS buffer alignment: */
-#define DS_ALIGNMENT		(1 << 3)
-
-/* Number of buffer pointers in DS: */
-#define NUM_DS_PTR_FIELDS	8
-
-/* Size of a pebs reset value in DS: */
-#define PEBS_RESET_FIELD_SIZE	8
-
-/* Mask of control bits in the DS MSR register: */
-#define BTS_CONTROL				  \
-	( ds_cfg.ctl[dsf_bts]			| \
-	  ds_cfg.ctl[dsf_bts_kernel]		| \
-	  ds_cfg.ctl[dsf_bts_user]		| \
-	  ds_cfg.ctl[dsf_bts_overflow] )
-
-/*
- * A BTS or PEBS tracer.
- *
- * This holds the configuration of the tracer and serves as a handle
- * to identify tracers.
- */
-struct ds_tracer {
-	/* The DS context (partially) owned by this tracer. */
-	struct ds_context	*context;
-	/* The buffer provided on ds_request() and its size in bytes. */
-	void			*buffer;
-	size_t			size;
-};
-
-struct bts_tracer {
-	/* The common DS part: */
-	struct ds_tracer	ds;
-
-	/* The trace including the DS configuration: */
-	struct bts_trace	trace;
-
-	/* Buffer overflow notification function: */
-	bts_ovfl_callback_t	ovfl;
-
-	/* Active flags affecting trace collection. */
-	unsigned int		flags;
-};
-
-struct pebs_tracer {
-	/* The common DS part: */
-	struct ds_tracer	ds;
-
-	/* The trace including the DS configuration: */
-	struct pebs_trace	trace;
-
-	/* Buffer overflow notification function: */
-	pebs_ovfl_callback_t	ovfl;
-};
-
-/*
- * Debug Store (DS) save area configuration (see Intel64 and IA32
- * Architectures Software Developer's Manual, section 18.5)
- *
- * The DS configuration consists of the following fields; different
- * architetures vary in the size of those fields.
- *
- * - double-word aligned base linear address of the BTS buffer
- * - write pointer into the BTS buffer
- * - end linear address of the BTS buffer (one byte beyond the end of
- *   the buffer)
- * - interrupt pointer into BTS buffer
- *   (interrupt occurs when write pointer passes interrupt pointer)
- * - double-word aligned base linear address of the PEBS buffer
- * - write pointer into the PEBS buffer
- * - end linear address of the PEBS buffer (one byte beyond the end of
- *   the buffer)
- * - interrupt pointer into PEBS buffer
- *   (interrupt occurs when write pointer passes interrupt pointer)
- * - value to which counter is reset following counter overflow
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- *
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- * - an offset giving the start of the respective region
- *
- * This offset is further used to index various arrays holding
- * information for BTS and PEBS at the respective index.
- *
- * On later 32bit processors, we only access the lower 32bit of the
- * 64bit pointer fields. The upper halves will be zeroed out.
- */
-
-enum ds_field {
-	ds_buffer_base = 0,
-	ds_index,
-	ds_absolute_maximum,
-	ds_interrupt_threshold,
-};
-
-enum ds_qualifier {
-	ds_bts = 0,
-	ds_pebs
-};
-
-static inline unsigned long
-ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
-{
-	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
-	return *(unsigned long *)base;
-}
-
-static inline void
-ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
-       unsigned long value)
-{
-	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
-	(*(unsigned long *)base) = value;
-}
-
-
-/*
- * Locking is done only for allocating BTS or PEBS resources.
- */
-static DEFINE_SPINLOCK(ds_lock);
-
-/*
- * We either support (system-wide) per-cpu or per-thread allocation.
- * We distinguish the two based on the task_struct pointer, where a
- * NULL pointer indicates per-cpu allocation for the current cpu.
- *
- * Allocations are use-counted. As soon as resources are allocated,
- * further allocations must be of the same type (per-cpu or
- * per-thread). We model this by counting allocations (i.e. the number
- * of tracers of a certain type) for one type negatively:
- *   =0  no tracers
- *   >0  number of per-thread tracers
- *   <0  number of per-cpu tracers
- *
- * Tracers essentially gives the number of ds contexts for a certain
- * type of allocation.
- */
-static atomic_t tracers = ATOMIC_INIT(0);
-
-static inline int get_tracer(struct task_struct *task)
-{
-	int error;
-
-	spin_lock_irq(&ds_lock);
-
-	if (task) {
-		error = -EPERM;
-		if (atomic_read(&tracers) < 0)
-			goto out;
-		atomic_inc(&tracers);
-	} else {
-		error = -EPERM;
-		if (atomic_read(&tracers) > 0)
-			goto out;
-		atomic_dec(&tracers);
-	}
-
-	error = 0;
-out:
-	spin_unlock_irq(&ds_lock);
-	return error;
-}
-
-static inline void put_tracer(struct task_struct *task)
-{
-	if (task)
-		atomic_dec(&tracers);
-	else
-		atomic_inc(&tracers);
-}
-
-/*
- * The DS context is either attached to a thread or to a cpu:
- * - in the former case, the thread_struct contains a pointer to the
- *   attached context.
- * - in the latter case, we use a static array of per-cpu context
- *   pointers.
- *
- * Contexts are use-counted. They are allocated on first access and
- * deallocated when the last user puts the context.
- */
-struct ds_context {
-	/* The DS configuration; goes into MSR_IA32_DS_AREA: */
-	unsigned char		ds[MAX_SIZEOF_DS];
-
-	/* The owner of the BTS and PEBS configuration, respectively: */
-	struct bts_tracer	*bts_master;
-	struct pebs_tracer	*pebs_master;
-
-	/* Use count: */
-	unsigned long		count;
-
-	/* Pointer to the context pointer field: */
-	struct ds_context	**this;
-
-	/* The traced task; NULL for cpu tracing: */
-	struct task_struct	*task;
-
-	/* The traced cpu; only valid if task is NULL: */
-	int			cpu;
-};
-
-static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
-
-
-static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
-{
-	struct ds_context **p_context =
-		(task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
-	struct ds_context *context = NULL;
-	struct ds_context *new_context = NULL;
-
-	/* Chances are small that we already have a context. */
-	new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
-	if (!new_context)
-		return NULL;
-
-	spin_lock_irq(&ds_lock);
-
-	context = *p_context;
-	if (likely(!context)) {
-		context = new_context;
-
-		context->this = p_context;
-		context->task = task;
-		context->cpu = cpu;
-		context->count = 0;
-
-		*p_context = context;
-	}
-
-	context->count++;
-
-	spin_unlock_irq(&ds_lock);
-
-	if (context != new_context)
-		kfree(new_context);
-
-	return context;
-}
-
-static void ds_put_context(struct ds_context *context)
-{
-	struct task_struct *task;
-	unsigned long irq;
-
-	if (!context)
-		return;
-
-	spin_lock_irqsave(&ds_lock, irq);
-
-	if (--context->count) {
-		spin_unlock_irqrestore(&ds_lock, irq);
-		return;
-	}
-
-	*(context->this) = NULL;
-
-	task = context->task;
-
-	if (task)
-		clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
-	/*
-	 * We leave the (now dangling) pointer to the DS configuration in
-	 * the DS_AREA msr. This is as good or as bad as replacing it with
-	 * NULL - the hardware would crash if we enabled tracing.
-	 *
-	 * This saves us some problems with having to write an msr on a
-	 * different cpu while preventing others from doing the same for the
-	 * next context for that same cpu.
-	 */
-
-	spin_unlock_irqrestore(&ds_lock, irq);
-
-	/* The context might still be in use for context switching. */
-	if (task && (task != current))
-		wait_task_context_switch(task);
-
-	kfree(context);
-}
-
-static void ds_install_ds_area(struct ds_context *context)
-{
-	unsigned long ds;
-
-	ds = (unsigned long)context->ds;
-
-	/*
-	 * There is a race between the bts master and the pebs master.
-	 *
-	 * The thread/cpu access is synchronized via get/put_cpu() for
-	 * task tracing and via wrmsr_on_cpu for cpu tracing.
-	 *
-	 * If bts and pebs are collected for the same task or same cpu,
-	 * the same confiuration is written twice.
-	 */
-	if (context->task) {
-		get_cpu();
-		if (context->task == current)
-			wrmsrl(MSR_IA32_DS_AREA, ds);
-		set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
-		put_cpu();
-	} else
-		wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
-			     (u32)((u64)ds), (u32)((u64)ds >> 32));
-}
-
-/*
- * Call the tracer's callback on a buffer overflow.
- *
- * context: the ds context
- * qual: the buffer type
- */
-static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
-{
-	switch (qual) {
-	case ds_bts:
-		if (context->bts_master &&
-		    context->bts_master->ovfl)
-			context->bts_master->ovfl(context->bts_master);
-		break;
-	case ds_pebs:
-		if (context->pebs_master &&
-		    context->pebs_master->ovfl)
-			context->pebs_master->ovfl(context->pebs_master);
-		break;
-	}
-}
-
-
-/*
- * Write raw data into the BTS or PEBS buffer.
- *
- * The remainder of any partially written record is zeroed out.
- *
- * context: the DS context
- * qual:    the buffer type
- * record:  the data to write
- * size:    the size of the data
- */
-static int ds_write(struct ds_context *context, enum ds_qualifier qual,
-		    const void *record, size_t size)
-{
-	int bytes_written = 0;
-
-	if (!record)
-		return -EINVAL;
-
-	while (size) {
-		unsigned long base, index, end, write_end, int_th;
-		unsigned long write_size, adj_write_size;
-
-		/*
-		 * Write as much as possible without producing an
-		 * overflow interrupt.
-		 *
-		 * Interrupt_threshold must either be
-		 * - bigger than absolute_maximum or
-		 * - point to a record between buffer_base and absolute_maximum
-		 *
-		 * Index points to a valid record.
-		 */
-		base   = ds_get(context->ds, qual, ds_buffer_base);
-		index  = ds_get(context->ds, qual, ds_index);
-		end    = ds_get(context->ds, qual, ds_absolute_maximum);
-		int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
-
-		write_end = min(end, int_th);
-
-		/*
-		 * If we are already beyond the interrupt threshold,
-		 * we fill the entire buffer.
-		 */
-		if (write_end <= index)
-			write_end = end;
-
-		if (write_end <= index)
-			break;
-
-		write_size = min((unsigned long) size, write_end - index);
-		memcpy((void *)index, record, write_size);
-
-		record = (const char *)record + write_size;
-		size -= write_size;
-		bytes_written += write_size;
-
-		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
-		adj_write_size *= ds_cfg.sizeof_rec[qual];
-
-		/* Zero out trailing bytes. */
-		memset((char *)index + write_size, 0,
-		       adj_write_size - write_size);
-		index += adj_write_size;
-
-		if (index >= end)
-			index = base;
-		ds_set(context->ds, qual, ds_index, index);
-
-		if (index >= int_th)
-			ds_overflow(context, qual);
-	}
-
-	return bytes_written;
-}
-
-
-/*
- * Branch Trace Store (BTS) uses the following format. Different
- * architectures vary in the size of those fields.
- * - source linear address
- * - destination linear address
- * - flags
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- * We compute the base address for the fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- *
- * In order to store additional information in the BTS buffer, we use
- * a special source address to indicate that the record requires
- * special interpretation.
- *
- * Netburst indicated via a bit in the flags field whether the branch
- * was predicted; this is ignored.
- *
- * We use two levels of abstraction:
- * - the raw data level defined here
- * - an arch-independent level defined in ds.h
- */
-
-enum bts_field {
-	bts_from,
-	bts_to,
-	bts_flags,
-
-	bts_qual		= bts_from,
-	bts_clock		= bts_to,
-	bts_pid			= bts_flags,
-
-	bts_qual_mask		= (bts_qual_max - 1),
-	bts_escape		= ((unsigned long)-1 & ~bts_qual_mask)
-};
-
-static inline unsigned long bts_get(const char *base, unsigned long field)
-{
-	base += (ds_cfg.sizeof_ptr_field * field);
-	return *(unsigned long *)base;
-}
-
-static inline void bts_set(char *base, unsigned long field, unsigned long val)
-{
-	base += (ds_cfg.sizeof_ptr_field * field);
-	(*(unsigned long *)base) = val;
-}
-
-
-/*
- * The raw BTS data is architecture dependent.
- *
- * For higher-level users, we give an arch-independent view.
- * - ds.h defines struct bts_struct
- * - bts_read translates one raw bts record into a bts_struct
- * - bts_write translates one bts_struct into the raw format and
- *   writes it into the top of the parameter tracer's buffer.
- *
- * return: bytes read/written on success; -Eerrno, otherwise
- */
-static int
-bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	if (at < tracer->trace.ds.begin)
-		return -EINVAL;
-
-	if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
-		return -EINVAL;
-
-	memset(out, 0, sizeof(*out));
-	if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
-		out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
-		out->variant.event.clock = bts_get(at, bts_clock);
-		out->variant.event.pid = bts_get(at, bts_pid);
-	} else {
-		out->qualifier = bts_branch;
-		out->variant.lbr.from = bts_get(at, bts_from);
-		out->variant.lbr.to   = bts_get(at, bts_to);
-
-		if (!out->variant.lbr.from && !out->variant.lbr.to)
-			out->qualifier = bts_invalid;
-	}
-
-	return ds_cfg.sizeof_rec[ds_bts];
-}
-
-static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
-{
-	unsigned char raw[MAX_SIZEOF_BTS];
-
-	if (!tracer)
-		return -EINVAL;
-
-	if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
-		return -EOVERFLOW;
-
-	switch (in->qualifier) {
-	case bts_invalid:
-		bts_set(raw, bts_from, 0);
-		bts_set(raw, bts_to, 0);
-		bts_set(raw, bts_flags, 0);
-		break;
-	case bts_branch:
-		bts_set(raw, bts_from, in->variant.lbr.from);
-		bts_set(raw, bts_to,   in->variant.lbr.to);
-		bts_set(raw, bts_flags, 0);
-		break;
-	case bts_task_arrives:
-	case bts_task_departs:
-		bts_set(raw, bts_qual, (bts_escape | in->qualifier));
-		bts_set(raw, bts_clock, in->variant.event.clock);
-		bts_set(raw, bts_pid, in->variant.event.pid);
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return ds_write(tracer->ds.context, ds_bts, raw,
-			ds_cfg.sizeof_rec[ds_bts]);
-}
-
-
-static void ds_write_config(struct ds_context *context,
-			    struct ds_trace *cfg, enum ds_qualifier qual)
-{
-	unsigned char *ds = context->ds;
-
-	ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
-	ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
-	ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
-	ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
-}
-
-static void ds_read_config(struct ds_context *context,
-			   struct ds_trace *cfg, enum ds_qualifier qual)
-{
-	unsigned char *ds = context->ds;
-
-	cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
-	cfg->top = (void *)ds_get(ds, qual, ds_index);
-	cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
-	cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
-}
-
-static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
-			     void *base, size_t size, size_t ith,
-			     unsigned int flags) {
-	unsigned long buffer, adj;
-
-	/*
-	 * Adjust the buffer address and size to meet alignment
-	 * constraints:
-	 * - buffer is double-word aligned
-	 * - size is multiple of record size
-	 *
-	 * We checked the size at the very beginning; we have enough
-	 * space to do the adjustment.
-	 */
-	buffer = (unsigned long)base;
-
-	adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
-	buffer += adj;
-	size   -= adj;
-
-	trace->n = size / ds_cfg.sizeof_rec[qual];
-	trace->size = ds_cfg.sizeof_rec[qual];
-
-	size = (trace->n * trace->size);
-
-	trace->begin = (void *)buffer;
-	trace->top = trace->begin;
-	trace->end = (void *)(buffer + size);
-	/*
-	 * The value for 'no threshold' is -1, which will set the
-	 * threshold outside of the buffer, just like we want it.
-	 */
-	ith *= ds_cfg.sizeof_rec[qual];
-	trace->ith = (void *)(buffer + size - ith);
-
-	trace->flags = flags;
-}
-
-
-static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
-		      enum ds_qualifier qual, struct task_struct *task,
-		      int cpu, void *base, size_t size, size_t th)
-{
-	struct ds_context *context;
-	int error;
-	size_t req_size;
-
-	error = -EOPNOTSUPP;
-	if (!ds_cfg.sizeof_rec[qual])
-		goto out;
-
-	error = -EINVAL;
-	if (!base)
-		goto out;
-
-	req_size = ds_cfg.sizeof_rec[qual];
-	/* We might need space for alignment adjustments. */
-	if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
-		req_size += DS_ALIGNMENT;
-
-	error = -EINVAL;
-	if (size < req_size)
-		goto out;
-
-	if (th != (size_t)-1) {
-		th *= ds_cfg.sizeof_rec[qual];
-
-		error = -EINVAL;
-		if (size <= th)
-			goto out;
-	}
-
-	tracer->buffer = base;
-	tracer->size = size;
-
-	error = -ENOMEM;
-	context = ds_get_context(task, cpu);
-	if (!context)
-		goto out;
-	tracer->context = context;
-
-	/*
-	 * Defer any tracer-specific initialization work for the context until
-	 * context ownership has been clarified.
-	 */
-
-	error = 0;
- out:
-	return error;
-}
-
-static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
-					 void *base, size_t size,
-					 bts_ovfl_callback_t ovfl, size_t th,
-					 unsigned int flags)
-{
-	struct bts_tracer *tracer;
-	int error;
-
-	/* Buffer overflow notification is not yet implemented. */
-	error = -EOPNOTSUPP;
-	if (ovfl)
-		goto out;
-
-	error = get_tracer(task);
-	if (error < 0)
-		goto out;
-
-	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
-	if (!tracer)
-		goto out_put_tracer;
-	tracer->ovfl = ovfl;
-
-	/* Do some more error checking and acquire a tracing context. */
-	error = ds_request(&tracer->ds, &tracer->trace.ds,
-			   ds_bts, task, cpu, base, size, th);
-	if (error < 0)
-		goto out_tracer;
-
-	/* Claim the bts part of the tracing context we acquired above. */
-	spin_lock_irq(&ds_lock);
-
-	error = -EPERM;
-	if (tracer->ds.context->bts_master)
-		goto out_unlock;
-	tracer->ds.context->bts_master = tracer;
-
-	spin_unlock_irq(&ds_lock);
-
-	/*
-	 * Now that we own the bts part of the context, let's complete the
-	 * initialization for that part.
-	 */
-	ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
-	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
-	ds_install_ds_area(tracer->ds.context);
-
-	tracer->trace.read  = bts_read;
-	tracer->trace.write = bts_write;
-
-	/* Start tracing. */
-	ds_resume_bts(tracer);
-
-	return tracer;
-
- out_unlock:
-	spin_unlock_irq(&ds_lock);
-	ds_put_context(tracer->ds.context);
- out_tracer:
-	kfree(tracer);
- out_put_tracer:
-	put_tracer(task);
- out:
-	return ERR_PTR(error);
-}
-
-struct bts_tracer *ds_request_bts_task(struct task_struct *task,
-				       void *base, size_t size,
-				       bts_ovfl_callback_t ovfl,
-				       size_t th, unsigned int flags)
-{
-	return ds_request_bts(task, 0, base, size, ovfl, th, flags);
-}
-
-struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
-				      bts_ovfl_callback_t ovfl,
-				      size_t th, unsigned int flags)
-{
-	return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
-}
-
-static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
-					   void *base, size_t size,
-					   pebs_ovfl_callback_t ovfl, size_t th,
-					   unsigned int flags)
-{
-	struct pebs_tracer *tracer;
-	int error;
-
-	/* Buffer overflow notification is not yet implemented. */
-	error = -EOPNOTSUPP;
-	if (ovfl)
-		goto out;
-
-	error = get_tracer(task);
-	if (error < 0)
-		goto out;
-
-	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
-	if (!tracer)
-		goto out_put_tracer;
-	tracer->ovfl = ovfl;
-
-	/* Do some more error checking and acquire a tracing context. */
-	error = ds_request(&tracer->ds, &tracer->trace.ds,
-			   ds_pebs, task, cpu, base, size, th);
-	if (error < 0)
-		goto out_tracer;
-
-	/* Claim the pebs part of the tracing context we acquired above. */
-	spin_lock_irq(&ds_lock);
-
-	error = -EPERM;
-	if (tracer->ds.context->pebs_master)
-		goto out_unlock;
-	tracer->ds.context->pebs_master = tracer;
-
-	spin_unlock_irq(&ds_lock);
-
-	/*
-	 * Now that we own the pebs part of the context, let's complete the
-	 * initialization for that part.
-	 */
-	ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
-	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-	ds_install_ds_area(tracer->ds.context);
-
-	/* Start tracing. */
-	ds_resume_pebs(tracer);
-
-	return tracer;
-
- out_unlock:
-	spin_unlock_irq(&ds_lock);
-	ds_put_context(tracer->ds.context);
- out_tracer:
-	kfree(tracer);
- out_put_tracer:
-	put_tracer(task);
- out:
-	return ERR_PTR(error);
-}
-
-struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
-					 void *base, size_t size,
-					 pebs_ovfl_callback_t ovfl,
-					 size_t th, unsigned int flags)
-{
-	return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
-}
-
-struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
-					pebs_ovfl_callback_t ovfl,
-					size_t th, unsigned int flags)
-{
-	return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
-}
-
-static void ds_free_bts(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-
-	task = tracer->ds.context->task;
-
-	WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
-	tracer->ds.context->bts_master = NULL;
-
-	/* Make sure tracing stopped and the tracer is not in use. */
-	if (task && (task != current))
-		wait_task_context_switch(task);
-
-	ds_put_context(tracer->ds.context);
-	put_tracer(task);
-
-	kfree(tracer);
-}
-
-void ds_release_bts(struct bts_tracer *tracer)
-{
-	might_sleep();
-
-	if (!tracer)
-		return;
-
-	ds_suspend_bts(tracer);
-	ds_free_bts(tracer);
-}
-
-int ds_release_bts_noirq(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long irq;
-	int error;
-
-	if (!tracer)
-		return 0;
-
-	task = tracer->ds.context->task;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task &&
-	    (tracer->ds.context->cpu != smp_processor_id()))
-		goto out;
-
-	error = -EPERM;
-	if (task && (task != current))
-		goto out;
-
-	ds_suspend_bts_noirq(tracer);
-	ds_free_bts(tracer);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-static void update_task_debugctlmsr(struct task_struct *task,
-				    unsigned long debugctlmsr)
-{
-	task->thread.debugctlmsr = debugctlmsr;
-
-	get_cpu();
-	if (task == current)
-		update_debugctlmsr(debugctlmsr);
-	put_cpu();
-}
-
-void ds_suspend_bts(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr;
-	int cpu;
-
-	if (!tracer)
-		return;
-
-	tracer->flags = 0;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	WARN_ON(!task && irqs_disabled());
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr_on_cpu(cpu));
-	debugctlmsr &= ~BTS_CONTROL;
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-
-int ds_suspend_bts_noirq(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr, irq;
-	int cpu, error = 0;
-
-	if (!tracer)
-		return 0;
-
-	tracer->flags = 0;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task && (cpu != smp_processor_id()))
-		goto out;
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr());
-	debugctlmsr &= ~BTS_CONTROL;
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr(debugctlmsr);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-static unsigned long ds_bts_control(struct bts_tracer *tracer)
-{
-	unsigned long control;
-
-	control = ds_cfg.ctl[dsf_bts];
-	if (!(tracer->trace.ds.flags & BTS_KERNEL))
-		control |= ds_cfg.ctl[dsf_bts_kernel];
-	if (!(tracer->trace.ds.flags & BTS_USER))
-		control |= ds_cfg.ctl[dsf_bts_user];
-
-	return control;
-}
-
-void ds_resume_bts(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr;
-	int cpu;
-
-	if (!tracer)
-		return;
-
-	tracer->flags = tracer->trace.ds.flags;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	WARN_ON(!task && irqs_disabled());
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr_on_cpu(cpu));
-	debugctlmsr |= ds_bts_control(tracer);
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-
-int ds_resume_bts_noirq(struct bts_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long debugctlmsr, irq;
-	int cpu, error = 0;
-
-	if (!tracer)
-		return 0;
-
-	tracer->flags = tracer->trace.ds.flags;
-
-	task = tracer->ds.context->task;
-	cpu  = tracer->ds.context->cpu;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task && (cpu != smp_processor_id()))
-		goto out;
-
-	debugctlmsr = (task ?
-		       task->thread.debugctlmsr :
-		       get_debugctlmsr());
-	debugctlmsr |= ds_bts_control(tracer);
-
-	if (task)
-		update_task_debugctlmsr(task, debugctlmsr);
-	else
-		update_debugctlmsr(debugctlmsr);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-static void ds_free_pebs(struct pebs_tracer *tracer)
-{
-	struct task_struct *task;
-
-	task = tracer->ds.context->task;
-
-	WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
-	tracer->ds.context->pebs_master = NULL;
-
-	ds_put_context(tracer->ds.context);
-	put_tracer(task);
-
-	kfree(tracer);
-}
-
-void ds_release_pebs(struct pebs_tracer *tracer)
-{
-	might_sleep();
-
-	if (!tracer)
-		return;
-
-	ds_suspend_pebs(tracer);
-	ds_free_pebs(tracer);
-}
-
-int ds_release_pebs_noirq(struct pebs_tracer *tracer)
-{
-	struct task_struct *task;
-	unsigned long irq;
-	int error;
-
-	if (!tracer)
-		return 0;
-
-	task = tracer->ds.context->task;
-
-	local_irq_save(irq);
-
-	error = -EPERM;
-	if (!task &&
-	    (tracer->ds.context->cpu != smp_processor_id()))
-		goto out;
-
-	error = -EPERM;
-	if (task && (task != current))
-		goto out;
-
-	ds_suspend_pebs_noirq(tracer);
-	ds_free_pebs(tracer);
-
-	error = 0;
- out:
-	local_irq_restore(irq);
-	return error;
-}
-
-void ds_suspend_pebs(struct pebs_tracer *tracer)
-{
-
-}
-
-int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
-{
-	return 0;
-}
-
-void ds_resume_pebs(struct pebs_tracer *tracer)
-{
-
-}
-
-int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
-{
-	return 0;
-}
-
-const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
-{
-	if (!tracer)
-		return NULL;
-
-	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
-	return &tracer->trace;
-}
-
-const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
-{
-	if (!tracer)
-		return NULL;
-
-	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-
-	tracer->trace.counters = ds_cfg.nr_counter_reset;
-	memcpy(tracer->trace.counter_reset,
-	       tracer->ds.context->ds +
-	       (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
-	       ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
-
-	return &tracer->trace;
-}
-
-int ds_reset_bts(struct bts_tracer *tracer)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	tracer->trace.ds.top = tracer->trace.ds.begin;
-
-	ds_set(tracer->ds.context->ds, ds_bts, ds_index,
-	       (unsigned long)tracer->trace.ds.top);
-
-	return 0;
-}
-
-int ds_reset_pebs(struct pebs_tracer *tracer)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	tracer->trace.ds.top = tracer->trace.ds.begin;
-
-	ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
-	       (unsigned long)tracer->trace.ds.top);
-
-	return 0;
-}
-
-int ds_set_pebs_reset(struct pebs_tracer *tracer,
-		      unsigned int counter, u64 value)
-{
-	if (!tracer)
-		return -EINVAL;
-
-	if (ds_cfg.nr_counter_reset < counter)
-		return -EINVAL;
-
-	*(u64 *)(tracer->ds.context->ds +
-		 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
-		 (counter * PEBS_RESET_FIELD_SIZE)) = value;
-
-	return 0;
-}
-
-static const struct ds_configuration ds_cfg_netburst = {
-	.name = "Netburst",
-	.ctl[dsf_bts]		= (1 << 2) | (1 << 3),
-	.ctl[dsf_bts_kernel]	= (1 << 5),
-	.ctl[dsf_bts_user]	= (1 << 6),
-	.nr_counter_reset	= 1,
-};
-static const struct ds_configuration ds_cfg_pentium_m = {
-	.name = "Pentium M",
-	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
-	.nr_counter_reset	= 1,
-};
-static const struct ds_configuration ds_cfg_core2_atom = {
-	.name = "Core 2/Atom",
-	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
-	.ctl[dsf_bts_kernel]	= (1 << 9),
-	.ctl[dsf_bts_user]	= (1 << 10),
-	.nr_counter_reset	= 1,
-};
-static const struct ds_configuration ds_cfg_core_i7 = {
-	.name = "Core i7",
-	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
-	.ctl[dsf_bts_kernel]	= (1 << 9),
-	.ctl[dsf_bts_user]	= (1 << 10),
-	.nr_counter_reset	= 4,
-};
-
-static void
-ds_configure(const struct ds_configuration *cfg,
-	     struct cpuinfo_x86 *cpu)
-{
-	unsigned long nr_pebs_fields = 0;
-
-	printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
-
-#ifdef __i386__
-	nr_pebs_fields = 10;
-#else
-	nr_pebs_fields = 18;
-#endif
-
-	/*
-	 * Starting with version 2, architectural performance
-	 * monitoring supports a format specifier.
-	 */
-	if ((cpuid_eax(0xa) & 0xff) > 1) {
-		unsigned long perf_capabilities, format;
-
-		rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
-
-		format = (perf_capabilities >> 8) & 0xf;
-
-		switch (format) {
-		case 0:
-			nr_pebs_fields = 18;
-			break;
-		case 1:
-			nr_pebs_fields = 22;
-			break;
-		default:
-			printk(KERN_INFO
-			       "[ds] unknown PEBS format: %lu\n", format);
-			nr_pebs_fields = 0;
-			break;
-		}
-	}
-
-	memset(&ds_cfg, 0, sizeof(ds_cfg));
-	ds_cfg = *cfg;
-
-	ds_cfg.sizeof_ptr_field =
-		(cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
-
-	ds_cfg.sizeof_rec[ds_bts]  = ds_cfg.sizeof_ptr_field * 3;
-	ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
-
-	if (!cpu_has(cpu, X86_FEATURE_BTS)) {
-		ds_cfg.sizeof_rec[ds_bts] = 0;
-		printk(KERN_INFO "[ds] bts not available\n");
-	}
-	if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
-		ds_cfg.sizeof_rec[ds_pebs] = 0;
-		printk(KERN_INFO "[ds] pebs not available\n");
-	}
-
-	printk(KERN_INFO "[ds] sizes: address: %u bit, ",
-	       8 * ds_cfg.sizeof_ptr_field);
-	printk("bts/pebs record: %u/%u bytes\n",
-	       ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
-
-	WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
-}
-
-void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
-{
-	/* Only configure the first cpu. Others are identical. */
-	if (ds_cfg.name)
-		return;
-
-	switch (c->x86) {
-	case 0x6:
-		switch (c->x86_model) {
-		case 0x9:
-		case 0xd: /* Pentium M */
-			ds_configure(&ds_cfg_pentium_m, c);
-			break;
-		case 0xf:
-		case 0x17: /* Core2 */
-		case 0x1c: /* Atom */
-			ds_configure(&ds_cfg_core2_atom, c);
-			break;
-		case 0x1a: /* Core i7 */
-			ds_configure(&ds_cfg_core_i7, c);
-			break;
-		default:
-			/* Sorry, don't know about them. */
-			break;
-		}
-		break;
-	case 0xf:
-		switch (c->x86_model) {
-		case 0x0:
-		case 0x1:
-		case 0x2: /* Netburst */
-			ds_configure(&ds_cfg_netburst, c);
-			break;
-		default:
-			/* Sorry, don't know about them. */
-			break;
-		}
-		break;
-	default:
-		/* Sorry, don't know about them. */
-		break;
-	}
-}
-
-static inline void ds_take_timestamp(struct ds_context *context,
-				     enum bts_qualifier qualifier,
-				     struct task_struct *task)
-{
-	struct bts_tracer *tracer = context->bts_master;
-	struct bts_struct ts;
-
-	/* Prevent compilers from reading the tracer pointer twice. */
-	barrier();
-
-	if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
-		return;
-
-	memset(&ts, 0, sizeof(ts));
-	ts.qualifier		= qualifier;
-	ts.variant.event.clock	= trace_clock_global();
-	ts.variant.event.pid	= task->pid;
-
-	bts_write(tracer, &ts);
-}
-
-/*
- * Change the DS configuration from tracing prev to tracing next.
- */
-void ds_switch_to(struct task_struct *prev, struct task_struct *next)
-{
-	struct ds_context *prev_ctx	= prev->thread.ds_ctx;
-	struct ds_context *next_ctx	= next->thread.ds_ctx;
-	unsigned long debugctlmsr	= next->thread.debugctlmsr;
-
-	/* Make sure all data is read before we start. */
-	barrier();
-
-	if (prev_ctx) {
-		update_debugctlmsr(0);
-
-		ds_take_timestamp(prev_ctx, bts_task_departs, prev);
-	}
-
-	if (next_ctx) {
-		ds_take_timestamp(next_ctx, bts_task_arrives, next);
-
-		wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
-	}
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-static __init int ds_selftest(void)
-{
-	if (ds_cfg.sizeof_rec[ds_bts]) {
-		int error;
-
-		error = ds_selftest_bts();
-		if (error) {
-			WARN(1, "[ds] selftest failed. disabling bts.\n");
-			ds_cfg.sizeof_rec[ds_bts] = 0;
-		}
-	}
-
-	if (ds_cfg.sizeof_rec[ds_pebs]) {
-		int error;
-
-		error = ds_selftest_pebs();
-		if (error) {
-			WARN(1, "[ds] selftest failed. disabling pebs.\n");
-			ds_cfg.sizeof_rec[ds_pebs] = 0;
-		}
-	}
-
-	return 0;
-}
-device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
deleted file mode 100644
index 6bc7c199ab9..00000000000
--- a/arch/x86/kernel/ds_selftest.c
+++ /dev/null
@@ -1,408 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-
-#include "ds_selftest.h"
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <asm/ds.h>
-
-
-#define BUFFER_SIZE		521	/* Intentionally chose an odd size. */
-#define SMALL_BUFFER_SIZE	24	/* A single bts entry. */
-
-struct ds_selftest_bts_conf {
-	struct bts_tracer *tracer;
-	int error;
-	int (*suspend)(struct bts_tracer *);
-	int (*resume)(struct bts_tracer *);
-};
-
-static int ds_selftest_bts_consistency(const struct bts_trace *trace)
-{
-	int error = 0;
-
-	if (!trace) {
-		printk(KERN_CONT "failed to access trace...");
-		/* Bail out. Other tests are pointless. */
-		return -1;
-	}
-
-	if (!trace->read) {
-		printk(KERN_CONT "bts read not available...");
-		error = -1;
-	}
-
-	/* Do some sanity checks on the trace configuration. */
-	if (!trace->ds.n) {
-		printk(KERN_CONT "empty bts buffer...");
-		error = -1;
-	}
-	if (!trace->ds.size) {
-		printk(KERN_CONT "bad bts trace setup...");
-		error = -1;
-	}
-	if (trace->ds.end !=
-	    (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
-		printk(KERN_CONT "bad bts buffer setup...");
-		error = -1;
-	}
-	/*
-	 * We allow top in [begin; end], since its not clear when the
-	 * overflow adjustment happens: after the increment or before the
-	 * write.
-	 */
-	if ((trace->ds.top < trace->ds.begin) ||
-	    (trace->ds.end < trace->ds.top)) {
-		printk(KERN_CONT "bts top out of bounds...");
-		error = -1;
-	}
-
-	return error;
-}
-
-static int ds_selftest_bts_read(struct bts_tracer *tracer,
-				const struct bts_trace *trace,
-				const void *from, const void *to)
-{
-	const unsigned char *at;
-
-	/*
-	 * Check a few things which do not belong to this test.
-	 * They should be covered by other tests.
-	 */
-	if (!trace)
-		return -1;
-
-	if (!trace->read)
-		return -1;
-
-	if (to < from)
-		return -1;
-
-	if (from < trace->ds.begin)
-		return -1;
-
-	if (trace->ds.end < to)
-		return -1;
-
-	if (!trace->ds.size)
-		return -1;
-
-	/* Now to the test itself. */
-	for (at = from; (void *)at < to; at += trace->ds.size) {
-		struct bts_struct bts;
-		unsigned long index;
-		int error;
-
-		if (((void *)at - trace->ds.begin) % trace->ds.size) {
-			printk(KERN_CONT
-			       "read from non-integer index...");
-			return -1;
-		}
-		index = ((void *)at - trace->ds.begin) / trace->ds.size;
-
-		memset(&bts, 0, sizeof(bts));
-		error = trace->read(tracer, at, &bts);
-		if (error < 0) {
-			printk(KERN_CONT
-			       "error reading bts trace at [%lu] (0x%p)...",
-			       index, at);
-			return error;
-		}
-
-		switch (bts.qualifier) {
-		case BTS_BRANCH:
-			break;
-		default:
-			printk(KERN_CONT
-			       "unexpected bts entry %llu at [%lu] (0x%p)...",
-			       bts.qualifier, index, at);
-			return -1;
-		}
-	}
-
-	return 0;
-}
-
-static void ds_selftest_bts_cpu(void *arg)
-{
-	struct ds_selftest_bts_conf *conf = arg;
-	const struct bts_trace *trace;
-	void *top;
-
-	if (IS_ERR(conf->tracer)) {
-		conf->error = PTR_ERR(conf->tracer);
-		conf->tracer = NULL;
-
-		printk(KERN_CONT
-		       "initialization failed (err: %d)...", conf->error);
-		return;
-	}
-
-	/* We should meanwhile have enough trace. */
-	conf->error = conf->suspend(conf->tracer);
-	if (conf->error < 0)
-		return;
-
-	/* Let's see if we can access the trace. */
-	trace = ds_read_bts(conf->tracer);
-
-	conf->error = ds_selftest_bts_consistency(trace);
-	if (conf->error < 0)
-		return;
-
-	/* If everything went well, we should have a few trace entries. */
-	if (trace->ds.top == trace->ds.begin) {
-		/*
-		 * It is possible but highly unlikely that we got a
-		 * buffer overflow and end up at exactly the same
-		 * position we started from.
-		 * Let's issue a warning, but continue.
-		 */
-		printk(KERN_CONT "no trace/overflow...");
-	}
-
-	/* Let's try to read the trace we collected. */
-	conf->error =
-		ds_selftest_bts_read(conf->tracer, trace,
-				     trace->ds.begin, trace->ds.top);
-	if (conf->error < 0)
-		return;
-
-	/*
-	 * Let's read the trace again.
-	 * Since we suspended tracing, we should get the same result.
-	 */
-	top = trace->ds.top;
-
-	trace = ds_read_bts(conf->tracer);
-	conf->error = ds_selftest_bts_consistency(trace);
-	if (conf->error < 0)
-		return;
-
-	if (top != trace->ds.top) {
-		printk(KERN_CONT "suspend not working...");
-		conf->error = -1;
-		return;
-	}
-
-	/* Let's collect some more trace - see if resume is working. */
-	conf->error = conf->resume(conf->tracer);
-	if (conf->error < 0)
-		return;
-
-	conf->error = conf->suspend(conf->tracer);
-	if (conf->error < 0)
-		return;
-
-	trace = ds_read_bts(conf->tracer);
-
-	conf->error = ds_selftest_bts_consistency(trace);
-	if (conf->error < 0)
-		return;
-
-	if (trace->ds.top == top) {
-		/*
-		 * It is possible but highly unlikely that we got a
-		 * buffer overflow and end up at exactly the same
-		 * position we started from.
-		 * Let's issue a warning and check the full trace.
-		 */
-		printk(KERN_CONT
-		       "no resume progress/overflow...");
-
-		conf->error =
-			ds_selftest_bts_read(conf->tracer, trace,
-					     trace->ds.begin, trace->ds.end);
-	} else if (trace->ds.top < top) {
-		/*
-		 * We had a buffer overflow - the entire buffer should
-		 * contain trace records.
-		 */
-		conf->error =
-			ds_selftest_bts_read(conf->tracer, trace,
-					     trace->ds.begin, trace->ds.end);
-	} else {
-		/*
-		 * It is quite likely that the buffer did not overflow.
-		 * Let's just check the delta trace.
-		 */
-		conf->error =
-			ds_selftest_bts_read(conf->tracer, trace, top,
-					     trace->ds.top);
-	}
-	if (conf->error < 0)
-		return;
-
-	conf->error = 0;
-}
-
-static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
-{
-	ds_suspend_bts(tracer);
-	return 0;
-}
-
-static int ds_resume_bts_wrap(struct bts_tracer *tracer)
-{
-	ds_resume_bts(tracer);
-	return 0;
-}
-
-static void ds_release_bts_noirq_wrap(void *tracer)
-{
-	(void)ds_release_bts_noirq(tracer);
-}
-
-static int ds_selftest_bts_bad_release_noirq(int cpu,
-					     struct bts_tracer *tracer)
-{
-	int error = -EPERM;
-
-	/* Try to release the tracer on the wrong cpu. */
-	get_cpu();
-	if (cpu != smp_processor_id()) {
-		error = ds_release_bts_noirq(tracer);
-		if (error != -EPERM)
-			printk(KERN_CONT "release on wrong cpu...");
-	}
-	put_cpu();
-
-	return error ? 0 : -1;
-}
-
-static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
-{
-	struct bts_tracer *tracer;
-	int error;
-
-	/* Try to request cpu tracing while task tracing is active. */
-	tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
-				    (size_t)-1, BTS_KERNEL);
-	error = PTR_ERR(tracer);
-	if (!IS_ERR(tracer)) {
-		ds_release_bts(tracer);
-		error = 0;
-	}
-
-	if (error != -EPERM)
-		printk(KERN_CONT "cpu/task tracing overlap...");
-
-	return error ? 0 : -1;
-}
-
-static int ds_selftest_bts_bad_request_task(void *buffer)
-{
-	struct bts_tracer *tracer;
-	int error;
-
-	/* Try to request cpu tracing while task tracing is active. */
-	tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
-				    (size_t)-1, BTS_KERNEL);
-	error = PTR_ERR(tracer);
-	if (!IS_ERR(tracer)) {
-		error = 0;
-		ds_release_bts(tracer);
-	}
-
-	if (error != -EPERM)
-		printk(KERN_CONT "task/cpu tracing overlap...");
-
-	return error ? 0 : -1;
-}
-
-int ds_selftest_bts(void)
-{
-	struct ds_selftest_bts_conf conf;
-	unsigned char buffer[BUFFER_SIZE], *small_buffer;
-	unsigned long irq;
-	int cpu;
-
-	printk(KERN_INFO "[ds] bts selftest...");
-	conf.error = 0;
-
-	small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		conf.suspend = ds_suspend_bts_wrap;
-		conf.resume = ds_resume_bts_wrap;
-		conf.tracer =
-			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
-					   NULL, (size_t)-1, BTS_KERNEL);
-		ds_selftest_bts_cpu(&conf);
-		if (conf.error >= 0)
-			conf.error = ds_selftest_bts_bad_request_task(buffer);
-		ds_release_bts(conf.tracer);
-		if (conf.error < 0)
-			goto out;
-
-		conf.suspend = ds_suspend_bts_noirq;
-		conf.resume = ds_resume_bts_noirq;
-		conf.tracer =
-			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
-					   NULL, (size_t)-1, BTS_KERNEL);
-		smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
-		if (conf.error >= 0) {
-			conf.error =
-				ds_selftest_bts_bad_release_noirq(cpu,
-								  conf.tracer);
-			/* We must not release the tracer twice. */
-			if (conf.error < 0)
-				conf.tracer = NULL;
-		}
-		if (conf.error >= 0)
-			conf.error = ds_selftest_bts_bad_request_task(buffer);
-		smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
-					 conf.tracer, 1);
-		if (conf.error < 0)
-			goto out;
-	}
-
-	conf.suspend = ds_suspend_bts_wrap;
-	conf.resume = ds_resume_bts_wrap;
-	conf.tracer =
-		ds_request_bts_task(current, buffer, BUFFER_SIZE,
-				    NULL, (size_t)-1, BTS_KERNEL);
-	ds_selftest_bts_cpu(&conf);
-	if (conf.error >= 0)
-		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
-	ds_release_bts(conf.tracer);
-	if (conf.error < 0)
-		goto out;
-
-	conf.suspend = ds_suspend_bts_noirq;
-	conf.resume = ds_resume_bts_noirq;
-	conf.tracer =
-		ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
-				   NULL, (size_t)-1, BTS_KERNEL);
-	local_irq_save(irq);
-	ds_selftest_bts_cpu(&conf);
-	if (conf.error >= 0)
-		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
-	ds_release_bts_noirq(conf.tracer);
-	local_irq_restore(irq);
-	if (conf.error < 0)
-		goto out;
-
-	conf.error = 0;
- out:
-	put_online_cpus();
-	printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
-
-	return conf.error;
-}
-
-int ds_selftest_pebs(void)
-{
-	return 0;
-}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
deleted file mode 100644
index 2ba8745c666..00000000000
--- a/arch/x86/kernel/ds_selftest.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-
-#ifdef CONFIG_X86_DS_SELFTEST
-extern int ds_selftest_bts(void);
-extern int ds_selftest_pebs(void);
-#else
-static inline int ds_selftest_bts(void) { return 0; }
-static inline int ds_selftest_pebs(void) { return 0; }
-#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6d817554780..c89a386930b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -224,11 +224,6 @@ unsigned __kprobes long oops_begin(void)
 	int cpu;
 	unsigned long flags;
 
-	/* notify the hw-branch tracer so it may disable tracing and
-	   add the last trace to the trace buffer -
-	   the earlier this happens, the more useful the trace. */
-	trace_hw_branch_oops();
-
 	oops_enter();
 
 	/* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b43bbaebe2c..7a880ad3a20 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -422,14 +422,12 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 
 static void __kprobes clear_btf(void)
 {
-	if (test_thread_flag(TIF_DEBUGCTLMSR))
-		update_debugctlmsr(0);
+	/* XXX */
 }
 
 static void __kprobes restore_btf(void)
 {
-	if (test_thread_flag(TIF_DEBUGCTLMSR))
-		update_debugctlmsr(current->thread.debugctlmsr);
+	/* XXX */
 }
 
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ad9540676fc..1a60beb32ed 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -20,7 +20,6 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 
 unsigned long idle_halt;
@@ -50,8 +49,6 @@ void free_thread_xstate(struct task_struct *tsk)
 		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
 		tsk->thread.xstate = NULL;
 	}
-
-	WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
 
 void free_thread_info(struct thread_info *ti)
@@ -198,12 +195,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
-	if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
-	    test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
-		ds_switch_to(prev_p, next_p);
-	else if (next->debugctlmsr != prev->debugctlmsr)
-		update_debugctlmsr(next->debugctlmsr);
-
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c62667e30..75090c589b7 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,7 +55,6 @@
 #include <asm/cpu.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -238,13 +237,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		kfree(p->thread.io_bitmap_ptr);
 		p->thread.io_bitmap_max = 0;
 	}
-
-	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
-	p->thread.ds_ctx = NULL;
-
-	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
-	p->thread.debugctlmsr = 0;
-
 	return err;
 }
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index dc9690b4c4c..cc4258f2beb 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,7 +49,6 @@
 #include <asm/ia32.h>
 #include <asm/idle.h>
 #include <asm/syscalls.h>
-#include <asm/ds.h>
 #include <asm/debugreg.h>
 
 asmlinkage extern void ret_from_fork(void);
@@ -313,13 +312,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		if (err)
 			goto out;
 	}
-
-	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
-	p->thread.ds_ctx = NULL;
-
-	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
-	p->thread.debugctlmsr = 0;
-
 	err = 0;
 out:
 	if (err && p->thread.io_bitmap_ptr) {
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a503b1fd04e..f2fd3b80e56 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -2,9 +2,6 @@
 /*
  * Pentium III FXSR, SSE support
  *	Gareth Hughes <gareth@valinux.com>, May 2000
- *
- * BTS tracing
- *	Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
  */
 
 #include <linux/kernel.h>
@@ -21,7 +18,6 @@
 #include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/signal.h>
-#include <linux/workqueue.h>
 #include <linux/perf_event.h>
 #include <linux/hw_breakpoint.h>
 
@@ -35,7 +31,6 @@
 #include <asm/desc.h>
 #include <asm/prctl.h>
 #include <asm/proto.h>
-#include <asm/ds.h>
 #include <asm/hw_breakpoint.h>
 
 #include "tls.h"
@@ -788,342 +783,6 @@ static int ioperm_get(struct task_struct *target,
 				   0, IO_BITMAP_BYTES);
 }
 
-#ifdef CONFIG_X86_PTRACE_BTS
-/*
- * A branch trace store context.
- *
- * Contexts may only be installed by ptrace_bts_config() and only for
- * ptraced tasks.
- *
- * Contexts are destroyed when the tracee is detached from the tracer.
- * The actual destruction work requires interrupts enabled, so the
- * work is deferred and will be scheduled during __ptrace_unlink().
- *
- * Contexts hold an additional task_struct reference on the traced
- * task, as well as a reference on the tracer's mm.
- *
- * Ptrace already holds a task_struct for the duration of ptrace operations,
- * but since destruction is deferred, it may be executed after both
- * tracer and tracee exited.
- */
-struct bts_context {
-	/* The branch trace handle. */
-	struct bts_tracer	*tracer;
-
-	/* The buffer used to store the branch trace and its size. */
-	void			*buffer;
-	unsigned int		size;
-
-	/* The mm that paid for the above buffer. */
-	struct mm_struct	*mm;
-
-	/* The task this context belongs to. */
-	struct task_struct	*task;
-
-	/* The signal to send on a bts buffer overflow. */
-	unsigned int		bts_ovfl_signal;
-
-	/* The work struct to destroy a context. */
-	struct work_struct	work;
-};
-
-static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
-{
-	void *buffer = NULL;
-	int err = -ENOMEM;
-
-	err = account_locked_memory(current->mm, current->signal->rlim, size);
-	if (err < 0)
-		return err;
-
-	buffer = kzalloc(size, GFP_KERNEL);
-	if (!buffer)
-		goto out_refund;
-
-	context->buffer = buffer;
-	context->size = size;
-	context->mm = get_task_mm(current);
-
-	return 0;
-
- out_refund:
-	refund_locked_memory(current->mm, size);
-	return err;
-}
-
-static inline void free_bts_buffer(struct bts_context *context)
-{
-	if (!context->buffer)
-		return;
-
-	kfree(context->buffer);
-	context->buffer = NULL;
-
-	refund_locked_memory(context->mm, context->size);
-	context->size = 0;
-
-	mmput(context->mm);
-	context->mm = NULL;
-}
-
-static void free_bts_context_work(struct work_struct *w)
-{
-	struct bts_context *context;
-
-	context = container_of(w, struct bts_context, work);
-
-	ds_release_bts(context->tracer);
-	put_task_struct(context->task);
-	free_bts_buffer(context);
-	kfree(context);
-}
-
-static inline void free_bts_context(struct bts_context *context)
-{
-	INIT_WORK(&context->work, free_bts_context_work);
-	schedule_work(&context->work);
-}
-
-static inline struct bts_context *alloc_bts_context(struct task_struct *task)
-{
-	struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (context) {
-		context->task = task;
-		task->bts = context;
-
-		get_task_struct(task);
-	}
-
-	return context;
-}
-
-static int ptrace_bts_read_record(struct task_struct *child, size_t index,
-				  struct bts_struct __user *out)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-	struct bts_struct bts;
-	const unsigned char *at;
-	int error;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	at = trace->ds.top - ((index + 1) * trace->ds.size);
-	if ((void *)at < trace->ds.begin)
-		at += (trace->ds.n * trace->ds.size);
-
-	if (!trace->read)
-		return -EOPNOTSUPP;
-
-	error = trace->read(context->tracer, at, &bts);
-	if (error < 0)
-		return error;
-
-	if (copy_to_user(out, &bts, sizeof(bts)))
-		return -EFAULT;
-
-	return sizeof(bts);
-}
-
-static int ptrace_bts_drain(struct task_struct *child,
-			    long size,
-			    struct bts_struct __user *out)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-	const unsigned char *at;
-	int error, drained = 0;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	if (!trace->read)
-		return -EOPNOTSUPP;
-
-	if (size < (trace->ds.top - trace->ds.begin))
-		return -EIO;
-
-	for (at = trace->ds.begin; (void *)at < trace->ds.top;
-	     out++, drained++, at += trace->ds.size) {
-		struct bts_struct bts;
-
-		error = trace->read(context->tracer, at, &bts);
-		if (error < 0)
-			return error;
-
-		if (copy_to_user(out, &bts, sizeof(bts)))
-			return -EFAULT;
-	}
-
-	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-
-	error = ds_reset_bts(context->tracer);
-	if (error < 0)
-		return error;
-
-	return drained;
-}
-
-static int ptrace_bts_config(struct task_struct *child,
-			     long cfg_size,
-			     const struct ptrace_bts_config __user *ucfg)
-{
-	struct bts_context *context;
-	struct ptrace_bts_config cfg;
-	unsigned int flags = 0;
-
-	if (cfg_size < sizeof(cfg))
-		return -EIO;
-
-	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
-		return -EFAULT;
-
-	context = child->bts;
-	if (!context)
-		context = alloc_bts_context(child);
-	if (!context)
-		return -ENOMEM;
-
-	if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
-		if (!cfg.signal)
-			return -EINVAL;
-
-		return -EOPNOTSUPP;
-		context->bts_ovfl_signal = cfg.signal;
-	}
-
-	ds_release_bts(context->tracer);
-	context->tracer = NULL;
-
-	if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
-		int err;
-
-		free_bts_buffer(context);
-		if (!cfg.size)
-			return 0;
-
-		err = alloc_bts_buffer(context, cfg.size);
-		if (err < 0)
-			return err;
-	}
-
-	if (cfg.flags & PTRACE_BTS_O_TRACE)
-		flags |= BTS_USER;
-
-	if (cfg.flags & PTRACE_BTS_O_SCHED)
-		flags |= BTS_TIMESTAMPS;
-
-	context->tracer =
-		ds_request_bts_task(child, context->buffer, context->size,
-				    NULL, (size_t)-1, flags);
-	if (unlikely(IS_ERR(context->tracer))) {
-		int error = PTR_ERR(context->tracer);
-
-		free_bts_buffer(context);
-		context->tracer = NULL;
-		return error;
-	}
-
-	return sizeof(cfg);
-}
-
-static int ptrace_bts_status(struct task_struct *child,
-			     long cfg_size,
-			     struct ptrace_bts_config __user *ucfg)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-	struct ptrace_bts_config cfg;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	if (cfg_size < sizeof(cfg))
-		return -EIO;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	memset(&cfg, 0, sizeof(cfg));
-	cfg.size	= trace->ds.end - trace->ds.begin;
-	cfg.signal	= context->bts_ovfl_signal;
-	cfg.bts_size	= sizeof(struct bts_struct);
-
-	if (cfg.signal)
-		cfg.flags |= PTRACE_BTS_O_SIGNAL;
-
-	if (trace->ds.flags & BTS_USER)
-		cfg.flags |= PTRACE_BTS_O_TRACE;
-
-	if (trace->ds.flags & BTS_TIMESTAMPS)
-		cfg.flags |= PTRACE_BTS_O_SCHED;
-
-	if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
-		return -EFAULT;
-
-	return sizeof(cfg);
-}
-
-static int ptrace_bts_clear(struct task_struct *child)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-
-	return ds_reset_bts(context->tracer);
-}
-
-static int ptrace_bts_size(struct task_struct *child)
-{
-	struct bts_context *context;
-	const struct bts_trace *trace;
-
-	context = child->bts;
-	if (!context)
-		return -ESRCH;
-
-	trace = ds_read_bts(context->tracer);
-	if (!trace)
-		return -ESRCH;
-
-	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
-}
-
-/*
- * Called from __ptrace_unlink() after the child has been moved back
- * to its original parent.
- */
-void ptrace_bts_untrace(struct task_struct *child)
-{
-	if (unlikely(child->bts)) {
-		free_bts_context(child->bts);
-		child->bts = NULL;
-	}
-}
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -1251,39 +910,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 #endif
 
-	/*
-	 * These bits need more cooking - not enabled yet:
-	 */
-#ifdef CONFIG_X86_PTRACE_BTS
-	case PTRACE_BTS_CONFIG:
-		ret = ptrace_bts_config
-			(child, data, (struct ptrace_bts_config __user *)addr);
-		break;
-
-	case PTRACE_BTS_STATUS:
-		ret = ptrace_bts_status
-			(child, data, (struct ptrace_bts_config __user *)addr);
-		break;
-
-	case PTRACE_BTS_SIZE:
-		ret = ptrace_bts_size(child);
-		break;
-
-	case PTRACE_BTS_GET:
-		ret = ptrace_bts_read_record
-			(child, data, (struct bts_struct __user *) addr);
-		break;
-
-	case PTRACE_BTS_CLEAR:
-		ret = ptrace_bts_clear(child);
-		break;
-
-	case PTRACE_BTS_DRAIN:
-		ret = ptrace_bts_drain
-			(child, data, (struct bts_struct __user *) addr);
-		break;
-#endif /* CONFIG_X86_PTRACE_BTS */
-
 	default:
 		ret = ptrace_request(child, request, addr, data);
 		break;
@@ -1543,14 +1169,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 	case PTRACE_GET_THREAD_AREA:
 	case PTRACE_SET_THREAD_AREA:
-#ifdef CONFIG_X86_PTRACE_BTS
-	case PTRACE_BTS_CONFIG:
-	case PTRACE_BTS_STATUS:
-	case PTRACE_BTS_SIZE:
-	case PTRACE_BTS_GET:
-	case PTRACE_BTS_CLEAR:
-	case PTRACE_BTS_DRAIN:
-#endif /* CONFIG_X86_PTRACE_BTS */
 		return arch_ptrace(child, request, addr, data);
 
 	default:
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 3149032ff10..7beba0769a8 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -157,22 +157,6 @@ static int enable_single_step(struct task_struct *child)
 	return 1;
 }
 
-/*
- * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
- */
-static void write_debugctlmsr(struct task_struct *child, unsigned long val)
-{
-	if (child->thread.debugctlmsr == val)
-		return;
-
-	child->thread.debugctlmsr = val;
-
-	if (child != current)
-		return;
-
-	update_debugctlmsr(val);
-}
-
 /*
  * Enable single or block step.
  */
@@ -185,17 +169,9 @@ static void enable_step(struct task_struct *child, bool block)
 	 * So noone should try to use debugger block stepping in a program
 	 * that uses user-mode single stepping itself.
 	 */
-	if (enable_single_step(child) && block) {
-		set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-		write_debugctlmsr(child,
-				  child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
-	} else {
-		write_debugctlmsr(child,
-				  child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
-
-		if (!child->thread.debugctlmsr)
-			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-	}
+	if (!enable_single_step(child))
+		return;
+	/* XXX */
 }
 
 void user_enable_single_step(struct task_struct *child)
@@ -213,11 +189,7 @@ void user_disable_single_step(struct task_struct *child)
 	/*
 	 * Make sure block stepping (BTF) is disabled.
 	 */
-	write_debugctlmsr(child,
-			  child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
-
-	if (!child->thread.debugctlmsr)
-		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+	/* XXX */
 
 	/* Always clear TIF_SINGLESTEP... */
 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e445418..e3da5d726a3 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -543,11 +543,6 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 
 	/* DR6 may or may not be cleared by the CPU */
 	set_debugreg(0, 6);
-	/*
-	 * The processor cleared BTF, so don't mark that we need it set.
-	 */
-	clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
-	tsk->thread.debugctlmsr = 0;
 
 	/* Store the virtualized DR6 value */
 	tsk->thread.debugreg6 = dr6;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 01e6adea07e..cc12b3c556b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -504,18 +504,6 @@ extern int ftrace_dump_on_oops;
 #define INIT_TRACE_RECURSION
 #endif
 
-#ifdef CONFIG_HW_BRANCH_TRACER
-
-void trace_hw_branch(u64 from, u64 to);
-void trace_hw_branch_oops(void);
-
-#else /* CONFIG_HW_BRANCH_TRACER */
-
-static inline void trace_hw_branch(u64 from, u64 to) {}
-static inline void trace_hw_branch_oops(void) {}
-
-#endif /* CONFIG_HW_BRANCH_TRACER */
-
 #ifdef CONFIG_FTRACE_SYSCALLS
 
 unsigned long arch_syscall_addr(int nr);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e70f21beb4b..c8442b65511 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -19,7 +19,6 @@ struct anon_vma;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
-struct rlimit;
 
 #ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
@@ -1449,9 +1448,6 @@ int vmemmap_populate_basepages(struct page *start_page,
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
 
-extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
-				 size_t size);
-extern void refund_locked_memory(struct mm_struct *mm, size_t size);
 
 enum mf_flags {
 	MF_COUNT_INCREASED = 1 << 0,
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index e1fb6072997..4272521e29e 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -345,18 +345,6 @@ static inline void user_single_step_siginfo(struct task_struct *tsk,
 #define arch_ptrace_stop(code, info)		do { } while (0)
 #endif
 
-#ifndef arch_ptrace_untrace
-/*
- * Do machine-specific work before untracing child.
- *
- * This is called for a normal detach as well as from ptrace_exit()
- * when the tracing task dies.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-#define arch_ptrace_untrace(task)		do { } while (0)
-#endif
-
 extern int task_current_syscall(struct task_struct *target, long *callno,
 				unsigned long args[6], unsigned int maxargs,
 				unsigned long *sp, unsigned long *pc);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dad7f668ebf..e0447c64af6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -99,7 +99,6 @@ struct futex_pi_state;
 struct robust_list_head;
 struct bio_list;
 struct fs_struct;
-struct bts_context;
 struct perf_event_context;
 
 /*
@@ -1272,12 +1271,6 @@ struct task_struct {
 	struct list_head ptraced;
 	struct list_head ptrace_entry;
 
-	/*
-	 * This is the tracer handle for the ptrace BTS extension.
-	 * This field actually belongs to the ptracer task.
-	 */
-	struct bts_context *bts;
-
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
 	struct list_head thread_group;
@@ -2123,10 +2116,8 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
-extern void wait_task_context_switch(struct task_struct *p);
 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
-static inline void wait_task_context_switch(struct task_struct *p) {}
 static inline unsigned long wait_task_inactive(struct task_struct *p,
 					       long match_state)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 4799c5f0e6d..d67f1dbfbe0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1108,9 +1108,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->memcg_batch.do_batch = 0;
 	p->memcg_batch.memcg = NULL;
 #endif
-
-	p->bts = NULL;
-
 	p->stack_start = stack_start;
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42ad8ae729a..9fb51237b18 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -76,7 +76,6 @@ void __ptrace_unlink(struct task_struct *child)
 	child->parent = child->real_parent;
 	list_del_init(&child->ptrace_entry);
 
-	arch_ptrace_untrace(child);
 	if (task_is_traced(child))
 		ptrace_untrace(child);
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 9ab3cd7858d..117b7cad31b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2076,49 +2076,6 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 	return 1;
 }
 
-/*
- * wait_task_context_switch -	wait for a thread to complete at least one
- *				context switch.
- *
- * @p must not be current.
- */
-void wait_task_context_switch(struct task_struct *p)
-{
-	unsigned long nvcsw, nivcsw, flags;
-	int running;
-	struct rq *rq;
-
-	nvcsw	= p->nvcsw;
-	nivcsw	= p->nivcsw;
-	for (;;) {
-		/*
-		 * The runqueue is assigned before the actual context
-		 * switch. We need to take the runqueue lock.
-		 *
-		 * We could check initially without the lock but it is
-		 * very likely that we need to take the lock in every
-		 * iteration.
-		 */
-		rq = task_rq_lock(p, &flags);
-		running = task_running(rq, p);
-		task_rq_unlock(rq, &flags);
-
-		if (likely(!running))
-			break;
-		/*
-		 * The switch count is incremented before the actual
-		 * context switch. We thus wait for two switches to be
-		 * sure at least one completed.
-		 */
-		if ((p->nvcsw - nvcsw) > 1)
-			break;
-		if ((p->nivcsw - nivcsw) > 1)
-			break;
-
-		cpu_relax();
-	}
-}
-
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 13e13d428cd..8b1797c4545 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD
 	help
 	  See Documentation/trace/ftrace-design.txt
 
-config HAVE_HW_BRANCH_TRACER
-	bool
-
 config HAVE_SYSCALL_TRACEPOINTS
 	bool
 	help
@@ -374,14 +371,6 @@ config STACK_TRACER
 
 	  Say N if unsure.
 
-config HW_BRANCH_TRACER
-	depends on HAVE_HW_BRANCH_TRACER
-	bool "Trace hw branches"
-	select GENERIC_TRACER
-	help
-	  This tracer records all branches on the system in a circular
-	  buffer, giving access to the last N branches for each cpu.
-
 config KMEMTRACE
 	bool "Trace SLAB allocations"
 	select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 78edc649003..ffb1a5b0550 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
-obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2825ef2c0b1..bec2c973ff0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,7 +34,6 @@ enum trace_type {
 	TRACE_GRAPH_RET,
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
-	TRACE_HW_BRANCHES,
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
 	TRACE_BLK,
@@ -229,7 +228,6 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_ENT);		\
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
-		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
 		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
@@ -467,8 +465,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
 					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
-extern int trace_selftest_startup_hw_branches(struct tracer *trace,
-					      struct trace_array *tr);
 extern int trace_selftest_startup_ksym(struct tracer *trace,
 					 struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f399d..dc008c1240d 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch,
 		 __entry->func, __entry->file, __entry->correct)
 );
 
-FTRACE_ENTRY(hw_branch, hw_branch_entry,
-
-	TRACE_HW_BRANCHES,
-
-	F_STRUCT(
-		__field(	u64,	from	)
-		__field(	u64,	to	)
-	),
-
-	F_printk("from: %llx to: %llx", __entry->from, __entry->to)
-);
-
 FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
 
 	TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index 7b97000745f..00000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * h/w branch tracer for x86 based on BTS
- *
- * Copyright (C) 2008-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
- */
-#include <linux/kallsyms.h>
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-
-#include <asm/ds.h>
-
-#include "trace_output.h"
-#include "trace.h"
-
-
-#define BTS_BUFFER_SIZE (1 << 13)
-
-static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
-static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
-
-#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
-
-static int trace_hw_branches_enabled __read_mostly;
-static int trace_hw_branches_suspended __read_mostly;
-static struct trace_array *hw_branch_trace __read_mostly;
-
-
-static void bts_trace_init_cpu(int cpu)
-{
-	per_cpu(hwb_tracer, cpu) =
-		ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
-				   BTS_BUFFER_SIZE, NULL, (size_t)-1,
-				   BTS_KERNEL);
-
-	if (IS_ERR(per_cpu(hwb_tracer, cpu)))
-		per_cpu(hwb_tracer, cpu) = NULL;
-}
-
-static int bts_trace_init(struct trace_array *tr)
-{
-	int cpu;
-
-	hw_branch_trace = tr;
-	trace_hw_branches_enabled = 0;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		bts_trace_init_cpu(cpu);
-
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			trace_hw_branches_enabled = 1;
-	}
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-
-	/* If we could not enable tracing on a single cpu, we fail. */
-	return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		if (likely(per_cpu(hwb_tracer, cpu))) {
-			ds_release_bts(per_cpu(hwb_tracer, cpu));
-			per_cpu(hwb_tracer, cpu) = NULL;
-		}
-	}
-	trace_hw_branches_enabled = 0;
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-}
-
-static void bts_trace_start(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_resume_bts(per_cpu(hwb_tracer, cpu));
-	trace_hw_branches_suspended = 0;
-	put_online_cpus();
-}
-
-static void bts_trace_stop(struct trace_array *tr)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-	trace_hw_branches_suspended = 1;
-	put_online_cpus();
-}
-
-static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
-				     unsigned long action, void *hcpu)
-{
-	int cpu = (long)hcpu;
-
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_DOWN_FAILED:
-		/* The notification is sent with interrupts enabled. */
-		if (trace_hw_branches_enabled) {
-			bts_trace_init_cpu(cpu);
-
-			if (trace_hw_branches_suspended &&
-			    likely(per_cpu(hwb_tracer, cpu)))
-				ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-		}
-		break;
-
-	case CPU_DOWN_PREPARE:
-		/* The notification is sent with interrupts enabled. */
-		if (likely(per_cpu(hwb_tracer, cpu))) {
-			ds_release_bts(per_cpu(hwb_tracer, cpu));
-			per_cpu(hwb_tracer, cpu) = NULL;
-		}
-	}
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
-	.notifier_call = bts_hotcpu_handler
-};
-
-static void bts_trace_print_header(struct seq_file *m)
-{
-	seq_puts(m, "# CPU#        TO  <-  FROM\n");
-}
-
-static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
-{
-	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
-	struct trace_entry *entry = iter->ent;
-	struct trace_seq *seq = &iter->seq;
-	struct hw_branch_entry *it;
-
-	trace_assign_type(it, entry);
-
-	if (entry->type == TRACE_HW_BRANCHES) {
-		if (trace_seq_printf(seq, "%4d  ", iter->cpu) &&
-		    seq_print_ip_sym(seq, it->to, symflags) &&
-		    trace_seq_printf(seq, "\t  <-  ") &&
-		    seq_print_ip_sym(seq, it->from, symflags) &&
-		    trace_seq_printf(seq, "\n"))
-			return TRACE_TYPE_HANDLED;
-		return TRACE_TYPE_PARTIAL_LINE;
-	}
-	return TRACE_TYPE_UNHANDLED;
-}
-
-void trace_hw_branch(u64 from, u64 to)
-{
-	struct ftrace_event_call *call = &event_hw_branch;
-	struct trace_array *tr = hw_branch_trace;
-	struct ring_buffer_event *event;
-	struct ring_buffer *buf;
-	struct hw_branch_entry *entry;
-	unsigned long irq1;
-	int cpu;
-
-	if (unlikely(!tr))
-		return;
-
-	if (unlikely(!trace_hw_branches_enabled))
-		return;
-
-	local_irq_save(irq1);
-	cpu = raw_smp_processor_id();
-	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
-		goto out;
-
-	buf = tr->buffer;
-	event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, from);
-	entry->ent.type = TRACE_HW_BRANCHES;
-	entry->from = from;
-	entry->to   = to;
-	if (!filter_check_discard(call, entry, buf, event))
-		trace_buffer_unlock_commit(buf, event, 0, 0);
-
- out:
-	atomic_dec(&tr->data[cpu]->disabled);
-	local_irq_restore(irq1);
-}
-
-static void trace_bts_at(const struct bts_trace *trace, void *at)
-{
-	struct bts_struct bts;
-	int err = 0;
-
-	WARN_ON_ONCE(!trace->read);
-	if (!trace->read)
-		return;
-
-	err = trace->read(this_tracer, at, &bts);
-	if (err < 0)
-		return;
-
-	switch (bts.qualifier) {
-	case BTS_BRANCH:
-		trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
-		break;
-	}
-}
-
-/*
- * Collect the trace on the current cpu and write it into the ftrace buffer.
- *
- * pre: tracing must be suspended on the current cpu
- */
-static void trace_bts_cpu(void *arg)
-{
-	struct trace_array *tr = (struct trace_array *)arg;
-	const struct bts_trace *trace;
-	unsigned char *at;
-
-	if (unlikely(!tr))
-		return;
-
-	if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
-		return;
-
-	if (unlikely(!this_tracer))
-		return;
-
-	trace = ds_read_bts(this_tracer);
-	if (!trace)
-		return;
-
-	for (at = trace->ds.top; (void *)at < trace->ds.end;
-	     at += trace->ds.size)
-		trace_bts_at(trace, at);
-
-	for (at = trace->ds.begin; (void *)at < trace->ds.top;
-	     at += trace->ds.size)
-		trace_bts_at(trace, at);
-}
-
-static void trace_bts_prepare(struct trace_iterator *iter)
-{
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_suspend_bts(per_cpu(hwb_tracer, cpu));
-	/*
-	 * We need to collect the trace on the respective cpu since ftrace
-	 * implicitly adds the record for the current cpu.
-	 * Once that is more flexible, we could collect the data from any cpu.
-	 */
-	on_each_cpu(trace_bts_cpu, iter->tr, 1);
-
-	for_each_online_cpu(cpu)
-		if (likely(per_cpu(hwb_tracer, cpu)))
-			ds_resume_bts(per_cpu(hwb_tracer, cpu));
-	put_online_cpus();
-}
-
-static void trace_bts_close(struct trace_iterator *iter)
-{
-	tracing_reset_online_cpus(iter->tr);
-}
-
-void trace_hw_branch_oops(void)
-{
-	if (this_tracer) {
-		ds_suspend_bts_noirq(this_tracer);
-		trace_bts_cpu(hw_branch_trace);
-		ds_resume_bts_noirq(this_tracer);
-	}
-}
-
-struct tracer bts_tracer __read_mostly =
-{
-	.name		= "hw-branch-tracer",
-	.init		= bts_trace_init,
-	.reset		= bts_trace_reset,
-	.print_header	= bts_trace_print_header,
-	.print_line	= bts_trace_print_line,
-	.start		= bts_trace_start,
-	.stop		= bts_trace_stop,
-	.open		= trace_bts_prepare,
-	.close		= trace_bts_close,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_hw_branches,
-#endif /* CONFIG_FTRACE_SELFTEST */
-};
-
-__init static int init_bts_trace(void)
-{
-	register_hotcpu_notifier(&bts_hotcpu_notifier);
-	return register_tracer(&bts_tracer);
-}
-device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d6..a7084e7c042 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -16,7 +16,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
-	case TRACE_HW_BRANCHES:
 	case TRACE_KSYM:
 		return 1;
 	}
@@ -754,62 +753,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
-#ifdef CONFIG_HW_BRANCH_TRACER
-int
-trace_selftest_startup_hw_branches(struct tracer *trace,
-				   struct trace_array *tr)
-{
-	struct trace_iterator *iter;
-	struct tracer tracer;
-	unsigned long count;
-	int ret;
-
-	if (!trace->open) {
-		printk(KERN_CONT "missing open function...");
-		return -1;
-	}
-
-	ret = tracer_init(trace, tr);
-	if (ret) {
-		warn_failed_init_tracer(trace, ret);
-		return ret;
-	}
-
-	/*
-	 * The hw-branch tracer needs to collect the trace from the various
-	 * cpu trace buffers - before tracing is stopped.
-	 */
-	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-	if (!iter)
-		return -ENOMEM;
-
-	memcpy(&tracer, trace, sizeof(tracer));
-
-	iter->trace = &tracer;
-	iter->tr = tr;
-	iter->pos = -1;
-	mutex_init(&iter->mutex);
-
-	trace->open(iter);
-
-	mutex_destroy(&iter->mutex);
-	kfree(iter);
-
-	tracing_stop();
-
-	ret = trace_test_buffer(tr, &count);
-	trace->reset(tr);
-	tracing_start();
-
-	if (!ret && !count) {
-		printk(KERN_CONT "no entries found..");
-		ret = -1;
-	}
-
-	return ret;
-}
-#endif /* CONFIG_HW_BRANCH_TRACER */
-
 #ifdef CONFIG_KSYM_TRACER
 static int ksym_selftest_dummy;
 
diff --git a/mm/mlock.c b/mm/mlock.c
index 8f4e2dfceec..3f82720e051 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -607,44 +607,3 @@ void user_shm_unlock(size_t size, struct user_struct *user)
 	spin_unlock(&shmlock_user_lock);
 	free_uid(user);
 }
-
-int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
-			  size_t size)
-{
-	unsigned long lim, vm, pgsz;
-	int error = -ENOMEM;
-
-	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
-	down_write(&mm->mmap_sem);
-
-	lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
-	vm   = mm->total_vm + pgsz;
-	if (lim < vm)
-		goto out;
-
-	lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
-	vm   = mm->locked_vm + pgsz;
-	if (lim < vm)
-		goto out;
-
-	mm->total_vm  += pgsz;
-	mm->locked_vm += pgsz;
-
-	error = 0;
- out:
-	up_write(&mm->mmap_sem);
-	return error;
-}
-
-void refund_locked_memory(struct mm_struct *mm, size_t size)
-{
-	unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
-	down_write(&mm->mmap_sem);
-
-	mm->total_vm  -= pgsz;
-	mm->locked_vm -= pgsz;
-
-	up_write(&mm->mmap_sem);
-}
-- 
cgit v1.2.3


From ea8e61b7bbc4a2faef77db34eb2db2a2c2372ff6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 25 Mar 2010 14:51:51 +0100
Subject: x86, ptrace: Fix block-step

Implement ptrace-block-step using TIF_BLOCKSTEP which will set
DEBUGCTLMSR_BTF when set for a task while preserving any other
DEBUGCTLMSR bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100325135414.017536066@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/processor.h   |  4 ++--
 arch/x86/include/asm/thread_info.h |  4 +++-
 arch/x86/kernel/kprobes.c          | 14 ++++++++++++--
 arch/x86/kernel/process.c          | 11 +++++++++++
 arch/x86/kernel/step.c             | 24 ++++++++++++++++++++----
 arch/x86/kernel/traps.c            |  5 +++++
 6 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5bec21a66dc..32428b410b5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -799,7 +799,7 @@ extern void cpu_init(void);
 
 static inline unsigned long get_debugctlmsr(void)
 {
-    unsigned long debugctlmsr = 0;
+	unsigned long debugctlmsr = 0;
 
 #ifndef CONFIG_X86_DEBUGCTLMSR
 	if (boot_cpu_data.x86 < 6)
@@ -807,7 +807,7 @@ static inline unsigned long get_debugctlmsr(void)
 #endif
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
 
-    return debugctlmsr;
+	return debugctlmsr;
 }
 
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index dc85e12d140..d017ed5502e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -92,6 +92,7 @@ struct thread_info {
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_FREEZE		23	/* is freezing for suspend */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
+#define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT	28	/* syscall tracepoint instrumentation */
 
@@ -113,6 +114,7 @@ struct thread_info {
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
+#define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
 
@@ -143,7 +145,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_NOTSC)
+	(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7a880ad3a20..f2f56c0967b 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -422,12 +422,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 
 static void __kprobes clear_btf(void)
 {
-	/* XXX */
+	if (test_thread_flag(TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl &= ~DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+	}
 }
 
 static void __kprobes restore_btf(void)
 {
-	/* XXX */
+	if (test_thread_flag(TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl |= DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+	}
 }
 
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1a60beb32ed..8328009416d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -195,6 +195,17 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
+	if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
+	    test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl &= ~DEBUGCTLMSR_BTF;
+		if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
+			debugctl |= DEBUGCTLMSR_BTF;
+
+		update_debugctlmsr(debugctl);
+	}
+
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 7beba0769a8..58de45ee08b 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -169,9 +169,19 @@ static void enable_step(struct task_struct *child, bool block)
 	 * So noone should try to use debugger block stepping in a program
 	 * that uses user-mode single stepping itself.
 	 */
-	if (!enable_single_step(child))
-		return;
-	/* XXX */
+	if (enable_single_step(child) && block) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl |= DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+		set_tsk_thread_flag(child, TIF_BLOCKSTEP);
+	} else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl &= ~DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+		clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
+	}
 }
 
 void user_enable_single_step(struct task_struct *child)
@@ -189,7 +199,13 @@ void user_disable_single_step(struct task_struct *child)
 	/*
 	 * Make sure block stepping (BTF) is disabled.
 	 */
-	/* XXX */
+	if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
+		unsigned long debugctl = get_debugctlmsr();
+
+		debugctl &= ~DEBUGCTLMSR_BTF;
+		update_debugctlmsr(debugctl);
+		clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
+	}
 
 	/* Always clear TIF_SINGLESTEP... */
 	clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index e3da5d726a3..36f1bd9f8e7 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -544,6 +544,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	/* DR6 may or may not be cleared by the CPU */
 	set_debugreg(0, 6);
 
+	/*
+	 * The processor cleared BTF, so don't mark that we need it set.
+	 */
+	clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+
 	/* Store the virtualized DR6 value */
 	tsk->thread.debugreg6 = dr6;
 
-- 
cgit v1.2.3


From 11164cd4f6dab326a88bdf27f2f8f7c11977e91a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 26 Mar 2010 14:08:44 +0100
Subject: perf, x86: Add Nehelem PMU programming errata workaround

Implement the workaround for Intel Errata AAK100 and AAP53.

Also, remove the Core-i7 name for Nehalem events since there are
also Westmere based i7 chips.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
LKML-Reference: <1269608924.12097.147.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       |  8 +++----
 arch/x86/kernel/cpu/perf_event_intel.c | 43 ++++++++++++++++++++++++++++++----
 arch/x86/kernel/cpu/perf_event_p4.c    |  2 +-
 arch/x86/kernel/cpu/perf_event_p6.c    |  2 +-
 4 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index f571f514de2..6f66d4a845f 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -184,7 +184,7 @@ struct x86_pmu {
 	int		version;
 	int		(*handle_irq)(struct pt_regs *);
 	void		(*disable_all)(void);
-	void		(*enable_all)(void);
+	void		(*enable_all)(int added);
 	void		(*enable)(struct perf_event *);
 	void		(*disable)(struct perf_event *);
 	int		(*hw_config)(struct perf_event_attr *attr, struct hw_perf_event *hwc);
@@ -576,7 +576,7 @@ void hw_perf_disable(void)
 	x86_pmu.disable_all();
 }
 
-static void x86_pmu_enable_all(void)
+static void x86_pmu_enable_all(int added)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
@@ -784,7 +784,7 @@ void hw_perf_enable(void)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct perf_event *event;
 	struct hw_perf_event *hwc;
-	int i;
+	int i, added = cpuc->n_added;
 
 	if (!x86_pmu_initialized())
 		return;
@@ -836,7 +836,7 @@ void hw_perf_enable(void)
 	cpuc->enabled = 1;
 	barrier();
 
-	x86_pmu.enable_all();
+	x86_pmu.enable_all(added);
 }
 
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 044b8436b19..676aac27aca 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -483,7 +483,7 @@ static void intel_pmu_disable_all(void)
 	intel_pmu_lbr_disable_all();
 }
 
-static void intel_pmu_enable_all(void)
+static void intel_pmu_enable_all(int added)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -502,6 +502,40 @@ static void intel_pmu_enable_all(void)
 	}
 }
 
+/*
+ * Workaround for:
+ *   Intel Errata AAK100 (model 26)
+ *   Intel Errata AAP53  (model 30)
+ *
+ * These chips need to be 'reset' when adding counters by programming
+ * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
+ * either in sequence on the same PMC or on different PMCs.
+ */
+static void intel_pmu_nhm_enable_all(int added)
+{
+	if (added) {
+		struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+		int i;
+
+		wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
+		wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
+		wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
+
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+
+		for (i = 0; i < 3; i++) {
+			struct perf_event *event = cpuc->events[i];
+
+			if (!event)
+				continue;
+
+			__x86_pmu_enable_event(&event->hw);
+		}
+	}
+	intel_pmu_enable_all(added);
+}
+
 static inline u64 intel_pmu_get_status(void)
 {
 	u64 status;
@@ -658,7 +692,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	intel_pmu_drain_bts_buffer();
 	status = intel_pmu_get_status();
 	if (!status) {
-		intel_pmu_enable_all();
+		intel_pmu_enable_all(0);
 		return 0;
 	}
 
@@ -707,7 +741,7 @@ again:
 		goto again;
 
 done:
-	intel_pmu_enable_all();
+	intel_pmu_enable_all(0);
 	return 1;
 }
 
@@ -920,7 +954,8 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_nhm();
 
 		x86_pmu.event_constraints = intel_nehalem_event_constraints;
-		pr_cont("Nehalem/Corei7 events, ");
+		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+		pr_cont("Nehalem events, ");
 		break;
 
 	case 28: /* Atom */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index f8fe069f14e..0d1be36cbe9 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -535,7 +535,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
 				(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
 }
 
-static void p4_pmu_enable_all(void)
+static void p4_pmu_enable_all(int added)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 6ff4d01d880..877182c850d 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -66,7 +66,7 @@ static void p6_pmu_disable_all(void)
 	wrmsrl(MSR_P6_EVNTSEL0, val);
 }
 
-static void p6_pmu_enable_all(void)
+static void p6_pmu_enable_all(int added)
 {
 	unsigned long val;
 
-- 
cgit v1.2.3


From 1fb2f77c037624601fd214fb7c29faa84cd7bdd7 Mon Sep 17 00:00:00 2001
From: Henrik Kretzschmar <henne@nachtwindheim.de>
Date: Fri, 26 Mar 2010 20:38:35 +0100
Subject: debugobjects: Section mismatch cleanup

This patch marks two functions, which only get called at
initialization, as __init.

Here is also interesting, that modpost doesn't catch here the right
function name.

WARNING: lib/built-in.o(.text+0x585f): Section mismatch in reference
from the function T.506() to the variable .init.data:obj
The function T.506() references the variable __initdata obj.
This is often because T.506 lacks a __initdata annotation or the
annotation of obj is wrong.

Signed-off-by: Henrik Kretzschmar <henne@nachtwindheim.de>
LKML-Reference: <1269632315-19403-1-git-send-email-henne@nachtwindheim.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 lib/debugobjects.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index a9a8996d286..c4ecd3ce7fd 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -773,7 +773,7 @@ static int __init fixup_free(void *addr, enum debug_obj_state state)
 	}
 }
 
-static int
+static int __init
 check_results(void *addr, enum debug_obj_state state, int fixups, int warnings)
 {
 	struct debug_bucket *db;
@@ -916,7 +916,7 @@ void __init debug_objects_early_init(void)
 /*
  * Convert the statically allocated objects to dynamic ones:
  */
-static int debug_objects_replace_static_objects(void)
+static int __init debug_objects_replace_static_objects(void)
 {
 	struct debug_bucket *db = obj_hash;
 	struct hlist_node *node, *tmp;
-- 
cgit v1.2.3


From 4bdde044dc36ac7b01f7502394d52619af9d1927 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 24 Mar 2010 10:58:05 +0800
Subject: tracing: Convert some signal events to DEFINE_TRACE

Use DECLARE_EVENT_CLASS to remove duplicate code:

text    data     bss     dec     hex filename
  23639    6084       8   29731    7423 kernel/signal.o.orig
  22727    6084       8   28819    7093 kernel/signal.o

2 events are converted:

  signal_queue_overflow: signal_overflow_fail, signal_lose_info

No functional change.

Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BA97FBD.8070703@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/signal.h | 52 ++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 30 deletions(-)

diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h
index a510b75ac30..814566c99d2 100644
--- a/include/trace/events/signal.h
+++ b/include/trace/events/signal.h
@@ -100,18 +100,7 @@ TRACE_EVENT(signal_deliver,
 		  __entry->sa_handler, __entry->sa_flags)
 );
 
-/**
- * signal_overflow_fail - called when signal queue is overflow
- * @sig: signal number
- * @group: signal to process group or not (bool)
- * @info: pointer to struct siginfo
- *
- * Kernel fails to generate 'sig' signal with 'info' siginfo, because
- * siginfo queue is overflow, and the signal is dropped.
- * 'group' is not 0 if the signal will be sent to a process group.
- * 'sig' is always one of RT signals.
- */
-TRACE_EVENT(signal_overflow_fail,
+DECLARE_EVENT_CLASS(signal_queue_overflow,
 
 	TP_PROTO(int sig, int group, struct siginfo *info),
 
@@ -134,6 +123,24 @@ TRACE_EVENT(signal_overflow_fail,
 		  __entry->sig, __entry->group, __entry->errno, __entry->code)
 );
 
+/**
+ * signal_overflow_fail - called when signal queue is overflow
+ * @sig: signal number
+ * @group: signal to process group or not (bool)
+ * @info: pointer to struct siginfo
+ *
+ * Kernel fails to generate 'sig' signal with 'info' siginfo, because
+ * siginfo queue is overflow, and the signal is dropped.
+ * 'group' is not 0 if the signal will be sent to a process group.
+ * 'sig' is always one of RT signals.
+ */
+DEFINE_EVENT(signal_queue_overflow, signal_overflow_fail,
+
+	TP_PROTO(int sig, int group, struct siginfo *info),
+
+	TP_ARGS(sig, group, info)
+);
+
 /**
  * signal_lose_info - called when siginfo is lost
  * @sig: signal number
@@ -145,28 +152,13 @@ TRACE_EVENT(signal_overflow_fail,
  * 'group' is not 0 if the signal will be sent to a process group.
  * 'sig' is always one of non-RT signals.
  */
-TRACE_EVENT(signal_lose_info,
+DEFINE_EVENT(signal_queue_overflow, signal_lose_info,
 
 	TP_PROTO(int sig, int group, struct siginfo *info),
 
-	TP_ARGS(sig, group, info),
-
-	TP_STRUCT__entry(
-		__field(	int,	sig	)
-		__field(	int,	group	)
-		__field(	int,	errno	)
-		__field(	int,	code	)
-	),
-
-	TP_fast_assign(
-		__entry->sig	= sig;
-		__entry->group	= group;
-		TP_STORE_SIGINFO(__entry, info);
-	),
-
-	TP_printk("sig=%d group=%d errno=%d code=%d",
-		  __entry->sig, __entry->group, __entry->errno, __entry->code)
+	TP_ARGS(sig, group, info)
 );
+
 #endif /* _TRACE_SIGNAL_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 50354a8a28d0c91695a2d6d25b5a821bfe557a07 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 24 Mar 2010 10:58:24 +0800
Subject: tracing: Update comments

Make some comments consistent with the code.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BA97FD0.7090202@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/ftrace.h | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index ea6f9d4a20e..75dd7787fb3 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -154,9 +154,11 @@
  *
  *	field = (typeof(field))entry;
  *
- *	p = get_cpu_var(ftrace_event_seq);
+ *	p = &get_cpu_var(ftrace_event_seq);
  *	trace_seq_init(p);
- *	ret = trace_seq_printf(s, <TP_printk> "\n");
+ *	ret = trace_seq_printf(s, "%s: ", <call>);
+ *	if (ret)
+ *		ret = trace_seq_printf(s, <TP_printk> "\n");
  *	put_cpu();
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
@@ -450,38 +452,38 @@ perf_trace_disable_##name(struct ftrace_event_call *unused)		\
  *
  * static void ftrace_raw_event_<call>(proto)
  * {
+ *	struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
  *	struct ring_buffer_event *event;
  *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
  *	struct ring_buffer *buffer;
  *	unsigned long irq_flags;
+ *	int __data_size;
  *	int pc;
  *
  *	local_save_flags(irq_flags);
  *	pc = preempt_count();
  *
+ *	__data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
+ *
  *	event = trace_current_buffer_lock_reserve(&buffer,
  *				  event_<call>.id,
- *				  sizeof(struct ftrace_raw_<call>),
+ *				  sizeof(*entry) + __data_size,
  *				  irq_flags, pc);
  *	if (!event)
  *		return;
  *	entry	= ring_buffer_event_data(event);
  *
- *	<assign>;  <-- Here we assign the entries by the __field and
- *			__array macros.
+ *	{ <assign>; }  <-- Here we assign the entries by the __field and
+ *			   __array macros.
  *
- *	trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc);
+ *	if (!filter_current_check_discard(buffer, event_call, entry, event))
+ *		trace_current_buffer_unlock_commit(buffer,
+ *						   event, irq_flags, pc);
  * }
  *
  * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *unused)
  * {
- *	int ret;
- *
- *	ret = register_trace_<call>(ftrace_raw_event_<call>);
- *	if (!ret)
- *		pr_info("event trace: Could not activate trace point "
- *			"probe to <call>");
- *	return ret;
+ *	return register_trace_<call>(ftrace_raw_event_<call>);
  * }
  *
  * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
@@ -493,6 +495,8 @@ perf_trace_disable_##name(struct ftrace_event_call *unused)		\
  *	.trace			= ftrace_raw_output_<call>, <-- stage 2
  * };
  *
+ * static const char print_fmt_<call>[] = <TP_printk>;
+ *
  * static struct ftrace_event_call __used
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
@@ -501,6 +505,8 @@ perf_trace_disable_##name(struct ftrace_event_call *unused)		\
  *	.raw_init		= trace_event_raw_init,
  *	.regfunc		= ftrace_reg_event_<call>,
  *	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.print_fmt		= print_fmt_<call>,
+ *	.define_fields		= ftrace_define_fields_<call>,
  * }
  *
  */
@@ -569,7 +575,6 @@ ftrace_raw_event_id_##call(struct ftrace_event_call *event_call,	\
 		return;							\
 	entry	= ring_buffer_event_data(event);			\
 									\
-									\
 	tstruct								\
 									\
 	{ assign; }							\
-- 
cgit v1.2.3


From ae832d1e03ac9bf09fb8a07fb37908ab40c7cd0e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 24 Mar 2010 10:57:43 +0800
Subject: tracing: Remove side effect from module tracepoints that caused a GPF

Remove the @refcnt argument, because it has side-effects, and arguments with
side-effects are not skipped by the jump over disabled instrumentation and are
executed even when the tracepoint is disabled.

This was also causing a GPF as found by Randy Dunlap:

Subject: 2.6.33 GP fault only when built with tracing
LKML-Reference: <4BA2B69D.3000309@oracle.com>

Note, the current 2.6.34-rc has a fix for the actual cause of the GPF,
but this fixes one of its triggers.

Tested-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BA97FA7.6040406@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/module.h        |  6 ++----
 include/trace/events/module.h | 14 +++++++-------
 kernel/module.c               |  3 +--
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/include/linux/module.h b/include/linux/module.h
index 5e869ffd34a..393ec39b580 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -460,8 +460,7 @@ static inline void __module_get(struct module *module)
 	if (module) {
 		preempt_disable();
 		__this_cpu_inc(module->refptr->count);
-		trace_module_get(module, _THIS_IP_,
-				 __this_cpu_read(module->refptr->count));
+		trace_module_get(module, _THIS_IP_);
 		preempt_enable();
 	}
 }
@@ -475,8 +474,7 @@ static inline int try_module_get(struct module *module)
 
 		if (likely(module_is_live(module))) {
 			__this_cpu_inc(module->refptr->count);
-			trace_module_get(module, _THIS_IP_,
-				__this_cpu_read(module->refptr->count));
+			trace_module_get(module, _THIS_IP_);
 		}
 		else
 			ret = 0;
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index 4b0f48ba16a..a585f8135bd 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -53,9 +53,9 @@ TRACE_EVENT(module_free,
 
 DECLARE_EVENT_CLASS(module_refcnt,
 
-	TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+	TP_PROTO(struct module *mod, unsigned long ip),
 
-	TP_ARGS(mod, ip, refcnt),
+	TP_ARGS(mod, ip),
 
 	TP_STRUCT__entry(
 		__field(	unsigned long,	ip		)
@@ -65,7 +65,7 @@ DECLARE_EVENT_CLASS(module_refcnt,
 
 	TP_fast_assign(
 		__entry->ip	= ip;
-		__entry->refcnt	= refcnt;
+		__entry->refcnt	= __this_cpu_read(mod->refptr->count);
 		__assign_str(name, mod->name);
 	),
 
@@ -75,16 +75,16 @@ DECLARE_EVENT_CLASS(module_refcnt,
 
 DEFINE_EVENT(module_refcnt, module_get,
 
-	TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+	TP_PROTO(struct module *mod, unsigned long ip),
 
-	TP_ARGS(mod, ip, refcnt)
+	TP_ARGS(mod, ip)
 );
 
 DEFINE_EVENT(module_refcnt, module_put,
 
-	TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+	TP_PROTO(struct module *mod, unsigned long ip),
 
-	TP_ARGS(mod, ip, refcnt)
+	TP_ARGS(mod, ip)
 );
 
 TRACE_EVENT(module_request,
diff --git a/kernel/module.c b/kernel/module.c
index c968d3606dc..21591ad921f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -800,8 +800,7 @@ void module_put(struct module *module)
 		preempt_disable();
 		__this_cpu_dec(module->refptr->count);
 
-		trace_module_put(module, _RET_IP_,
-				 __this_cpu_read(module->refptr->count));
+		trace_module_put(module, _RET_IP_);
 		/* Maybe they're waiting for us to drop reference? */
 		if (unlikely(!module_is_live(module)))
 			wake_up_process(module->waiter);
-- 
cgit v1.2.3


From eb0c53771fb2f5f66b0edb3ebce33be4bbf1c285 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 29 Mar 2010 14:25:18 -0400
Subject: tracing: Fix compile error in module tracepoints when MODULE_UNLOAD
 not set

If modules are configured in the build but unloading of modules is not,
then the refcnt is not defined. Place the get/put module tracepoints
under CONFIG_MODULE_UNLOAD since it references this field in the module
structure.

As a side-effect, this patch also reduces the code when MODULE_UNLOAD
is not set, because these unused tracepoints are not created.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/module.h | 4 ++++
 kernel/module.c               | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index a585f8135bd..f07b44a2b24 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -51,6 +51,9 @@ TRACE_EVENT(module_free,
 	TP_printk("%s", __get_str(name))
 );
 
+#ifdef CONFIG_MODULE_UNLOAD
+/* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */
+
 DECLARE_EVENT_CLASS(module_refcnt,
 
 	TP_PROTO(struct module *mod, unsigned long ip),
@@ -86,6 +89,7 @@ DEFINE_EVENT(module_refcnt, module_put,
 
 	TP_ARGS(mod, ip)
 );
+#endif /* CONFIG_MODULE_UNLOAD */
 
 TRACE_EVENT(module_request,
 
diff --git a/kernel/module.c b/kernel/module.c
index 21591ad921f..d9e237926b6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -59,8 +59,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
 
-EXPORT_TRACEPOINT_SYMBOL(module_get);
-
 #if 0
 #define DEBUGP printk
 #else
@@ -467,6 +465,9 @@ MODINFO_ATTR(srcversion);
 static char last_unloaded_module[MODULE_NAME_LEN+1];
 
 #ifdef CONFIG_MODULE_UNLOAD
+
+EXPORT_TRACEPOINT_SYMBOL(module_get);
+
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
-- 
cgit v1.2.3


From 66a8cb95ed04025664d1db4e952155ee1dccd048 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 31 Mar 2010 13:21:56 -0400
Subject: ring-buffer: Add place holder recording of dropped events

Currently, when the ring buffer drops events, it does not record
the fact that it did so. It does inform the writer that the event
was dropped by returning a NULL event, but it does not put in any
place holder where the event was dropped.

This is not a trivial thing to add because the ring buffer mostly
runs in overwrite (flight recorder) mode. That is, when the ring
buffer is full, new data will overwrite old data.

In a produce/consumer mode, where new data is simply dropped when
the ring buffer is full, it is trivial to add the placeholder
for dropped events. When there's more room to write new data, then
a special event can be added to notify the reader about the dropped
events.

But in overwrite mode, any new write can overwrite events. A place
holder can not be inserted into the ring buffer since there never
may be room. A reader could also come in at anytime and miss the
placeholder.

Luckily, the way the ring buffer works, the read side can find out
if events were lost or not, and how many events. Everytime a write
takes place, if it overwrites the header page (the next read) it
updates a "overrun" variable that keeps track of the number of
lost events. When a reader swaps out a page from the ring buffer,
it can record this number, perfom the swap, and then check to
see if the number changed, and take the diff if it has, which would be
the number of events dropped. This can be stored by the reader
and returned to callers of the reader.

Since the reader page swap will fail if the writer moved the head
page since the time the reader page set up the swap, this gives room
to record the overruns without worrying about races. If the reader
sets up the pages, records the overrun, than performs the swap,
if the swap succeeds, then the overrun variable has not been
updated since the setup before the swap.

For binary readers of the ring buffer, a flag is set in the header
of each sub page (sub buffer) of the ring buffer. This flag is embedded
in the size field of the data on the sub buffer, in the 31st bit (the size
can be 32 or 64 bits depending on the architecture), but only 27
bits needs to be used for the actual size (less actually).

We could add a new field in the sub buffer header to also record the
number of events dropped since the last read, but this will change the
format of the binary ring buffer a bit too much. Perhaps this change can
be made if the information on the number of events dropped is considered
important enough.

Note, the notification of dropped events is only used by consuming reads
or peeking at the ring buffer. Iterating over the ring buffer does not
keep this information because the necessary data is only available when
a page swap is made, and the iterator does not swap out pages.

Cc: Robert Richter <robert.richter@amd.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 drivers/oprofile/cpu_buffer.c        |  4 +-
 include/linux/ring_buffer.h          |  6 ++-
 kernel/trace/ring_buffer.c           | 72 +++++++++++++++++++++++++++++++++---
 kernel/trace/ring_buffer_benchmark.c |  2 +-
 kernel/trace/trace.c                 |  4 +-
 kernel/trace/trace_functions_graph.c |  5 ++-
 kernel/trace/trace_selftest.c        |  2 +-
 7 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 166b67ea622..7581dbe456d 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -186,14 +186,14 @@ int op_cpu_buffer_write_commit(struct op_entry *entry)
 struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
 {
 	struct ring_buffer_event *e;
-	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
+	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL);
 	if (e)
 		goto event;
 	if (ring_buffer_swap_cpu(op_ring_buffer_read,
 				 op_ring_buffer_write,
 				 cpu))
 		return NULL;
-	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
+	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL, NULL);
 	if (e)
 		goto event;
 	return NULL;
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 5fcc31ed577..c8297761e41 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -120,9 +120,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
 		      unsigned long length, void *data);
 
 struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+		 unsigned long *lost_events);
 struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
 ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d1187ef20ca..8295650444c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -318,6 +318,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 #define TS_MASK		((1ULL << TS_SHIFT) - 1)
 #define TS_DELTA_TEST	(~TS_MASK)
 
+/* Flag when events were overwritten */
+#define RB_MISSED_EVENTS	(1 << 31)
+
 struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
 	local_t		 commit;	/* write committed index */
@@ -416,6 +419,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
 			       (unsigned int)sizeof(field.commit),
 			       (unsigned int)is_signed_type(long));
 
+	ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
+			       "offset:%u;\tsize:%u;\tsigned:%u;\n",
+			       (unsigned int)offsetof(typeof(field), commit),
+			       1,
+			       (unsigned int)is_signed_type(long));
+
 	ret = trace_seq_printf(s, "\tfield: char data;\t"
 			       "offset:%u;\tsize:%u;\tsigned:%u;\n",
 			       (unsigned int)offsetof(typeof(field), data),
@@ -439,6 +448,8 @@ struct ring_buffer_per_cpu {
 	struct buffer_page		*tail_page;	/* write to tail */
 	struct buffer_page		*commit_page;	/* committed pages */
 	struct buffer_page		*reader_page;
+	unsigned long			lost_events;
+	unsigned long			last_overrun;
 	local_t				commit_overrun;
 	local_t				overrun;
 	local_t				entries;
@@ -2835,6 +2846,7 @@ static struct buffer_page *
 rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct buffer_page *reader = NULL;
+	unsigned long overwrite;
 	unsigned long flags;
 	int nr_loops = 0;
 	int ret;
@@ -2895,6 +2907,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	/* The reader page will be pointing to the new head */
 	rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
 
+	/*
+	 * We want to make sure we read the overruns after we set up our
+	 * pointers to the next object. The writer side does a
+	 * cmpxchg to cross pages which acts as the mb on the writer
+	 * side. Note, the reader will constantly fail the swap
+	 * while the writer is updating the pointers, so this
+	 * guarantees that the overwrite recorded here is the one we
+	 * want to compare with the last_overrun.
+	 */
+	smp_mb();
+	overwrite = local_read(&(cpu_buffer->overrun));
+
 	/*
 	 * Here's the tricky part.
 	 *
@@ -2926,6 +2950,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->reader_page = reader;
 	rb_reset_reader_page(cpu_buffer);
 
+	if (overwrite != cpu_buffer->last_overrun) {
+		cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
+		cpu_buffer->last_overrun = overwrite;
+	}
+
 	goto again;
 
  out:
@@ -3002,8 +3031,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 		rb_advance_iter(iter);
 }
 
+static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	return cpu_buffer->lost_events;
+}
+
 static struct ring_buffer_event *
-rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
+	       unsigned long *lost_events)
 {
 	struct ring_buffer_event *event;
 	struct buffer_page *reader;
@@ -3055,6 +3090,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
 			ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
 							 cpu_buffer->cpu, ts);
 		}
+		if (lost_events)
+			*lost_events = rb_lost_events(cpu_buffer);
 		return event;
 
 	default:
@@ -3165,12 +3202,14 @@ static inline int rb_ok_to_lock(void)
  * @buffer: The ring buffer to read
  * @cpu: The cpu to peak at
  * @ts: The timestamp counter of this event.
+ * @lost_events: a variable to store if events were lost (may be NULL)
  *
  * This will return the event that will be read next, but does
  * not consume the data.
  */
 struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+		 unsigned long *lost_events)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
@@ -3185,7 +3224,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	local_irq_save(flags);
 	if (dolock)
 		spin_lock(&cpu_buffer->reader_lock);
-	event = rb_buffer_peek(cpu_buffer, ts);
+	event = rb_buffer_peek(cpu_buffer, ts, lost_events);
 	if (event && event->type_len == RINGBUF_TYPE_PADDING)
 		rb_advance_reader(cpu_buffer);
 	if (dolock)
@@ -3227,13 +3266,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 /**
  * ring_buffer_consume - return an event and consume it
  * @buffer: The ring buffer to get the next event from
+ * @cpu: the cpu to read the buffer from
+ * @ts: a variable to store the timestamp (may be NULL)
+ * @lost_events: a variable to store if events were lost (may be NULL)
  *
  * Returns the next event in the ring buffer, and that event is consumed.
  * Meaning, that sequential reads will keep returning a different event,
  * and eventually empty the ring buffer if the producer is slower.
  */
 struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+		    unsigned long *lost_events)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event = NULL;
@@ -3254,9 +3297,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 	if (dolock)
 		spin_lock(&cpu_buffer->reader_lock);
 
-	event = rb_buffer_peek(cpu_buffer, ts);
-	if (event)
+	event = rb_buffer_peek(cpu_buffer, ts, lost_events);
+	if (event) {
+		cpu_buffer->lost_events = 0;
 		rb_advance_reader(cpu_buffer);
+	}
 
 	if (dolock)
 		spin_unlock(&cpu_buffer->reader_lock);
@@ -3405,6 +3450,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->write_stamp = 0;
 	cpu_buffer->read_stamp = 0;
 
+	cpu_buffer->lost_events = 0;
+	cpu_buffer->last_overrun = 0;
+
 	rb_head_page_activate(cpu_buffer);
 }
 
@@ -3684,6 +3732,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	unsigned int commit;
 	unsigned int read;
 	u64 save_timestamp;
+	int missed_events = 0;
 	int ret = -1;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -3716,6 +3765,10 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	read = reader->read;
 	commit = rb_page_commit(reader);
 
+	/* Check if any events were dropped */
+	if (cpu_buffer->lost_events)
+		missed_events = 1;
+
 	/*
 	 * If this page has been partially read or
 	 * if len is not big enough to read the rest of the page or
@@ -3779,6 +3832,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	}
 	ret = read;
 
+	cpu_buffer->lost_events = 0;
+	/*
+	 * Set a flag in the commit field if we lost events
+	 */
+	if (missed_events)
+		local_add(RB_MISSED_EVENTS, &bpage->commit);
+
  out_unlock:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index df74c798225..dc56556b55a 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -81,7 +81,7 @@ static enum event_status read_event(int cpu)
 	int *entry;
 	u64 ts;
 
-	event = ring_buffer_consume(buffer, cpu, &ts);
+	event = ring_buffer_consume(buffer, cpu, &ts, NULL);
 	if (!event)
 		return EVENT_DROPPED;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3ec2ee6f656..fabb0033a9b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1556,7 +1556,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 	if (buf_iter)
 		event = ring_buffer_iter_peek(buf_iter, ts);
 	else
-		event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
+		event = ring_buffer_peek(iter->tr->buffer, cpu, ts, NULL);
 
 	ftrace_enable_cpu();
 
@@ -1635,7 +1635,7 @@ static void trace_consume(struct trace_iterator *iter)
 {
 	/* Don't allow ftrace to trace into the ring buffers */
 	ftrace_disable_cpu();
-	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
+	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, NULL);
 	ftrace_enable_cpu();
 }
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e6989d9b44d..a7f75fb10aa 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -489,9 +489,10 @@ get_return_for_leaf(struct trace_iterator *iter,
 			 * We need to consume the current entry to see
 			 * the next one.
 			 */
-			ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+			ring_buffer_consume(iter->tr->buffer, iter->cpu,
+					    NULL, NULL);
 			event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
-						 NULL);
+						 NULL, NULL);
 		}
 
 		if (!event)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d6..e50180874c6 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -29,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
 	struct trace_entry *entry;
 	unsigned int loops = 0;
 
-	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
+	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
 		entry = ring_buffer_event_data(event);
 
 		/*
-- 
cgit v1.2.3


From bc21b478425ac73f66a5ec0b375a5e0d12d609ce Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 31 Mar 2010 19:49:26 -0400
Subject: tracing: Show the lost events in the trace_pipe output

Now that the ring buffer can keep track of where events are lost.
Use this information to the output of trace_pipe:

       hackbench-3588  [001]  1326.701660: lock_acquire: ffffffff816591e0 read rcu_read_lock
       hackbench-3588  [001]  1326.701661: lock_acquire: ffff88003f4091f0 &(&dentry->d_lock)->rlock
       hackbench-3588  [001]  1326.701664: lock_release: ffff88003f4091f0 &(&dentry->d_lock)->rlock
CPU:1 [LOST 673 EVENTS]
       hackbench-3588  [001]  1326.702711: kmem_cache_free: call_site=ffffffff81102b85 ptr=ffff880026d96738
       hackbench-3588  [001]  1326.702712: lock_release: ffff88003e1480a8 &mm->mmap_sem
       hackbench-3588  [001]  1326.702713: lock_acquire: ffff88003e1480a8 &mm->mmap_sem

Even works with the function graph tracer:

 2) ! 170.098 us  |                                            }
 2)   4.036 us    |                                            rcu_irq_exit();
 2)   3.657 us    |                                            idle_cpu();
 2) ! 190.301 us  |                                          }
CPU:2 [LOST 2196 EVENTS]
 2)   0.853 us    |                            } /* cancel_dirty_page */
 2)               |                            remove_from_page_cache() {
 2)   1.578 us    |                              _raw_spin_lock_irq();
 2)               |                              __remove_from_page_cache() {

Note, it does not work with the iterator "trace" file, since it requires
the use of consuming the page from the ring buffer to determine how many
events were lost, which the iterator does not do.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  1 +
 kernel/trace/trace.c         | 30 ++++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c0f4b364c71..39e71b0a3bf 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -58,6 +58,7 @@ struct trace_iterator {
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
 	struct trace_entry	*ent;
+	unsigned long		lost_events;
 	int			leftover;
 	int			cpu;
 	u64			ts;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fabb0033a9b..0498bebcbfd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1545,7 +1545,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
 }
 
 static struct trace_entry *
-peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
+peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
+		unsigned long *lost_events)
 {
 	struct ring_buffer_event *event;
 	struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1556,7 +1557,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 	if (buf_iter)
 		event = ring_buffer_iter_peek(buf_iter, ts);
 	else
-		event = ring_buffer_peek(iter->tr->buffer, cpu, ts, NULL);
+		event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
+					 lost_events);
 
 	ftrace_enable_cpu();
 
@@ -1564,10 +1566,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
 }
 
 static struct trace_entry *
-__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
+		  unsigned long *missing_events, u64 *ent_ts)
 {
 	struct ring_buffer *buffer = iter->tr->buffer;
 	struct trace_entry *ent, *next = NULL;
+	unsigned long lost_events, next_lost = 0;
 	int cpu_file = iter->cpu_file;
 	u64 next_ts = 0, ts;
 	int next_cpu = -1;
@@ -1580,7 +1584,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 	if (cpu_file > TRACE_PIPE_ALL_CPU) {
 		if (ring_buffer_empty_cpu(buffer, cpu_file))
 			return NULL;
-		ent = peek_next_entry(iter, cpu_file, ent_ts);
+		ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
 		if (ent_cpu)
 			*ent_cpu = cpu_file;
 
@@ -1592,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 		if (ring_buffer_empty_cpu(buffer, cpu))
 			continue;
 
-		ent = peek_next_entry(iter, cpu, &ts);
+		ent = peek_next_entry(iter, cpu, &ts, &lost_events);
 
 		/*
 		 * Pick the entry with the smallest timestamp:
@@ -1601,6 +1605,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 			next = ent;
 			next_cpu = cpu;
 			next_ts = ts;
+			next_lost = lost_events;
 		}
 	}
 
@@ -1610,6 +1615,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 	if (ent_ts)
 		*ent_ts = next_ts;
 
+	if (missing_events)
+		*missing_events = next_lost;
+
 	return next;
 }
 
@@ -1617,13 +1625,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
 					  int *ent_cpu, u64 *ent_ts)
 {
-	return __find_next_entry(iter, ent_cpu, ent_ts);
+	return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
 }
 
 /* Find the next real entry, and increment the iterator to the next entry */
 static void *find_next_entry_inc(struct trace_iterator *iter)
 {
-	iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
+	iter->ent = __find_next_entry(iter, &iter->cpu,
+				      &iter->lost_events, &iter->ts);
 
 	if (iter->ent)
 		trace_iterator_increment(iter);
@@ -1635,7 +1644,8 @@ static void trace_consume(struct trace_iterator *iter)
 {
 	/* Don't allow ftrace to trace into the ring buffers */
 	ftrace_disable_cpu();
-	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, NULL);
+	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
+			    &iter->lost_events);
 	ftrace_enable_cpu();
 }
 
@@ -2030,6 +2040,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
 	enum print_line_t ret;
 
+	if (iter->lost_events)
+		trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+				 iter->cpu, iter->lost_events);
+
 	if (iter->trace && iter->trace->print_line) {
 		ret = iter->trace->print_line(iter);
 		if (ret != TRACE_TYPE_UNHANDLED)
-- 
cgit v1.2.3


From ff0ff84a0767df48d728c36510365344a7e7d582 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 31 Mar 2010 22:11:42 -0400
Subject: ring-buffer: Add lost event count to end of sub buffer

Currently, binary readers of the ring buffer only know where events were
lost, but not how many events were lost at that location.
This information is available, but it would require adding another
field to the sub buffer header to include it.

But when a event can not fit at the end of a sub buffer, it is written
to the next sub buffer. This means there is a good chance that the
buffer may have room to hold this counter. If it does, write
the counter at the end of the sub buffer and set another flag
in the data size field that states that this information exists.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8295650444c..dc6d563a6d2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -320,6 +320,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 
 /* Flag when events were overwritten */
 #define RB_MISSED_EVENTS	(1 << 31)
+/* Missed count stored at end */
+#define RB_MISSED_STORED	(1 << 30)
 
 struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
@@ -340,6 +342,7 @@ struct buffer_page {
 	local_t		 write;		/* index for next write */
 	unsigned	 read;		/* index for next read */
 	local_t		 entries;	/* entries on this page */
+	unsigned long	 real_end;	/* real end of data */
 	struct buffer_data_page *page;	/* Actual data page */
 };
 
@@ -1769,6 +1772,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	event = __rb_page_index(tail_page, tail);
 	kmemcheck_annotate_bitfield(event, bitfield);
 
+	/*
+	 * Save the original length to the meta data.
+	 * This will be used by the reader to add lost event
+	 * counter.
+	 */
+	tail_page->real_end = tail;
+
 	/*
 	 * If this event is bigger than the minimum size, then
 	 * we need to be careful that we don't subtract the
@@ -2888,6 +2898,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	local_set(&cpu_buffer->reader_page->write, 0);
 	local_set(&cpu_buffer->reader_page->entries, 0);
 	local_set(&cpu_buffer->reader_page->page->commit, 0);
+	cpu_buffer->reader_page->real_end = 0;
 
  spin:
 	/*
@@ -3728,11 +3739,11 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct buffer_data_page *bpage;
 	struct buffer_page *reader;
+	unsigned long missed_events;
 	unsigned long flags;
 	unsigned int commit;
 	unsigned int read;
 	u64 save_timestamp;
-	int missed_events = 0;
 	int ret = -1;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -3766,8 +3777,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	commit = rb_page_commit(reader);
 
 	/* Check if any events were dropped */
-	if (cpu_buffer->lost_events)
-		missed_events = 1;
+	missed_events = cpu_buffer->lost_events;
 
 	/*
 	 * If this page has been partially read or
@@ -3829,6 +3839,14 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		local_set(&reader->entries, 0);
 		reader->read = 0;
 		*data_page = bpage;
+
+		/*
+		 * Use the real_end for the data size,
+		 * This gives us a chance to store the lost events
+		 * on the page.
+		 */
+		if (reader->real_end)
+			local_set(&bpage->commit, reader->real_end);
 	}
 	ret = read;
 
@@ -3836,8 +3854,19 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	/*
 	 * Set a flag in the commit field if we lost events
 	 */
-	if (missed_events)
+	if (missed_events) {
+		commit = local_read(&bpage->commit);
+
+		/* If there is room at the end of the page to save the
+		 * missed events, then record it there.
+		 */
+		if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+			memcpy(&bpage->data[commit], &missed_events,
+			       sizeof(missed_events));
+			local_add(RB_MISSED_STORED, &bpage->commit);
+		}
 		local_add(RB_MISSED_EVENTS, &bpage->commit);
+	}
 
  out_unlock:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-- 
cgit v1.2.3


From 085ea739adf107b5a5d131f3625e517ff4a5181e Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Fri, 2 Apr 2010 12:50:39 -0400
Subject: perf probe: Fix --line syntax help and document

Just fix typos. --line option accepts ':START-END' syntax,
not ':START:END'.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
LKML-Reference: <20100402165038.23551.62590.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Documentation/perf-probe.txt | 2 +-
 tools/perf/builtin-probe.c              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 0f944b3be9e..bb671b34677 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -85,7 +85,7 @@ LINE SYNTAX
 -----------
 Line range is descripted by following syntax.
 
- "FUNC[:RLN[+NUM|:RLN2]]|SRC:ALN[+NUM|:ALN2]"
+ "FUNC[:RLN[+NUM|-RLN2]]|SRC:ALN[+NUM|-ALN2]"
 
 FUNC specifies the function name of showing lines. 'RLN' is the start line
 number from function entry line, and 'RLN2' is the end line number. As same as
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index cf2ffa5a384..b3ba25a910f 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -167,7 +167,7 @@ static const struct option options[] = {
 		    " with existing name"),
 #ifdef DWARF_SUPPORT
 	OPT_CALLBACK('L', "line", NULL,
-		     "FUNC[:RLN[+NUM|:RLN2]]|SRC:ALN[+NUM|:ALN2]",
+		     "FUNC[:RLN[+NUM|-RLN2]]|SRC:ALN[+NUM|-ALN2]",
 		     "Show source code lines.", opt_show_lines),
 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
 		   "file", "vmlinux pathname"),
-- 
cgit v1.2.3


From c9e385826d4f1ca5a72005ab8503598f791a8dc0 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Fri, 2 Apr 2010 12:50:45 -0400
Subject: perf probe: Fix not to return non-matched file

Fix cu_find_realpath() not to return the last file path
if that is not matched to input pattern.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
LKML-Reference: <20100402165045.23551.47780.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/probe-finder.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index db52ec2e84d..b44132ead95 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -183,6 +183,8 @@ static const char *cu_find_realpath(Dwarf_Die *cu_die, const char *fname)
 		if (strtailcmp(src, fname) == 0)
 			break;
 	}
+	if (i == nfiles)
+		return NULL;
 	return src;
 }
 
-- 
cgit v1.2.3


From 12e5a7ae475ccb2733d740ffb95d9ca0a18392da Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Fri, 2 Apr 2010 12:50:53 -0400
Subject: perf probe: Correct error message for non-structure type

perf probe outputs incorrect error message when it is called with
non-existent field on a non-data structure local variable.

<Before>
 # perf probe vfs_read 'count.hoge'
  Fatal: Structure on a register is not supported yet.
 # perf probe vfs_read 'count->hoge'
  Fatal: Semantic error: hoge must be referred by '.'

This corrects the messsage.

<After>
 # perf probe vfs_read 'count.hoge'
  Fatal: count is not a data structure.
 # perf probe vfs_read 'count->hoge'
  Fatal: count is not a data structure.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
LKML-Reference: <20100402165052.23551.75866.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/probe-finder.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index b44132ead95..59b0115de30 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -429,12 +429,20 @@ static void convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 		if (die_get_real_type(&type, &type) == NULL)
 			die("Failed to get a type information of %s.", varname);
 
+		/* Verify it is a data structure  */
+		if (dwarf_tag(&type) != DW_TAG_structure_type)
+			die("%s is not a data structure.", varname);
+
 		ref = xzalloc(sizeof(struct kprobe_trace_arg_ref));
 		if (*ref_ptr)
 			(*ref_ptr)->next = ref;
 		else
 			*ref_ptr = ref;
 	} else {
+		/* Verify it is a data structure  */
+		if (dwarf_tag(&type) != DW_TAG_structure_type)
+			die("%s is not a data structure.", varname);
+
 		if (field->ref)
 			die("Semantic error: %s must be referred by '.'",
 			    field->name);
@@ -442,10 +450,6 @@ static void convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 			die("Structure on a register is not supported yet.");
 	}
 
-	/* Verify it is a data structure  */
-	if (dwarf_tag(&type) != DW_TAG_structure_type)
-		die("%s is not a data structure.", varname);
-
 	if (die_find_member(&type, field->name, &member) == NULL)
 		die("%s(tyep:%s) has no member %s.", varname,
 		    dwarf_diename(&type), field->name);
-- 
cgit v1.2.3


From 75ec5a245c7763c397f31ec8964d0a46c54a7386 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Fri, 2 Apr 2010 12:50:59 -0400
Subject: perf probe: Fix to close dwarf when failing to analyze it

Fix to close libdw routine when failing to analyze it in
find_perf_probe_point().

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
LKML-Reference: <20100402165059.23551.95587.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/probe-finder.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 59b0115de30..a8513772df0 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -818,8 +818,10 @@ int find_perf_probe_point(int fd, unsigned long addr,
 		return -ENOENT;
 
 	/* Find cu die */
-	if (!dwarf_addrdie(dbg, (Dwarf_Addr)addr, &cudie))
-		return -EINVAL;
+	if (!dwarf_addrdie(dbg, (Dwarf_Addr)addr, &cudie)) {
+		ret = -EINVAL;
+		goto end;
+	}
 
 	/* Find a corresponding line */
 	line = dwarf_getsrc_die(&cudie, (Dwarf_Addr)addr);
-- 
cgit v1.2.3


From 948b1bb89a44561560531394c18da4a99215f772 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 29 Mar 2010 18:36:50 +0200
Subject: perf, x86: Undo some some *_counter* -> *_event* renames

The big rename:

 cdd6c48 perf: Do the big rename: Performance Counters -> Performance Events

accidentally renamed some members of stucts that were named after
registers in the spec. To avoid confusion this patch reverts some
changes. The related specs are MSR descriptions in AMD's BKDGs and the
ARCHITECTURAL PERFORMANCE MONITORING section in the Intel 64 and IA-32
Architectures Software Developer's Manuals.

This patch does:

 $ sed -i -e 's:num_events:num_counters:g' \
   arch/x86/include/asm/perf_event.h \
   arch/x86/kernel/cpu/perf_event_amd.c \
   arch/x86/kernel/cpu/perf_event.c \
   arch/x86/kernel/cpu/perf_event_intel.c \
   arch/x86/kernel/cpu/perf_event_p6.c \
   arch/x86/kernel/cpu/perf_event_p4.c \
   arch/x86/oprofile/op_model_ppro.c

 $ sed -i -e 's:event_bits:cntval_bits:g' -e 's:event_mask:cntval_mask:g' \
   arch/x86/kernel/cpu/perf_event_amd.c \
   arch/x86/kernel/cpu/perf_event.c \
   arch/x86/kernel/cpu/perf_event_intel.c \
   arch/x86/kernel/cpu/perf_event_p6.c \
   arch/x86/kernel/cpu/perf_event_p4.c

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1269880612-25800-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h      |  4 +-
 arch/x86/kernel/cpu/perf_event.c       | 74 +++++++++++++++++-----------------
 arch/x86/kernel/cpu/perf_event_amd.c   | 12 +++---
 arch/x86/kernel/cpu/perf_event_intel.c | 16 ++++----
 arch/x86/kernel/cpu/perf_event_p4.c    | 14 +++----
 arch/x86/kernel/cpu/perf_event_p6.c    |  6 +--
 arch/x86/oprofile/op_model_ppro.c      |  4 +-
 7 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 124dddd598f..987bf673141 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -67,7 +67,7 @@
 union cpuid10_eax {
 	struct {
 		unsigned int version_id:8;
-		unsigned int num_events:8;
+		unsigned int num_counters:8;
 		unsigned int bit_width:8;
 		unsigned int mask_length:8;
 	} split;
@@ -76,7 +76,7 @@ union cpuid10_eax {
 
 union cpuid10_edx {
 	struct {
-		unsigned int num_events_fixed:4;
+		unsigned int num_counters_fixed:4;
 		unsigned int reserved:28;
 	} split;
 	unsigned int full;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b5343566181..9daaa1ef504 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -195,10 +195,10 @@ struct x86_pmu {
 	u64		(*event_map)(int);
 	u64		(*raw_event)(u64);
 	int		max_events;
-	int		num_events;
-	int		num_events_fixed;
-	int		event_bits;
-	u64		event_mask;
+	int		num_counters;
+	int		num_counters_fixed;
+	int		cntval_bits;
+	u64		cntval_mask;
 	int		apic;
 	u64		max_period;
 	struct event_constraint *
@@ -268,7 +268,7 @@ static u64
 x86_perf_event_update(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	int shift = 64 - x86_pmu.event_bits;
+	int shift = 64 - x86_pmu.cntval_bits;
 	u64 prev_raw_count, new_raw_count;
 	int idx = hwc->idx;
 	s64 delta;
@@ -320,12 +320,12 @@ static bool reserve_pmc_hardware(void)
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		disable_lapic_nmi_watchdog();
 
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
 			goto perfctr_fail;
 	}
 
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
@@ -336,7 +336,7 @@ eventsel_fail:
 	for (i--; i >= 0; i--)
 		release_evntsel_nmi(x86_pmu.eventsel + i);
 
-	i = x86_pmu.num_events;
+	i = x86_pmu.num_counters;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
@@ -352,7 +352,7 @@ static void release_pmc_hardware(void)
 {
 	int i;
 
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		release_perfctr_nmi(x86_pmu.perfctr + i);
 		release_evntsel_nmi(x86_pmu.eventsel + i);
 	}
@@ -547,7 +547,7 @@ static void x86_pmu_disable_all(void)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
 		if (!test_bit(idx, cpuc->active_mask))
@@ -582,7 +582,7 @@ static void x86_pmu_enable_all(int added)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		struct perf_event *event = cpuc->events[idx];
 		u64 val;
 
@@ -657,14 +657,14 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	 * assign events to counters starting with most
 	 * constrained events.
 	 */
-	wmax = x86_pmu.num_events;
+	wmax = x86_pmu.num_counters;
 
 	/*
 	 * when fixed event counters are present,
 	 * wmax is incremented by 1 to account
 	 * for one more choice
 	 */
-	if (x86_pmu.num_events_fixed)
+	if (x86_pmu.num_counters_fixed)
 		wmax++;
 
 	for (w = 1, num = n; num && w <= wmax; w++) {
@@ -714,7 +714,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
 	struct perf_event *event;
 	int n, max_count;
 
-	max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
+	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
 
 	/* current number of events already accepted */
 	n = cpuc->n_events;
@@ -904,7 +904,7 @@ x86_perf_event_set_period(struct perf_event *event)
 	atomic64_set(&hwc->prev_count, (u64)-left);
 
 	wrmsrl(hwc->event_base + idx,
-			(u64)(-left) & x86_pmu.event_mask);
+			(u64)(-left) & x86_pmu.cntval_mask);
 
 	perf_event_update_userpage(event);
 
@@ -987,7 +987,7 @@ void perf_event_print_debug(void)
 	unsigned long flags;
 	int cpu, idx;
 
-	if (!x86_pmu.num_events)
+	if (!x86_pmu.num_counters)
 		return;
 
 	local_irq_save(flags);
@@ -1011,7 +1011,7 @@ void perf_event_print_debug(void)
 	}
 	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
 		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
 
@@ -1024,7 +1024,7 @@ void perf_event_print_debug(void)
 		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
-	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
 		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1089,7 +1089,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 
@@ -1097,7 +1097,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 		hwc = &event->hw;
 
 		val = x86_perf_event_update(event);
-		if (val & (1ULL << (x86_pmu.event_bits - 1)))
+		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
 			continue;
 
 		/*
@@ -1401,46 +1401,46 @@ void __init init_hw_perf_events(void)
 	if (x86_pmu.quirks)
 		x86_pmu.quirks();
 
-	if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
+	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
 		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
-		     x86_pmu.num_events, X86_PMC_MAX_GENERIC);
-		x86_pmu.num_events = X86_PMC_MAX_GENERIC;
+		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
+		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
 	}
-	x86_pmu.intel_ctrl = (1 << x86_pmu.num_events) - 1;
-	perf_max_events = x86_pmu.num_events;
+	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+	perf_max_events = x86_pmu.num_counters;
 
-	if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
+	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
 		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
-		     x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
-		x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
+		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
+		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
 	}
 
 	x86_pmu.intel_ctrl |=
-		((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
+		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
 	perf_events_lapic_init();
 	register_die_notifier(&perf_event_nmi_notifier);
 
 	unconstrained = (struct event_constraint)
-		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
-				   0, x86_pmu.num_events);
+		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
+				   0, x86_pmu.num_counters);
 
 	if (x86_pmu.event_constraints) {
 		for_each_event_constraint(c, x86_pmu.event_constraints) {
 			if (c->cmask != INTEL_ARCH_FIXED_MASK)
 				continue;
 
-			c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
-			c->weight += x86_pmu.num_events;
+			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+			c->weight += x86_pmu.num_counters;
 		}
 	}
 
 	pr_info("... version:                %d\n",     x86_pmu.version);
-	pr_info("... bit width:              %d\n",     x86_pmu.event_bits);
-	pr_info("... generic registers:      %d\n",     x86_pmu.num_events);
-	pr_info("... value mask:             %016Lx\n", x86_pmu.event_mask);
+	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
+	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
+	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
 	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
-	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_events_fixed);
+	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
 	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
 
 	perf_cpu_notifier(x86_pmu_notifier);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 285623bc3cc..7753a5c7653 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -165,7 +165,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
 	 * be removed on one CPU at a time AND PMU is disabled
 	 * when we come here
 	 */
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (nb->owners[i] == event) {
 			cmpxchg(nb->owners+i, event, NULL);
 			break;
@@ -215,7 +215,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	struct amd_nb *nb = cpuc->amd_nb;
 	struct perf_event *old = NULL;
-	int max = x86_pmu.num_events;
+	int max = x86_pmu.num_counters;
 	int i, j, k = -1;
 
 	/*
@@ -293,7 +293,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
 	/*
 	 * initialize all possible NB constraints
 	 */
-	for (i = 0; i < x86_pmu.num_events; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		__set_bit(i, nb->event_constraints[i].idxmsk);
 		nb->event_constraints[i].weight = 1;
 	}
@@ -385,9 +385,9 @@ static __initconst struct x86_pmu amd_pmu = {
 	.event_map		= amd_pmu_event_map,
 	.raw_event		= amd_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
-	.num_events		= 4,
-	.event_bits		= 48,
-	.event_mask		= (1ULL << 48) - 1,
+	.num_counters		= 4,
+	.cntval_bits		= 48,
+	.cntval_mask		= (1ULL << 48) - 1,
 	.apic			= 1,
 	/* use highest bit to detect overflow */
 	.max_period		= (1ULL << 47) - 1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 676aac27aca..cc4d90a13d5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -653,20 +653,20 @@ static void intel_pmu_reset(void)
 	unsigned long flags;
 	int idx;
 
-	if (!x86_pmu.num_events)
+	if (!x86_pmu.num_counters)
 		return;
 
 	local_irq_save(flags);
 
 	printk("clearing PMU state on CPU#%d\n", smp_processor_id());
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
 		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
 	}
-	for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
 		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-	}
+
 	if (ds)
 		ds->bts_index = ds->bts_buffer_base;
 
@@ -901,16 +901,16 @@ static __init int intel_pmu_init(void)
 		x86_pmu = intel_pmu;
 
 	x86_pmu.version			= version;
-	x86_pmu.num_events		= eax.split.num_events;
-	x86_pmu.event_bits		= eax.split.bit_width;
-	x86_pmu.event_mask		= (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.cntval_bits		= eax.split.bit_width;
+	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
 
 	/*
 	 * Quirk: v2 perfmon does not report fixed-purpose events, so
 	 * assume at least 3 events:
 	 */
 	if (version > 1)
-		x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
+		x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
 
 	/*
 	 * v2 and above have a perf capabilities MSR
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 0d1be36cbe9..4139100404e 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -483,7 +483,7 @@ static void p4_pmu_disable_all(void)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		struct perf_event *event = cpuc->events[idx];
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
@@ -540,7 +540,7 @@ static void p4_pmu_enable_all(int added)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int idx;
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		struct perf_event *event = cpuc->events[idx];
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
@@ -562,7 +562,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
-	for (idx = 0; idx < x86_pmu.num_events; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
@@ -579,7 +579,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 		p4_pmu_clear_cccr_ovf(hwc);
 
 		val = x86_perf_event_update(event);
-		if (val & (1ULL << (x86_pmu.event_bits - 1)))
+		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
 			continue;
 
 		/*
@@ -794,10 +794,10 @@ static __initconst struct x86_pmu p4_pmu = {
 	 * though leave it restricted at moment assuming
 	 * HT is on
 	 */
-	.num_events		= ARCH_P4_MAX_CCCR,
+	.num_counters		= ARCH_P4_MAX_CCCR,
 	.apic			= 1,
-	.event_bits		= 40,
-	.event_mask		= (1ULL << 40) - 1,
+	.cntval_bits		= 40,
+	.cntval_mask		= (1ULL << 40) - 1,
 	.max_period		= (1ULL << 39) - 1,
 	.hw_config		= p4_hw_config,
 	.schedule_events	= p4_pmu_schedule_events,
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 877182c850d..b26fbc7eb93 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -119,7 +119,7 @@ static __initconst struct x86_pmu p6_pmu = {
 	.apic			= 1,
 	.max_period		= (1ULL << 31) - 1,
 	.version		= 0,
-	.num_events		= 2,
+	.num_counters		= 2,
 	/*
 	 * Events have 40 bits implemented. However they are designed such
 	 * that bits [32-39] are sign extensions of bit 31. As such the
@@ -127,8 +127,8 @@ static __initconst struct x86_pmu p6_pmu = {
 	 *
 	 * See IA-32 Intel Architecture Software developer manual Vol 3B
 	 */
-	.event_bits		= 32,
-	.event_mask		= (1ULL << 32) - 1,
+	.cntval_bits		= 32,
+	.cntval_mask		= (1ULL << 32) - 1,
 	.get_event_constraints	= x86_get_event_constraints,
 	.event_constraints	= p6_event_constraints,
 };
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 2bf90fafa7b..c8abc4d1bf3 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -239,11 +239,11 @@ static void arch_perfmon_setup_counters(void)
 	if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
 		current_cpu_data.x86_model == 15) {
 		eax.split.version_id = 2;
-		eax.split.num_events = 2;
+		eax.split.num_counters = 2;
 		eax.split.bit_width = 40;
 	}
 
-	num_counters = eax.split.num_events;
+	num_counters = eax.split.num_counters;
 
 	op_arch_perfmon_spec.num_counters = num_counters;
 	op_arch_perfmon_spec.num_controls = num_counters;
-- 
cgit v1.2.3


From a098f4484bc7dae23f5b62360954007b99b64600 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 30 Mar 2010 11:28:21 +0200
Subject: perf, x86: implement ARCH_PERFMON_EVENTSEL bit masks

ARCH_PERFMON_EVENTSEL bit masks are often used in the kernel. This
patch adds macros for the bit masks and removes local defines. The
function intel_pmu_raw_event() becomes x86_pmu_raw_event() which is
generic for x86 models and same also for p6. Duplicate code is
removed.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100330092821.GH11907@erda.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event.h      | 58 +++++++++++++++-------------------
 arch/x86/kernel/cpu/perf_event.c       | 19 +++++++++--
 arch/x86/kernel/cpu/perf_event_amd.c   | 15 +--------
 arch/x86/kernel/cpu/perf_event_intel.c | 22 ++-----------
 arch/x86/kernel/cpu/perf_event_p6.c    | 20 +-----------
 5 files changed, 45 insertions(+), 89 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 987bf673141..f6d43dbfd8e 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -18,39 +18,31 @@
 #define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
 #define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
 
-#define ARCH_PERFMON_EVENTSEL_ENABLE			  (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_ANY			  (1 << 21)
-#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
-
-/*
- * Includes eventsel and unit mask as well:
- */
-
-
-#define INTEL_ARCH_EVTSEL_MASK		0x000000FFULL
-#define INTEL_ARCH_UNIT_MASK		0x0000FF00ULL
-#define INTEL_ARCH_EDGE_MASK		0x00040000ULL
-#define INTEL_ARCH_INV_MASK		0x00800000ULL
-#define INTEL_ARCH_CNT_MASK		0xFF000000ULL
-#define INTEL_ARCH_EVENT_MASK	(INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK)
-
-/*
- * filter mask to validate fixed counter events.
- * the following filters disqualify for fixed counters:
- *  - inv
- *  - edge
- *  - cnt-mask
- *  The other filters are supported by fixed counters.
- *  The any-thread option is supported starting with v3.
- */
-#define INTEL_ARCH_FIXED_MASK \
-	(INTEL_ARCH_CNT_MASK| \
-	 INTEL_ARCH_INV_MASK| \
-	 INTEL_ARCH_EDGE_MASK|\
-	 INTEL_ARCH_UNIT_MASK|\
-	 INTEL_ARCH_EVENT_MASK)
+#define ARCH_PERFMON_EVENTSEL_EVENT			0x000000FFULL
+#define ARCH_PERFMON_EVENTSEL_UMASK			0x0000FF00ULL
+#define ARCH_PERFMON_EVENTSEL_USR			(1ULL << 16)
+#define ARCH_PERFMON_EVENTSEL_OS			(1ULL << 17)
+#define ARCH_PERFMON_EVENTSEL_EDGE			(1ULL << 18)
+#define ARCH_PERFMON_EVENTSEL_INT			(1ULL << 20)
+#define ARCH_PERFMON_EVENTSEL_ANY			(1ULL << 21)
+#define ARCH_PERFMON_EVENTSEL_ENABLE			(1ULL << 22)
+#define ARCH_PERFMON_EVENTSEL_INV			(1ULL << 23)
+#define ARCH_PERFMON_EVENTSEL_CMASK			0xFF000000ULL
+
+#define AMD64_EVENTSEL_EVENT	\
+	(ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32))
+#define INTEL_ARCH_EVENT_MASK	\
+	(ARCH_PERFMON_EVENTSEL_UMASK | ARCH_PERFMON_EVENTSEL_EVENT)
+
+#define X86_RAW_EVENT_MASK		\
+	(ARCH_PERFMON_EVENTSEL_EVENT |	\
+	 ARCH_PERFMON_EVENTSEL_UMASK |	\
+	 ARCH_PERFMON_EVENTSEL_EDGE  |	\
+	 ARCH_PERFMON_EVENTSEL_INV   |	\
+	 ARCH_PERFMON_EVENTSEL_CMASK)
+#define AMD64_RAW_EVENT_MASK		\
+	(X86_RAW_EVENT_MASK          |  \
+	 AMD64_EVENTSEL_EVENT)
 
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 9daaa1ef504..1dd42c18f1c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -143,13 +143,21 @@ struct cpu_hw_events {
  * Constraint on the Event code.
  */
 #define INTEL_EVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
+	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
 
 /*
  * Constraint on the Event code + UMask + fixed-mask
+ *
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ *  - inv
+ *  - edge
+ *  - cnt-mask
+ *  The other filters are supported by fixed counters.
+ *  The any-thread option is supported starting with v3.
  */
 #define FIXED_EVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
+	EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
 
 /*
  * Constraint on the Event code + UMask
@@ -437,6 +445,11 @@ static int x86_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc
 	return 0;
 }
 
+static u64 x86_pmu_raw_event(u64 hw_event)
+{
+	return hw_event & X86_RAW_EVENT_MASK;
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -1427,7 +1440,7 @@ void __init init_hw_perf_events(void)
 
 	if (x86_pmu.event_constraints) {
 		for_each_event_constraint(c, x86_pmu.event_constraints) {
-			if (c->cmask != INTEL_ARCH_FIXED_MASK)
+			if (c->cmask != X86_RAW_EVENT_MASK)
 				continue;
 
 			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 7753a5c7653..37e9517729d 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -113,20 +113,7 @@ static u64 amd_pmu_event_map(int hw_event)
 
 static u64 amd_pmu_raw_event(u64 hw_event)
 {
-#define K7_EVNTSEL_EVENT_MASK	0xF000000FFULL
-#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
-#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
-#define K7_EVNTSEL_INV_MASK	0x000800000ULL
-#define K7_EVNTSEL_REG_MASK	0x0FF000000ULL
-
-#define K7_EVNTSEL_MASK			\
-	(K7_EVNTSEL_EVENT_MASK |	\
-	 K7_EVNTSEL_UNIT_MASK  |	\
-	 K7_EVNTSEL_EDGE_MASK  |	\
-	 K7_EVNTSEL_INV_MASK   |	\
-	 K7_EVNTSEL_REG_MASK)
-
-	return hw_event & K7_EVNTSEL_MASK;
+	return hw_event & AMD64_RAW_EVENT_MASK;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index cc4d90a13d5..dfdd6f90fc8 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -452,24 +452,6 @@ static __initconst u64 atom_hw_cache_event_ids
  },
 };
 
-static u64 intel_pmu_raw_event(u64 hw_event)
-{
-#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
-#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
-#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
-#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
-#define CORE_EVNTSEL_REG_MASK		0xFF000000ULL
-
-#define CORE_EVNTSEL_MASK		\
-	(INTEL_ARCH_EVTSEL_MASK |	\
-	 INTEL_ARCH_UNIT_MASK   |	\
-	 INTEL_ARCH_EDGE_MASK   |	\
-	 INTEL_ARCH_INV_MASK    |	\
-	 INTEL_ARCH_CNT_MASK)
-
-	return hw_event & CORE_EVNTSEL_MASK;
-}
-
 static void intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -788,7 +770,7 @@ static __initconst struct x86_pmu core_pmu = {
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
-	.raw_event		= intel_pmu_raw_event,
+	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	/*
@@ -827,7 +809,7 @@ static __initconst struct x86_pmu intel_pmu = {
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
-	.raw_event		= intel_pmu_raw_event,
+	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index b26fbc7eb93..03c139a67ba 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -27,24 +27,6 @@ static u64 p6_pmu_event_map(int hw_event)
  */
 #define P6_NOP_EVENT			0x0000002EULL
 
-static u64 p6_pmu_raw_event(u64 hw_event)
-{
-#define P6_EVNTSEL_EVENT_MASK		0x000000FFULL
-#define P6_EVNTSEL_UNIT_MASK		0x0000FF00ULL
-#define P6_EVNTSEL_EDGE_MASK		0x00040000ULL
-#define P6_EVNTSEL_INV_MASK		0x00800000ULL
-#define P6_EVNTSEL_REG_MASK		0xFF000000ULL
-
-#define P6_EVNTSEL_MASK			\
-	(P6_EVNTSEL_EVENT_MASK |	\
-	 P6_EVNTSEL_UNIT_MASK  |	\
-	 P6_EVNTSEL_EDGE_MASK  |	\
-	 P6_EVNTSEL_INV_MASK   |	\
-	 P6_EVNTSEL_REG_MASK)
-
-	return hw_event & P6_EVNTSEL_MASK;
-}
-
 static struct event_constraint p6_event_constraints[] =
 {
 	INTEL_EVENT_CONSTRAINT(0xc1, 0x1),	/* FLOPS */
@@ -114,7 +96,7 @@ static __initconst struct x86_pmu p6_pmu = {
 	.eventsel		= MSR_P6_EVNTSEL0,
 	.perfctr		= MSR_P6_PERFCTR0,
 	.event_map		= p6_pmu_event_map,
-	.raw_event		= p6_pmu_raw_event,
+	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
 	.apic			= 1,
 	.max_period		= (1ULL << 31) - 1,
-- 
cgit v1.2.3


From b4cdc5c264b35c67007800dec3928e9547a9d70b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 30 Mar 2010 17:00:06 +0200
Subject: perf, x86: Fix up the ANY flag stuff

Stephane noticed that the ANY flag was in generic arch code, and Cyrill
reported that it broke the P4 code.

Solve this by merging x86_pmu::raw_event into x86_pmu::hw_config and
provide intel_pmu and amd_pmu specific versions of this callback.

The intel_pmu one deals with the ANY flag, the amd_pmu adds the few extra
event bits AMD64 has.

Reported-by: Stephane Eranian <eranian@google.com>
Reported-by: Cyrill Gorcunov <gorcunov@gmail.com>
Acked-by: Robert Richter <robert.richter@amd.com>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Acked-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1269968113.5258.442.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       | 35 ++++++++++-------------------
 arch/x86/kernel/cpu/perf_event_amd.c   | 17 +++++++++++----
 arch/x86/kernel/cpu/perf_event_intel.c | 30 +++++++++++++++++++++----
 arch/x86/kernel/cpu/perf_event_p4.c    | 40 +++++++++++++++++++---------------
 arch/x86/kernel/cpu/perf_event_p6.c    |  3 +--
 5 files changed, 74 insertions(+), 51 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1dd42c18f1c..65e9c5efb61 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -196,12 +196,11 @@ struct x86_pmu {
 	void		(*enable_all)(int added);
 	void		(*enable)(struct perf_event *);
 	void		(*disable)(struct perf_event *);
-	int		(*hw_config)(struct perf_event_attr *attr, struct hw_perf_event *hwc);
+	int		(*hw_config)(struct perf_event *event);
 	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
 	unsigned	eventsel;
 	unsigned	perfctr;
 	u64		(*event_map)(int);
-	u64		(*raw_event)(u64);
 	int		max_events;
 	int		num_counters;
 	int		num_counters_fixed;
@@ -426,28 +425,26 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 	return 0;
 }
 
-static int x86_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
+static int x86_pmu_hw_config(struct perf_event *event)
 {
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
 	 */
-	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
 
 	/*
 	 * Count user and OS events unless requested not to
 	 */
-	if (!attr->exclude_user)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!attr->exclude_kernel)
-		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+	if (!event->attr.exclude_user)
+		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!event->attr.exclude_kernel)
+		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	return 0;
-}
+	if (event->attr.type == PERF_TYPE_RAW)
+		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
-static u64 x86_pmu_raw_event(u64 hw_event)
-{
-	return hw_event & X86_RAW_EVENT_MASK;
+	return 0;
 }
 
 /*
@@ -489,7 +486,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	hwc->last_tag = ~0ULL;
 
 	/* Processor specifics */
-	err = x86_pmu.hw_config(attr, hwc);
+	err = x86_pmu.hw_config(event);
 	if (err)
 		return err;
 
@@ -508,16 +505,8 @@ static int __hw_perf_event_init(struct perf_event *event)
 			return -EOPNOTSUPP;
 	}
 
-	/*
-	 * Raw hw_event type provide the config in the hw_event structure
-	 */
-	if (attr->type == PERF_TYPE_RAW) {
-		hwc->config |= x86_pmu.raw_event(attr->config);
-		if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
-		    perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
+	if (attr->type == PERF_TYPE_RAW)
 		return 0;
-	}
 
 	if (attr->type == PERF_TYPE_HW_CACHE)
 		return set_ext_hw_attr(hwc, attr);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 37e9517729d..bbd7339f08a 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -111,9 +111,19 @@ static u64 amd_pmu_event_map(int hw_event)
 	return amd_perfmon_event_map[hw_event];
 }
 
-static u64 amd_pmu_raw_event(u64 hw_event)
+static int amd_pmu_hw_config(struct perf_event *event)
 {
-	return hw_event & AMD64_RAW_EVENT_MASK;
+	int ret = x86_pmu_hw_config(event);
+
+	if (ret)
+		return ret;
+
+	if (event->attr.type != PERF_TYPE_RAW)
+		return 0;
+
+	event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+	return 0;
 }
 
 /*
@@ -365,12 +375,11 @@ static __initconst struct x86_pmu amd_pmu = {
 	.enable_all		= x86_pmu_enable_all,
 	.enable			= x86_pmu_enable_event,
 	.disable		= x86_pmu_disable_event,
-	.hw_config		= x86_hw_config,
+	.hw_config		= amd_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
 	.event_map		= amd_pmu_event_map,
-	.raw_event		= amd_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 	.num_counters		= 4,
 	.cntval_bits		= 48,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index dfdd6f90fc8..30bf10c55f1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -758,6 +758,30 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 	return x86_get_event_constraints(cpuc, event);
 }
 
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+	int ret = x86_pmu_hw_config(event);
+
+	if (ret)
+		return ret;
+
+	if (event->attr.type != PERF_TYPE_RAW)
+		return 0;
+
+	if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
+		return 0;
+
+	if (x86_pmu.version < 3)
+		return -EINVAL;
+
+	if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
+
+	return 0;
+}
+
 static __initconst struct x86_pmu core_pmu = {
 	.name			= "core",
 	.handle_irq		= x86_pmu_handle_irq,
@@ -765,12 +789,11 @@ static __initconst struct x86_pmu core_pmu = {
 	.enable_all		= x86_pmu_enable_all,
 	.enable			= x86_pmu_enable_event,
 	.disable		= x86_pmu_disable_event,
-	.hw_config		= x86_hw_config,
+	.hw_config		= x86_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
-	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	/*
@@ -804,12 +827,11 @@ static __initconst struct x86_pmu intel_pmu = {
 	.enable_all		= intel_pmu_enable_all,
 	.enable			= intel_pmu_enable_event,
 	.disable		= intel_pmu_disable_event,
-	.hw_config		= x86_hw_config,
+	.hw_config		= intel_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= intel_pmu_event_map,
-	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 4139100404e..acd237d29f1 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -419,20 +419,7 @@ static u64 p4_pmu_event_map(int hw_event)
 	return config;
 }
 
-/*
- * We don't control raw events so it's up to the caller
- * to pass sane values (and we don't count the thread number
- * on HT machine but allow HT-compatible specifics to be
- * passed on)
- */
-static u64 p4_pmu_raw_event(u64 hw_event)
-{
-	return hw_event &
-		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
-		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
-}
-
-static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
+static int p4_hw_config(struct perf_event *event)
 {
 	int cpu = raw_smp_processor_id();
 	u32 escr, cccr;
@@ -444,11 +431,29 @@ static int p4_hw_config(struct perf_event_attr *attr, struct hw_perf_event *hwc)
 	 */
 
 	cccr = p4_default_cccr_conf(cpu);
-	escr = p4_default_escr_conf(cpu, attr->exclude_kernel, attr->exclude_user);
-	hwc->config = p4_config_pack_escr(escr) | p4_config_pack_cccr(cccr);
+	escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
+					 event->attr.exclude_user);
+	event->hw.config = p4_config_pack_escr(escr) |
+			   p4_config_pack_cccr(cccr);
 
 	if (p4_ht_active() && p4_ht_thread(cpu))
-		hwc->config = p4_set_ht_bit(hwc->config);
+		event->hw.config = p4_set_ht_bit(event->hw.config);
+
+	if (event->attr.type != PERF_TYPE_RAW)
+		return 0;
+
+	/*
+	 * We don't control raw events so it's up to the caller
+	 * to pass sane values (and we don't count the thread number
+	 * on HT machine but allow HT-compatible specifics to be
+	 * passed on)
+	 *
+	 * XXX: HT wide things should check perf_paranoid_cpu() &&
+	 *      CAP_SYS_ADMIN
+	 */
+	event->hw.config |= event->attr.config &
+		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
+		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
 
 	return 0;
 }
@@ -785,7 +790,6 @@ static __initconst struct x86_pmu p4_pmu = {
 	.eventsel		= MSR_P4_BPU_CCCR0,
 	.perfctr		= MSR_P4_BPU_PERFCTR0,
 	.event_map		= p4_pmu_event_map,
-	.raw_event		= p4_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(p4_general_events),
 	.get_event_constraints	= x86_get_event_constraints,
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 03c139a67ba..9123e8ec995 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -91,12 +91,11 @@ static __initconst struct x86_pmu p6_pmu = {
 	.enable_all		= p6_pmu_enable_all,
 	.enable			= p6_pmu_enable_event,
 	.disable		= p6_pmu_disable_event,
-	.hw_config		= x86_hw_config,
+	.hw_config		= x86_pmu_hw_config,
 	.schedule_events	= x86_schedule_events,
 	.eventsel		= MSR_P6_EVNTSEL0,
 	.perfctr		= MSR_P6_PERFCTR0,
 	.event_map		= p6_pmu_event_map,
-	.raw_event		= x86_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
 	.apic			= 1,
 	.max_period		= (1ULL << 31) - 1,
-- 
cgit v1.2.3


From caaa8be3b6707cb9664e573a28b00f845ce9f32e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 29 Mar 2010 13:09:53 +0200
Subject: perf, x86: Fix __initconst vs const

All variables that have __initconst should also be const.

Suggested-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_amd.c   |  4 ++--
 arch/x86/kernel/cpu/perf_event_intel.c | 12 ++++++------
 arch/x86/kernel/cpu/perf_event_p4.c    |  4 ++--
 arch/x86/kernel/cpu/perf_event_p6.c    |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index bbd7339f08a..611df11ba15 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -2,7 +2,7 @@
 
 static DEFINE_RAW_SPINLOCK(amd_nb_lock);
 
-static __initconst u64 amd_hw_cache_event_ids
+static __initconst const u64 amd_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -368,7 +368,7 @@ static void amd_pmu_cpu_dead(int cpu)
 	raw_spin_unlock(&amd_nb_lock);
 }
 
-static __initconst struct x86_pmu amd_pmu = {
+static __initconst const struct x86_pmu amd_pmu = {
 	.name			= "AMD",
 	.handle_irq		= x86_pmu_handle_irq,
 	.disable_all		= x86_pmu_disable_all,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 30bf10c55f1..1957e3f14c0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -88,7 +88,7 @@ static u64 intel_pmu_event_map(int hw_event)
 	return intel_perfmon_event_map[hw_event];
 }
 
-static __initconst u64 westmere_hw_cache_event_ids
+static __initconst const u64 westmere_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -179,7 +179,7 @@ static __initconst u64 westmere_hw_cache_event_ids
  },
 };
 
-static __initconst u64 nehalem_hw_cache_event_ids
+static __initconst const u64 nehalem_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -270,7 +270,7 @@ static __initconst u64 nehalem_hw_cache_event_ids
  },
 };
 
-static __initconst u64 core2_hw_cache_event_ids
+static __initconst const u64 core2_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -361,7 +361,7 @@ static __initconst u64 core2_hw_cache_event_ids
  },
 };
 
-static __initconst u64 atom_hw_cache_event_ids
+static __initconst const u64 atom_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -782,7 +782,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	return 0;
 }
 
-static __initconst struct x86_pmu core_pmu = {
+static __initconst const struct x86_pmu core_pmu = {
 	.name			= "core",
 	.handle_irq		= x86_pmu_handle_irq,
 	.disable_all		= x86_pmu_disable_all,
@@ -820,7 +820,7 @@ static void intel_pmu_cpu_dying(int cpu)
 	fini_debug_store_on_cpu(cpu);
 }
 
-static __initconst struct x86_pmu intel_pmu = {
+static __initconst const struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
 	.disable_all		= intel_pmu_disable_all,
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index acd237d29f1..15367cce66b 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -287,7 +287,7 @@ static struct p4_event_bind p4_event_bind_map[] = {
 	p4_config_pack_cccr(cache_event					| \
 			    P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
 
-static __initconst u64 p4_hw_cache_event_ids
+static __initconst const u64 p4_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -780,7 +780,7 @@ done:
 	return num ? -ENOSPC : 0;
 }
 
-static __initconst struct x86_pmu p4_pmu = {
+static __initconst const struct x86_pmu p4_pmu = {
 	.name			= "Netburst P4/Xeon",
 	.handle_irq		= p4_pmu_handle_irq,
 	.disable_all		= p4_pmu_disable_all,
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 9123e8ec995..34ba07be2cd 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -84,7 +84,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
 	(void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
 }
 
-static __initconst struct x86_pmu p6_pmu = {
+static __initconst const struct x86_pmu p6_pmu = {
 	.name			= "p6",
 	.handle_irq		= x86_pmu_handle_irq,
 	.disable_all		= p6_pmu_disable_all,
-- 
cgit v1.2.3


From 40b91cd10f000b4c4934e48e2e5c0bec66def144 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 29 Mar 2010 16:37:17 +0200
Subject: perf, x86: Add Nehalem programming quirk to Westmere

According to the Xeon-5600 errata the Westmere suffers the same PMU
programming bug as the original Nehalem did.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 1957e3f14c0..f168b4030d4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -488,6 +488,7 @@ static void intel_pmu_enable_all(int added)
  * Workaround for:
  *   Intel Errata AAK100 (model 26)
  *   Intel Errata AAP53  (model 30)
+ *   Intel Errata BD53   (model 44)
  *
  * These chips need to be 'reset' when adding counters by programming
  * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
@@ -980,6 +981,7 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_nhm();
 
 		x86_pmu.event_constraints = intel_westmere_event_constraints;
+		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		pr_cont("Westmere events, ");
 		break;
 
-- 
cgit v1.2.3


From 32bd7eb5a7f4596c8440dd9440322fe9e686634d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 24 Mar 2010 13:17:19 +0800
Subject: sched: Remove remaining USER_SCHED code

This is left over from commit 7c9414385e ("sched: Remove USER_SCHED"")

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Dhaval Giani <dhaval.giani@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: David Howells <dhowells@redhat.com>
LKML-Reference: <4BA9A05F.7010407@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/Kconfig            |  3 +--
 kernel/capability.c     |  1 -
 kernel/cred-internals.h | 21 ---------------------
 kernel/cred.c           |  3 ---
 kernel/exit.c           |  1 -
 kernel/sched_debug.c    |  5 -----
 kernel/user.c           | 10 +---------
 7 files changed, 2 insertions(+), 42 deletions(-)
 delete mode 100644 kernel/cred-internals.h

diff --git a/init/Kconfig b/init/Kconfig
index eb77e8ccde1..5fe94b82e4c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -604,8 +604,7 @@ config RT_GROUP_SCHED
 	default n
 	help
 	  This feature lets you explicitly allocate real CPU bandwidth
-	  to users or control groups (depending on the "Basis for grouping tasks"
-	  setting below. If enabled, it will also make it impossible to
+	  to task groups. If enabled, it will also make it impossible to
 	  schedule realtime tasks for non-root users until you allocate
 	  realtime bandwidth for them.
 	  See Documentation/scheduler/sched-rt-group.txt for more information.
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e4697e9b27..2f05303715a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
 #include <asm/uaccess.h>
-#include "cred-internals.h"
 
 /*
  * Leveraged for setting/resetting capabilities
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf..00000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Internal credentials stuff
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-/*
- * user.c
- */
-static inline void sched_switch_user(struct task_struct *p)
-{
-#ifdef CONFIG_USER_SCHED
-	sched_move_task(p);
-#endif	/* CONFIG_USER_SCHED */
-}
-
diff --git a/kernel/cred.c b/kernel/cred.c
index 1b1129d0cce..19d3ccce3d4 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,7 +16,6 @@
 #include <linux/init_task.h>
 #include <linux/security.h>
 #include <linux/cn_proc.h>
-#include "cred-internals.h"
 
 #if 0
 #define kdebug(FMT, ...) \
@@ -557,8 +556,6 @@ int commit_creds(struct cred *new)
 		atomic_dec(&old->user->processes);
 	alter_cred_subscribers(old, -2);
 
-	sched_switch_user(task);
-
 	/* send notifications */
 	if (new->uid   != old->uid  ||
 	    new->euid  != old->euid ||
diff --git a/kernel/exit.c b/kernel/exit.c
index cce59cb5ee6..84dc4b294e4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,7 +55,6 @@
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
-#include "cred-internals.h"
 
 static void exit_mm(struct task_struct * tsk);
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8a46a719f36..0932c5c45b3 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -173,11 +173,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	task_group_path(tg, path, sizeof(path));
 
 	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-	{
-		uid_t uid = cfs_rq->tg->uid;
-		SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
-	}
 #else
 	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
diff --git a/kernel/user.c b/kernel/user.c
index ec3b2229893..8e1c8c0a496 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/user_namespace.h>
-#include "cred-internals.h"
 
 struct user_namespace init_user_ns = {
 	.kref = {
@@ -137,9 +136,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up, *new;
 
-	/* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
-	 * atomic.
-	 */
+	/* Make uid_hash_find() + uid_hash_insert() atomic. */
 	spin_lock_irq(&uidhash_lock);
 	up = uid_hash_find(uid, hashent);
 	spin_unlock_irq(&uidhash_lock);
@@ -161,11 +158,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
-			/* This case is not possible when CONFIG_USER_SCHED
-			 * is defined, since we serialize alloc_uid() using
-			 * uids_mutex. Hence no need to call
-			 * sched_destroy_user() or remove_user_sysfs_dir().
-			 */
 			key_put(new->uid_keyring);
 			key_put(new->session_keyring);
 			kmem_cache_free(uid_cachep, new);
-- 
cgit v1.2.3


From 25c2d55c00c6097e6792ebf21e31342f23b9b768 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 24 Mar 2010 13:17:50 +0800
Subject: sched: Remove USER_SCHED from documentation

USER_SCHED has been removed, so update the documentation
accordingly.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
LKML-Reference: <4BA9A07E.8070508@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/scheduler/sched-design-CFS.txt | 54 ++--------------------------
 Documentation/scheduler/sched-rt-group.txt   | 20 +++--------
 2 files changed, 7 insertions(+), 67 deletions(-)

diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt
index 6f33593e59e..8239ebbcddc 100644
--- a/Documentation/scheduler/sched-design-CFS.txt
+++ b/Documentation/scheduler/sched-design-CFS.txt
@@ -211,7 +211,7 @@ provide fair CPU time to each such task group.  For example, it may be
 desirable to first provide fair CPU time to each user on the system and then to
 each task belonging to a user.
 
-CONFIG_GROUP_SCHED strives to achieve exactly that.  It lets tasks to be
+CONFIG_CGROUP_SCHED strives to achieve exactly that.  It lets tasks to be
 grouped and divides CPU time fairly among such groups.
 
 CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and
@@ -220,38 +220,11 @@ SCHED_RR) tasks.
 CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and
 SCHED_BATCH) tasks.
 
-At present, there are two (mutually exclusive) mechanisms to group tasks for
-CPU bandwidth control purposes:
-
- - Based on user id (CONFIG_USER_SCHED)
-
-   With this option, tasks are grouped according to their user id.
-
- - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED)
-
-   This options needs CONFIG_CGROUPS to be defined, and lets the administrator
+   These options need CONFIG_CGROUPS to be defined, and let the administrator
    create arbitrary groups of tasks, using the "cgroup" pseudo filesystem.  See
    Documentation/cgroups/cgroups.txt for more information about this filesystem.
 
-Only one of these options to group tasks can be chosen and not both.
-
-When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new
-user and a "cpu_share" file is added in that directory.
-
-	# cd /sys/kernel/uids
-	# cat 512/cpu_share		# Display user 512's CPU share
-	1024
-	# echo 2048 > 512/cpu_share	# Modify user 512's CPU share
-	# cat 512/cpu_share		# Display user 512's CPU share
-	2048
-	#
-
-CPU bandwidth between two users is divided in the ratio of their CPU shares.
-For example: if you would like user "root" to get twice the bandwidth of user
-"guest," then set the cpu_share for both the users such that "root"'s cpu_share
-is twice "guest"'s cpu_share.
-
-When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each
+When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
 group created using the pseudo filesystem.  See example steps below to create
 task groups and modify their CPU share using the "cgroups" pseudo filesystem.
 
@@ -273,24 +246,3 @@ task groups and modify their CPU share using the "cgroups" pseudo filesystem.
 
 	# #Launch gmplayer (or your favourite movie player)
 	# echo <movie_player_pid> > multimedia/tasks
-
-8. Implementation note: user namespaces
-
-User namespaces are intended to be hierarchical.  But they are currently
-only partially implemented.  Each of those has ramifications for CFS.
-
-First, since user namespaces are hierarchical, the /sys/kernel/uids
-presentation is inadequate.  Eventually we will likely want to use sysfs
-tagging to provide private views of /sys/kernel/uids within each user
-namespace.
-
-Second, the hierarchical nature is intended to support completely
-unprivileged use of user namespaces.  So if using user groups, then
-we want the users in a user namespace to be children of the user
-who created it.
-
-That is currently unimplemented.  So instead, every user in a new
-user namespace will receive 1024 shares just like any user in the
-initial user namespace.  Note that at the moment creation of a new
-user namespace requires each of CAP_SYS_ADMIN, CAP_SETUID, and
-CAP_SETGID.
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 86eabe6c341..605b0d40329 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -126,23 +126,12 @@ priority!
 2.3 Basis for grouping tasks
 ----------------------------
 
-There are two compile-time settings for allocating CPU bandwidth. These are
-configured using the "Basis for grouping tasks" multiple choice menu under
-General setup > Group CPU Scheduler:
-
-a. CONFIG_USER_SCHED (aka "Basis for grouping tasks" =  "user id")
-
-This lets you use the virtual files under
-"/sys/kernel/uids/<uid>/cpu_rt_runtime_us" to control he CPU time reserved for
-each user .
-
-The other option is:
-
-.o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups")
+Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real
+CPU bandwidth to task groups.
 
 This uses the /cgroup virtual file system and
 "/cgroup/<cgroup>/cpu.rt_runtime_us" to control the CPU time reserved for each
-control group instead.
+control group.
 
 For more information on working with control groups, you should read
 Documentation/cgroups/cgroups.txt as well.
@@ -161,8 +150,7 @@ For now, this can be simplified to just the following (but see Future plans):
 ===============
 
 There is work in progress to make the scheduling period for each group
-("/sys/kernel/uids/<uid>/cpu_rt_period_us" or
-"/cgroup/<cgroup>/cpu.rt_period_us" respectively) configurable as well.
+("/cgroup/<cgroup>/cpu.rt_period_us") configurable as well.
 
 The constraint on the period is that a subgroup must have a smaller or
 equal period to its parent. But realistically its not very useful _yet_
-- 
cgit v1.2.3


From 897f0b3c3ff40b443c84e271bef19bd6ae885195 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 15 Mar 2010 10:10:03 +0100
Subject: sched: Kill the broken and deadlockable
 cpuset_lock/cpuset_cpus_allowed_locked code

This patch just states the fact the cpusets/cpuhotplug interaction is
broken and removes the deadlockable code which only pretends to work.

- cpuset_lock() doesn't really work. It is needed for
  cpuset_cpus_allowed_locked() but we can't take this lock in
  try_to_wake_up()->select_fallback_rq() path.

- cpuset_lock() is deadlockable. Suppose that a task T bound to CPU takes
  callback_mutex. If cpu_down(CPU) happens before T drops callback_mutex
  stop_machine() preempts T, then migration_call(CPU_DEAD) tries to take
  cpuset_lock() and hangs forever because CPU is already dead and thus
  T can't be scheduled.

- cpuset_cpus_allowed_locked() is deadlockable too. It takes task_lock()
  which is not irq-safe, but try_to_wake_up() can be called from irq.

Kill them, and change select_fallback_rq() to use cpu_possible_mask, like
we currently do without CONFIG_CPUSETS.

Also, with or without this patch, with or without CONFIG_CPUSETS, the
callers of select_fallback_rq() can race with each other or with
set_cpus_allowed() pathes.

The subsequent patches try to to fix these problems.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091003.GA9123@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpuset.h | 13 -------------
 kernel/cpuset.c        | 27 +--------------------------
 kernel/sched.c         | 10 +++-------
 3 files changed, 4 insertions(+), 46 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a5740fc4d04..eeaaee746be 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -21,8 +21,6 @@ extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
-extern void cpuset_cpus_allowed_locked(struct task_struct *p,
-				       struct cpumask *mask);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -69,9 +67,6 @@ struct seq_file;
 extern void cpuset_task_status_allowed(struct seq_file *m,
 					struct task_struct *task);
 
-extern void cpuset_lock(void);
-extern void cpuset_unlock(void);
-
 extern int cpuset_mem_spread_node(void);
 
 static inline int cpuset_do_page_mem_spread(void)
@@ -105,11 +100,6 @@ static inline void cpuset_cpus_allowed(struct task_struct *p,
 {
 	cpumask_copy(mask, cpu_possible_mask);
 }
-static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
-					      struct cpumask *mask)
-{
-	cpumask_copy(mask, cpu_possible_mask);
-}
 
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 {
@@ -157,9 +147,6 @@ static inline void cpuset_task_status_allowed(struct seq_file *m,
 {
 }
 
-static inline void cpuset_lock(void) {}
-static inline void cpuset_unlock(void) {}
-
 static inline int cpuset_mem_spread_node(void)
 {
 	return 0;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d10946748ec..9a747f56d58 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2182,19 +2182,10 @@ void __init cpuset_init_smp(void)
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
 	mutex_lock(&callback_mutex);
-	cpuset_cpus_allowed_locked(tsk, pmask);
-	mutex_unlock(&callback_mutex);
-}
-
-/**
- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
- * Must be called with callback_mutex held.
- **/
-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
-{
 	task_lock(tsk);
 	guarantee_online_cpus(task_cs(tsk), pmask);
 	task_unlock(tsk);
+	mutex_unlock(&callback_mutex);
 }
 
 void cpuset_init_current_mems_allowed(void)
@@ -2382,22 +2373,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 	return 0;
 }
 
-/**
- * cpuset_lock - lock out any changes to cpuset structures
- *
- * The out of memory (oom) code needs to mutex_lock cpusets
- * from being changed while it scans the tasklist looking for a
- * task in an overlapping cpuset.  Expose callback_mutex via this
- * cpuset_lock() routine, so the oom code can lock it, before
- * locking the task list.  The tasklist_lock is a spinlock, so
- * must be taken inside callback_mutex.
- */
-
-void cpuset_lock(void)
-{
-	mutex_lock(&callback_mutex);
-}
-
 /**
  * cpuset_unlock - release lock on cpuset changes
  *
diff --git a/kernel/sched.c b/kernel/sched.c
index 52b7efd2741..c0b3ebc1631 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2296,11 +2296,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 		return dest_cpu;
 
 	/* No more Mr. Nice Guy. */
-	if (dest_cpu >= nr_cpu_ids) {
-		rcu_read_lock();
-		cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
-		rcu_read_unlock();
-		dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+	if (unlikely(dest_cpu >= nr_cpu_ids)) {
+		cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+		dest_cpu = cpumask_any(cpu_active_mask);
 
 		/*
 		 * Don't tell them about moving exiting tasks or
@@ -5866,7 +5864,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
 		migrate_live_tasks(cpu);
 		rq = cpu_rq(cpu);
 		kthread_stop(rq->migration_thread);
@@ -5879,7 +5876,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		rq->idle->sched_class = &idle_sched_class;
 		migrate_dead_tasks(cpu);
 		raw_spin_unlock_irq(&rq->lock);
-		cpuset_unlock();
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
 		calc_global_load_remove(rq);
-- 
cgit v1.2.3


From 1445c08d06c5594895b4fae952ef8a457e89c390 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 15 Mar 2010 10:10:10 +0100
Subject: sched: move_task_off_dead_cpu(): Take rq->lock around
 select_fallback_rq()

move_task_off_dead_cpu()->select_fallback_rq() reads/updates ->cpus_allowed
lockless. We can race with set_cpus_allowed() running in parallel.

Change it to take rq->lock around select_fallback_rq(). Note that it is not
trivial to move this spin_lock() into select_fallback_rq(), we must recheck
the task was not migrated after we take the lock and other callers do not
need this lock.

To avoid the races with other callers of select_fallback_rq() which rely on
TASK_WAKING, we also check p->state != TASK_WAKING and do nothing otherwise.
The owner of TASK_WAKING must update ->cpus_allowed and choose the correct
CPU anyway, and the subsequent __migrate_task() is just meaningless because
p->se.on_rq must be false.

Alternatively, we could change select_task_rq() to take rq->lock right
after it calls sched_class->select_task_rq(), but this looks a bit ugly.

Also, change it to not assume irqs are disabled and absorb __migrate_task_irq().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091010.GA9131@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index c0b3ebc1631..27774b5aeb6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5448,29 +5448,29 @@ static int migration_thread(void *data)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-
-static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-	int ret;
-
-	local_irq_disable();
-	ret = __migrate_task(p, src_cpu, dest_cpu);
-	local_irq_enable();
-	return ret;
-}
-
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
-	int dest_cpu;
-
+	struct rq *rq = cpu_rq(dead_cpu);
+	int needs_cpu, uninitialized_var(dest_cpu);
+	unsigned long flags;
 again:
-	dest_cpu = select_fallback_rq(dead_cpu, p);
+	local_irq_save(flags);
+
+	raw_spin_lock(&rq->lock);
+	needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
+	if (needs_cpu)
+		dest_cpu = select_fallback_rq(dead_cpu, p);
+	raw_spin_unlock(&rq->lock);
 
 	/* It can have affinity changed while we were choosing. */
-	if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+	if (needs_cpu)
+		needs_cpu = !__migrate_task(p, dead_cpu, dest_cpu);
+	local_irq_restore(flags);
+
+	if (unlikely(needs_cpu))
 		goto again;
 }
 
-- 
cgit v1.2.3


From c1804d547dc098363443667609c272d1e4d15ee8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 15 Mar 2010 10:10:14 +0100
Subject: sched: move_task_off_dead_cpu(): Remove retry logic

The previous patch preserved the retry logic, but it looks unneeded.

__migrate_task() can only fail if we raced with migration after we dropped
the lock, but in this case the caller of set_cpus_allowed/etc must initiate
migration itself if ->on_rq == T.

We already fixed p->cpus_allowed, the changes in active/online masks must
be visible to racer, it should migrate the task to online cpu correctly.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091014.GA9138@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 27774b5aeb6..f475c608b07 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5456,7 +5456,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 	struct rq *rq = cpu_rq(dead_cpu);
 	int needs_cpu, uninitialized_var(dest_cpu);
 	unsigned long flags;
-again:
+
 	local_irq_save(flags);
 
 	raw_spin_lock(&rq->lock);
@@ -5464,14 +5464,13 @@ again:
 	if (needs_cpu)
 		dest_cpu = select_fallback_rq(dead_cpu, p);
 	raw_spin_unlock(&rq->lock);
-
-	/* It can have affinity changed while we were choosing. */
+	/*
+	 * It can only fail if we race with set_cpus_allowed(),
+	 * in the racer should migrate the task anyway.
+	 */
 	if (needs_cpu)
-		needs_cpu = !__migrate_task(p, dead_cpu, dest_cpu);
+		__migrate_task(p, dead_cpu, dest_cpu);
 	local_irq_restore(flags);
-
-	if (unlikely(needs_cpu))
-		goto again;
 }
 
 /*
-- 
cgit v1.2.3


From 30da688ef6b76e01969b00608202fff1eed2accc Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 15 Mar 2010 10:10:19 +0100
Subject: sched: sched_exec(): Remove the select_fallback_rq() logic

sched_exec()->select_task_rq() reads/updates ->cpus_allowed lockless.
This can race with other CPUs updating our ->cpus_allowed, and this
looks meaningless to me.

The task is current and running, it must have online cpus in ->cpus_allowed,
the fallback mode is bogus. And, if ->sched_class returns the "wrong" cpu,
this likely means we raced with set_cpus_allowed() which was called
for reason, why should sched_exec() retry and call ->select_task_rq()
again?

Change the code to call sched_class->select_task_rq() directly and do
nothing if the returned cpu is wrong after re-checking under rq->lock.

From now task_struct->cpus_allowed is always stable under TASK_WAKING,
select_fallback_rq() is always called under rq-lock or the caller or
the caller owns TASK_WAKING (select_task_rq).

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091019.GA9141@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index f475c608b07..165b532dd8c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2280,6 +2280,9 @@ void task_oncpu_function_call(struct task_struct *p,
 }
 
 #ifdef CONFIG_SMP
+/*
+ * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
 	int dest_cpu;
@@ -2316,12 +2319,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 }
 
 /*
- * Gets called from 3 sites (exec, fork, wakeup), since it is called without
- * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
- * by:
- *
- *  exec:           is unstable, retry loop
- *  fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
+ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
  */
 static inline
 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
@@ -3076,9 +3074,8 @@ void sched_exec(void)
 	unsigned long flags;
 	struct rq *rq;
 
-again:
 	this_cpu = get_cpu();
-	dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
+	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
 	if (dest_cpu == this_cpu) {
 		put_cpu();
 		return;
@@ -3086,18 +3083,12 @@ again:
 
 	rq = task_rq_lock(p, &flags);
 	put_cpu();
-
 	/*
 	 * select_task_rq() can race against ->cpus_allowed
 	 */
-	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
-	    || unlikely(!cpu_active(dest_cpu))) {
-		task_rq_unlock(rq, &flags);
-		goto again;
-	}
-
-	/* force the process onto the specified CPU */
-	if (migrate_task(p, dest_cpu, &req)) {
+	if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
+	    likely(cpu_active(dest_cpu)) &&
+	    migrate_task(p, dest_cpu, &req)) {
 		/* Need to wait for migration thread (might exit: take ref). */
 		struct task_struct *mt = rq->migration_thread;
 
-- 
cgit v1.2.3


From 6a1bdc1b577ebcb65f6603c57f8347309bc4ab13 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 15 Mar 2010 10:10:23 +0100
Subject: sched: _cpu_down(): Don't play with current->cpus_allowed

_cpu_down() changes the current task's affinity and then recovers it at
the end. The problems are well known: we can't restore old_allowed if it
was bound to the now-dead-cpu, and we can race with the userspace which
can change cpu-affinity during unplug.

_cpu_down() should not play with current->cpus_allowed at all. Instead,
take_cpu_down() can migrate the caller of _cpu_down() after __cpu_disable()
removes the dying cpu from cpu_online_mask.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091023.GA9148@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/cpu.c          | 18 ++++++------------
 kernel/sched.c        |  2 +-
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 43c94515273..8bea40725c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1843,6 +1843,7 @@ extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
 #ifdef CONFIG_HOTPLUG_CPU
+extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
 extern void idle_task_exit(void);
 #else
 static inline void idle_task_exit(void) {}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f8cced2692b..8d340faac38 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -163,6 +163,7 @@ static inline void check_for_tasks(int cpu)
 }
 
 struct take_cpu_down_param {
+	struct task_struct *caller;
 	unsigned long mod;
 	void *hcpu;
 };
@@ -171,6 +172,7 @@ struct take_cpu_down_param {
 static int __ref take_cpu_down(void *_param)
 {
 	struct take_cpu_down_param *param = _param;
+	unsigned int cpu = (unsigned long)param->hcpu;
 	int err;
 
 	/* Ensure this CPU doesn't handle any more interrupts. */
@@ -181,6 +183,8 @@ static int __ref take_cpu_down(void *_param)
 	raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
 				param->hcpu);
 
+	if (task_cpu(param->caller) == cpu)
+		move_task_off_dead_cpu(cpu, param->caller);
 	/* Force idle task to run as soon as we yield: it should
 	   immediately notice cpu is offline and die quickly. */
 	sched_idle_next();
@@ -191,10 +195,10 @@ static int __ref take_cpu_down(void *_param)
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
-	cpumask_var_t old_allowed;
 	void *hcpu = (void *)(long)cpu;
 	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 	struct take_cpu_down_param tcd_param = {
+		.caller = current,
 		.mod = mod,
 		.hcpu = hcpu,
 	};
@@ -205,9 +209,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	if (!cpu_online(cpu))
 		return -EINVAL;
 
-	if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
-		return -ENOMEM;
-
 	cpu_hotplug_begin();
 	set_cpu_active(cpu, false);
 	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
@@ -224,10 +225,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 		goto out_release;
 	}
 
-	/* Ensure that we are not runnable on dying cpu */
-	cpumask_copy(old_allowed, &current->cpus_allowed);
-	set_cpus_allowed_ptr(current, cpu_active_mask);
-
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
 		set_cpu_active(cpu, true);
@@ -236,7 +233,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 					    hcpu) == NOTIFY_BAD)
 			BUG();
 
-		goto out_allowed;
+		goto out_release;
 	}
 	BUG_ON(cpu_online(cpu));
 
@@ -254,8 +251,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	check_for_tasks(cpu);
 
-out_allowed:
-	set_cpus_allowed_ptr(current, old_allowed);
 out_release:
 	cpu_hotplug_done();
 	if (!err) {
@@ -263,7 +258,6 @@ out_release:
 					    hcpu) == NOTIFY_BAD)
 			BUG();
 	}
-	free_cpumask_var(old_allowed);
 	return err;
 }
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 165b532dd8c..11119deffa4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5442,7 +5442,7 @@ static int migration_thread(void *data)
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
  */
-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
 	struct rq *rq = cpu_rq(dead_cpu);
 	int needs_cpu, uninitialized_var(dest_cpu);
-- 
cgit v1.2.3


From 9084bb8246ea935b98320554229e2f371f7f52fa Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 15 Mar 2010 10:10:27 +0100
Subject: sched: Make select_fallback_rq() cpuset friendly

Introduce cpuset_cpus_allowed_fallback() helper to fix the cpuset problems
with select_fallback_rq(). It can be called from any context and can't use
any cpuset locks including task_lock(). It is called when the task doesn't
have online cpus in ->cpus_allowed but ttwu/etc must be able to find a
suitable cpu.

I am not proud of this patch. Everything which needs such a fat comment
can't be good even if correct. But I'd prefer to not change the locking
rules in the code I hardly understand, and in any case I believe this
simple change make the code much more correct compared to deadlocks we
currently have.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100315091027.GA9155@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/cpuset.h |  7 +++++++
 kernel/cpuset.c        | 42 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched.c         |  4 +---
 3 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index eeaaee746be..a73454aec33 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -21,6 +21,7 @@ extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
+extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@@ -101,6 +102,12 @@ static inline void cpuset_cpus_allowed(struct task_struct *p,
 	cpumask_copy(mask, cpu_possible_mask);
 }
 
+static inline int cpuset_cpus_allowed_fallback(struct task_struct *p)
+{
+	cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+	return cpumask_any(cpu_active_mask);
+}
+
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
 {
 	return node_possible_map;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9a747f56d58..9a50c5f6e72 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2188,6 +2188,48 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 	mutex_unlock(&callback_mutex);
 }
 
+int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+{
+	const struct cpuset *cs;
+	int cpu;
+
+	rcu_read_lock();
+	cs = task_cs(tsk);
+	if (cs)
+		cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+	rcu_read_unlock();
+
+	/*
+	 * We own tsk->cpus_allowed, nobody can change it under us.
+	 *
+	 * But we used cs && cs->cpus_allowed lockless and thus can
+	 * race with cgroup_attach_task() or update_cpumask() and get
+	 * the wrong tsk->cpus_allowed. However, both cases imply the
+	 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
+	 * which takes task_rq_lock().
+	 *
+	 * If we are called after it dropped the lock we must see all
+	 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
+	 * set any mask even if it is not right from task_cs() pov,
+	 * the pending set_cpus_allowed_ptr() will fix things.
+	 */
+
+	cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
+	if (cpu >= nr_cpu_ids) {
+		/*
+		 * Either tsk->cpus_allowed is wrong (see above) or it
+		 * is actually empty. The latter case is only possible
+		 * if we are racing with remove_tasks_in_empty_cpuset().
+		 * Like above we can temporary set any mask and rely on
+		 * set_cpus_allowed_ptr() as synchronization point.
+		 */
+		cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+		cpu = cpumask_any(cpu_active_mask);
+	}
+
+	return cpu;
+}
+
 void cpuset_init_current_mems_allowed(void)
 {
 	nodes_setall(current->mems_allowed);
diff --git a/kernel/sched.c b/kernel/sched.c
index 11119deffa4..9a38c7a24ed 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2300,9 +2300,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 
 	/* No more Mr. Nice Guy. */
 	if (unlikely(dest_cpu >= nr_cpu_ids)) {
-		cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
-		dest_cpu = cpumask_any(cpu_active_mask);
-
+		dest_cpu = cpuset_cpus_allowed_fallback(p);
 		/*
 		 * Don't tell them about moving exiting tasks or
 		 * kernel threads (both mm NULL), since they never
-- 
cgit v1.2.3


From 0017d735092844118bef006696a750a0e4ef6ebd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 24 Mar 2010 18:34:10 +0100
Subject: sched: Fix TASK_WAKING vs fork deadlock

Oleg noticed a few races with the TASK_WAKING usage on fork.

 - since TASK_WAKING is basically a spinlock, it should be IRQ safe
 - since we set TASK_WAKING (*) without holding rq->lock it could
   be there still is a rq->lock holder, thereby not actually
   providing full serialization.

(*) in fact we clear PF_STARTING, which in effect enables TASK_WAKING.

Cure the second issue by not setting TASK_WAKING in sched_fork(), but
only temporarily in wake_up_new_task() while calling select_task_rq().

Cure the first by holding rq->lock around the select_task_rq() call,
this will disable IRQs, this however requires that we push down the
rq->lock release into select_task_rq_fair()'s cgroup stuff.

Because select_task_rq_fair() still needs to drop the rq->lock we
cannot fully get rid of TASK_WAKING.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  3 ++-
 kernel/sched.c          | 65 ++++++++++++++++++-------------------------------
 kernel/sched_fair.c     |  8 ++++--
 kernel/sched_idletask.c |  3 ++-
 kernel/sched_rt.c       |  5 ++--
 5 files changed, 36 insertions(+), 48 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8bea40725c7..fb6c18843ee 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1046,7 +1046,8 @@ struct sched_class {
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
-	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+	int  (*select_task_rq)(struct rq *rq, struct task_struct *p,
+			       int sd_flag, int flags);
 
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
 	void (*post_schedule) (struct rq *this_rq);
diff --git a/kernel/sched.c b/kernel/sched.c
index 9a38c7a24ed..dcd17736dae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -916,14 +916,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 /*
  * Check whether the task is waking, we use this to synchronize against
  * ttwu() so that task_cpu() reports a stable number.
- *
- * We need to make an exception for PF_STARTING tasks because the fork
- * path might require task_rq_lock() to work, eg. it can call
- * set_cpus_allowed_ptr() from the cpuset clone_ns code.
  */
 static inline int task_is_waking(struct task_struct *p)
 {
-	return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+	return unlikely(p->state == TASK_WAKING);
 }
 
 /*
@@ -2320,9 +2316,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
  * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
  */
 static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
 {
-	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+	int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
 
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
@@ -2393,17 +2389,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	if (p->sched_class->task_waking)
 		p->sched_class->task_waking(rq, p);
 
-	__task_rq_unlock(rq);
-
-	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-	if (cpu != orig_cpu) {
-		/*
-		 * Since we migrate the task without holding any rq->lock,
-		 * we need to be careful with task_rq_lock(), since that
-		 * might end up locking an invalid rq.
-		 */
+	cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
+	if (cpu != orig_cpu)
 		set_task_cpu(p, cpu);
-	}
+	__task_rq_unlock(rq);
 
 	rq = cpu_rq(cpu);
 	raw_spin_lock(&rq->lock);
@@ -2530,11 +2519,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
 
 	__sched_fork(p);
 	/*
-	 * We mark the process as waking here. This guarantees that
+	 * We mark the process as running here. This guarantees that
 	 * nobody will actually run it, and a signal or other external
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
-	p->state = TASK_WAKING;
+	p->state = TASK_RUNNING;
 
 	/*
 	 * Revert to default priority/policy on fork if requested.
@@ -2601,28 +2590,25 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	int cpu __maybe_unused = get_cpu();
 
 #ifdef CONFIG_SMP
+	rq = task_rq_lock(p, &flags);
+	p->state = TASK_WAKING;
+
 	/*
 	 * Fork balancing, do it here and not earlier because:
 	 *  - cpus_allowed can change in the fork path
 	 *  - any previously selected cpu might disappear through hotplug
 	 *
-	 * We still have TASK_WAKING but PF_STARTING is gone now, meaning
-	 * ->cpus_allowed is stable, we have preemption disabled, meaning
-	 * cpu_online_mask is stable.
+	 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
+	 * without people poking at ->cpus_allowed.
 	 */
-	cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+	cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
 	set_task_cpu(p, cpu);
-#endif
-
-	/*
-	 * Since the task is not on the rq and we still have TASK_WAKING set
-	 * nobody else will migrate this task.
-	 */
-	rq = cpu_rq(cpu);
-	raw_spin_lock_irqsave(&rq->lock, flags);
 
-	BUG_ON(p->state != TASK_WAKING);
 	p->state = TASK_RUNNING;
+	task_rq_unlock(rq, &flags);
+#endif
+
+	rq = task_rq_lock(p, &flags);
 	activate_task(rq, p, 0);
 	trace_sched_wakeup_new(rq, p, 1);
 	check_preempt_curr(rq, p, WF_FORK);
@@ -3068,19 +3054,15 @@ void sched_exec(void)
 {
 	struct task_struct *p = current;
 	struct migration_req req;
-	int dest_cpu, this_cpu;
 	unsigned long flags;
 	struct rq *rq;
-
-	this_cpu = get_cpu();
-	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
-	if (dest_cpu == this_cpu) {
-		put_cpu();
-		return;
-	}
+	int dest_cpu;
 
 	rq = task_rq_lock(p, &flags);
-	put_cpu();
+	dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+	if (dest_cpu == smp_processor_id())
+		goto unlock;
+
 	/*
 	 * select_task_rq() can race against ->cpus_allowed
 	 */
@@ -3098,6 +3080,7 @@ void sched_exec(void)
 
 		return;
 	}
+unlock:
 	task_rq_unlock(rq, &flags);
 }
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 49ad99378f8..8a5e7632d09 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1423,7 +1423,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
  *
  * preempt must be disabled.
  */
-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+static int
+select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
 {
 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
@@ -1521,8 +1522,11 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
 				  cpumask_weight(sched_domain_span(sd))))
 			tmp = affine_sd;
 
-		if (tmp)
+		if (tmp) {
+			raw_spin_unlock(&rq->lock);
 			update_shares(tmp);
+			raw_spin_lock(&rq->lock);
+		}
 	}
 #endif
 
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a8a6d8a5094..5af709f503b 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
  */
 
 #ifdef CONFIG_SMP
-static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 012d69bb67c..fde895f8044 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
 #ifdef CONFIG_SMP
 static int find_lowest_rq(struct task_struct *task);
 
-static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+static int
+select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
-	struct rq *rq = task_rq(p);
-
 	if (sd_flag != SD_BALANCE_WAKE)
 		return smp_processor_id();
 
-- 
cgit v1.2.3


From 65cc8e4859ff29a9ddc989c88557d6059834c2a2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 25 Mar 2010 21:05:16 +0100
Subject: sched: Optimize task_rq_lock()

Now that we hold the rq->lock over set_task_cpu() again, we can do
away with most of the TASK_WAKING checks and reduce them again to
set_cpus_allowed_ptr().

Removes some conditionals from scheduling hot-paths.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index dcd17736dae..51d336e08a9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -914,8 +914,8 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
- * Check whether the task is waking, we use this to synchronize against
- * ttwu() so that task_cpu() reports a stable number.
+ * Check whether the task is waking, we use this to synchronize ->cpus_allowed
+ * against ttwu().
  */
 static inline int task_is_waking(struct task_struct *p)
 {
@@ -932,11 +932,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
 	struct rq *rq;
 
 	for (;;) {
-		while (task_is_waking(p))
-			cpu_relax();
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p) && !task_is_waking(p)))
+		if (likely(rq == task_rq(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 	}
@@ -953,12 +951,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	struct rq *rq;
 
 	for (;;) {
-		while (task_is_waking(p))
-			cpu_relax();
 		local_irq_save(*flags);
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p) && !task_is_waking(p)))
+		if (likely(rq == task_rq(p)))
 			return rq;
 		raw_spin_unlock_irqrestore(&rq->lock, *flags);
 	}
@@ -5262,7 +5258,18 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 	struct rq *rq;
 	int ret = 0;
 
+	/*
+	 * Serialize against TASK_WAKING so that ttwu() and wunt() can
+	 * drop the rq->lock and still rely on ->cpus_allowed.
+	 */
+again:
+	while (task_is_waking(p))
+		cpu_relax();
 	rq = task_rq_lock(p, &flags);
+	if (task_is_waking(p)) {
+		task_rq_unlock(rq, &flags);
+		goto again;
+	}
 
 	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
 		ret = -EINVAL;
-- 
cgit v1.2.3


From cc87f76a601d2d256118f7bab15e35254356ae21 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 26 Mar 2010 12:22:14 +0100
Subject: sched: Fix nr_uninterruptible count

The cpuload calculation in calc_load_account_active() assumes
rq->nr_uninterruptible will not change on an offline cpu after
migrate_nr_uninterruptible(). However the recent migrate on wakeup
changes broke that and would result in decrementing the offline cpu's
rq->nr_uninterruptible.

Fix this by accounting the nr_uninterruptible on the waking cpu.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 51d336e08a9..14c8d2a1b38 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2378,8 +2378,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	 *
 	 * First fix up the nr_uninterruptible count:
 	 */
-	if (task_contributes_to_load(p))
-		rq->nr_uninterruptible--;
+	if (task_contributes_to_load(p)) {
+		if (likely(cpu_online(orig_cpu)))
+			rq->nr_uninterruptible--;
+		else
+			this_rq()->nr_uninterruptible--;
+	}
 	p->state = TASK_WAKING;
 
 	if (p->sched_class->task_waking)
-- 
cgit v1.2.3


From 371fd7e7a56a5c136d31aa980011bd2f131c3ef5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 24 Mar 2010 16:38:48 +0100
Subject: sched: Add enqueue/dequeue flags

In order to reduce the dependency on TASK_WAKING rework the enqueue
interface to support a proper flags field.

Replace the int wakeup, bool head arguments with an int flags argument
and create the following flags:

  ENQUEUE_WAKEUP - the enqueue is a wakeup of a sleeping task,
  ENQUEUE_WAKING - the enqueue has relative vruntime due to
                   having sched_class::task_waking() called,
  ENQUEUE_HEAD - the waking task should be places on the head
                 of the priority queue (where appropriate).

For symmetry also convert sched_class::dequeue() to a flags scheme.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   | 11 ++++++++---
 kernel/sched.c          | 32 +++++++++++++++++---------------
 kernel/sched_fair.c     | 25 ++++++++-----------------
 kernel/sched_idletask.c |  2 +-
 kernel/sched_rt.c       |  8 ++++----
 5 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fb6c18843ee..e3e900f318d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1032,12 +1032,17 @@ struct sched_domain;
 #define WF_SYNC		0x01		/* waker goes to sleep after wakup */
 #define WF_FORK		0x02		/* child wakeup after fork */
 
+#define ENQUEUE_WAKEUP		1
+#define ENQUEUE_WAKING		2
+#define ENQUEUE_HEAD		4
+
+#define DEQUEUE_SLEEP		1
+
 struct sched_class {
 	const struct sched_class *next;
 
-	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup,
-			      bool head);
-	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
+	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
 	void (*yield_task) (struct rq *rq);
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
diff --git a/kernel/sched.c b/kernel/sched.c
index 14c8d2a1b38..4a57e96dd6c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1877,44 +1877,43 @@ static void update_avg(u64 *avg, u64 sample)
 	*avg += diff >> 3;
 }
 
-static void
-enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
 	sched_info_queued(p);
-	p->sched_class->enqueue_task(rq, p, wakeup, head);
+	p->sched_class->enqueue_task(rq, p, flags);
 	p->se.on_rq = 1;
 }
 
-static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
 	sched_info_dequeued(p);
-	p->sched_class->dequeue_task(rq, p, sleep);
+	p->sched_class->dequeue_task(rq, p, flags);
 	p->se.on_rq = 0;
 }
 
 /*
  * activate_task - move a task to the runqueue.
  */
-static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible--;
 
-	enqueue_task(rq, p, wakeup, false);
+	enqueue_task(rq, p, flags);
 	inc_nr_running(rq);
 }
 
 /*
  * deactivate_task - remove a task from the runqueue.
  */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible++;
 
-	dequeue_task(rq, p, sleep);
+	dequeue_task(rq, p, flags);
 	dec_nr_running(rq);
 }
 
@@ -2353,6 +2352,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 {
 	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
+	unsigned long en_flags = ENQUEUE_WAKEUP;
 	struct rq *rq;
 
 	this_cpu = get_cpu();
@@ -2386,8 +2386,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	}
 	p->state = TASK_WAKING;
 
-	if (p->sched_class->task_waking)
+	if (p->sched_class->task_waking) {
 		p->sched_class->task_waking(rq, p);
+		en_flags |= ENQUEUE_WAKING;
+	}
 
 	cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
 	if (cpu != orig_cpu)
@@ -2432,7 +2434,7 @@ out_activate:
 		schedstat_inc(p, se.statistics.nr_wakeups_local);
 	else
 		schedstat_inc(p, se.statistics.nr_wakeups_remote);
-	activate_task(rq, p, 1);
+	activate_task(rq, p, en_flags);
 	success = 1;
 
 out_running:
@@ -3623,7 +3625,7 @@ need_resched_nonpreemptible:
 		if (unlikely(signal_pending_state(prev->state, prev)))
 			prev->state = TASK_RUNNING;
 		else
-			deactivate_task(rq, prev, 1);
+			deactivate_task(rq, prev, DEQUEUE_SLEEP);
 		switch_count = &prev->nvcsw;
 	}
 
@@ -4193,7 +4195,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq) {
-		enqueue_task(rq, p, 0, oldprio < prio);
+		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
 
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
@@ -4236,7 +4238,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	delta = p->prio - old_prio;
 
 	if (on_rq) {
-		enqueue_task(rq, p, 0, false);
+		enqueue_task(rq, p, 0);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -8180,7 +8182,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (on_rq)
-		enqueue_task(rq, tsk, 0, false);
+		enqueue_task(rq, tsk, 0);
 
 	task_rq_unlock(rq, &flags);
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8a5e7632d09..88d3053ac7c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -757,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	se->vruntime = vruntime;
 }
 
-#define ENQUEUE_WAKEUP	1
-#define ENQUEUE_MIGRATE 2
-
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -767,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update the normalized vruntime before updating min_vruntime
 	 * through callig update_curr().
 	 */
-	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
 		se->vruntime += cfs_rq->min_vruntime;
 
 	/*
@@ -803,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	/*
 	 * Update run-time statistics of the 'current'.
@@ -811,7 +808,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 	update_curr(cfs_rq);
 
 	update_stats_dequeue(cfs_rq, se);
-	if (sleep) {
+	if (flags & DEQUEUE_SLEEP) {
 #ifdef CONFIG_SCHEDSTATS
 		if (entity_is_task(se)) {
 			struct task_struct *tsk = task_of(se);
@@ -836,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 	 * update can refer to the ->curr item and we need to reflect this
 	 * movement in our normalized position.
 	 */
-	if (!sleep)
+	if (!(flags & DEQUEUE_SLEEP))
 		se->vruntime -= cfs_rq->min_vruntime;
 }
 
@@ -1045,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq)
  * then put the task into the rbtree:
  */
 static void
-enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
-	int flags = 0;
-
-	if (wakeup)
-		flags |= ENQUEUE_WAKEUP;
-	if (p->state == TASK_WAKING)
-		flags |= ENQUEUE_MIGRATE;
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
@@ -1072,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
  * decreased. We remove the task from the rbtree and
  * update the fair scheduling stats:
  */
-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
 
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
-		dequeue_entity(cfs_rq, se, sleep);
+		dequeue_entity(cfs_rq, se, flags);
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight)
 			break;
-		sleep = 1;
+		flags |= DEQUEUE_SLEEP;
 	}
 
 	hrtick_update(rq);
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5af709f503b..bea2b8f1202 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -33,7 +33,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
  * message if some code attempts to do it:
  */
 static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 {
 	raw_spin_unlock_irq(&rq->lock);
 	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fde895f8044..8afb953e31c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
  * Adding/removing a task to/from a priority array:
  */
 static void
-enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 
-	if (wakeup)
+	if (flags & ENQUEUE_WAKEUP)
 		rt_se->timeout = 0;
 
-	enqueue_rt_entity(rt_se, head);
+	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
 
 	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
 }
 
-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
+static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 
-- 
cgit v1.2.3


From c6e718ff8cdcf5e7855077687720b37c4a07650a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 26 Mar 2010 12:11:06 -0300
Subject: perf symbols: Move more map_groups methods to map.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While writing a standalone test app that uses the symbol system to
find kernel space symbols I noticed these also need to be moved.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.c   |   2 +-
 tools/perf/util/map.c    | 168 ++++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/map.h    |  11 ++-
 tools/perf/util/thread.c | 169 +----------------------------------------------
 4 files changed, 180 insertions(+), 170 deletions(-)

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 09e09e78cb6..de3190102cc 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -658,7 +658,7 @@ print_entries:
 
 		if (h->ms.map == NULL && verbose > 1) {
 			__map_groups__fprintf_maps(&h->thread->mg,
-						   MAP__FUNCTION, fp);
+						   MAP__FUNCTION, verbose, fp);
 			fprintf(fp, "%.10s end\n", graph_dotted_line);
 		}
 	}
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 9f2963f9ee9..e21f9800173 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -1,4 +1,5 @@
 #include "symbol.h"
+#include <errno.h>
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
@@ -234,6 +235,37 @@ u64 map__objdump_2ip(struct map *map, u64 addr)
 	return ip;
 }
 
+void map_groups__init(struct map_groups *self)
+{
+	int i;
+	for (i = 0; i < MAP__NR_TYPES; ++i) {
+		self->maps[i] = RB_ROOT;
+		INIT_LIST_HEAD(&self->removed_maps[i]);
+	}
+}
+
+void map_groups__flush(struct map_groups *self)
+{
+	int type;
+
+	for (type = 0; type < MAP__NR_TYPES; type++) {
+		struct rb_root *root = &self->maps[type];
+		struct rb_node *next = rb_first(root);
+
+		while (next) {
+			struct map *pos = rb_entry(next, struct map, rb_node);
+			next = rb_next(&pos->rb_node);
+			rb_erase(&pos->rb_node, root);
+			/*
+			 * We may have references to this map, for
+			 * instance in some hist_entry instances, so
+			 * just move them to a separate list.
+			 */
+			list_add_tail(&pos->node, &self->removed_maps[pos->type]);
+		}
+	}
+}
+
 struct symbol *map_groups__find_symbol(struct map_groups *self,
 				       enum map_type type, u64 addr,
 				       symbol_filter_t filter)
@@ -246,6 +278,142 @@ struct symbol *map_groups__find_symbol(struct map_groups *self,
 	return NULL;
 }
 
+size_t __map_groups__fprintf_maps(struct map_groups *self,
+				  enum map_type type, int verbose, FILE *fp)
+{
+	size_t printed = fprintf(fp, "%s:\n", map_type__name[type]);
+	struct rb_node *nd;
+
+	for (nd = rb_first(&self->maps[type]); nd; nd = rb_next(nd)) {
+		struct map *pos = rb_entry(nd, struct map, rb_node);
+		printed += fprintf(fp, "Map:");
+		printed += map__fprintf(pos, fp);
+		if (verbose > 2) {
+			printed += dso__fprintf(pos->dso, type, fp);
+			printed += fprintf(fp, "--\n");
+		}
+	}
+
+	return printed;
+}
+
+size_t map_groups__fprintf_maps(struct map_groups *self, int verbose, FILE *fp)
+{
+	size_t printed = 0, i;
+	for (i = 0; i < MAP__NR_TYPES; ++i)
+		printed += __map_groups__fprintf_maps(self, i, verbose, fp);
+	return printed;
+}
+
+static size_t __map_groups__fprintf_removed_maps(struct map_groups *self,
+						 enum map_type type,
+						 int verbose, FILE *fp)
+{
+	struct map *pos;
+	size_t printed = 0;
+
+	list_for_each_entry(pos, &self->removed_maps[type], node) {
+		printed += fprintf(fp, "Map:");
+		printed += map__fprintf(pos, fp);
+		if (verbose > 1) {
+			printed += dso__fprintf(pos->dso, type, fp);
+			printed += fprintf(fp, "--\n");
+		}
+	}
+	return printed;
+}
+
+static size_t map_groups__fprintf_removed_maps(struct map_groups *self,
+					       int verbose, FILE *fp)
+{
+	size_t printed = 0, i;
+	for (i = 0; i < MAP__NR_TYPES; ++i)
+		printed += __map_groups__fprintf_removed_maps(self, i, verbose, fp);
+	return printed;
+}
+
+size_t map_groups__fprintf(struct map_groups *self, int verbose, FILE *fp)
+{
+	size_t printed = map_groups__fprintf_maps(self, verbose, fp);
+	printed += fprintf(fp, "Removed maps:\n");
+	return printed + map_groups__fprintf_removed_maps(self, verbose, fp);
+}
+
+int map_groups__fixup_overlappings(struct map_groups *self, struct map *map,
+				   int verbose, FILE *fp)
+{
+	struct rb_root *root = &self->maps[map->type];
+	struct rb_node *next = rb_first(root);
+
+	while (next) {
+		struct map *pos = rb_entry(next, struct map, rb_node);
+		next = rb_next(&pos->rb_node);
+
+		if (!map__overlap(pos, map))
+			continue;
+
+		if (verbose >= 2) {
+			fputs("overlapping maps:\n", fp);
+			map__fprintf(map, fp);
+			map__fprintf(pos, fp);
+		}
+
+		rb_erase(&pos->rb_node, root);
+		/*
+		 * We may have references to this map, for instance in some
+		 * hist_entry instances, so just move them to a separate
+		 * list.
+		 */
+		list_add_tail(&pos->node, &self->removed_maps[map->type]);
+		/*
+		 * Now check if we need to create new maps for areas not
+		 * overlapped by the new map:
+		 */
+		if (map->start > pos->start) {
+			struct map *before = map__clone(pos);
+
+			if (before == NULL)
+				return -ENOMEM;
+
+			before->end = map->start - 1;
+			map_groups__insert(self, before);
+			if (verbose >= 2)
+				map__fprintf(before, fp);
+		}
+
+		if (map->end < pos->end) {
+			struct map *after = map__clone(pos);
+
+			if (after == NULL)
+				return -ENOMEM;
+
+			after->start = map->end + 1;
+			map_groups__insert(self, after);
+			if (verbose >= 2)
+				map__fprintf(after, fp);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * XXX This should not really _copy_ te maps, but refcount them.
+ */
+int map_groups__clone(struct map_groups *self,
+		      struct map_groups *parent, enum map_type type)
+{
+	struct rb_node *nd;
+	for (nd = rb_first(&parent->maps[type]); nd; nd = rb_next(nd)) {
+		struct map *map = rb_entry(nd, struct map, rb_node);
+		struct map *new = map__clone(map);
+		if (new == NULL)
+			return -ENOMEM;
+		map_groups__insert(self, new);
+	}
+	return 0;
+}
+
 static u64 map__reloc_map_ip(struct map *map, u64 ip)
 {
 	return ip + (s64)map->pgoff;
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 6a703fa7470..4e7a11da8ff 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -97,11 +97,14 @@ struct map_groups {
 };
 
 size_t __map_groups__fprintf_maps(struct map_groups *self,
-				  enum map_type type, FILE *fp);
+				  enum map_type type, int verbose, FILE *fp);
 void maps__insert(struct rb_root *maps, struct map *map);
 struct map *maps__find(struct rb_root *maps, u64 addr);
 void map_groups__init(struct map_groups *self);
-size_t map_groups__fprintf_maps(struct map_groups *self, FILE *fp);
+int map_groups__clone(struct map_groups *self,
+		      struct map_groups *parent, enum map_type type);
+size_t map_groups__fprintf(struct map_groups *self, int verbose, FILE *fp);
+size_t map_groups__fprintf_maps(struct map_groups *self, int verbose, FILE *fp);
 
 static inline void map_groups__insert(struct map_groups *self, struct map *map)
 {
@@ -125,6 +128,9 @@ static inline struct symbol *map_groups__find_function(struct map_groups *self,
 	return map_groups__find_symbol(self, MAP__FUNCTION, addr, filter);
 }
 
+int map_groups__fixup_overlappings(struct map_groups *self, struct map *map,
+				   int verbose, FILE *fp);
+
 struct map *map_groups__find_by_name(struct map_groups *self,
 				     enum map_type type, const char *name);
 int __map_groups__create_kernel_maps(struct map_groups *self,
@@ -134,5 +140,6 @@ int map_groups__create_kernel_maps(struct map_groups *self,
 				   struct map *vmlinux_maps[MAP__NR_TYPES]);
 struct map *map_groups__new_module(struct map_groups *self, u64 start,
 				   const char *filename);
+void map_groups__flush(struct map_groups *self);
 
 #endif /* __PERF_MAP_H */
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 9bbe27d7530..1f7ecd47f49 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -38,15 +38,6 @@ failure:
 	return ret;
 }
 
-void map_groups__init(struct map_groups *self)
-{
-	int i;
-	for (i = 0; i < MAP__NR_TYPES; ++i) {
-		self->maps[i] = RB_ROOT;
-		INIT_LIST_HEAD(&self->removed_maps[i]);
-	}
-}
-
 static struct thread *thread__new(pid_t pid)
 {
 	struct thread *self = zalloc(sizeof(*self));
@@ -62,28 +53,6 @@ static struct thread *thread__new(pid_t pid)
 	return self;
 }
 
-static void map_groups__flush(struct map_groups *self)
-{
-	int type;
-
-	for (type = 0; type < MAP__NR_TYPES; type++) {
-		struct rb_root *root = &self->maps[type];
-		struct rb_node *next = rb_first(root);
-
-		while (next) {
-			struct map *pos = rb_entry(next, struct map, rb_node);
-			next = rb_next(&pos->rb_node);
-			rb_erase(&pos->rb_node, root);
-			/*
-			 * We may have references to this map, for
-			 * instance in some hist_entry instances, so
-			 * just move them to a separate list.
-			 */
-			list_add_tail(&pos->node, &self->removed_maps[pos->type]);
-		}
-	}
-}
-
 int thread__set_comm(struct thread *self, const char *comm)
 {
 	int err;
@@ -110,69 +79,10 @@ int thread__comm_len(struct thread *self)
 	return self->comm_len;
 }
 
-size_t __map_groups__fprintf_maps(struct map_groups *self,
-				  enum map_type type, FILE *fp)
-{
-	size_t printed = fprintf(fp, "%s:\n", map_type__name[type]);
-	struct rb_node *nd;
-
-	for (nd = rb_first(&self->maps[type]); nd; nd = rb_next(nd)) {
-		struct map *pos = rb_entry(nd, struct map, rb_node);
-		printed += fprintf(fp, "Map:");
-		printed += map__fprintf(pos, fp);
-		if (verbose > 2) {
-			printed += dso__fprintf(pos->dso, type, fp);
-			printed += fprintf(fp, "--\n");
-		}
-	}
-
-	return printed;
-}
-
-size_t map_groups__fprintf_maps(struct map_groups *self, FILE *fp)
-{
-	size_t printed = 0, i;
-	for (i = 0; i < MAP__NR_TYPES; ++i)
-		printed += __map_groups__fprintf_maps(self, i, fp);
-	return printed;
-}
-
-static size_t __map_groups__fprintf_removed_maps(struct map_groups *self,
-						 enum map_type type, FILE *fp)
-{
-	struct map *pos;
-	size_t printed = 0;
-
-	list_for_each_entry(pos, &self->removed_maps[type], node) {
-		printed += fprintf(fp, "Map:");
-		printed += map__fprintf(pos, fp);
-		if (verbose > 1) {
-			printed += dso__fprintf(pos->dso, type, fp);
-			printed += fprintf(fp, "--\n");
-		}
-	}
-	return printed;
-}
-
-static size_t map_groups__fprintf_removed_maps(struct map_groups *self, FILE *fp)
-{
-	size_t printed = 0, i;
-	for (i = 0; i < MAP__NR_TYPES; ++i)
-		printed += __map_groups__fprintf_removed_maps(self, i, fp);
-	return printed;
-}
-
-static size_t map_groups__fprintf(struct map_groups *self, FILE *fp)
-{
-	size_t printed = map_groups__fprintf_maps(self, fp);
-	printed += fprintf(fp, "Removed maps:\n");
-	return printed + map_groups__fprintf_removed_maps(self, fp);
-}
-
 static size_t thread__fprintf(struct thread *self, FILE *fp)
 {
 	return fprintf(fp, "Thread %d %s\n", self->pid, self->comm) +
-	       map_groups__fprintf(&self->mg, fp);
+	       map_groups__fprintf(&self->mg, verbose, fp);
 }
 
 struct thread *perf_session__findnew(struct perf_session *self, pid_t pid)
@@ -214,87 +124,12 @@ struct thread *perf_session__findnew(struct perf_session *self, pid_t pid)
 	return th;
 }
 
-static int map_groups__fixup_overlappings(struct map_groups *self,
-					  struct map *map)
-{
-	struct rb_root *root = &self->maps[map->type];
-	struct rb_node *next = rb_first(root);
-
-	while (next) {
-		struct map *pos = rb_entry(next, struct map, rb_node);
-		next = rb_next(&pos->rb_node);
-
-		if (!map__overlap(pos, map))
-			continue;
-
-		if (verbose >= 2) {
-			fputs("overlapping maps:\n", stderr);
-			map__fprintf(map, stderr);
-			map__fprintf(pos, stderr);
-		}
-
-		rb_erase(&pos->rb_node, root);
-		/*
-		 * We may have references to this map, for instance in some
-		 * hist_entry instances, so just move them to a separate
-		 * list.
-		 */
-		list_add_tail(&pos->node, &self->removed_maps[map->type]);
-		/*
-		 * Now check if we need to create new maps for areas not
-		 * overlapped by the new map:
-		 */
-		if (map->start > pos->start) {
-			struct map *before = map__clone(pos);
-
-			if (before == NULL)
-				return -ENOMEM;
-
-			before->end = map->start - 1;
-			map_groups__insert(self, before);
-			if (verbose >= 2)
-				map__fprintf(before, stderr);
-		}
-
-		if (map->end < pos->end) {
-			struct map *after = map__clone(pos);
-
-			if (after == NULL)
-				return -ENOMEM;
-
-			after->start = map->end + 1;
-			map_groups__insert(self, after);
-			if (verbose >= 2)
-				map__fprintf(after, stderr);
-		}
-	}
-
-	return 0;
-}
-
 void thread__insert_map(struct thread *self, struct map *map)
 {
-	map_groups__fixup_overlappings(&self->mg, map);
+	map_groups__fixup_overlappings(&self->mg, map, verbose, stderr);
 	map_groups__insert(&self->mg, map);
 }
 
-/*
- * XXX This should not really _copy_ te maps, but refcount them.
- */
-static int map_groups__clone(struct map_groups *self,
-			     struct map_groups *parent, enum map_type type)
-{
-	struct rb_node *nd;
-	for (nd = rb_first(&parent->maps[type]); nd; nd = rb_next(nd)) {
-		struct map *map = rb_entry(nd, struct map, rb_node);
-		struct map *new = map__clone(map);
-		if (new == NULL)
-			return -ENOMEM;
-		map_groups__insert(self, new);
-	}
-	return 0;
-}
-
 int thread__fork(struct thread *self, struct thread *parent)
 {
 	int i;
-- 
cgit v1.2.3


From 7e5e1b1404c30db5f6bc3f5203b6c21c1d244f99 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 26 Mar 2010 12:30:40 -0300
Subject: perf symbols: map_groups__find_symbol must return the map too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tools need to know from which map in the map_group a symbol was resolved
to, so that, for isntance, we can annotate kernel modules symbols by
getting its precise name, etc.

Also add the _by_name variants for completeness.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-kmem.c |  3 ++-
 tools/perf/util/map.c     | 28 +++++++++++++++++++++++++++-
 tools/perf/util/map.h     | 23 +++++++++++++++++++----
 3 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 924a9518931..32edb6a8687 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -369,7 +369,8 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 		if (is_caller) {
 			addr = data->call_site;
 			if (!raw_ip)
-				sym = map_groups__find_function(&session->kmaps, addr, NULL);
+				sym = map_groups__find_function(&session->kmaps,
+								addr, NULL, NULL);
 		} else
 			addr = data->ptr;
 
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index e21f9800173..37913b241bd 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -268,12 +268,38 @@ void map_groups__flush(struct map_groups *self)
 
 struct symbol *map_groups__find_symbol(struct map_groups *self,
 				       enum map_type type, u64 addr,
+				       struct map **mapp,
 				       symbol_filter_t filter)
 {
 	struct map *map = map_groups__find(self, type, addr);
 
-	if (map != NULL)
+	if (map != NULL) {
+		if (mapp != NULL)
+			*mapp = map;
 		return map__find_symbol(map, map->map_ip(map, addr), filter);
+	}
+
+	return NULL;
+}
+
+struct symbol *map_groups__find_symbol_by_name(struct map_groups *self,
+					       enum map_type type,
+					       const char *name,
+					       struct map **mapp,
+					       symbol_filter_t filter)
+{
+	struct rb_node *nd;
+
+	for (nd = rb_first(&self->maps[type]); nd; nd = rb_next(nd)) {
+		struct map *pos = rb_entry(nd, struct map, rb_node);
+		struct symbol *sym = map__find_symbol_by_name(pos, name, filter);
+
+		if (sym == NULL)
+			continue;
+		if (mapp != NULL)
+			*mapp = pos;
+		return sym;
+	}
 
 	return NULL;
 }
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 4e7a11da8ff..2031278cc06 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -119,13 +119,28 @@ static inline struct map *map_groups__find(struct map_groups *self,
 
 struct symbol *map_groups__find_symbol(struct map_groups *self,
 				       enum map_type type, u64 addr,
+				       struct map **mapp,
 				       symbol_filter_t filter);
 
-static inline struct symbol *map_groups__find_function(struct map_groups *self,
-						       u64 addr,
-						       symbol_filter_t filter)
+struct symbol *map_groups__find_symbol_by_name(struct map_groups *self,
+					       enum map_type type,
+					       const char *name,
+					       struct map **mapp,
+					       symbol_filter_t filter);
+
+static inline
+struct symbol *map_groups__find_function(struct map_groups *self, u64 addr,
+					 struct map **mapp, symbol_filter_t filter)
+{
+	return map_groups__find_symbol(self, MAP__FUNCTION, addr, mapp, filter);
+}
+
+static inline
+struct symbol *map_groups__find_function_by_name(struct map_groups *self,
+						 const char *name, struct map **mapp,
+						 symbol_filter_t filter)
 {
-	return map_groups__find_symbol(self, MAP__FUNCTION, addr, filter);
+	return map_groups__find_symbol_by_name(self, MAP__FUNCTION, name, mapp, filter);
 }
 
 int map_groups__fixup_overlappings(struct map_groups *self, struct map *map,
-- 
cgit v1.2.3


From 5f4d3f8816461300ce54505c9117bf85b3044aa0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 26 Mar 2010 21:16:22 -0300
Subject: perf report: Add progress bars
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For when we are processing the events and inserting the entries in the
browser.

Experimentation here: naming "ui_something" we may be treading into
creating a TUI/GUI set of routines that can then be implemented in terms
of multiple backends.

Also the time it takes for adding things to the "browser" takes, visually
(I guess I should do some profiling here ;-) ), more time than for
processing the events...

That means we probably need to create a custom hist_entry browser, so
that we reuse the structures we have in place instead of duplicating
them in newt.

But progress was made and at least we can see something while long files
are being loaded, that must be one of UI 101 bullet points :-)

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c |   7 +--
 tools/perf/util/debug.h     |  16 +++++++
 tools/perf/util/hist.c      |   5 ++-
 tools/perf/util/hist.h      |   2 +-
 tools/perf/util/newt.c      | 103 ++++++++++++++++++++++++++++++++++----------
 tools/perf/util/session.c   |   6 +++
 tools/perf/util/session.h   |  12 ++++--
 7 files changed, 119 insertions(+), 32 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 6ab16980dd6..381918515a5 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -303,13 +303,14 @@ static int __cmd_report(void)
 	next = rb_first(&session->stats_by_id);
 	while (next) {
 		struct event_stat_id *stats;
+		u64 nr_hists;
 
 		stats = rb_entry(next, struct event_stat_id, rb_node);
 		perf_session__collapse_resort(&stats->hists);
-		perf_session__output_resort(&stats->hists, stats->stats.total);
-
+		nr_hists = perf_session__output_resort(&stats->hists,
+						       stats->stats.total);
 		if (use_browser)
-			perf_session__browse_hists(&stats->hists,
+			perf_session__browse_hists(&stats->hists, nr_hists,
 						   stats->stats.total, help);
 		else {
 			if (rb_first(&session->stats_by_id) ==
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index 0172edf3f15..5cb0a1b1401 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -10,13 +10,29 @@ extern int dump_trace;
 int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
 void trace_event(event_t *event);
 
+struct ui_progress;
+
 #ifdef NO_NEWT_SUPPORT
 static inline int browser__show_help(const char *format __used, va_list ap __used)
 {
 	return 0;
 }
+
+static inline struct ui_progress *ui_progress__new(const char *title __used,
+						   u64 total __used)
+{
+	return (struct ui_progress *)1;
+}
+
+static inline void ui_progress__update(struct ui_progress *self __used,
+				       u64 curr __used) {}
+
+static inline void ui_progress__delete(struct ui_progress *self __used) {}
 #else
 int browser__show_help(const char *format, va_list ap);
+struct ui_progress *ui_progress__new(const char *title, u64 total);
+void ui_progress__update(struct ui_progress *self, u64 curr);
+void ui_progress__delete(struct ui_progress *self);
 #endif
 
 #endif	/* __PERF_DEBUG_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index de3190102cc..a46d0933246 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -185,12 +185,13 @@ static void perf_session__insert_output_hist_entry(struct rb_root *root,
 	rb_insert_color(&he->rb_node, root);
 }
 
-void perf_session__output_resort(struct rb_root *hists, u64 total_samples)
+u64 perf_session__output_resort(struct rb_root *hists, u64 total_samples)
 {
 	struct rb_root tmp;
 	struct rb_node *next;
 	struct hist_entry *n;
 	u64 min_callchain_hits;
+	u64 nr_hists = 0;
 
 	min_callchain_hits =
 		total_samples * (callchain_param.min_percent / 100);
@@ -205,9 +206,11 @@ void perf_session__output_resort(struct rb_root *hists, u64 total_samples)
 		rb_erase(&n->rb_node, hists);
 		perf_session__insert_output_hist_entry(&tmp, n,
 						       min_callchain_hits);
+		++nr_hists;
 	}
 
 	*hists = tmp;
+	return nr_hists;
 }
 
 static size_t callchain__fprintf_left_margin(FILE *fp, int left_margin)
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index fe366ce5db4..da6a8c1320f 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -25,7 +25,7 @@ size_t hist_entry__fprintf(struct hist_entry *self,
 			   u64 session_total);
 void hist_entry__free(struct hist_entry *);
 
-void perf_session__output_resort(struct rb_root *hists, u64 total_samples);
+u64 perf_session__output_resort(struct rb_root *hists, u64 total_samples);
 void perf_session__collapse_resort(struct rb_root *hists);
 size_t perf_session__fprintf_hists(struct rb_root *hists,
 				   struct perf_session *pair,
diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index e99bcc8d193..b0210ae5b93 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -12,6 +12,72 @@
 #include "sort.h"
 #include "symbol.h"
 
+struct ui_progress {
+	newtComponent form, scale;
+};
+
+struct ui_progress *ui_progress__new(const char *title, u64 total)
+{
+	struct ui_progress *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		int cols;
+		newtGetScreenSize(&cols, NULL);
+		cols -= 4;
+		newtCenteredWindow(cols, 1, title);
+		self->form  = newtForm(NULL, NULL, 0);
+		if (self->form == NULL)
+			goto out_free_self;
+		self->scale = newtScale(0, 0, cols, total);
+		if (self->scale == NULL)
+			goto out_free_form;
+		newtFormAddComponents(self->form, self->scale, NULL);
+		newtRefresh();
+	}
+
+	return self;
+
+out_free_form:
+	newtFormDestroy(self->form);
+out_free_self:
+	free(self);
+	return NULL;
+}
+
+void ui_progress__update(struct ui_progress *self, u64 curr)
+{
+	newtScaleSet(self->scale, curr);
+	newtRefresh();
+}
+
+void ui_progress__delete(struct ui_progress *self)
+{
+	newtFormDestroy(self->form);
+	newtPopWindow();
+	free(self);
+}
+
+static char browser__last_msg[1024];
+
+int browser__show_help(const char *format, va_list ap)
+{
+	int ret;
+	static int backlog;
+
+        ret = vsnprintf(browser__last_msg + backlog,
+			sizeof(browser__last_msg) - backlog, format, ap);
+	backlog += ret;
+
+	if (browser__last_msg[backlog - 1] == '\n') {
+		newtPopHelpLine();
+		newtPushHelpLine(browser__last_msg);
+		newtRefresh();
+		backlog = 0;
+	}
+
+	return ret;
+}
+
 static void newt_form__set_exit_keys(newtComponent self)
 {
 	newtFormAddHotKey(self, NEWT_KEY_ESCAPE);
@@ -364,8 +430,8 @@ static void perf_session__selection(newtComponent self, void *data)
 	*symbol_ptr = newt__symbol_tree_get_current(self);
 }
 
-void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
-				const char *helpline)
+int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
+			       u64 session_total, const char *helpline)
 {
 	struct sort_entry *se;
 	struct rb_node *nd;
@@ -378,6 +444,12 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 	newtComponent form, tree;
 	struct newtExitStruct es;
 	const struct map_symbol *selection;
+	u64 curr_hist = 0;
+	struct ui_progress *progress;
+
+	progress = ui_progress__new("Adding entries to the browser...", nr_hists);
+	if (progress == NULL)
+		return -1;
 
 	snprintf(str, sizeof(str), "Samples: %Ld", session_total);
 	newtDrawRootText(0, 0, str);
@@ -419,8 +491,13 @@ void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
 			max_len = len;
 		if (symbol_conf.use_callchain)
 			hist_entry__append_callchain_browser(h, tree, session_total, idx++);
+		++curr_hist;
+		if (curr_hist % 5)
+			ui_progress__update(progress, curr_hist);
 	}
 
+	ui_progress__delete(progress);
+
 	if (max_len > cols)
 		max_len = cols - 3;
 
@@ -480,27 +557,7 @@ do_annotate:
 
 	newtFormDestroy(form);
 	newtPopWindow();
-}
-
-static char browser__last_msg[1024];
-
-int browser__show_help(const char *format, va_list ap)
-{
-	int ret;
-	static int backlog;
-
-        ret = vsnprintf(browser__last_msg + backlog,
-			sizeof(browser__last_msg) - backlog, format, ap);
-	backlog += ret;
-
-	if (browser__last_msg[backlog - 1] == '\n') {
-		newtPopHelpLine();
-		newtPushHelpLine(browser__last_msg);
-		newtRefresh();
-		backlog = 0;
-	}
-
-	return ret;
+	return 0;
 }
 
 void setup_browser(void)
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 76b4ac689df..32765cdca05 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -397,6 +397,10 @@ int __perf_session__process_events(struct perf_session *self,
 	event_t *event;
 	uint32_t size;
 	char *buf;
+	struct ui_progress *progress = ui_progress__new("Processing events...",
+							self->size);
+	if (progress == NULL)
+		return -1;
 
 	perf_event_ops__fill_defaults(ops);
 
@@ -425,6 +429,7 @@ remap:
 
 more:
 	event = (event_t *)(buf + head);
+	ui_progress__update(progress, offset);
 
 	if (self->header.needs_swap)
 		perf_event_header__bswap(&event->header);
@@ -475,6 +480,7 @@ more:
 done:
 	err = 0;
 out_err:
+	ui_progress__delete(progress);
 	return err;
 }
 
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 631f8157fc1..6a15daeda57 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -88,11 +88,15 @@ static inline struct map *
 }
 
 #ifdef NO_NEWT_SUPPORT
-static inline void perf_session__browse_hists(struct rb_root *hists __used,
+static inline int perf_session__browse_hists(struct rb_root *hists __used,
+					      u64 nr_hists __used,
 					      u64 session_total __used,
-					      const char *helpline __used) {}
+					      const char *helpline __used)
+{
+	return 0;
+}
 #else
-void perf_session__browse_hists(struct rb_root *hists, u64 session_total,
-				const char *helpline);
+int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
+				u64 session_total, const char *helpline);
 #endif
 #endif /* __PERF_SESSION_H */
-- 
cgit v1.2.3


From 8b2c551f9635bf1c5c2d38de300137998915478f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 27 Mar 2010 11:43:36 -0300
Subject: perf tools: Use -o $(BITBUCKET) in one more case
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As described in 1703f2c some gcc versions has issues using /dev/null, so
use the mechanism used elsewhere.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 69036457761..120736c03a5 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -542,7 +542,7 @@ PYTHON_EMBED_LDOPTS = `python-config --ldflags 2>/dev/null`
 PYTHON_EMBED_CCOPTS = `python-config --cflags 2>/dev/null`
 endif
 
-ifneq ($(shell sh -c "(echo '\#include <Python.h>'; echo 'int main(void) { Py_Initialize(); return 0; }') | $(CC) -x c - $(PYTHON_EMBED_CCOPTS) -o /dev/null $(PYTHON_EMBED_LDOPTS) > /dev/null 2>&1 && echo y"), y)
+ifneq ($(shell sh -c "(echo '\#include <Python.h>'; echo 'int main(void) { Py_Initialize(); return 0; }') | $(CC) -x c - $(PYTHON_EMBED_CCOPTS) -o $(BITBUCKET) $(PYTHON_EMBED_LDOPTS) > /dev/null 2>&1 && echo y"), y)
 	BASIC_CFLAGS += -DNO_LIBPYTHON
 else
 	ALL_LDFLAGS += $(PYTHON_EMBED_LDOPTS)
-- 
cgit v1.2.3


From c29ede615fd35a640e771fbbb1778e915fac43a7 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 27 Mar 2010 14:30:45 -0300
Subject: perf tools: Allow specifying O= to build files in a separate
 directory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoiding polluting the source tree with build files.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile              | 322 ++++++++++++++++++++-------------------
 tools/perf/util/PERF-VERSION-GEN |   6 +-
 2 files changed, 172 insertions(+), 156 deletions(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 120736c03a5..93029c09cd6 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -1,3 +1,7 @@
+ifeq ("$(origin O)", "command line")
+	OUTPUT := $(O)/
+endif
+
 # The default target of this Makefile is...
 all::
 
@@ -153,9 +157,13 @@ all::
 #
 # Define NO_DWARF if you do not want debug-info analysis feature at all.
 
-PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
-	@$(SHELL_PATH) util/PERF-VERSION-GEN
--include PERF-VERSION-FILE
+$(shell sh -c 'mkdir -p $(OUTPUT)scripts/python/Perf-Trace-Util/' 2> /dev/null)
+$(shell sh -c 'mkdir -p $(OUTPUT)util/scripting-engines/' 2> /dev/null)
+$(shell sh -c 'mkdir $(OUTPUT)bench' 2> /dev/null)
+
+$(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
+	@$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT)
+-include $(OUTPUT)PERF-VERSION-FILE
 
 uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
 uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
@@ -310,7 +318,7 @@ PROGRAMS += $(EXTRA_PROGRAMS)
 #
 # Single 'perf' binary right now:
 #
-PROGRAMS += perf
+PROGRAMS += $(OUTPUT)perf
 
 # List built-in command $C whose implementation cmd_$C() is not in
 # builtin-$C.o but is linked in as part of some other command.
@@ -320,7 +328,7 @@ PROGRAMS += perf
 ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
 
 # what 'all' will build but not install in perfexecdir
-OTHER_PROGRAMS = perf$X
+OTHER_PROGRAMS = $(OUTPUT)perf$X
 
 # Set paths to tools early so that they can be used for version tests.
 ifndef SHELL_PATH
@@ -332,7 +340,7 @@ endif
 
 export PERL_PATH
 
-LIB_FILE=libperf.a
+LIB_FILE=$(OUTPUT)libperf.a
 
 LIB_H += ../../include/linux/perf_event.h
 LIB_H += ../../include/linux/rbtree.h
@@ -393,77 +401,77 @@ LIB_H += util/probe-finder.h
 LIB_H += util/probe-event.h
 LIB_H += util/cpumap.h
 
-LIB_OBJS += util/abspath.o
-LIB_OBJS += util/alias.o
-LIB_OBJS += util/build-id.o
-LIB_OBJS += util/config.o
-LIB_OBJS += util/ctype.o
-LIB_OBJS += util/debugfs.o
-LIB_OBJS += util/environment.o
-LIB_OBJS += util/event.o
-LIB_OBJS += util/exec_cmd.o
-LIB_OBJS += util/help.o
-LIB_OBJS += util/levenshtein.o
-LIB_OBJS += util/parse-options.o
-LIB_OBJS += util/parse-events.o
-LIB_OBJS += util/path.o
-LIB_OBJS += util/rbtree.o
-LIB_OBJS += util/bitmap.o
-LIB_OBJS += util/hweight.o
-LIB_OBJS += util/find_next_bit.o
-LIB_OBJS += util/run-command.o
-LIB_OBJS += util/quote.o
-LIB_OBJS += util/strbuf.o
-LIB_OBJS += util/string.o
-LIB_OBJS += util/strlist.o
-LIB_OBJS += util/usage.o
-LIB_OBJS += util/wrapper.o
-LIB_OBJS += util/sigchain.o
-LIB_OBJS += util/symbol.o
-LIB_OBJS += util/color.o
-LIB_OBJS += util/pager.o
-LIB_OBJS += util/header.o
-LIB_OBJS += util/callchain.o
-LIB_OBJS += util/values.o
-LIB_OBJS += util/debug.o
-LIB_OBJS += util/map.o
-LIB_OBJS += util/session.o
-LIB_OBJS += util/thread.o
-LIB_OBJS += util/trace-event-parse.o
-LIB_OBJS += util/trace-event-read.o
-LIB_OBJS += util/trace-event-info.o
-LIB_OBJS += util/trace-event-scripting.o
-LIB_OBJS += util/svghelper.o
-LIB_OBJS += util/sort.o
-LIB_OBJS += util/hist.o
-LIB_OBJS += util/probe-event.o
-LIB_OBJS += util/util.o
-LIB_OBJS += util/cpumap.o
-
-BUILTIN_OBJS += builtin-annotate.o
-
-BUILTIN_OBJS += builtin-bench.o
+LIB_OBJS += $(OUTPUT)util/abspath.o
+LIB_OBJS += $(OUTPUT)util/alias.o
+LIB_OBJS += $(OUTPUT)util/build-id.o
+LIB_OBJS += $(OUTPUT)util/config.o
+LIB_OBJS += $(OUTPUT)util/ctype.o
+LIB_OBJS += $(OUTPUT)util/debugfs.o
+LIB_OBJS += $(OUTPUT)util/environment.o
+LIB_OBJS += $(OUTPUT)util/event.o
+LIB_OBJS += $(OUTPUT)util/exec_cmd.o
+LIB_OBJS += $(OUTPUT)util/help.o
+LIB_OBJS += $(OUTPUT)util/levenshtein.o
+LIB_OBJS += $(OUTPUT)util/parse-options.o
+LIB_OBJS += $(OUTPUT)util/parse-events.o
+LIB_OBJS += $(OUTPUT)util/path.o
+LIB_OBJS += $(OUTPUT)util/rbtree.o
+LIB_OBJS += $(OUTPUT)util/bitmap.o
+LIB_OBJS += $(OUTPUT)util/hweight.o
+LIB_OBJS += $(OUTPUT)util/find_next_bit.o
+LIB_OBJS += $(OUTPUT)util/run-command.o
+LIB_OBJS += $(OUTPUT)util/quote.o
+LIB_OBJS += $(OUTPUT)util/strbuf.o
+LIB_OBJS += $(OUTPUT)util/string.o
+LIB_OBJS += $(OUTPUT)util/strlist.o
+LIB_OBJS += $(OUTPUT)util/usage.o
+LIB_OBJS += $(OUTPUT)util/wrapper.o
+LIB_OBJS += $(OUTPUT)util/sigchain.o
+LIB_OBJS += $(OUTPUT)util/symbol.o
+LIB_OBJS += $(OUTPUT)util/color.o
+LIB_OBJS += $(OUTPUT)util/pager.o
+LIB_OBJS += $(OUTPUT)util/header.o
+LIB_OBJS += $(OUTPUT)util/callchain.o
+LIB_OBJS += $(OUTPUT)util/values.o
+LIB_OBJS += $(OUTPUT)util/debug.o
+LIB_OBJS += $(OUTPUT)util/map.o
+LIB_OBJS += $(OUTPUT)util/session.o
+LIB_OBJS += $(OUTPUT)util/thread.o
+LIB_OBJS += $(OUTPUT)util/trace-event-parse.o
+LIB_OBJS += $(OUTPUT)util/trace-event-read.o
+LIB_OBJS += $(OUTPUT)util/trace-event-info.o
+LIB_OBJS += $(OUTPUT)util/trace-event-scripting.o
+LIB_OBJS += $(OUTPUT)util/svghelper.o
+LIB_OBJS += $(OUTPUT)util/sort.o
+LIB_OBJS += $(OUTPUT)util/hist.o
+LIB_OBJS += $(OUTPUT)util/probe-event.o
+LIB_OBJS += $(OUTPUT)util/util.o
+LIB_OBJS += $(OUTPUT)util/cpumap.o
+
+BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o
+
+BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
 
 # Benchmark modules
-BUILTIN_OBJS += bench/sched-messaging.o
-BUILTIN_OBJS += bench/sched-pipe.o
-BUILTIN_OBJS += bench/mem-memcpy.o
-
-BUILTIN_OBJS += builtin-diff.o
-BUILTIN_OBJS += builtin-help.o
-BUILTIN_OBJS += builtin-sched.o
-BUILTIN_OBJS += builtin-buildid-list.o
-BUILTIN_OBJS += builtin-buildid-cache.o
-BUILTIN_OBJS += builtin-list.o
-BUILTIN_OBJS += builtin-record.o
-BUILTIN_OBJS += builtin-report.o
-BUILTIN_OBJS += builtin-stat.o
-BUILTIN_OBJS += builtin-timechart.o
-BUILTIN_OBJS += builtin-top.o
-BUILTIN_OBJS += builtin-trace.o
-BUILTIN_OBJS += builtin-probe.o
-BUILTIN_OBJS += builtin-kmem.o
-BUILTIN_OBJS += builtin-lock.o
+BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
+BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
+
+BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
+BUILTIN_OBJS += $(OUTPUT)builtin-help.o
+BUILTIN_OBJS += $(OUTPUT)builtin-sched.o
+BUILTIN_OBJS += $(OUTPUT)builtin-buildid-list.o
+BUILTIN_OBJS += $(OUTPUT)builtin-buildid-cache.o
+BUILTIN_OBJS += $(OUTPUT)builtin-list.o
+BUILTIN_OBJS += $(OUTPUT)builtin-record.o
+BUILTIN_OBJS += $(OUTPUT)builtin-report.o
+BUILTIN_OBJS += $(OUTPUT)builtin-stat.o
+BUILTIN_OBJS += $(OUTPUT)builtin-timechart.o
+BUILTIN_OBJS += $(OUTPUT)builtin-top.o
+BUILTIN_OBJS += $(OUTPUT)builtin-trace.o
+BUILTIN_OBJS += $(OUTPUT)builtin-probe.o
+BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o
+BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
 
 PERFLIBS = $(LIB_FILE)
 
@@ -494,6 +502,10 @@ ifeq ($(uname_S),Darwin)
 	PTHREAD_LIBS =
 endif
 
+ifneq ($(OUTPUT),)
+	BASIC_CFLAGS += -I$(OUTPUT)
+endif
+
 ifeq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
 ifneq ($(shell sh -c "(echo '\#include <gnu/libc-version.h>'; echo 'int main(void) { const char * version = gnu_get_libc_version(); return (long)version; }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
 	msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]/glibc-static);
@@ -512,7 +524,7 @@ else
 ifndef NO_DWARF
 	BASIC_CFLAGS += -I/usr/include/elfutils -DDWARF_SUPPORT
 	EXTLIBS += -lelf -ldw
-	LIB_OBJS += util/probe-finder.o
+	LIB_OBJS += $(OUTPUT)util/probe-finder.o
 endif
 endif
 
@@ -521,7 +533,7 @@ ifneq ($(shell sh -c "(echo '\#include <newt.h>'; echo 'int main(void) { newtIni
 	BASIC_CFLAGS += -DNO_NEWT_SUPPORT
 else
 	EXTLIBS += -lnewt
-	LIB_OBJS += util/newt.o
+	LIB_OBJS += $(OUTPUT)util/newt.o
 endif
 
 ifndef NO_LIBPERL
@@ -533,8 +545,8 @@ ifneq ($(shell sh -c "(echo '\#include <EXTERN.h>'; echo '\#include <perl.h>'; e
 	BASIC_CFLAGS += -DNO_LIBPERL
 else
 	ALL_LDFLAGS += $(PERL_EMBED_LDOPTS)
-	LIB_OBJS += util/scripting-engines/trace-event-perl.o
-	LIB_OBJS += scripts/perl/Perf-Trace-Util/Context.o
+	LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-perl.o
+	LIB_OBJS += $(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o
 endif
 
 ifndef NO_LIBPYTHON
@@ -546,8 +558,8 @@ ifneq ($(shell sh -c "(echo '\#include <Python.h>'; echo 'int main(void) { Py_In
 	BASIC_CFLAGS += -DNO_LIBPYTHON
 else
 	ALL_LDFLAGS += $(PYTHON_EMBED_LDOPTS)
-	LIB_OBJS += util/scripting-engines/trace-event-python.o
-	LIB_OBJS += scripts/python/Perf-Trace-Util/Context.o
+	LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o
+	LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o
 endif
 
 ifdef NO_DEMANGLE
@@ -618,53 +630,53 @@ ifdef NO_C99_FORMAT
 endif
 ifdef SNPRINTF_RETURNS_BOGUS
 	COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS
-	COMPAT_OBJS += compat/snprintf.o
+	COMPAT_OBJS += $(OUTPUT)compat/snprintf.o
 endif
 ifdef FREAD_READS_DIRECTORIES
 	COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
-	COMPAT_OBJS += compat/fopen.o
+	COMPAT_OBJS += $(OUTPUT)compat/fopen.o
 endif
 ifdef NO_SYMLINK_HEAD
 	BASIC_CFLAGS += -DNO_SYMLINK_HEAD
 endif
 ifdef NO_STRCASESTR
 	COMPAT_CFLAGS += -DNO_STRCASESTR
-	COMPAT_OBJS += compat/strcasestr.o
+	COMPAT_OBJS += $(OUTPUT)compat/strcasestr.o
 endif
 ifdef NO_STRTOUMAX
 	COMPAT_CFLAGS += -DNO_STRTOUMAX
-	COMPAT_OBJS += compat/strtoumax.o
+	COMPAT_OBJS += $(OUTPUT)compat/strtoumax.o
 endif
 ifdef NO_STRTOULL
 	COMPAT_CFLAGS += -DNO_STRTOULL
 endif
 ifdef NO_SETENV
 	COMPAT_CFLAGS += -DNO_SETENV
-	COMPAT_OBJS += compat/setenv.o
+	COMPAT_OBJS += $(OUTPUT)compat/setenv.o
 endif
 ifdef NO_MKDTEMP
 	COMPAT_CFLAGS += -DNO_MKDTEMP
-	COMPAT_OBJS += compat/mkdtemp.o
+	COMPAT_OBJS += $(OUTPUT)compat/mkdtemp.o
 endif
 ifdef NO_UNSETENV
 	COMPAT_CFLAGS += -DNO_UNSETENV
-	COMPAT_OBJS += compat/unsetenv.o
+	COMPAT_OBJS += $(OUTPUT)compat/unsetenv.o
 endif
 ifdef NO_SYS_SELECT_H
 	BASIC_CFLAGS += -DNO_SYS_SELECT_H
 endif
 ifdef NO_MMAP
 	COMPAT_CFLAGS += -DNO_MMAP
-	COMPAT_OBJS += compat/mmap.o
+	COMPAT_OBJS += $(OUTPUT)compat/mmap.o
 else
 	ifdef USE_WIN32_MMAP
 		COMPAT_CFLAGS += -DUSE_WIN32_MMAP
-		COMPAT_OBJS += compat/win32mmap.o
+		COMPAT_OBJS += $(OUTPUT)compat/win32mmap.o
 	endif
 endif
 ifdef NO_PREAD
 	COMPAT_CFLAGS += -DNO_PREAD
-	COMPAT_OBJS += compat/pread.o
+	COMPAT_OBJS += $(OUTPUT)compat/pread.o
 endif
 ifdef NO_FAST_WORKING_DIRECTORY
 	BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
@@ -686,10 +698,10 @@ else
 endif
 endif
 ifdef NO_INET_NTOP
-	LIB_OBJS += compat/inet_ntop.o
+	LIB_OBJS += $(OUTPUT)compat/inet_ntop.o
 endif
 ifdef NO_INET_PTON
-	LIB_OBJS += compat/inet_pton.o
+	LIB_OBJS += $(OUTPUT)compat/inet_pton.o
 endif
 
 ifdef NO_ICONV
@@ -706,15 +718,15 @@ endif
 
 ifdef PPC_SHA1
 	SHA1_HEADER = "ppc/sha1.h"
-	LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
+	LIB_OBJS += $(OUTPUT)ppc/sha1.o ppc/sha1ppc.o
 else
 ifdef ARM_SHA1
 	SHA1_HEADER = "arm/sha1.h"
-	LIB_OBJS += arm/sha1.o arm/sha1_arm.o
+	LIB_OBJS += $(OUTPUT)arm/sha1.o $(OUTPUT)arm/sha1_arm.o
 else
 ifdef MOZILLA_SHA1
 	SHA1_HEADER = "mozilla-sha1/sha1.h"
-	LIB_OBJS += mozilla-sha1/sha1.o
+	LIB_OBJS += $(OUTPUT)mozilla-sha1/sha1.o
 else
 	SHA1_HEADER = <openssl/sha.h>
 	EXTLIBS += $(LIB_4_CRYPTO)
@@ -726,15 +738,15 @@ ifdef NO_PERL_MAKEMAKER
 endif
 ifdef NO_HSTRERROR
 	COMPAT_CFLAGS += -DNO_HSTRERROR
-	COMPAT_OBJS += compat/hstrerror.o
+	COMPAT_OBJS += $(OUTPUT)compat/hstrerror.o
 endif
 ifdef NO_MEMMEM
 	COMPAT_CFLAGS += -DNO_MEMMEM
-	COMPAT_OBJS += compat/memmem.o
+	COMPAT_OBJS += $(OUTPUT)compat/memmem.o
 endif
 ifdef INTERNAL_QSORT
 	COMPAT_CFLAGS += -DINTERNAL_QSORT
-	COMPAT_OBJS += compat/qsort.o
+	COMPAT_OBJS += $(OUTPUT)compat/qsort.o
 endif
 ifdef RUNTIME_PREFIX
 	COMPAT_CFLAGS += -DRUNTIME_PREFIX
@@ -814,7 +826,7 @@ export TAR INSTALL DESTDIR SHELL_PATH
 
 SHELL = $(SHELL_PATH)
 
-all:: .perf.dev.null shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) PERF-BUILD-OPTIONS
+all:: .perf.dev.null shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) $(OUTPUT)PERF-BUILD-OPTIONS
 ifneq (,$X)
 	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
 endif
@@ -826,39 +838,39 @@ please_set_SHELL_PATH_to_a_more_modern_shell:
 
 shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
 
-strip: $(PROGRAMS) perf$X
-	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) perf$X
+strip: $(PROGRAMS) $(OUTPUT)perf$X
+	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf$X
 
-perf.o: perf.c common-cmds.h PERF-CFLAGS
+$(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
 		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
-		$(ALL_CFLAGS) -c $(filter %.c,$^)
+		$(ALL_CFLAGS) -c $(filter %.c,$^) -o $@
 
-perf$X: perf.o $(BUILTIN_OBJS) $(PERFLIBS)
-	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ perf.o \
+$(OUTPUT)perf$X: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(OUTPUT)perf.o \
 		$(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
 
-builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
+$(OUTPUT)builtin-help.o: builtin-help.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) \
 		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
 		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
 		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
 
-builtin-timechart.o: builtin-timechart.c common-cmds.h PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
+$(OUTPUT)builtin-timechart.o: builtin-timechart.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) \
 		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
 		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
 		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
 
-$(BUILT_INS): perf$X
+$(BUILT_INS): $(OUTPUT)perf$X
 	$(QUIET_BUILT_IN)$(RM) $@ && \
 	ln perf$X $@ 2>/dev/null || \
 	ln -s perf$X $@ 2>/dev/null || \
 	cp perf$X $@
 
-common-cmds.h: util/generate-cmdlist.sh command-list.txt
+$(OUTPUT)common-cmds.h: util/generate-cmdlist.sh command-list.txt
 
-common-cmds.h: $(wildcard Documentation/perf-*.txt)
+$(OUTPUT)common-cmds.h: $(wildcard Documentation/perf-*.txt)
 	$(QUIET_GEN). util/generate-cmdlist.sh > $@+ && mv $@+ $@
 
 $(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
@@ -870,7 +882,7 @@ $(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
 	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
 	    $@.sh >$@+ && \
 	chmod +x $@+ && \
-	mv $@+ $@
+	mv $@+ $(OUTPUT)$@
 
 configure: configure.ac
 	$(QUIET_GEN)$(RM) $@ $<+ && \
@@ -880,60 +892,60 @@ configure: configure.ac
 	$(RM) $<+
 
 # These can record PERF_VERSION
-perf.o perf.spec \
+$(OUTPUT)perf.o perf.spec \
 	$(patsubst %.sh,%,$(SCRIPT_SH)) \
 	$(patsubst %.perl,%,$(SCRIPT_PERL)) \
-	: PERF-VERSION-FILE
+	: $(OUTPUT)PERF-VERSION-FILE
 
-%.o: %.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
-%.s: %.c PERF-CFLAGS
+$(OUTPUT)%.o: %.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $<
+$(OUTPUT)%.s: %.c $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
-%.o: %.S
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
+$(OUTPUT)%.o: %.S
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $<
 
-util/exec_cmd.o: util/exec_cmd.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
+$(OUTPUT)util/exec_cmd.o: util/exec_cmd.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) \
 		'-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
 		'-DBINDIR="$(bindir_relative_SQ)"' \
 		'-DPREFIX="$(prefix_SQ)"' \
 		$<
 
-builtin-init-db.o: builtin-init-db.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
+$(OUTPUT)builtin-init-db.o: builtin-init-db.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
 
-util/config.o: util/config.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+$(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
 
-util/rbtree.o: ../../lib/rbtree.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o util/rbtree.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+$(OUTPUT)util/rbtree.o: ../../lib/rbtree.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
 
 # some perf warning policies can't fit to lib/bitmap.c, eg: it warns about variable shadowing
 # from <string.h> that comes from kernel headers wrapping.
 KBITMAP_FLAGS=`echo $(ALL_CFLAGS) | sed s/-Wshadow// | sed s/-Wswitch-default// | sed s/-Wextra//`
 
-util/bitmap.o: ../../lib/bitmap.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o util/bitmap.o -c $(KBITMAP_FLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+$(OUTPUT)util/bitmap.o: ../../lib/bitmap.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(KBITMAP_FLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
 
-util/hweight.o: ../../lib/hweight.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o util/hweight.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+$(OUTPUT)util/hweight.o: ../../lib/hweight.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
 
-util/find_next_bit.o: ../../lib/find_next_bit.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o util/find_next_bit.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+$(OUTPUT)util/find_next_bit.o: ../../lib/find_next_bit.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
 
-util/scripting-engines/trace-event-perl.o: util/scripting-engines/trace-event-perl.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o util/scripting-engines/trace-event-perl.o -c $(ALL_CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow $<
+$(OUTPUT)util/scripting-engines/trace-event-perl.o: util/scripting-engines/trace-event-perl.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow $<
 
-scripts/perl/Perf-Trace-Util/Context.o: scripts/perl/Perf-Trace-Util/Context.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o scripts/perl/Perf-Trace-Util/Context.o -c $(ALL_CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $<
+$(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o: scripts/perl/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $<
 
-util/scripting-engines/trace-event-python.o: util/scripting-engines/trace-event-python.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o util/scripting-engines/trace-event-python.o -c $(ALL_CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow $<
+$(OUTPUT)util/scripting-engines/trace-event-python.o: util/scripting-engines/trace-event-python.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow $<
 
-scripts/python/Perf-Trace-Util/Context.o: scripts/python/Perf-Trace-Util/Context.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o scripts/python/Perf-Trace-Util/Context.o -c $(ALL_CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $<
+$(OUTPUT)scripts/python/Perf-Trace-Util/Context.o: scripts/python/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $<
 
-perf-%$X: %.o $(PERFLIBS)
+$(OUTPUT)perf-%$X: %.o $(PERFLIBS)
 	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
 
 $(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
@@ -974,17 +986,17 @@ cscope:
 TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\
              $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
 
-PERF-CFLAGS: .FORCE-PERF-CFLAGS
+$(OUTPUT)PERF-CFLAGS: .FORCE-PERF-CFLAGS
 	@FLAGS='$(TRACK_CFLAGS)'; \
-	    if test x"$$FLAGS" != x"`cat PERF-CFLAGS 2>/dev/null`" ; then \
+	    if test x"$$FLAGS" != x"`cat $(OUTPUT)PERF-CFLAGS 2>/dev/null`" ; then \
 		echo 1>&2 "    * new build flags or prefix"; \
-		echo "$$FLAGS" >PERF-CFLAGS; \
+		echo "$$FLAGS" >$(OUTPUT)PERF-CFLAGS; \
             fi
 
 # We need to apply sq twice, once to protect from the shell
-# that runs PERF-BUILD-OPTIONS, and then again to protect it
+# that runs $(OUTPUT)PERF-BUILD-OPTIONS, and then again to protect it
 # and the first level quoting from the shell that runs "echo".
-PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
+$(OUTPUT)PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
 	@echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
 	@echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
 	@echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
@@ -1005,7 +1017,7 @@ all:: $(TEST_PROGRAMS)
 
 export NO_SVN_TESTS
 
-check: common-cmds.h
+check: $(OUTPUT)common-cmds.h
 	if sparse; \
 	then \
 		for i in *.c */*.c; \
@@ -1039,10 +1051,10 @@ export perfexec_instdir
 
 install: all
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
-	$(INSTALL) perf$X '$(DESTDIR_SQ)$(bindir_SQ)'
+	$(INSTALL) $(OUTPUT)perf$X '$(DESTDIR_SQ)$(bindir_SQ)'
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin'
-	$(INSTALL) perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+	$(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 	$(INSTALL) scripts/perl/Perf-Trace-Util/lib/Perf/Trace/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'
 	$(INSTALL) scripts/perl/*.pl -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl'
 	$(INSTALL) scripts/perl/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin'
@@ -1056,7 +1068,7 @@ ifdef BUILT_INS
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 	$(INSTALL) $(BUILT_INS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 ifneq (,$X)
-	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
+	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) $(OUTPUT)perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
 endif
 endif
 
@@ -1140,14 +1152,14 @@ clean:
 	$(RM) *.o */*.o */*/*.o */*/*/*.o $(LIB_FILE)
 	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
 	$(RM) $(TEST_PROGRAMS)
-	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
+	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope*
 	$(RM) -r autom4te.cache
 	$(RM) config.log config.mak.autogen config.mak.append config.status config.cache
 	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir
 	$(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
 	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
 	$(MAKE) -C Documentation/ clean
-	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
+	$(RM) $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS $(OUTPUT)PERF-BUILD-OPTIONS
 
 .PHONY: all install clean strip
 .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
diff --git a/tools/perf/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN
index 54552a00a11..49ece792191 100755
--- a/tools/perf/util/PERF-VERSION-GEN
+++ b/tools/perf/util/PERF-VERSION-GEN
@@ -1,6 +1,10 @@
 #!/bin/sh
 
-GVF=PERF-VERSION-FILE
+if [ $# -eq 1 ]  ; then
+	OUTPUT=$1
+fi
+
+GVF=${OUTPUT}PERF-VERSION-FILE
 DEF_VER=v0.0.2.PERF
 
 LF='
-- 
cgit v1.2.3


From 70162138c91b040da3162fe1f34fe8aaf6506f10 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 30 Mar 2010 18:27:39 -0300
Subject: perf record: Add a fallback to the reference relocation symbol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Usually "_text" is enough, but I received reports that its not always
available, so fallback to "_stext" for the symbol we use to check if we
need to apply any relocation to all the symbols in the kernel symtab,
for when, for instance, kexec is being used.

Reported-by: Darren Hart <dvhltc@us.ibm.com>
Cc: Darren Hart <dvhltc@us.ibm.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 60ecdd3dd26..80dc444031d 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -575,6 +575,9 @@ static int __cmd_record(int argc, const char **argv)
 
 	err = event__synthesize_kernel_mmap(process_synthesized_event,
 					    session, "_text");
+	if (err < 0)
+		err = event__synthesize_kernel_mmap(process_synthesized_event,
+						    session, "_stext");
 	if (err < 0) {
 		pr_err("Couldn't record kernel reference relocation symbol.\n");
 		return err;
-- 
cgit v1.2.3


From a4e3b956a820162b7c1d616117b4f23b6017f504 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 31 Mar 2010 11:33:40 -0300
Subject: perf hist: Replace ->print() routines by ->snprintf() equivalents
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Then hist_entry__fprintf will just us the newly introduced
hist_entry__snprintf, add the newline and fprintf it to the supplied
FILE descriptor.

This allows us to remove the use_browser checking in the color_printf
routines, that now got color_snprintf variants too.

The newt TUI browser (and other GUIs that may come in the future) don't
have to worry about stdio specific stuff in the strings they get from
the se->snprintf routines and instead use whatever means to do the
equivalent.

Also the newt TUI browser don't have to use the fmemopen() hack, instead
it can use the se->snprintf routines directly. For now tho use the
hist_entry__snprintf routine to reduce the patch size.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/color.c | 53 ++++++++++++++++++++++++++---
 tools/perf/util/color.h |  4 +++
 tools/perf/util/hist.c  | 54 ++++++++++++++++++++----------
 tools/perf/util/hist.h  |  7 +++-
 tools/perf/util/newt.c  | 53 +++--------------------------
 tools/perf/util/sort.c  | 88 ++++++++++++++++++++++++++-----------------------
 tools/perf/util/sort.h  |  4 +--
 7 files changed, 149 insertions(+), 114 deletions(-)

diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c
index 9da01914e0a..e191eb9a667 100644
--- a/tools/perf/util/color.c
+++ b/tools/perf/util/color.c
@@ -166,6 +166,31 @@ int perf_color_default_config(const char *var, const char *value, void *cb)
 	return perf_default_config(var, value, cb);
 }
 
+static int __color_vsnprintf(char *bf, size_t size, const char *color,
+			     const char *fmt, va_list args, const char *trail)
+{
+	int r = 0;
+
+	/*
+	 * Auto-detect:
+	 */
+	if (perf_use_color_default < 0) {
+		if (isatty(1) || pager_in_use())
+			perf_use_color_default = 1;
+		else
+			perf_use_color_default = 0;
+	}
+
+	if (perf_use_color_default && *color)
+		r += snprintf(bf, size, "%s", color);
+	r += vsnprintf(bf + r, size - r, fmt, args);
+	if (perf_use_color_default && *color)
+		r += snprintf(bf + r, size - r, "%s", PERF_COLOR_RESET);
+	if (trail)
+		r += snprintf(bf + r, size - r, "%s", trail);
+	return r;
+}
+
 static int __color_vfprintf(FILE *fp, const char *color, const char *fmt,
 		va_list args, const char *trail)
 {
@@ -191,11 +216,28 @@ static int __color_vfprintf(FILE *fp, const char *color, const char *fmt,
 	return r;
 }
 
+int color_vsnprintf(char *bf, size_t size, const char *color,
+		    const char *fmt, va_list args)
+{
+	return __color_vsnprintf(bf, size, color, fmt, args, NULL);
+}
+
 int color_vfprintf(FILE *fp, const char *color, const char *fmt, va_list args)
 {
 	return __color_vfprintf(fp, color, fmt, args, NULL);
 }
 
+int color_snprintf(char *bf, size_t size, const char *color,
+		   const char *fmt, ...)
+{
+	va_list args;
+	int r;
+
+	va_start(args, fmt);
+	r = color_vsnprintf(bf, size, color, fmt, args);
+	va_end(args);
+	return r;
+}
 
 int color_fprintf(FILE *fp, const char *color, const char *fmt, ...)
 {
@@ -203,10 +245,7 @@ int color_fprintf(FILE *fp, const char *color, const char *fmt, ...)
 	int r;
 
 	va_start(args, fmt);
-	if (use_browser)
-		r = vfprintf(fp, fmt, args);
-	else
-		r = color_vfprintf(fp, color, fmt, args);
+	r = color_vfprintf(fp, color, fmt, args);
 	va_end(args);
 	return r;
 }
@@ -277,3 +316,9 @@ int percent_color_fprintf(FILE *fp, const char *fmt, double percent)
 
 	return r;
 }
+
+int percent_color_snprintf(char *bf, size_t size, const char *fmt, double percent)
+{
+	const char *color = get_percent_color(percent);
+	return color_snprintf(bf, size, color, fmt, percent);
+}
diff --git a/tools/perf/util/color.h b/tools/perf/util/color.h
index 24e8809210b..dea082b7960 100644
--- a/tools/perf/util/color.h
+++ b/tools/perf/util/color.h
@@ -32,10 +32,14 @@ int perf_color_default_config(const char *var, const char *value, void *cb);
 int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty);
 void color_parse(const char *value, const char *var, char *dst);
 void color_parse_mem(const char *value, int len, const char *var, char *dst);
+int color_vsnprintf(char *bf, size_t size, const char *color,
+		    const char *fmt, va_list args);
 int color_vfprintf(FILE *fp, const char *color, const char *fmt, va_list args);
 int color_fprintf(FILE *fp, const char *color, const char *fmt, ...);
+int color_snprintf(char *bf, size_t size, const char *color, const char *fmt, ...);
 int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...);
 int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *buf);
+int percent_color_snprintf(char *bf, size_t size, const char *fmt, double percent);
 int percent_color_fprintf(FILE *fp, const char *fmt, double percent);
 const char *get_percent_color(double percent);
 
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index a46d0933246..f0794913d57 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -455,16 +455,17 @@ static size_t hist_entry_callchain__fprintf(FILE *fp, struct hist_entry *self,
 	return ret;
 }
 
-size_t hist_entry__fprintf(struct hist_entry *self,
+int hist_entry__snprintf(struct hist_entry *self,
+			   char *s, size_t size,
 			   struct perf_session *pair_session,
 			   bool show_displacement,
-			   long displacement, FILE *fp,
+			   long displacement, bool color,
 			   u64 session_total)
 {
 	struct sort_entry *se;
 	u64 count, total;
 	const char *sep = symbol_conf.field_sep;
-	size_t ret;
+	int ret;
 
 	if (symbol_conf.exclude_other && !self->parent)
 		return 0;
@@ -477,17 +478,22 @@ size_t hist_entry__fprintf(struct hist_entry *self,
 		total = session_total;
 	}
 
-	if (total)
-		ret = percent_color_fprintf(fp, sep ? "%.2f" : "   %6.2f%%",
-					    (count * 100.0) / total);
-	else
-		ret = fprintf(fp, sep ? "%lld" : "%12lld ", count);
+	if (total) {
+		if (color)
+			ret = percent_color_snprintf(s, size,
+						     sep ? "%.2f" : "   %6.2f%%",
+						     (count * 100.0) / total);
+		else
+			ret = snprintf(s, size, sep ? "%.2f" : "   %6.2f%%",
+				       (count * 100.0) / total);
+	} else
+		ret = snprintf(s, size, sep ? "%lld" : "%12lld ", count);
 
 	if (symbol_conf.show_nr_samples) {
 		if (sep)
-			ret += fprintf(fp, "%c%lld", *sep, count);
+			ret += snprintf(s + ret, size - ret, "%c%lld", *sep, count);
 		else
-			ret += fprintf(fp, "%11lld", count);
+			ret += snprintf(s + ret, size - ret, "%11lld", count);
 	}
 
 	if (pair_session) {
@@ -507,9 +513,9 @@ size_t hist_entry__fprintf(struct hist_entry *self,
 			snprintf(bf, sizeof(bf), " ");
 
 		if (sep)
-			ret += fprintf(fp, "%c%s", *sep, bf);
+			ret += snprintf(s + ret, size - ret, "%c%s", *sep, bf);
 		else
-			ret += fprintf(fp, "%11.11s", bf);
+			ret += snprintf(s + ret, size - ret, "%11.11s", bf);
 
 		if (show_displacement) {
 			if (displacement)
@@ -518,9 +524,9 @@ size_t hist_entry__fprintf(struct hist_entry *self,
 				snprintf(bf, sizeof(bf), " ");
 
 			if (sep)
-				ret += fprintf(fp, "%c%s", *sep, bf);
+				ret += snprintf(s + ret, size - ret, "%c%s", *sep, bf);
 			else
-				ret += fprintf(fp, "%6.6s", bf);
+				ret += snprintf(s + ret, size - ret, "%6.6s", bf);
 		}
 	}
 
@@ -528,11 +534,25 @@ size_t hist_entry__fprintf(struct hist_entry *self,
 		if (se->elide)
 			continue;
 
-		ret += fprintf(fp, "%s", sep ?: "  ");
-		ret += se->print(fp, self, se->width ? *se->width : 0);
+		ret += snprintf(s + ret, size - ret, "%s", sep ?: "  ");
+		ret += se->snprintf(self, s + ret, size - ret,
+				    se->width ? *se->width : 0);
 	}
 
-	return ret + fprintf(fp, "\n");
+	return ret;
+}
+
+int hist_entry__fprintf(struct hist_entry *self,
+			struct perf_session *pair_session,
+			bool show_displacement,
+			long displacement, FILE *fp,
+			u64 session_total)
+{
+	char bf[512];
+	hist_entry__snprintf(self, bf, sizeof(bf), pair_session,
+			     show_displacement, displacement,
+			     true, session_total);
+	return fprintf(fp, "%s\n", bf);
 }
 
 static size_t hist_entry__fprintf_callchain(struct hist_entry *self, FILE *fp,
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index da6a8c1320f..ad17f0ad798 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -18,11 +18,16 @@ struct hist_entry *__perf_session__add_hist_entry(struct rb_root *hists,
 						  u64 count, bool *hit);
 extern int64_t hist_entry__cmp(struct hist_entry *, struct hist_entry *);
 extern int64_t hist_entry__collapse(struct hist_entry *, struct hist_entry *);
-size_t hist_entry__fprintf(struct hist_entry *self,
+int hist_entry__fprintf(struct hist_entry *self,
 			   struct perf_session *pair_session,
 			   bool show_displacement,
 			   long displacement, FILE *fp,
 			   u64 session_total);
+int hist_entry__snprintf(struct hist_entry *self,
+			 char *bf, size_t size,
+			 struct perf_session *pair_session,
+			 bool show_displacement, long displacement,
+			 bool color, u64 session_total);
 void hist_entry__free(struct hist_entry *);
 
 u64 perf_session__output_resort(struct rb_root *hists, u64 total_samples);
diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index b0210ae5b93..edd628f5337 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -294,60 +294,17 @@ static void hist_entry__append_callchain_browser(struct hist_entry *self,
 	}
 }
 
-/*
- * FIXME: get lib/string.c linked with perf somehow
- */
-static char *skip_spaces(const char *str)
-{
-	while (isspace(*str))
-		++str;
-	return (char *)str;
-}
-
-static char *strim(char *s)
-{
-	size_t size;
-	char *end;
-
-	s = skip_spaces(s);
-	size = strlen(s);
-	if (!size)
-		return s;
-
-	end = s + size - 1;
-	while (end >= s && isspace(*end))
-		end--;
-	*(end + 1) = '\0';
-
-	return s;
-}
-
 static size_t hist_entry__append_browser(struct hist_entry *self,
 					 newtComponent tree, u64 total)
 {
-	char bf[1024], *s;
-	FILE *fp;
+	char s[256];
+	size_t ret;
 
 	if (symbol_conf.exclude_other && !self->parent)
 		return 0;
 
-	fp = fmemopen(bf, sizeof(bf), "w");
-	if (fp == NULL)
-		return 0;
-
-	hist_entry__fprintf(self, NULL, false, 0, fp, total);
-	fclose(fp);
-
-	/*
-	 * FIXME: We shouldn't need to trim, as the printing routines shouldn't
-	 * add spaces it in the first place, the stdio output routines should
-	 * call a __snprintf method instead of the current __print (that
-	 * actually is a __fprintf) one, but get the raw string and _then_ add
-	 * the newline, as this is a detail of stdio printing, not needed in
-	 * other UIs, e.g. newt.
-	 */
-	s = strim(bf);
-
+	ret = hist_entry__snprintf(self, s, sizeof(s), NULL,
+				   false, 0, false, total);
 	if (symbol_conf.use_callchain) {
 		int indexes[2];
 
@@ -357,7 +314,7 @@ static size_t hist_entry__append_browser(struct hist_entry *self,
 	} else
 		newtListboxAppendEntry(tree, s, &self->ms);
 
-	return strlen(s);
+	return ret;
 }
 
 static void map_symbol__annotate_browser(const struct map_symbol *self)
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 9b80c13cae4..31329a1cd32 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -18,10 +18,21 @@ char * field_sep;
 
 LIST_HEAD(hist_entry__sort_list);
 
+static int hist_entry__thread_snprintf(struct hist_entry *self, char *bf,
+				       size_t size, unsigned int width);
+static int hist_entry__comm_snprintf(struct hist_entry *self, char *bf,
+				     size_t size, unsigned int width);
+static int hist_entry__dso_snprintf(struct hist_entry *self, char *bf,
+				    size_t size, unsigned int width);
+static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf,
+				    size_t size, unsigned int width);
+static int hist_entry__parent_snprintf(struct hist_entry *self, char *bf,
+				       size_t size, unsigned int width);
+
 struct sort_entry sort_thread = {
 	.header = "Command:  Pid",
 	.cmp	= sort__thread_cmp,
-	.print	= sort__thread_print,
+	.snprintf = hist_entry__thread_snprintf,
 	.width	= &threads__col_width,
 };
 
@@ -29,27 +40,27 @@ struct sort_entry sort_comm = {
 	.header		= "Command",
 	.cmp		= sort__comm_cmp,
 	.collapse	= sort__comm_collapse,
-	.print		= sort__comm_print,
+	.snprintf	= hist_entry__comm_snprintf,
 	.width		= &comms__col_width,
 };
 
 struct sort_entry sort_dso = {
 	.header = "Shared Object",
 	.cmp	= sort__dso_cmp,
-	.print	= sort__dso_print,
+	.snprintf = hist_entry__dso_snprintf,
 	.width	= &dsos__col_width,
 };
 
 struct sort_entry sort_sym = {
 	.header = "Symbol",
 	.cmp	= sort__sym_cmp,
-	.print	= sort__sym_print,
+	.snprintf = hist_entry__sym_snprintf,
 };
 
 struct sort_entry sort_parent = {
 	.header = "Parent symbol",
 	.cmp	= sort__parent_cmp,
-	.print	= sort__parent_print,
+	.snprintf = hist_entry__parent_snprintf,
 	.width	= &parent_symbol__col_width,
 };
 
@@ -85,45 +96,38 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 	return right->thread->pid - left->thread->pid;
 }
 
-int repsep_fprintf(FILE *fp, const char *fmt, ...)
+static int repsep_snprintf(char *bf, size_t size, const char *fmt, ...)
 {
 	int n;
 	va_list ap;
 
 	va_start(ap, fmt);
-	if (!field_sep)
-		n = vfprintf(fp, fmt, ap);
-	else {
-		char *bf = NULL;
-		n = vasprintf(&bf, fmt, ap);
-		if (n > 0) {
-			char *sep = bf;
-
-			while (1) {
-				sep = strchr(sep, *field_sep);
-				if (sep == NULL)
-					break;
-				*sep = '.';
-			}
+	n = vsnprintf(bf, size, fmt, ap);
+	if (field_sep && n > 0) {
+		char *sep = bf;
+
+		while (1) {
+			sep = strchr(sep, *field_sep);
+			if (sep == NULL)
+				break;
+			*sep = '.';
 		}
-		fputs(bf, fp);
-		free(bf);
 	}
 	va_end(ap);
 	return n;
 }
 
-size_t
-sort__thread_print(FILE *fp, struct hist_entry *self, unsigned int width)
+static int hist_entry__thread_snprintf(struct hist_entry *self, char *bf,
+				       size_t size, unsigned int width)
 {
-	return repsep_fprintf(fp, "%*s:%5d", width - 6,
+	return repsep_snprintf(bf, size, "%*s:%5d", width,
 			      self->thread->comm ?: "", self->thread->pid);
 }
 
-size_t
-sort__comm_print(FILE *fp, struct hist_entry *self, unsigned int width)
+static int hist_entry__comm_snprintf(struct hist_entry *self, char *bf,
+				     size_t size, unsigned int width)
 {
-	return repsep_fprintf(fp, "%*s", width, self->thread->comm);
+	return repsep_snprintf(bf, size, "%*s", width, self->thread->comm);
 }
 
 /* --sort dso */
@@ -149,16 +153,16 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 	return strcmp(dso_name_l, dso_name_r);
 }
 
-size_t
-sort__dso_print(FILE *fp, struct hist_entry *self, unsigned int width)
+static int hist_entry__dso_snprintf(struct hist_entry *self, char *bf,
+				    size_t size, unsigned int width)
 {
 	if (self->ms.map && self->ms.map->dso) {
 		const char *dso_name = !verbose ? self->ms.map->dso->short_name :
 						  self->ms.map->dso->long_name;
-		return repsep_fprintf(fp, "%-*s", width, dso_name);
+		return repsep_snprintf(bf, size, "%-*s", width, dso_name);
 	}
 
-	return repsep_fprintf(fp, "%*llx", width, (u64)self->ip);
+	return repsep_snprintf(bf, size, "%*Lx", width, self->ip);
 }
 
 /* --sort symbol */
@@ -177,22 +181,22 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(ip_r - ip_l);
 }
 
-
-size_t
-sort__sym_print(FILE *fp, struct hist_entry *self, unsigned int width __used)
+static int hist_entry__sym_snprintf(struct hist_entry *self, char *bf,
+				    size_t size, unsigned int width __used)
 {
 	size_t ret = 0;
 
 	if (verbose) {
 		char o = self->ms.map ? dso__symtab_origin(self->ms.map->dso) : '!';
-		ret += repsep_fprintf(fp, "%#018llx %c ", (u64)self->ip, o);
+		ret += repsep_snprintf(bf, size, "%#018llx %c ", self->ip, o);
 	}
 
-	ret += repsep_fprintf(fp, "[%c] ", self->level);
+	ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", self->level);
 	if (self->ms.sym)
-		ret += repsep_fprintf(fp, "%s", self->ms.sym->name);
+		ret += repsep_snprintf(bf + ret, size - ret, "%s",
+				       self->ms.sym->name);
 	else
-		ret += repsep_fprintf(fp, "%#016llx", (u64)self->ip);
+		ret += repsep_snprintf(bf + ret, size - ret, "%#016llx", self->ip);
 
 	return ret;
 }
@@ -231,10 +235,10 @@ sort__parent_cmp(struct hist_entry *left, struct hist_entry *right)
 	return strcmp(sym_l->name, sym_r->name);
 }
 
-size_t
-sort__parent_print(FILE *fp, struct hist_entry *self, unsigned int width)
+static int hist_entry__parent_snprintf(struct hist_entry *self, char *bf,
+				       size_t size, unsigned int width)
 {
-	return repsep_fprintf(fp, "%-*s", width,
+	return repsep_snprintf(bf, size, "%-*s", width,
 			      self->parent ? self->parent->name : "[other]");
 }
 
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 598568696f9..439ec5fa0f5 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -76,7 +76,8 @@ struct sort_entry {
 
 	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
 	int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
-	size_t	(*print)(FILE *fp, struct hist_entry *, unsigned int width);
+	int	(*snprintf)(struct hist_entry *self, char *bf, size_t size,
+			    unsigned int width);
 	unsigned int *width;
 	bool	elide;
 };
@@ -86,7 +87,6 @@ extern struct list_head hist_entry__sort_list;
 
 void setup_sorting(const char * const usagestr[], const struct option *opts);
 
-extern int repsep_fprintf(FILE *fp, const char *fmt, ...);
 extern size_t sort__thread_print(FILE *, struct hist_entry *, unsigned int);
 extern size_t sort__comm_print(FILE *, struct hist_entry *, unsigned int);
 extern size_t sort__dso_print(FILE *, struct hist_entry *, unsigned int);
-- 
cgit v1.2.3


From e727ca73f85d4c5be3547eda674168219d1c22d8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 1 Apr 2010 19:12:13 -0300
Subject: perf kmem: Resolve kernel symbols again
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Due to the assumption in perf_session__new that the kernel maps would be
created using the fake PERF_RECORD_MMAP event in a perf.data file 'perf
kmem --stat caller', that doesn't have such event, ends up not being
able to resolve the kernel addresses.

Fix it by calling perf_session__create_kernel_maps() in __cmd_kmem().

LKML-Reference: <new-submission>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-kmem.c | 3 +++
 tools/perf/util/session.c | 5 -----
 tools/perf/util/session.h | 5 +++++
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 32edb6a8687..7cbb5eb1510 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -489,6 +489,9 @@ static int __cmd_kmem(void)
 	if (session == NULL)
 		return -ENOMEM;
 
+	if (perf_session__create_kernel_maps(session) < 0)
+		goto out_delete;
+
 	if (!perf_session__has_traces(session, "kmem record"))
 		goto out_delete;
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 32765cdca05..9da5e723495 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -52,11 +52,6 @@ out_close:
 	return -1;
 }
 
-static inline int perf_session__create_kernel_maps(struct perf_session *self)
-{
-	return map_groups__create_kernel_maps(&self->kmaps, self->vmlinux_maps);
-}
-
 struct perf_session *perf_session__new(const char *filename, int mode, bool force)
 {
 	size_t len = filename ? strlen(filename) + 1 : 0;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 6a15daeda57..dffaff52ba4 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -80,6 +80,11 @@ static inline int __perf_session__create_kernel_maps(struct perf_session *self,
 						self->vmlinux_maps, kernel);
 }
 
+static inline int perf_session__create_kernel_maps(struct perf_session *self)
+{
+	return map_groups__create_kernel_maps(&self->kmaps, self->vmlinux_maps);
+}
+
 static inline struct map *
 	perf_session__new_module_map(struct perf_session *self,
 				     u64 start, const char *filename)
-- 
cgit v1.2.3


From 71cf8b8ff7d6a79af086be9e4c72628da9d62d58 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 1 Apr 2010 21:24:38 -0300
Subject: perf kmem: Fixup the symbol address before using it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We get absolute addresses in the events, but relative ones from the
symbol subsystem, so calculate the absolute address by asking for the
map where the symbol was found, that has the place where the DSO was
actually loaded.

For the core kernel this poses no problems if the kernel is not
relocated by things like kexec, or if we use /proc/kallsyms, but for
modules we were getting really large, negative offsets.

LKML-Reference: <new-submission>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-kmem.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 7cbb5eb1510..513aa8a55db 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -363,6 +363,7 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 		struct alloc_stat *data = rb_entry(next, struct alloc_stat,
 						   node);
 		struct symbol *sym = NULL;
+		struct map *map;
 		char buf[BUFSIZ];
 		u64 addr;
 
@@ -370,13 +371,13 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 			addr = data->call_site;
 			if (!raw_ip)
 				sym = map_groups__find_function(&session->kmaps,
-								addr, NULL, NULL);
+								addr, &map, NULL);
 		} else
 			addr = data->ptr;
 
 		if (sym != NULL)
 			snprintf(buf, sizeof(buf), "%s+%Lx", sym->name,
-				 addr - sym->start);
+				 addr - map->unmap_ip(map, sym->start));
 		else
 			snprintf(buf, sizeof(buf), "%#Lx", addr);
 		printf(" %-34s |", buf);
-- 
cgit v1.2.3


From b9fb93047756c5e4129dfda7591612de61b0e877 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 2 Apr 2010 09:50:42 -0300
Subject: perf hist: Only allocate callchain_node if processing callchains
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The struct callchain_node size is 120 bytes, that are never used when
there are no callchains or '-g none' is specified, so conditionally
allocate it, reducing sizeof(struct hist_entry) from 210 bytes to only
96, greatly speeding the non-callchain processing.

LKML-Reference: <new-submission>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c | 4 ++--
 tools/perf/util/hist.c      | 5 +++--
 tools/perf/util/sort.h      | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 381918515a5..1fb13e5fd1f 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -110,8 +110,8 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 
 	if (symbol_conf.use_callchain) {
 		if (!hit)
-			callchain_init(&he->callchain);
-		err = append_chain(&he->callchain, data->callchain, syms);
+			callchain_init(he->callchain);
+		err = append_chain(he->callchain, data->callchain, syms);
 		free(syms);
 
 		if (err)
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index f0794913d57..18cf8b32160 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -50,7 +50,8 @@ struct hist_entry *__perf_session__add_hist_entry(struct rb_root *hists,
 			p = &(*p)->rb_right;
 	}
 
-	he = malloc(sizeof(*he));
+	he = malloc(sizeof(*he) + (symbol_conf.use_callchain ?
+				    sizeof(struct callchain_node) : 0));
 	if (!he)
 		return NULL;
 	*he = entry;
@@ -168,7 +169,7 @@ static void perf_session__insert_output_hist_entry(struct rb_root *root,
 	struct hist_entry *iter;
 
 	if (symbol_conf.use_callchain)
-		callchain_param.sort(&he->sorted_chain, &he->callchain,
+		callchain_param.sort(&he->sorted_chain, he->callchain,
 				      min_callchain_hits, &callchain_param);
 
 	while (*p != NULL) {
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 439ec5fa0f5..5bf2b744e7b 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -49,12 +49,12 @@ struct hist_entry {
 	u64			ip;
 	char			level;
 	struct symbol	  *parent;
-	struct callchain_node	callchain;
 	union {
 		unsigned long	  position;
 		struct hist_entry *pair;
 		struct rb_root	  sorted_chain;
 	};
+	struct callchain_node	callchain[0];
 };
 
 enum sort_type {
-- 
cgit v1.2.3


From ad5b217b152d99ca3922153500c619d9758dd87a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 2 Apr 2010 10:04:18 -0300
Subject: perf session: Remove one more exit() call from library code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Return NULL instead and make the caller propagate the error.

LKML-Reference: <new-submission>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c |  5 ++++-
 tools/perf/util/session.c   | 11 +++--------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 1fb13e5fd1f..6767f10615e 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -89,9 +89,12 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 	struct event_stat_id *stats;
 	struct perf_event_attr *attr;
 
-	if ((sort__has_parent || symbol_conf.use_callchain) && data->callchain)
+	if ((sort__has_parent || symbol_conf.use_callchain) && data->callchain) {
 		syms = perf_session__resolve_callchain(self, al->thread,
 						       data->callchain, &parent);
+		if (syms == NULL)
+			return -ENOMEM;
+	}
 
 	attr = perf_header__find_attr(data->id, &self->header);
 	if (attr)
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 9da5e723495..ddf288fca3e 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -118,16 +118,11 @@ struct map_symbol *perf_session__resolve_callchain(struct perf_session *self,
 						   struct symbol **parent)
 {
 	u8 cpumode = PERF_RECORD_MISC_USER;
-	struct map_symbol *syms = NULL;
 	unsigned int i;
+	struct map_symbol *syms = calloc(chain->nr, sizeof(*syms));
 
-	if (symbol_conf.use_callchain) {
-		syms = calloc(chain->nr, sizeof(*syms));
-		if (!syms) {
-			fprintf(stderr, "Can't allocate memory for symbols\n");
-			exit(-1);
-		}
-	}
+	if (!syms)
+		return NULL;
 
 	for (i = 0; i < chain->nr; i++) {
 		u64 ip = chain->ips[i];
-- 
cgit v1.2.3


From 2aefa4f733f2c5ce51dd2316ffecb258463fde71 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 2 Apr 2010 12:30:57 -0300
Subject: perf tools: sort_dimension__add shouldn't die
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Propagate error instead.

LKML-Reference: <new-submission>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c | 3 ++-
 tools/perf/util/sort.c      | 5 ++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 6767f10615e..b13a7e2f839 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -473,7 +473,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 	setup_sorting(report_usage, options);
 
 	if (parent_pattern != default_parent_pattern) {
-		sort_dimension__add("parent");
+		if (sort_dimension__add("parent") < 0)
+			return -1;
 		sort_parent.elide = 1;
 	} else
 		symbol_conf.exclude_other = false;
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 31329a1cd32..9d24d4b2c8f 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -264,9 +264,8 @@ int sort_dimension__add(const char *tok)
 				char err[BUFSIZ];
 
 				regerror(ret, &parent_regex, err, sizeof(err));
-				fprintf(stderr, "Invalid regex: %s\n%s",
-					parent_pattern, err);
-				exit(-1);
+				pr_err("Invalid regex: %s\n%s", parent_pattern, err);
+				return -EINVAL;
 			}
 			sort__has_parent = 1;
 		}
-- 
cgit v1.2.3


From e206d556c5793ac5e28c0aaba2e07432e5f9a098 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 3 Apr 2010 10:19:26 -0300
Subject: perf tools: Move the prototypes in util/string.h to util.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So that we avoid conflict with libc's string.h header.

Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Suggested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile            |  1 -
 tools/perf/bench/mem-memcpy.c  |  1 -
 tools/perf/builtin-annotate.c  |  1 -
 tools/perf/builtin-record.c    |  1 -
 tools/perf/builtin-report.c    |  1 -
 tools/perf/builtin-timechart.c |  1 -
 tools/perf/perf.c              |  1 -
 tools/perf/util/string.h       | 16 ----------------
 tools/perf/util/util.h         | 12 ++++++++++++
 9 files changed, 12 insertions(+), 23 deletions(-)
 delete mode 100644 tools/perf/util/string.h

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 93029c09cd6..9f5a47e5c07 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -385,7 +385,6 @@ LIB_H += util/header.h
 LIB_H += util/help.h
 LIB_H += util/session.h
 LIB_H += util/strbuf.h
-LIB_H += util/string.h
 LIB_H += util/strlist.h
 LIB_H += util/svghelper.h
 LIB_H += util/run-command.h
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index 89773178e89..52e646e3e87 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -10,7 +10,6 @@
 #include "../perf.h"
 #include "../util/util.h"
 #include "../util/parse-options.h"
-#include "../util/string.h"
 #include "../util/header.h"
 #include "bench.h"
 
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 887e8e04a6f..ee0d9172699 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -14,7 +14,6 @@
 #include "util/cache.h"
 #include <linux/rbtree.h>
 #include "util/symbol.h"
-#include "util/string.h"
 
 #include "perf.h"
 #include "util/debug.h"
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 80dc444031d..dc61f1b68b4 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -15,7 +15,6 @@
 #include "util/util.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
-#include "util/string.h"
 
 #include "util/header.h"
 #include "util/event.h"
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index b13a7e2f839..6615e09e336 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -14,7 +14,6 @@
 #include "util/cache.h"
 #include <linux/rbtree.h>
 #include "util/symbol.h"
-#include "util/string.h"
 #include "util/callchain.h"
 #include "util/strlist.h"
 #include "util/values.h"
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 0d4d8ff7914..266e7aa996d 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -21,7 +21,6 @@
 #include "util/cache.h"
 #include <linux/rbtree.h>
 #include "util/symbol.h"
-#include "util/string.h"
 #include "util/callchain.h"
 #include "util/strlist.h"
 
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 2826e702986..d4be55b6cd3 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -13,7 +13,6 @@
 #include "util/quote.h"
 #include "util/run-command.h"
 #include "util/parse-events.h"
-#include "util/string.h"
 #include "util/debugfs.h"
 
 bool use_browser;
diff --git a/tools/perf/util/string.h b/tools/perf/util/string.h
deleted file mode 100644
index 70058241666..00000000000
--- a/tools/perf/util/string.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef __PERF_STRING_H_
-#define __PERF_STRING_H_
-
-#include <stdbool.h>
-#include "types.h"
-
-s64 perf_atoll(const char *str);
-char **argv_split(const char *str, int *argcp);
-void argv_free(char **argv);
-bool strglobmatch(const char *str, const char *pat);
-bool strlazymatch(const char *str, const char *pat);
-
-#define _STR(x) #x
-#define STR(x) _STR(x)
-
-#endif /* __PERF_STRING_H */
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 52701087ce0..fbf45d1b26f 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -42,12 +42,14 @@
 #define _ALL_SOURCE 1
 #define _GNU_SOURCE 1
 #define _BSD_SOURCE 1
+#define HAS_BOOL
 
 #include <unistd.h>
 #include <stdio.h>
 #include <sys/stat.h>
 #include <sys/statfs.h>
 #include <fcntl.h>
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdlib.h>
 #include <stdarg.h>
@@ -78,6 +80,7 @@
 #include <pwd.h>
 #include <inttypes.h>
 #include "../../../include/linux/magic.h"
+#include "types.h"
 
 
 #ifndef NO_ICONV
@@ -415,4 +418,13 @@ void git_qsort(void *base, size_t nmemb, size_t size,
 int mkdir_p(char *path, mode_t mode);
 int copyfile(const char *from, const char *to);
 
+s64 perf_atoll(const char *str);
+char **argv_split(const char *str, int *argcp);
+void argv_free(char **argv);
+bool strglobmatch(const char *str, const char *pat);
+bool strlazymatch(const char *str, const char *pat);
+
+#define _STR(x) #x
+#define STR(x) _STR(x)
+
 #endif
-- 
cgit v1.2.3


From 4af8b35db6634dd1e0d616de689582b6c93550af Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Sat, 3 Apr 2010 22:53:31 +1100
Subject: perf symbols: Fill in pgoff in mmap synthesized events

When we synthesize mmap events we need to fill in the pgoff field.

I wasn't able to test this completely since I couldn't find an
executable region with a non 0 offset. We will see it when we start
doing data profiling.

Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: David Miller <davem@davemloft.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20100403115331.GK5594@kryten>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 052eaeccc20..571fb25f7eb 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -130,6 +130,7 @@ static int event__synthesize_mmap_events(pid_t pid, pid_t tgid,
 			continue;
 		pbf += n + 3;
 		if (*pbf == 'x') { /* vm_exec */
+			u64 vm_pgoff;
 			char *execname = strchr(bf, '/');
 
 			/* Catch VDSO */
@@ -139,6 +140,14 @@ static int event__synthesize_mmap_events(pid_t pid, pid_t tgid,
 			if (execname == NULL)
 				continue;
 
+			pbf += 3;
+			n = hex2u64(pbf, &vm_pgoff);
+			/* pgoff is in bytes, not pages */
+			if (n >= 0)
+				ev.mmap.pgoff = vm_pgoff << getpagesize();
+			else
+				ev.mmap.pgoff = 0;
+
 			size = strlen(execname);
 			execname[size - 1] = '\0'; /* Remove \n */
 			memcpy(ev.mmap.filename, execname, size);
-- 
cgit v1.2.3


From fb6b893180faec03e1d32149ef5cc412df9714df Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 3 Apr 2010 11:04:55 -0300
Subject: perf newt: Remove useless column width calculation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Not used in the TUI interface.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index edd628f5337..a2ba3fffe04 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -390,11 +390,8 @@ static void perf_session__selection(newtComponent self, void *data)
 int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 			       u64 session_total, const char *helpline)
 {
-	struct sort_entry *se;
 	struct rb_node *nd;
 	char seq[] = ".";
-	unsigned int width;
-	char *col_width = symbol_conf.col_width_list_str;
 	int rows, cols, idx;
 	int max_len = 0;
 	char str[1024];
@@ -423,23 +420,6 @@ int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 
 	newtComponentAddCallback(tree, perf_session__selection, &selection);
 
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		if (se->elide)
-			continue;
-		width = strlen(se->header);
-		if (se->width) {
-			if (symbol_conf.col_width_list_str) {
-				if (col_width) {
-					*se->width = atoi(col_width);
-					col_width = strchr(col_width, ',');
-					if (col_width)
-						++col_width;
-				}
-			}
-			*se->width = max(*se->width, width);
-		}
-	}
-
 	idx = 0;
 	for (nd = rb_first(hists); nd; nd = rb_next(nd)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-- 
cgit v1.2.3


From e65713ea1e61e92d28284a55df2aa039ebe10003 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 3 Apr 2010 11:25:56 -0300
Subject: perf newt: Move the hist browser population bits to separare function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Next patches will use that when applying filtes to then repopulate the
browser with the narrowed vision.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 129 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 84 insertions(+), 45 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index a2ba3fffe04..509d921532e 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -381,53 +381,69 @@ static const void *newt__symbol_tree_get_current(newtComponent self)
 	return newtListboxGetCurrent(self);
 }
 
-static void perf_session__selection(newtComponent self, void *data)
+static void hist_browser__selection(newtComponent self, void *data)
 {
 	const struct map_symbol **symbol_ptr = data;
 	*symbol_ptr = newt__symbol_tree_get_current(self);
 }
 
-int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
-			       u64 session_total, const char *helpline)
+struct hist_browser {
+	newtComponent		form, tree;
+	const struct map_symbol *selection;
+};
+
+static struct hist_browser *hist_browser__new(void)
+{
+	struct hist_browser *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		char seq[] = ".";
+		int rows;
+
+		newtGetScreenSize(NULL, &rows);
+
+		if (symbol_conf.use_callchain)
+			self->tree = newtCheckboxTreeMulti(0, 0, rows - 5, seq,
+							   NEWT_FLAG_SCROLL);
+		else
+			self->tree = newtListbox(0, 0, rows - 5,
+						(NEWT_FLAG_SCROLL |
+						 NEWT_FLAG_RETURNEXIT));
+		newtComponentAddCallback(self->tree, hist_browser__selection,
+					 &self->selection);
+	}
+
+	return self;
+}
+
+static void hist_browser__delete(struct hist_browser *self)
+{
+	newtFormDestroy(self->form);
+	newtPopWindow();
+	free(self);
+}
+
+static int hist_browser__populate(struct hist_browser *self, struct rb_root *hists,
+				  u64 nr_hists, u64 session_total)
 {
+	int max_len = 0, idx, cols, rows;
+	struct ui_progress *progress;
 	struct rb_node *nd;
-	char seq[] = ".";
-	int rows, cols, idx;
-	int max_len = 0;
-	char str[1024];
-	newtComponent form, tree;
-	struct newtExitStruct es;
-	const struct map_symbol *selection;
 	u64 curr_hist = 0;
-	struct ui_progress *progress;
 
 	progress = ui_progress__new("Adding entries to the browser...", nr_hists);
 	if (progress == NULL)
 		return -1;
 
-	snprintf(str, sizeof(str), "Samples: %Ld", session_total);
-	newtDrawRootText(0, 0, str);
-	newtPushHelpLine(helpline);
-
-	newtGetScreenSize(&cols, &rows);
-
-	if (symbol_conf.use_callchain)
-		tree = newtCheckboxTreeMulti(0, 0, rows - 5, seq,
-						NEWT_FLAG_SCROLL);
-	else
-		tree = newtListbox(0, 0, rows - 5, (NEWT_FLAG_SCROLL |
-						       NEWT_FLAG_RETURNEXIT));
-
-	newtComponentAddCallback(tree, perf_session__selection, &selection);
-
 	idx = 0;
 	for (nd = rb_first(hists); nd; nd = rb_next(nd)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-		int len = hist_entry__append_browser(h, tree, session_total);
+		int len = hist_entry__append_browser(h, self->tree, session_total);
 		if (len > max_len)
 			max_len = len;
 		if (symbol_conf.use_callchain)
-			hist_entry__append_callchain_browser(h, tree, session_total, idx++);
+			hist_entry__append_callchain_browser(h, self->tree,
+							     session_total, idx++);
 		++curr_hist;
 		if (curr_hist % 5)
 			ui_progress__update(progress, curr_hist);
@@ -435,27 +451,50 @@ int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 
 	ui_progress__delete(progress);
 
+	newtGetScreenSize(&cols, &rows);
+
 	if (max_len > cols)
 		max_len = cols - 3;
 
 	if (!symbol_conf.use_callchain)
-		newtListboxSetWidth(tree, max_len);
+		newtListboxSetWidth(self->tree, max_len);
 
 	newtCenteredWindow(max_len + (symbol_conf.use_callchain ? 5 : 0),
 			   rows - 5, "Report");
-	form = newt_form__new();
-	newtFormAddHotKey(form, 'A');
-	newtFormAddHotKey(form, 'a');
-	newtFormAddHotKey(form, NEWT_KEY_RIGHT);
-	newtFormAddComponents(form, tree, NULL);
-	selection = newt__symbol_tree_get_current(tree);
+	self->form = newt_form__new();
+	newtFormAddHotKey(self->form, 'A');
+	newtFormAddHotKey(self->form, 'a');
+	newtFormAddHotKey(self->form, NEWT_KEY_RIGHT);
+	newtFormAddComponents(self->form, self->tree, NULL);
+	self->selection = newt__symbol_tree_get_current(self->tree);
+
+	return 0;
+}
+
+int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
+			       u64 session_total, const char *helpline)
+{
+	struct newtExitStruct es;
+	char str[1024];
+	int err = -1;
+	struct hist_browser *browser = hist_browser__new();
+
+	if (browser == NULL)
+		return -1;
+
+	snprintf(str, sizeof(str), "Samples: %Ld", session_total);
+	newtDrawRootText(0, 0, str);
+	newtPushHelpLine(helpline);
+
+	if (hist_browser__populate(browser, hists, nr_hists, session_total) < 0)
+		goto out;
 
 	while (1) {
 		char annotate[512];
 		const char *options[2];
 		int nr_options = 0, choice = 0;
 
-		newtFormRun(form, &es);
+		newtFormRun(browser->form, &es);
 		if (es.reason == NEWT_EXIT_HOTKEY) {
 			if (toupper(es.u.key) == 'A')
 				goto do_annotate;
@@ -469,9 +508,9 @@ int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 			}
 		}
 
-		if (selection->sym != NULL) {
+		if (browser->selection->sym != NULL) {
 			snprintf(annotate, sizeof(annotate),
-				 "Annotate %s", selection->sym->name);
+				 "Annotate %s", browser->selection->sym->name);
 			options[nr_options++] = annotate;
 		}
 
@@ -480,21 +519,21 @@ int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 		if (choice == nr_options - 1)
 			break;
 do_annotate:
-		if (selection->sym != NULL && choice >= 0) {
-			if (selection->map->dso->origin == DSO__ORIG_KERNEL) {
+		if (browser->selection->sym != NULL && choice >= 0) {
+			if (browser->selection->map->dso->origin == DSO__ORIG_KERNEL) {
 				newtPopHelpLine();
 				newtPushHelpLine("No vmlinux file found, can't "
 						 "annotate with just a "
 						 "kallsyms file");
 				continue;
 			}
-			map_symbol__annotate_browser(selection);
+			map_symbol__annotate_browser(browser->selection);
 		}
 	}
-
-	newtFormDestroy(form);
-	newtPopWindow();
-	return 0;
+	err = 0;
+out:
+	hist_browser__delete(browser);
+	return err;
 }
 
 void setup_browser(void)
-- 
cgit v1.2.3


From 533c46c31c0e82f19dbb087c77d85eaccd6fefdb Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 3 Apr 2010 11:54:35 -0300
Subject: perf newt: Pass the input_name to perf_session__browse_hists
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So that it can use it in the 'perf annotate' command line, otherwise
it'll use the default and not the specified -i filename passed to 'perf
report'.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c |  3 ++-
 tools/perf/util/newt.c      | 13 ++++++++-----
 tools/perf/util/session.h   |  6 ++++--
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 6615e09e336..e93c69a8e72 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -313,7 +313,8 @@ static int __cmd_report(void)
 						       stats->stats.total);
 		if (use_browser)
 			perf_session__browse_hists(&stats->hists, nr_hists,
-						   stats->stats.total, help);
+						   stats->stats.total, help,
+						   input_name);
 		else {
 			if (rb_first(&session->stats_by_id) ==
 			    rb_last(&session->stats_by_id))
diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 509d921532e..c93bc2a2d13 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -317,7 +317,8 @@ static size_t hist_entry__append_browser(struct hist_entry *self,
 	return ret;
 }
 
-static void map_symbol__annotate_browser(const struct map_symbol *self)
+static void map_symbol__annotate_browser(const struct map_symbol *self,
+					 const char *input_name)
 {
 	FILE *fp;
 	int cols, rows;
@@ -331,8 +332,8 @@ static void map_symbol__annotate_browser(const struct map_symbol *self)
 	if (self->sym == NULL)
 		return;
 
-	if (asprintf(&str, "perf annotate -d \"%s\" %s 2>&1 | expand",
-		     self->map->dso->name, self->sym->name) < 0)
+	if (asprintf(&str, "perf annotate -i \"%s\" -d \"%s\" %s 2>&1 | expand",
+		     input_name, self->map->dso->name, self->sym->name) < 0)
 		return;
 
 	fp = popen(str, "r");
@@ -472,7 +473,8 @@ static int hist_browser__populate(struct hist_browser *self, struct rb_root *his
 }
 
 int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
-			       u64 session_total, const char *helpline)
+			       u64 session_total, const char *helpline,
+			       const char *input_name)
 {
 	struct newtExitStruct es;
 	char str[1024];
@@ -527,7 +529,8 @@ do_annotate:
 						 "kallsyms file");
 				continue;
 			}
-			map_symbol__annotate_browser(browser->selection);
+			map_symbol__annotate_browser(browser->selection,
+						     input_name);
 		}
 	}
 	err = 0;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index dffaff52ba4..27f4c2dc715 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -96,12 +96,14 @@ static inline struct map *
 static inline int perf_session__browse_hists(struct rb_root *hists __used,
 					      u64 nr_hists __used,
 					      u64 session_total __used,
-					      const char *helpline __used)
+					     const char *helpline __used,
+					     const char *input_name __used)
 {
 	return 0;
 }
 #else
 int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
-				u64 session_total, const char *helpline);
+			       u64 session_total, const char *helpline,
+			       const char *input_name);
 #endif
 #endif /* __PERF_SESSION_H */
-- 
cgit v1.2.3


From 83753190c136901c916df267703937e60f24b8b8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 3 Apr 2010 16:30:44 -0300
Subject: perf newt: Add a "Zoom into foo.so DSO" and reverse operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clicking on -> will bring as one of the popup menu options a "Zoom into
CURRENT DSO", i.e. CURRENT will be replaced by the name of the DSO in
the current line.

Choosing this option will filter out all samples that didn't took place
in a symbol in this DSO.

After that the option reverts to "Zoom out of CURRENT DSO", to allow
going back to the more compreensive view, not filtered by DSO.

Future similar operations will include zooming into a particular thread,
COMM, CPU, "last minute", "last N usecs", etc.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 119 ++++++++++++++++++++++++++++++++++++-------------
 tools/perf/util/sort.h |   3 +-
 2 files changed, 90 insertions(+), 32 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index c93bc2a2d13..bbf725d4b38 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -94,7 +94,7 @@ static newtComponent newt_form__new(void)
 	return self;
 }
 
-static int popup_menu(int argc, const char *argv[])
+static int popup_menu(int argc, char * const argv[])
 {
 	struct newtExitStruct es;
 	int i, rc = -1, max_len = 5;
@@ -397,22 +397,8 @@ static struct hist_browser *hist_browser__new(void)
 {
 	struct hist_browser *self = malloc(sizeof(*self));
 
-	if (self != NULL) {
-		char seq[] = ".";
-		int rows;
-
-		newtGetScreenSize(NULL, &rows);
-
-		if (symbol_conf.use_callchain)
-			self->tree = newtCheckboxTreeMulti(0, 0, rows - 5, seq,
-							   NEWT_FLAG_SCROLL);
-		else
-			self->tree = newtListbox(0, 0, rows - 5,
-						(NEWT_FLAG_SCROLL |
-						 NEWT_FLAG_RETURNEXIT));
-		newtComponentAddCallback(self->tree, hist_browser__selection,
-					 &self->selection);
-	}
+	if (self != NULL)
+		self->form = NULL;
 
 	return self;
 }
@@ -431,6 +417,30 @@ static int hist_browser__populate(struct hist_browser *self, struct rb_root *his
 	struct ui_progress *progress;
 	struct rb_node *nd;
 	u64 curr_hist = 0;
+	char seq[] = ".";
+	char str[256];
+
+	if (self->form) {
+		newtFormDestroy(self->form);
+		newtPopWindow();
+	}
+
+	snprintf(str, sizeof(str), "Samples: %Ld                            ",
+		 session_total);
+	newtDrawRootText(0, 0, str);
+
+	newtGetScreenSize(NULL, &rows);
+
+	if (symbol_conf.use_callchain)
+		self->tree = newtCheckboxTreeMulti(0, 0, rows - 5, seq,
+						   NEWT_FLAG_SCROLL);
+	else
+		self->tree = newtListbox(0, 0, rows - 5,
+					(NEWT_FLAG_SCROLL |
+					 NEWT_FLAG_RETURNEXIT));
+
+	newtComponentAddCallback(self->tree, hist_browser__selection,
+				 &self->selection);
 
 	progress = ui_progress__new("Adding entries to the browser...", nr_hists);
 	if (progress == NULL)
@@ -439,7 +449,12 @@ static int hist_browser__populate(struct hist_browser *self, struct rb_root *his
 	idx = 0;
 	for (nd = rb_first(hists); nd; nd = rb_next(nd)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-		int len = hist_entry__append_browser(h, self->tree, session_total);
+		int len;
+
+		if (h->filtered)
+			continue;
+
+		len = hist_entry__append_browser(h, self->tree, session_total);
 		if (len > max_len)
 			max_len = len;
 		if (symbol_conf.use_callchain)
@@ -463,6 +478,9 @@ static int hist_browser__populate(struct hist_browser *self, struct rb_root *his
 	newtCenteredWindow(max_len + (symbol_conf.use_callchain ? 5 : 0),
 			   rows - 5, "Report");
 	self->form = newt_form__new();
+	if (self->form == NULL)
+		return -1;
+
 	newtFormAddHotKey(self->form, 'A');
 	newtFormAddHotKey(self->form, 'a');
 	newtFormAddHotKey(self->form, NEWT_KEY_RIGHT);
@@ -472,29 +490,50 @@ static int hist_browser__populate(struct hist_browser *self, struct rb_root *his
 	return 0;
 }
 
+static u64 hists__filter_by_dso(struct rb_root *hists, struct dso *dso,
+				u64 *session_total)
+{
+	struct rb_node *nd;
+	u64 nr_hists = 0;
+
+	*session_total = 0;
+
+	for (nd = rb_first(hists); nd; nd = rb_next(nd)) {
+		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+
+		if (dso != NULL && (h->ms.map == NULL || h->ms.map->dso != dso)) {
+			h->filtered = true;
+			continue;
+		}
+		h->filtered = false;
+		++nr_hists;
+		*session_total += h->count;
+	}
+
+	return nr_hists;
+}
+
 int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 			       u64 session_total, const char *helpline,
 			       const char *input_name)
 {
 	struct newtExitStruct es;
-	char str[1024];
+	bool dso_filtered = false;
 	int err = -1;
 	struct hist_browser *browser = hist_browser__new();
 
 	if (browser == NULL)
 		return -1;
 
-	snprintf(str, sizeof(str), "Samples: %Ld", session_total);
-	newtDrawRootText(0, 0, str);
 	newtPushHelpLine(helpline);
 
 	if (hist_browser__populate(browser, hists, nr_hists, session_total) < 0)
 		goto out;
 
 	while (1) {
-		char annotate[512];
-		const char *options[2];
-		int nr_options = 0, choice = 0;
+		char *options[16];
+		int nr_options = 0, choice = 0, i,
+		    annotate = -2, zoom_dso = -2;
 
 		newtFormRun(browser->form, &es);
 		if (es.reason == NEWT_EXIT_HOTKEY) {
@@ -510,18 +549,29 @@ int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 			}
 		}
 
-		if (browser->selection->sym != NULL) {
-			snprintf(annotate, sizeof(annotate),
-				 "Annotate %s", browser->selection->sym->name);
-			options[nr_options++] = annotate;
-		}
+		if (browser->selection->sym != NULL &&
+		    asprintf(&options[nr_options], "Annotate %s",
+			     browser->selection->sym->name) > 0)
+			annotate = nr_options++;
+
+		if (browser->selection->map != NULL &&
+		    asprintf(&options[nr_options], "Zoom %s %s DSO",
+			     dso_filtered ? "out of" : "into",
+			     (browser->selection->map->dso->kernel ? "the Kernel" :
+			      browser->selection->map->dso->short_name)) > 0)
+			zoom_dso = nr_options++;
+
+		options[nr_options++] = (char *)"Exit";
 
-		options[nr_options++] = "Exit";
 		choice = popup_menu(nr_options, options);
+
+		for (i = 0; i < nr_options - 1; ++i)
+			free(options[i]);
+
 		if (choice == nr_options - 1)
 			break;
 do_annotate:
-		if (browser->selection->sym != NULL && choice >= 0) {
+		if (choice == annotate) {
 			if (browser->selection->map->dso->origin == DSO__ORIG_KERNEL) {
 				newtPopHelpLine();
 				newtPushHelpLine("No vmlinux file found, can't "
@@ -531,6 +581,13 @@ do_annotate:
 			}
 			map_symbol__annotate_browser(browser->selection,
 						     input_name);
+		} if (choice == zoom_dso) {
+			hists__filter_by_dso(hists,
+					     dso_filtered ? NULL : browser->selection->map->dso,
+					     &session_total);
+			dso_filtered = !dso_filtered;
+			if (hist_browser__populate(browser, hists, nr_hists, session_total) < 0)
+				goto out;
 		}
 	}
 	err = 0;
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 5bf2b744e7b..dce79d33e33 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -48,7 +48,8 @@ struct hist_entry {
 	struct map_symbol	ms;
 	u64			ip;
 	char			level;
-	struct symbol	  *parent;
+	bool			filtered;
+	struct symbol		*parent;
 	union {
 		unsigned long	  position;
 		struct hist_entry *pair;
-- 
cgit v1.2.3


From a5e29aca02fcecd086ac160ea29244cae6b4305e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 3 Apr 2010 22:44:37 -0300
Subject: perf TUI: Add a "Zoom into COMM(PID) thread" and reverse operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now one can press the right arrow key and in addition to being able to
filter by DSO, filter out by thread too, or a combination of both
filters.

With this one can start collecting events for the whole system, then
focus on a subset of the collected data quickly.

Cc: Avi Kivity <avi@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 82 +++++++++++++++++++++++++++++++++++++++++++++-----
 tools/perf/util/sort.h |  9 ++++--
 2 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index bbf725d4b38..6d6e022d770 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -490,6 +490,11 @@ static int hist_browser__populate(struct hist_browser *self, struct rb_root *his
 	return 0;
 }
 
+enum hist_filter {
+	HIST_FILTER__DSO,
+	HIST_FILTER__THREAD,
+};
+
 static u64 hists__filter_by_dso(struct rb_root *hists, struct dso *dso,
 				u64 *session_total)
 {
@@ -502,10 +507,10 @@ static u64 hists__filter_by_dso(struct rb_root *hists, struct dso *dso,
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
 
 		if (dso != NULL && (h->ms.map == NULL || h->ms.map->dso != dso)) {
-			h->filtered = true;
+			h->filtered |= (1 << HIST_FILTER__DSO);
 			continue;
 		}
-		h->filtered = false;
+		h->filtered &= ~(1 << HIST_FILTER__DSO);
 		++nr_hists;
 		*session_total += h->count;
 	}
@@ -513,12 +518,54 @@ static u64 hists__filter_by_dso(struct rb_root *hists, struct dso *dso,
 	return nr_hists;
 }
 
+static u64 hists__filter_by_thread(struct rb_root *hists, const struct thread *thread,
+				   u64 *session_total)
+{
+	struct rb_node *nd;
+	u64 nr_hists = 0;
+
+	*session_total = 0;
+
+	for (nd = rb_first(hists); nd; nd = rb_next(nd)) {
+		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+
+		if (thread != NULL && h->thread != thread) {
+			h->filtered |= (1 << HIST_FILTER__THREAD);
+			continue;
+		}
+		h->filtered &= ~(1 << HIST_FILTER__THREAD);
+		++nr_hists;
+		*session_total += h->count;
+	}
+
+	return nr_hists;
+}
+
+static struct thread *hist_browser__selected_thread(struct hist_browser *self)
+{
+	int *indexes;
+
+	if (!symbol_conf.use_callchain)
+		goto out;
+
+	indexes = newtCheckboxTreeFindItem(self->tree, (void *)self->selection);
+	if (indexes) {
+		bool is_hist_entry = indexes[1] == NEWT_ARG_LAST;
+		free(indexes);
+		if (is_hist_entry)
+			goto out;
+	}
+	return NULL;
+out:
+	return *(struct thread **)(self->selection + 1);
+}
+
 int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 			       u64 session_total, const char *helpline,
 			       const char *input_name)
 {
 	struct newtExitStruct es;
-	bool dso_filtered = false;
+	bool dso_filtered = false, thread_filtered = false;
 	int err = -1;
 	struct hist_browser *browser = hist_browser__new();
 
@@ -531,9 +578,10 @@ int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 		goto out;
 
 	while (1) {
+		const struct thread *thread;
 		char *options[16];
 		int nr_options = 0, choice = 0, i,
-		    annotate = -2, zoom_dso = -2;
+		    annotate = -2, zoom_dso = -2, zoom_thread = -2;
 
 		newtFormRun(browser->form, &es);
 		if (es.reason == NEWT_EXIT_HOTKEY) {
@@ -561,6 +609,13 @@ int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 			      browser->selection->map->dso->short_name)) > 0)
 			zoom_dso = nr_options++;
 
+		thread = hist_browser__selected_thread(browser);
+		if (thread != NULL &&
+		    asprintf(&options[nr_options], "Zoom %s %s(%d) thread",
+			     (thread_filtered ? "out of" : "into"),
+			     (thread->comm_set ?  thread->comm : ""), thread->pid) > 0)
+			zoom_thread = nr_options++;
+
 		options[nr_options++] = (char *)"Exit";
 
 		choice = popup_menu(nr_options, options);
@@ -570,6 +625,9 @@ int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 
 		if (choice == nr_options - 1)
 			break;
+
+		if (choice == -1)
+			continue;
 do_annotate:
 		if (choice == annotate) {
 			if (browser->selection->map->dso->origin == DSO__ORIG_KERNEL) {
@@ -581,13 +639,21 @@ do_annotate:
 			}
 			map_symbol__annotate_browser(browser->selection,
 						     input_name);
-		} if (choice == zoom_dso) {
-			hists__filter_by_dso(hists,
-					     dso_filtered ? NULL : browser->selection->map->dso,
-					     &session_total);
+		} else if (choice == zoom_dso) {
+			nr_hists = hists__filter_by_dso(hists,
+							(dso_filtered ? NULL :
+							 browser->selection->map->dso),
+							&session_total);
 			dso_filtered = !dso_filtered;
 			if (hist_browser__populate(browser, hists, nr_hists, session_total) < 0)
 				goto out;
+		} else if (choice == zoom_thread) {
+			nr_hists = hists__filter_by_thread(hists,
+							   (thread_filtered ? NULL : thread),
+							   &session_total);
+			thread_filtered = !thread_filtered;
+			if (hist_browser__populate(browser, hists, nr_hists, session_total) < 0)
+				goto out;
 		}
 	}
 	err = 0;
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index dce79d33e33..6d7b4be7060 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -44,11 +44,16 @@ extern enum sort_type sort__first_dimension;
 struct hist_entry {
 	struct rb_node		rb_node;
 	u64			count;
-	struct thread		*thread;
+	/*
+	 * XXX WARNING!
+	 * thread _has_ to come after ms, see
+	 * hist_browser__selected_thread in util/newt.c
+	 */
 	struct map_symbol	ms;
+	struct thread		*thread;
 	u64			ip;
 	char			level;
-	bool			filtered;
+	u8			filtered;
 	struct symbol		*parent;
 	union {
 		unsigned long	  position;
-- 
cgit v1.2.3


From 6f4dee06fbf0133917f3d76fa3fb50e18b10c1f5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 18 Mar 2010 23:47:01 +0100
Subject: perf: Drop the frame reliablity check

It is useless now that we have a pure stack frame
walker, as given addr are always reliable.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 65e9c5efb61..353a174adb4 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1602,8 +1602,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
 {
 	struct perf_callchain_entry *entry = data;
 
-	if (reliable)
-		callchain_store(entry, addr);
+	callchain_store(entry, addr);
 }
 
 static const struct stacktrace_ops backtrace_ops = {
-- 
cgit v1.2.3


From 6cc8a7c1d8560c042f486b23318a6291569ab96b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 19 Mar 2010 01:23:53 +0100
Subject: perf: Fetch hot regs from the template caller

Trace events can be defined from a template using
DECLARE_EVENT_CLASS/DEFINE_EVENT or directly with TRACE_EVENT.

In both cases we have a template tracepoint handler, used to
record the trace, to which we pass our ftrace event instance.

In the function level, if the class is named "foo" and the event
is named "blah", we have the following chain of calls:

perf_trace_blah() -> perf_trace_templ_foo()

In the case we have several events sharing the class "blah",
we'll have multiple users of perf_trace_templ_foo(), and it
won't be inlined by the compiler. This is usually what happens
with the DECLARE_EVENT_CLASS/DEFINE_EVENT based definition.

But if perf_trace_blah() is the only caller of perf_trace_templ_foo()
there are fair chances that it will be inlined.

The problem is that we fetch the regs from perf_trace_templ_foo()
after we rewinded the frame pointer to the second caller, we want
to reach the caller of perf_trace_blah() to get the right source
of the event. And we do this by always assuming that
perf_trace_templ_foo() is not inlined. But as shown above this
is not always true. And if it is inlined we miss the first caller,
losing the most important level of precision.

We get:
	    61.31%       ls  [kernel.kallsyms]  [k] do_softirq
                         |
                         --- do_softirq
                             irq_exit
                             do_IRQ
                             common_interrupt
                            |
                            |--25.00%-- tty_buffer_request_room

Instead of:
	    61.31%       ls  [kernel.kallsyms]  [k] __do_softirq
                         |
                         --- __do_softirq
                             do_softirq
                             irq_exit
                             do_IRQ
                             common_interrupt
                            |
                            |--25.00%-- tty_buffer_request_room

To fix this, we fetch the regs from perf_trace_blah() rather than
perf_trace_templ_foo() so that we don't have to deal with inlining
surprises.

That also bring us the advantage of having the true source of the
event even if we don't have frame pointers.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 include/trace/ftrace.h | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index ea6f9d4a20e..882c64832ff 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -758,13 +758,12 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static notrace void							\
 perf_trace_templ_##call(struct ftrace_event_call *event_call,		\
-			    proto)					\
+			struct pt_regs *__regs, proto)			\
 {									\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	struct ftrace_raw_##call *entry;				\
 	u64 __addr = 0, __count = 1;					\
 	unsigned long irq_flags;					\
-	struct pt_regs *__regs;						\
 	int __entry_size;						\
 	int __data_size;						\
 	int rctx;							\
@@ -785,20 +784,22 @@ perf_trace_templ_##call(struct ftrace_event_call *event_call,		\
 									\
 	{ assign; }							\
 									\
-	__regs = &__get_cpu_var(perf_trace_regs);			\
-	perf_fetch_caller_regs(__regs, 2);				\
-									\
 	perf_trace_buf_submit(entry, __entry_size, rctx, __addr,	\
 			       __count, irq_flags, __regs);		\
 }
 
 #undef DEFINE_EVENT
-#define DEFINE_EVENT(template, call, proto, args)		\
-static notrace void perf_trace_##call(proto)			\
-{								\
-	struct ftrace_event_call *event_call = &event_##call;	\
-								\
-	perf_trace_templ_##template(event_call, args);		\
+#define DEFINE_EVENT(template, call, proto, args)			\
+static notrace void perf_trace_##call(proto)				\
+{									\
+	struct ftrace_event_call *event_call = &event_##call;		\
+	struct pt_regs *__regs = &get_cpu_var(perf_trace_regs);		\
+									\
+	perf_fetch_caller_regs(__regs, 1);				\
+									\
+	perf_trace_templ_##template(event_call, __regs, args);		\
+									\
+	put_cpu_var(perf_trace_regs);					\
 }
 
 #undef DEFINE_EVENT_PRINT
-- 
cgit v1.2.3


From 3326c1ceee234e63160852720d48be8a8f7a6d08 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@relay.de.ibm.com>
Date: Tue, 23 Mar 2010 19:09:33 +0100
Subject: perf_event: Make perf fd non seekable

Perf_event does not need seeking, so prevent it in order to
get rid of default_llseek, which uses the BKL.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
[drop the nonseekable_open, not needed for anon inodes]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/perf_event.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 63fbce1c80b..4aa50ff4efc 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2645,6 +2645,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
 }
 
 static const struct file_operations perf_fops = {
+	.llseek			= no_llseek,
 	.release		= perf_release,
 	.read			= perf_read,
 	.poll			= perf_poll,
-- 
cgit v1.2.3


From 8141d0050d76e5695011b5ab577ec66fb51a998c Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Date: Sun, 4 Apr 2010 17:13:18 +0900
Subject: perf: Swap inclusion order of util.h and string.h in util/string.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently util/string.c includes headers in this order: string.h, util.h
But this causes a build error because __USE_GNU definition
is needed for strndup() definition:

	% make -j
	touch .perf.dev.null
	    CC util/string.o
	cc1: warnings being treated as errors
	util/string.c: In function ‘argv_split’:
	util/string.c:171: error: implicit declaration of function ‘strndup’
	util/string.c:171: error: incompatible implicit declaration of built-in function ‘strndup’

So this patch swaps the headers inclusion order.
util.h defines _GNU_SOURCE, and /usr/include/features.h defines
__USE_GNU as 1 if _GNU_SOURCE is defined.

Signed-off-by: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1270368798-27232-1-git-send-email-mitake@dcl.info.waseda.ac.jp>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 tools/perf/util/string.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c
index d4389242cfd..0409fc7c005 100644
--- a/tools/perf/util/string.c
+++ b/tools/perf/util/string.c
@@ -1,5 +1,5 @@
-#include "string.h"
 #include "util.h"
+#include "string.h"
 
 #define K 1024LL
 /*
-- 
cgit v1.2.3


From aa27497c2fb4c7f57706099bd489e683e5cc3e3b Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 5 Apr 2010 17:11:05 +0800
Subject: tracing: Fix uninitialized variable of tracing/trace output

Because a local variable is not initialized, I got these
when I did 'cat tracing/trace'. (not trace_pipe):

CPU:0 [LOST 18446744071579453134 EVENTS]
              ps-3099  [000]   560.770221: lock_acquire: ffff880030865010 &(&dentry->d_lock)->rlock
CPU:0 [LOST 18446744071579453134 EVENTS]
              ps-3099  [000]   560.770221: lock_release: ffff880030865010 &(&dentry->d_lock)->rlock
CPU:0 [LOST 18446612133255294080 EVENTS]
              ps-3099  [000]   560.770221: lock_acquire: ffff880030865010 &(&dentry->d_lock)->rlock
CPU:0 [LOST 18446744071579453134 EVENTS]
              ps-3099  [000]   560.770222: lock_release: ffff880030865010 &(&dentry->d_lock)->rlock
CPU:0 [LOST 18446744071579453134 EVENTS]
              ps-3099  [000]   560.770222: lock_release: ffffffff816cfb98 dcache_lock

See peek_next_entry(), it does not set *lost_events when we 'cat tracing/trace'

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4BB9A929.2000303@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0498bebcbfd..b9be232352b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1571,7 +1571,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
 {
 	struct ring_buffer *buffer = iter->tr->buffer;
 	struct trace_entry *ent, *next = NULL;
-	unsigned long lost_events, next_lost = 0;
+	unsigned long lost_events = 0, next_lost = 0;
 	int cpu_file = iter->cpu_file;
 	u64 next_ts = 0, ts;
 	int next_cpu = -1;
-- 
cgit v1.2.3


From bd6d29c25bb1a24a4c160ec5de43e0004e01f72b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 6 Apr 2010 00:10:17 +0200
Subject: lockstat: Make lockstat counting per cpu

Locking statistics are implemented using global atomic
variables. This is usually fine unless some path write them very
often.

This is the case for the function and function graph tracers
that disable irqs for each entry saved (except if the function
tracer is in preempt disabled only mode).
And calls to local_irq_save/restore() increment
hardirqs_on_events and hardirqs_off_events stats (or similar
stats for redundant versions).

Incrementing these global vars for each function ends up in too
much cache bouncing if lockstats are enabled.

To solve this, implement the debug_atomic_*() operations using
per cpu vars.

 -v2: Use per_cpu() instead of get_cpu_var() to fetch the desired
      cpu vars on debug_atomic_read()

 -v3: Store the stats in a structure. No need for local_t as we
      are NMI/irq safe.

 -v4: Fix tons of build errors. I thought I had tested it but I
      probably forgot to select the relevant config.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1270505417-8144-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/lockdep.c           | 47 +++++++++++------------------
 kernel/lockdep_internals.h | 74 +++++++++++++++++++++++++++++++++-------------
 kernel/lockdep_proc.c      | 58 ++++++++++++++++++------------------
 3 files changed, 99 insertions(+), 80 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0c30d0455de..069af0276bf 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -430,20 +430,7 @@ static struct stack_trace lockdep_init_trace = {
 /*
  * Various lockdep statistics:
  */
-atomic_t chain_lookup_hits;
-atomic_t chain_lookup_misses;
-atomic_t hardirqs_on_events;
-atomic_t hardirqs_off_events;
-atomic_t redundant_hardirqs_on;
-atomic_t redundant_hardirqs_off;
-atomic_t softirqs_on_events;
-atomic_t softirqs_off_events;
-atomic_t redundant_softirqs_on;
-atomic_t redundant_softirqs_off;
-atomic_t nr_unused_locks;
-atomic_t nr_cyclic_checks;
-atomic_t nr_find_usage_forwards_checks;
-atomic_t nr_find_usage_backwards_checks;
+DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
 #endif
 
 /*
@@ -758,7 +745,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 		return NULL;
 	}
 	class = lock_classes + nr_lock_classes++;
-	debug_atomic_inc(&nr_unused_locks);
+	debug_atomic_inc(nr_unused_locks);
 	class->key = key;
 	class->name = lock->name;
 	class->subclass = subclass;
@@ -1215,7 +1202,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
 {
 	int result;
 
-	debug_atomic_inc(&nr_cyclic_checks);
+	debug_atomic_inc(nr_cyclic_checks);
 
 	result = __bfs_forwards(root, target, class_equal, target_entry);
 
@@ -1252,7 +1239,7 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
 {
 	int result;
 
-	debug_atomic_inc(&nr_find_usage_forwards_checks);
+	debug_atomic_inc(nr_find_usage_forwards_checks);
 
 	result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
 
@@ -1275,7 +1262,7 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
 {
 	int result;
 
-	debug_atomic_inc(&nr_find_usage_backwards_checks);
+	debug_atomic_inc(nr_find_usage_backwards_checks);
 
 	result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
 
@@ -1835,7 +1822,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 	list_for_each_entry(chain, hash_head, entry) {
 		if (chain->chain_key == chain_key) {
 cache_hit:
-			debug_atomic_inc(&chain_lookup_hits);
+			debug_atomic_inc(chain_lookup_hits);
 			if (very_verbose(class))
 				printk("\nhash chain already cached, key: "
 					"%016Lx tail class: [%p] %s\n",
@@ -1900,7 +1887,7 @@ cache_hit:
 		chain_hlocks[chain->base + j] = class - lock_classes;
 	}
 	list_add_tail_rcu(&chain->entry, hash_head);
-	debug_atomic_inc(&chain_lookup_misses);
+	debug_atomic_inc(chain_lookup_misses);
 	inc_chains();
 
 	return 1;
@@ -2321,7 +2308,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 		return;
 
 	if (unlikely(curr->hardirqs_enabled)) {
-		debug_atomic_inc(&redundant_hardirqs_on);
+		debug_atomic_inc(redundant_hardirqs_on);
 		return;
 	}
 	/* we'll do an OFF -> ON transition: */
@@ -2348,7 +2335,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 
 	curr->hardirq_enable_ip = ip;
 	curr->hardirq_enable_event = ++curr->irq_events;
-	debug_atomic_inc(&hardirqs_on_events);
+	debug_atomic_inc(hardirqs_on_events);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
@@ -2380,9 +2367,9 @@ void trace_hardirqs_off_caller(unsigned long ip)
 		curr->hardirqs_enabled = 0;
 		curr->hardirq_disable_ip = ip;
 		curr->hardirq_disable_event = ++curr->irq_events;
-		debug_atomic_inc(&hardirqs_off_events);
+		debug_atomic_inc(hardirqs_off_events);
 	} else
-		debug_atomic_inc(&redundant_hardirqs_off);
+		debug_atomic_inc(redundant_hardirqs_off);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
@@ -2406,7 +2393,7 @@ void trace_softirqs_on(unsigned long ip)
 		return;
 
 	if (curr->softirqs_enabled) {
-		debug_atomic_inc(&redundant_softirqs_on);
+		debug_atomic_inc(redundant_softirqs_on);
 		return;
 	}
 
@@ -2416,7 +2403,7 @@ void trace_softirqs_on(unsigned long ip)
 	curr->softirqs_enabled = 1;
 	curr->softirq_enable_ip = ip;
 	curr->softirq_enable_event = ++curr->irq_events;
-	debug_atomic_inc(&softirqs_on_events);
+	debug_atomic_inc(softirqs_on_events);
 	/*
 	 * We are going to turn softirqs on, so set the
 	 * usage bit for all held locks, if hardirqs are
@@ -2446,10 +2433,10 @@ void trace_softirqs_off(unsigned long ip)
 		curr->softirqs_enabled = 0;
 		curr->softirq_disable_ip = ip;
 		curr->softirq_disable_event = ++curr->irq_events;
-		debug_atomic_inc(&softirqs_off_events);
+		debug_atomic_inc(softirqs_off_events);
 		DEBUG_LOCKS_WARN_ON(!softirq_count());
 	} else
-		debug_atomic_inc(&redundant_softirqs_off);
+		debug_atomic_inc(redundant_softirqs_off);
 }
 
 static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
@@ -2654,7 +2641,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 			return 0;
 		break;
 	case LOCK_USED:
-		debug_atomic_dec(&nr_unused_locks);
+		debug_atomic_dec(nr_unused_locks);
 		break;
 	default:
 		if (!debug_locks_off_graph_unlock())
@@ -2760,7 +2747,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		if (!class)
 			return 0;
 	}
-	debug_atomic_inc((atomic_t *)&class->ops);
+	atomic_inc((atomic_t *)&class->ops);
 	if (very_verbose(class)) {
 		printk("\nacquire class [%p] %s", class->key, class->name);
 		if (class->name_version > 1)
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2ee95ad131..8d7d4b6c741 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -110,29 +110,61 @@ lockdep_count_backward_deps(struct lock_class *class)
 #endif
 
 #ifdef CONFIG_DEBUG_LOCKDEP
+
+#include <asm/local.h>
 /*
- * Various lockdep statistics:
+ * Various lockdep statistics.
+ * We want them per cpu as they are often accessed in fast path
+ * and we want to avoid too much cache bouncing.
  */
-extern atomic_t chain_lookup_hits;
-extern atomic_t chain_lookup_misses;
-extern atomic_t hardirqs_on_events;
-extern atomic_t hardirqs_off_events;
-extern atomic_t redundant_hardirqs_on;
-extern atomic_t redundant_hardirqs_off;
-extern atomic_t softirqs_on_events;
-extern atomic_t softirqs_off_events;
-extern atomic_t redundant_softirqs_on;
-extern atomic_t redundant_softirqs_off;
-extern atomic_t nr_unused_locks;
-extern atomic_t nr_cyclic_checks;
-extern atomic_t nr_cyclic_check_recursions;
-extern atomic_t nr_find_usage_forwards_checks;
-extern atomic_t nr_find_usage_forwards_recursions;
-extern atomic_t nr_find_usage_backwards_checks;
-extern atomic_t nr_find_usage_backwards_recursions;
-# define debug_atomic_inc(ptr)		atomic_inc(ptr)
-# define debug_atomic_dec(ptr)		atomic_dec(ptr)
-# define debug_atomic_read(ptr)		atomic_read(ptr)
+struct lockdep_stats {
+	int	chain_lookup_hits;
+	int	chain_lookup_misses;
+	int	hardirqs_on_events;
+	int	hardirqs_off_events;
+	int	redundant_hardirqs_on;
+	int	redundant_hardirqs_off;
+	int	softirqs_on_events;
+	int	softirqs_off_events;
+	int	redundant_softirqs_on;
+	int	redundant_softirqs_off;
+	int	nr_unused_locks;
+	int	nr_cyclic_checks;
+	int	nr_cyclic_check_recursions;
+	int	nr_find_usage_forwards_checks;
+	int	nr_find_usage_forwards_recursions;
+	int	nr_find_usage_backwards_checks;
+	int	nr_find_usage_backwards_recursions;
+};
+
+DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
+
+#define debug_atomic_inc(ptr)			{		\
+	struct lockdep_stats *__cpu_lockdep_stats;		\
+								\
+	WARN_ON_ONCE(!irqs_disabled());				\
+	__cpu_lockdep_stats = &__get_cpu_var(lockdep_stats);	\
+	__cpu_lockdep_stats->ptr++;				\
+}
+
+#define debug_atomic_dec(ptr)			{		\
+	struct lockdep_stats *__cpu_lockdep_stats;		\
+								\
+	WARN_ON_ONCE(!irqs_disabled());				\
+	__cpu_lockdep_stats = &__get_cpu_var(lockdep_stats);	\
+	__cpu_lockdep_stats->ptr--;				\
+}
+
+#define debug_atomic_read(ptr)		({				\
+	struct lockdep_stats *__cpu_lockdep_stats;			\
+	unsigned long long __total = 0;					\
+	int __cpu;							\
+	for_each_possible_cpu(__cpu) {					\
+		__cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu);	\
+		__total += __cpu_lockdep_stats->ptr;			\
+	}								\
+	__total;							\
+})
 #else
 # define debug_atomic_inc(ptr)		do { } while (0)
 # define debug_atomic_dec(ptr)		do { } while (0)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4aba4f3584..59b76c8ce9d 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -184,34 +184,34 @@ static const struct file_operations proc_lockdep_chains_operations = {
 static void lockdep_stats_debug_show(struct seq_file *m)
 {
 #ifdef CONFIG_DEBUG_LOCKDEP
-	unsigned int hi1 = debug_atomic_read(&hardirqs_on_events),
-		     hi2 = debug_atomic_read(&hardirqs_off_events),
-		     hr1 = debug_atomic_read(&redundant_hardirqs_on),
-		     hr2 = debug_atomic_read(&redundant_hardirqs_off),
-		     si1 = debug_atomic_read(&softirqs_on_events),
-		     si2 = debug_atomic_read(&softirqs_off_events),
-		     sr1 = debug_atomic_read(&redundant_softirqs_on),
-		     sr2 = debug_atomic_read(&redundant_softirqs_off);
-
-	seq_printf(m, " chain lookup misses:           %11u\n",
-		debug_atomic_read(&chain_lookup_misses));
-	seq_printf(m, " chain lookup hits:             %11u\n",
-		debug_atomic_read(&chain_lookup_hits));
-	seq_printf(m, " cyclic checks:                 %11u\n",
-		debug_atomic_read(&nr_cyclic_checks));
-	seq_printf(m, " find-mask forwards checks:     %11u\n",
-		debug_atomic_read(&nr_find_usage_forwards_checks));
-	seq_printf(m, " find-mask backwards checks:    %11u\n",
-		debug_atomic_read(&nr_find_usage_backwards_checks));
-
-	seq_printf(m, " hardirq on events:             %11u\n", hi1);
-	seq_printf(m, " hardirq off events:            %11u\n", hi2);
-	seq_printf(m, " redundant hardirq ons:         %11u\n", hr1);
-	seq_printf(m, " redundant hardirq offs:        %11u\n", hr2);
-	seq_printf(m, " softirq on events:             %11u\n", si1);
-	seq_printf(m, " softirq off events:            %11u\n", si2);
-	seq_printf(m, " redundant softirq ons:         %11u\n", sr1);
-	seq_printf(m, " redundant softirq offs:        %11u\n", sr2);
+	unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
+			   hi2 = debug_atomic_read(hardirqs_off_events),
+			   hr1 = debug_atomic_read(redundant_hardirqs_on),
+			   hr2 = debug_atomic_read(redundant_hardirqs_off),
+			   si1 = debug_atomic_read(softirqs_on_events),
+			   si2 = debug_atomic_read(softirqs_off_events),
+			   sr1 = debug_atomic_read(redundant_softirqs_on),
+			   sr2 = debug_atomic_read(redundant_softirqs_off);
+
+	seq_printf(m, " chain lookup misses:           %11llu\n",
+		debug_atomic_read(chain_lookup_misses));
+	seq_printf(m, " chain lookup hits:             %11llu\n",
+		debug_atomic_read(chain_lookup_hits));
+	seq_printf(m, " cyclic checks:                 %11llu\n",
+		debug_atomic_read(nr_cyclic_checks));
+	seq_printf(m, " find-mask forwards checks:     %11llu\n",
+		debug_atomic_read(nr_find_usage_forwards_checks));
+	seq_printf(m, " find-mask backwards checks:    %11llu\n",
+		debug_atomic_read(nr_find_usage_backwards_checks));
+
+	seq_printf(m, " hardirq on events:             %11llu\n", hi1);
+	seq_printf(m, " hardirq off events:            %11llu\n", hi2);
+	seq_printf(m, " redundant hardirq ons:         %11llu\n", hr1);
+	seq_printf(m, " redundant hardirq offs:        %11llu\n", hr2);
+	seq_printf(m, " softirq on events:             %11llu\n", si1);
+	seq_printf(m, " softirq off events:            %11llu\n", si2);
+	seq_printf(m, " redundant softirq ons:         %11llu\n", sr1);
+	seq_printf(m, " redundant softirq offs:        %11llu\n", sr2);
 #endif
 }
 
@@ -263,7 +263,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 #endif
 	}
 #ifdef CONFIG_DEBUG_LOCKDEP
-	DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
+	DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
 #endif
 	seq_printf(m, " lock-classes:                  %11lu [max: %lu]\n",
 			nr_lock_classes, MAX_LOCKDEP_KEYS);
-- 
cgit v1.2.3


From 1527bc8b928dd1399c3d3467dd47d9ede210978a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 1 Feb 2010 15:03:07 +0100
Subject: bitops: Optimize hweight() by making use of compile-time evaluation

Rename the extisting runtime hweight() implementations to
__arch_hweight(), rename the compile-time versions to __const_hweight()
and then have hweight() pick between them.

Suggested-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100318111929.GB11152@aftab>
Acked-by: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <1265028224.24455.154.camel@laptop>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/alpha/include/asm/bitops.h            | 18 +++++++------
 arch/ia64/include/asm/bitops.h             | 11 ++++----
 arch/sparc/include/asm/bitops_64.h         | 11 ++++----
 include/asm-generic/bitops/arch_hweight.h  | 11 ++++++++
 include/asm-generic/bitops/const_hweight.h | 42 ++++++++++++++++++++++++++++++
 include/asm-generic/bitops/hweight.h       |  8 ++----
 include/linux/bitops.h                     | 25 ------------------
 lib/hweight.c                              | 19 +++++++-------
 8 files changed, 87 insertions(+), 58 deletions(-)
 create mode 100644 include/asm-generic/bitops/arch_hweight.h
 create mode 100644 include/asm-generic/bitops/const_hweight.h

diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h
index 15f3ae25c51..296da1d5ed5 100644
--- a/arch/alpha/include/asm/bitops.h
+++ b/arch/alpha/include/asm/bitops.h
@@ -405,29 +405,31 @@ static inline int fls(int x)
 
 #if defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67)
 /* Whee.  EV67 can calculate it directly.  */
-static inline unsigned long hweight64(unsigned long w)
+static inline unsigned long __arch_hweight64(unsigned long w)
 {
 	return __kernel_ctpop(w);
 }
 
-static inline unsigned int hweight32(unsigned int w)
+static inline unsigned int __arch_weight32(unsigned int w)
 {
-	return hweight64(w);
+	return __arch_hweight64(w);
 }
 
-static inline unsigned int hweight16(unsigned int w)
+static inline unsigned int __arch_hweight16(unsigned int w)
 {
-	return hweight64(w & 0xffff);
+	return __arch_hweight64(w & 0xffff);
 }
 
-static inline unsigned int hweight8(unsigned int w)
+static inline unsigned int __arch_hweight8(unsigned int w)
 {
-	return hweight64(w & 0xff);
+	return __arch_hweight64(w & 0xff);
 }
 #else
-#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/arch_hweight.h>
 #endif
 
+#include <asm-generic/bitops/const_hweight.h>
+
 #endif /* __KERNEL__ */
 
 #include <asm-generic/bitops/find.h>
diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h
index 6ebc229a1c5..9da3df6f1a5 100644
--- a/arch/ia64/include/asm/bitops.h
+++ b/arch/ia64/include/asm/bitops.h
@@ -437,17 +437,18 @@ __fls (unsigned long x)
  * hweightN: returns the hamming weight (i.e. the number
  * of bits set) of a N-bit word
  */
-static __inline__ unsigned long
-hweight64 (unsigned long x)
+static __inline__ unsigned long __arch_hweight64(unsigned long x)
 {
 	unsigned long result;
 	result = ia64_popcnt(x);
 	return result;
 }
 
-#define hweight32(x)	(unsigned int) hweight64((x) & 0xfffffffful)
-#define hweight16(x)	(unsigned int) hweight64((x) & 0xfffful)
-#define hweight8(x)	(unsigned int) hweight64((x) & 0xfful)
+#define __arch_hweight32(x) ((unsigned int) __arch_hweight64((x) & 0xfffffffful))
+#define __arch_hweight16(x) ((unsigned int) __arch_hweight64((x) & 0xfffful))
+#define __arch_hweight8(x)  ((unsigned int) __arch_hweight64((x) & 0xfful))
+
+#include <asm-generic/bitops/const_hweight.h>
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/sparc/include/asm/bitops_64.h b/arch/sparc/include/asm/bitops_64.h
index e72ac9cdfb9..766121a67a2 100644
--- a/arch/sparc/include/asm/bitops_64.h
+++ b/arch/sparc/include/asm/bitops_64.h
@@ -44,7 +44,7 @@ extern void change_bit(unsigned long nr, volatile unsigned long *addr);
 
 #ifdef ULTRA_HAS_POPULATION_COUNT
 
-static inline unsigned int hweight64(unsigned long w)
+static inline unsigned int __arch_hweight64(unsigned long w)
 {
 	unsigned int res;
 
@@ -52,7 +52,7 @@ static inline unsigned int hweight64(unsigned long w)
 	return res;
 }
 
-static inline unsigned int hweight32(unsigned int w)
+static inline unsigned int __arch_hweight32(unsigned int w)
 {
 	unsigned int res;
 
@@ -60,7 +60,7 @@ static inline unsigned int hweight32(unsigned int w)
 	return res;
 }
 
-static inline unsigned int hweight16(unsigned int w)
+static inline unsigned int __arch_hweight16(unsigned int w)
 {
 	unsigned int res;
 
@@ -68,7 +68,7 @@ static inline unsigned int hweight16(unsigned int w)
 	return res;
 }
 
-static inline unsigned int hweight8(unsigned int w)
+static inline unsigned int __arch_hweight8(unsigned int w)
 {
 	unsigned int res;
 
@@ -78,9 +78,10 @@ static inline unsigned int hweight8(unsigned int w)
 
 #else
 
-#include <asm-generic/bitops/hweight.h>
+#include <asm-generic/bitops/arch_hweight.h>
 
 #endif
+#include <asm-generic/bitops/const_hweight.h>
 #include <asm-generic/bitops/lock.h>
 #endif /* __KERNEL__ */
 
diff --git a/include/asm-generic/bitops/arch_hweight.h b/include/asm-generic/bitops/arch_hweight.h
new file mode 100644
index 00000000000..3a7be842cdc
--- /dev/null
+++ b/include/asm-generic/bitops/arch_hweight.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_GENERIC_BITOPS_ARCH_HWEIGHT_H_
+#define _ASM_GENERIC_BITOPS_ARCH_HWEIGHT_H_
+
+#include <asm/types.h>
+
+extern unsigned int __arch_hweight32(unsigned int w);
+extern unsigned int __arch_hweight16(unsigned int w);
+extern unsigned int __arch_hweight8(unsigned int w);
+extern unsigned long __arch_hweight64(__u64 w);
+
+#endif /* _ASM_GENERIC_BITOPS_HWEIGHT_H_ */
diff --git a/include/asm-generic/bitops/const_hweight.h b/include/asm-generic/bitops/const_hweight.h
new file mode 100644
index 00000000000..fa2a50b7ee6
--- /dev/null
+++ b/include/asm-generic/bitops/const_hweight.h
@@ -0,0 +1,42 @@
+#ifndef _ASM_GENERIC_BITOPS_CONST_HWEIGHT_H_
+#define _ASM_GENERIC_BITOPS_CONST_HWEIGHT_H_
+
+/*
+ * Compile time versions of __arch_hweightN()
+ */
+#define __const_hweight8(w)		\
+      (	(!!((w) & (1ULL << 0))) +	\
+	(!!((w) & (1ULL << 1))) +	\
+	(!!((w) & (1ULL << 2))) +	\
+	(!!((w) & (1ULL << 3))) +	\
+	(!!((w) & (1ULL << 4))) +	\
+	(!!((w) & (1ULL << 5))) +	\
+	(!!((w) & (1ULL << 6))) +	\
+	(!!((w) & (1ULL << 7)))	)
+
+#define __const_hweight16(w) (__const_hweight8(w)  + __const_hweight8((w)  >> 8 ))
+#define __const_hweight32(w) (__const_hweight16(w) + __const_hweight16((w) >> 16))
+#define __const_hweight64(w) (__const_hweight32(w) + __const_hweight32((w) >> 32))
+
+/*
+ * Generic interface.
+ */
+#define hweight8(w)  (__builtin_constant_p(w) ? __const_hweight8(w)  : __arch_hweight8(w))
+#define hweight16(w) (__builtin_constant_p(w) ? __const_hweight16(w) : __arch_hweight16(w))
+#define hweight32(w) (__builtin_constant_p(w) ? __const_hweight32(w) : __arch_hweight32(w))
+#define hweight64(w) (__builtin_constant_p(w) ? __const_hweight64(w) : __arch_hweight64(w))
+
+/*
+ * Interface for known constant arguments
+ */
+#define HWEIGHT8(w)  (BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + __const_hweight8(w))
+#define HWEIGHT16(w) (BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + __const_hweight16(w))
+#define HWEIGHT32(w) (BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + __const_hweight32(w))
+#define HWEIGHT64(w) (BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) + __const_hweight64(w))
+
+/*
+ * Type invariant interface to the compile time constant hweight functions.
+ */
+#define HWEIGHT(w)   HWEIGHT64((u64)w)
+
+#endif /* _ASM_GENERIC_BITOPS_CONST_HWEIGHT_H_ */
diff --git a/include/asm-generic/bitops/hweight.h b/include/asm-generic/bitops/hweight.h
index fbbc383771d..a94d6519c7e 100644
--- a/include/asm-generic/bitops/hweight.h
+++ b/include/asm-generic/bitops/hweight.h
@@ -1,11 +1,7 @@
 #ifndef _ASM_GENERIC_BITOPS_HWEIGHT_H_
 #define _ASM_GENERIC_BITOPS_HWEIGHT_H_
 
-#include <asm/types.h>
-
-extern unsigned int hweight32(unsigned int w);
-extern unsigned int hweight16(unsigned int w);
-extern unsigned int hweight8(unsigned int w);
-extern unsigned long hweight64(__u64 w);
+#include <asm-generic/bitops/arch_hweight.h>
+#include <asm-generic/bitops/const_hweight.h>
 
 #endif /* _ASM_GENERIC_BITOPS_HWEIGHT_H_ */
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index b7938987923..c55d5bc4ee5 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -47,31 +47,6 @@ static inline unsigned long hweight_long(unsigned long w)
 	return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
 }
 
-/*
- * Clearly slow versions of the hweightN() functions, their benefit is
- * of course compile time evaluation of constant arguments.
- */
-#define HWEIGHT8(w)					\
-      (	BUILD_BUG_ON_ZERO(!__builtin_constant_p(w)) +	\
-	(!!((w) & (1ULL << 0))) +			\
-	(!!((w) & (1ULL << 1))) +			\
-	(!!((w) & (1ULL << 2))) +			\
-	(!!((w) & (1ULL << 3))) +			\
-	(!!((w) & (1ULL << 4))) +			\
-	(!!((w) & (1ULL << 5))) +			\
-	(!!((w) & (1ULL << 6))) +			\
-	(!!((w) & (1ULL << 7)))	)
-
-#define HWEIGHT16(w) (HWEIGHT8(w)  + HWEIGHT8((w) >> 8))
-#define HWEIGHT32(w) (HWEIGHT16(w) + HWEIGHT16((w) >> 16))
-#define HWEIGHT64(w) (HWEIGHT32(w) + HWEIGHT32((w) >> 32))
-
-/*
- * Type invariant version that simply casts things to the
- * largest type.
- */
-#define HWEIGHT(w)   HWEIGHT64((u64)(w))
-
 /**
  * rol32 - rotate a 32-bit value left
  * @word: value to rotate
diff --git a/lib/hweight.c b/lib/hweight.c
index 63ee4eb1228..a6927e76840 100644
--- a/lib/hweight.c
+++ b/lib/hweight.c
@@ -9,7 +9,7 @@
  * The Hamming Weight of a number is the total number of bits set in it.
  */
 
-unsigned int hweight32(unsigned int w)
+unsigned int __arch_hweight32(unsigned int w)
 {
 #ifdef ARCH_HAS_FAST_MULTIPLIER
 	w -= (w >> 1) & 0x55555555;
@@ -24,29 +24,30 @@ unsigned int hweight32(unsigned int w)
 	return (res + (res >> 16)) & 0x000000FF;
 #endif
 }
-EXPORT_SYMBOL(hweight32);
+EXPORT_SYMBOL(__arch_hweight32);
 
-unsigned int hweight16(unsigned int w)
+unsigned int __arch_hweight16(unsigned int w)
 {
 	unsigned int res = w - ((w >> 1) & 0x5555);
 	res = (res & 0x3333) + ((res >> 2) & 0x3333);
 	res = (res + (res >> 4)) & 0x0F0F;
 	return (res + (res >> 8)) & 0x00FF;
 }
-EXPORT_SYMBOL(hweight16);
+EXPORT_SYMBOL(__arch_hweight16);
 
-unsigned int hweight8(unsigned int w)
+unsigned int __arch_hweight8(unsigned int w)
 {
 	unsigned int res = w - ((w >> 1) & 0x55);
 	res = (res & 0x33) + ((res >> 2) & 0x33);
 	return (res + (res >> 4)) & 0x0F;
 }
-EXPORT_SYMBOL(hweight8);
+EXPORT_SYMBOL(__arch_hweight8);
 
-unsigned long hweight64(__u64 w)
+unsigned long __arch_hweight64(__u64 w)
 {
 #if BITS_PER_LONG == 32
-	return hweight32((unsigned int)(w >> 32)) + hweight32((unsigned int)w);
+	return __arch_hweight32((unsigned int)(w >> 32)) +
+	       __arch_hweight32((unsigned int)w);
 #elif BITS_PER_LONG == 64
 #ifdef ARCH_HAS_FAST_MULTIPLIER
 	w -= (w >> 1) & 0x5555555555555555ul;
@@ -63,4 +64,4 @@ unsigned long hweight64(__u64 w)
 #endif
 #endif
 }
-EXPORT_SYMBOL(hweight64);
+EXPORT_SYMBOL(__arch_hweight64);
-- 
cgit v1.2.3


From d61931d89be506372d01a90d1755f6d0a9fafe2d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Fri, 5 Mar 2010 17:34:46 +0100
Subject: x86: Add optimized popcnt variants

Add support for the hardware version of the Hamming weight function,
popcnt, present in CPUs which advertize it under CPUID, Function
0x0000_0001_ECX[23]. On CPUs which don't support it, we fallback to the
default lib/hweight.c sw versions.

A synthetic benchmark comparing popcnt with __sw_hweight64 showed almost
a 3x speedup on a F10h machine.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100318112015.GC11152@aftab>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig                          |  5 +++
 arch/x86/include/asm/alternative.h        |  9 +++--
 arch/x86/include/asm/arch_hweight.h       | 59 +++++++++++++++++++++++++++++++
 arch/x86/include/asm/bitops.h             |  4 ++-
 include/asm-generic/bitops/arch_hweight.h | 22 +++++++++---
 lib/Makefile                              |  3 ++
 lib/hweight.c                             | 20 +++++------
 scripts/Makefile.lib                      |  4 +++
 8 files changed, 108 insertions(+), 18 deletions(-)
 create mode 100644 arch/x86/include/asm/arch_hweight.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0eacb1ffb42..89d8c54cdd3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -238,6 +238,11 @@ config X86_32_LAZY_GS
 	def_bool y
 	depends on X86_32 && !CC_STACKPROTECTOR
 
+config ARCH_HWEIGHT_CFLAGS
+	string
+	default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
+	default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
+
 config KTIME_SCALAR
 	def_bool X86_32
 source "init/Kconfig"
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index b09ec55650b..67dae51e7fd 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -39,9 +39,6 @@
 #define LOCK_PREFIX ""
 #endif
 
-/* This must be included *after* the definition of LOCK_PREFIX */
-#include <asm/cpufeature.h>
-
 struct alt_instr {
 	u8 *instr;		/* original instruction */
 	u8 *replacement;
@@ -95,6 +92,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
       "663:\n\t" newinstr "\n664:\n"		/* replacement     */	\
       ".previous"
 
+/*
+ * This must be included *after* the definition of ALTERNATIVE due to
+ * <asm/arch_hweight.h>
+ */
+#include <asm/cpufeature.h>
+
 /*
  * Alternative instructions for different CPU types or capabilities.
  *
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
new file mode 100644
index 00000000000..d1fc3c219ae
--- /dev/null
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -0,0 +1,59 @@
+#ifndef _ASM_X86_HWEIGHT_H
+#define _ASM_X86_HWEIGHT_H
+
+#ifdef CONFIG_64BIT
+/* popcnt %rdi, %rax */
+#define POPCNT ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
+#define REG_IN "D"
+#define REG_OUT "a"
+#else
+/* popcnt %eax, %eax */
+#define POPCNT ".byte 0xf3,0x0f,0xb8,0xc0"
+#define REG_IN "a"
+#define REG_OUT "a"
+#endif
+
+/*
+ * __sw_hweightXX are called from within the alternatives below
+ * and callee-clobbered registers need to be taken care of. See
+ * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
+ * compiler switches.
+ */
+static inline unsigned int __arch_hweight32(unsigned int w)
+{
+	unsigned int res = 0;
+
+	asm (ALTERNATIVE("call __sw_hweight32", POPCNT, X86_FEATURE_POPCNT)
+		     : "="REG_OUT (res)
+		     : REG_IN (w));
+
+	return res;
+}
+
+static inline unsigned int __arch_hweight16(unsigned int w)
+{
+	return __arch_hweight32(w & 0xffff);
+}
+
+static inline unsigned int __arch_hweight8(unsigned int w)
+{
+	return __arch_hweight32(w & 0xff);
+}
+
+static inline unsigned long __arch_hweight64(__u64 w)
+{
+	unsigned long res = 0;
+
+#ifdef CONFIG_X86_32
+	return  __arch_hweight32((u32)w) +
+		__arch_hweight32((u32)(w >> 32));
+#else
+	asm (ALTERNATIVE("call __sw_hweight64", POPCNT, X86_FEATURE_POPCNT)
+		     : "="REG_OUT (res)
+		     : REG_IN (w));
+#endif /* CONFIG_X86_32 */
+
+	return res;
+}
+
+#endif
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 02b47a603fc..545776efeb1 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -444,7 +444,9 @@ static inline int fls(int x)
 
 #define ARCH_HAS_FAST_MULTIPLIER 1
 
-#include <asm-generic/bitops/hweight.h>
+#include <asm/arch_hweight.h>
+
+#include <asm-generic/bitops/const_hweight.h>
 
 #endif /* __KERNEL__ */
 
diff --git a/include/asm-generic/bitops/arch_hweight.h b/include/asm-generic/bitops/arch_hweight.h
index 3a7be842cdc..9a81c1e9436 100644
--- a/include/asm-generic/bitops/arch_hweight.h
+++ b/include/asm-generic/bitops/arch_hweight.h
@@ -3,9 +3,23 @@
 
 #include <asm/types.h>
 
-extern unsigned int __arch_hweight32(unsigned int w);
-extern unsigned int __arch_hweight16(unsigned int w);
-extern unsigned int __arch_hweight8(unsigned int w);
-extern unsigned long __arch_hweight64(__u64 w);
+inline unsigned int __arch_hweight32(unsigned int w)
+{
+	return __sw_hweight32(w);
+}
 
+inline unsigned int __arch_hweight16(unsigned int w)
+{
+	return __sw_hweight16(w);
+}
+
+inline unsigned int __arch_hweight8(unsigned int w)
+{
+	return __sw_hweight8(w);
+}
+
+inline unsigned long __arch_hweight64(__u64 w)
+{
+	return __sw_hweight64(w);
+}
 #endif /* _ASM_GENERIC_BITOPS_HWEIGHT_H_ */
diff --git a/lib/Makefile b/lib/Makefile
index 2e152aed719..abe63a8ad14 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -39,7 +39,10 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_GENERIC_FIND_FIRST_BIT) += find_next_bit.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o
+
+CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
+
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
 obj-$(CONFIG_BTREE) += btree.o
 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
diff --git a/lib/hweight.c b/lib/hweight.c
index a6927e76840..3c79d50814c 100644
--- a/lib/hweight.c
+++ b/lib/hweight.c
@@ -9,7 +9,7 @@
  * The Hamming Weight of a number is the total number of bits set in it.
  */
 
-unsigned int __arch_hweight32(unsigned int w)
+unsigned int __sw_hweight32(unsigned int w)
 {
 #ifdef ARCH_HAS_FAST_MULTIPLIER
 	w -= (w >> 1) & 0x55555555;
@@ -24,30 +24,30 @@ unsigned int __arch_hweight32(unsigned int w)
 	return (res + (res >> 16)) & 0x000000FF;
 #endif
 }
-EXPORT_SYMBOL(__arch_hweight32);
+EXPORT_SYMBOL(__sw_hweight32);
 
-unsigned int __arch_hweight16(unsigned int w)
+unsigned int __sw_hweight16(unsigned int w)
 {
 	unsigned int res = w - ((w >> 1) & 0x5555);
 	res = (res & 0x3333) + ((res >> 2) & 0x3333);
 	res = (res + (res >> 4)) & 0x0F0F;
 	return (res + (res >> 8)) & 0x00FF;
 }
-EXPORT_SYMBOL(__arch_hweight16);
+EXPORT_SYMBOL(__sw_hweight16);
 
-unsigned int __arch_hweight8(unsigned int w)
+unsigned int __sw_hweight8(unsigned int w)
 {
 	unsigned int res = w - ((w >> 1) & 0x55);
 	res = (res & 0x33) + ((res >> 2) & 0x33);
 	return (res + (res >> 4)) & 0x0F;
 }
-EXPORT_SYMBOL(__arch_hweight8);
+EXPORT_SYMBOL(__sw_hweight8);
 
-unsigned long __arch_hweight64(__u64 w)
+unsigned long __sw_hweight64(__u64 w)
 {
 #if BITS_PER_LONG == 32
-	return __arch_hweight32((unsigned int)(w >> 32)) +
-	       __arch_hweight32((unsigned int)w);
+	return __sw_hweight32((unsigned int)(w >> 32)) +
+	       __sw_hweight32((unsigned int)w);
 #elif BITS_PER_LONG == 64
 #ifdef ARCH_HAS_FAST_MULTIPLIER
 	w -= (w >> 1) & 0x5555555555555555ul;
@@ -64,4 +64,4 @@ unsigned long __arch_hweight64(__u64 w)
 #endif
 #endif
 }
-EXPORT_SYMBOL(__arch_hweight64);
+EXPORT_SYMBOL(__sw_hweight64);
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index f9bdf264473..cbcd654215e 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -245,3 +245,7 @@ quiet_cmd_lzo = LZO    $@
 cmd_lzo = (cat $(filter-out FORCE,$^) | \
 	lzop -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
 	(rm -f $@ ; false)
+
+# misc stuff
+# ---------------------------------------------------------------------------
+quote:="
-- 
cgit v1.2.3


From 6e7ab4c649eb7ed403d970b8eda32ca3745e8024 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 5 Apr 2010 12:02:18 -0300
Subject: perf TUI: Show filters on the title and add help line about how to
 zoom out
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Suggested-by: Ingo Molnar <molnar@elte.hu>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 96 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 68 insertions(+), 28 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 6d6e022d770..c0e71aacfc1 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -411,7 +411,7 @@ static void hist_browser__delete(struct hist_browser *self)
 }
 
 static int hist_browser__populate(struct hist_browser *self, struct rb_root *hists,
-				  u64 nr_hists, u64 session_total)
+				  u64 nr_hists, u64 session_total, const char *title)
 {
 	int max_len = 0, idx, cols, rows;
 	struct ui_progress *progress;
@@ -476,7 +476,7 @@ static int hist_browser__populate(struct hist_browser *self, struct rb_root *his
 		newtListboxSetWidth(self->tree, max_len);
 
 	newtCenteredWindow(max_len + (symbol_conf.use_callchain ? 5 : 0),
-			   rows - 5, "Report");
+			   rows - 5, title);
 	self->form = newt_form__new();
 	if (self->form == NULL)
 		return -1;
@@ -495,7 +495,7 @@ enum hist_filter {
 	HIST_FILTER__THREAD,
 };
 
-static u64 hists__filter_by_dso(struct rb_root *hists, struct dso *dso,
+static u64 hists__filter_by_dso(struct rb_root *hists, const struct dso *dso,
 				u64 *session_total)
 {
 	struct rb_node *nd;
@@ -560,25 +560,47 @@ out:
 	return *(struct thread **)(self->selection + 1);
 }
 
+static int hist_browser__title(char *bf, size_t size, const char *input_name,
+			       const struct dso *dso, const struct thread *thread)
+{
+	int printed = 0;
+
+	if (thread)
+		printed += snprintf(bf + printed, size - printed,
+				    "Thread: %s(%d)",
+				    (thread->comm_set ?  thread->comm : ""),
+				    thread->pid);
+	if (dso)
+		printed += snprintf(bf + printed, size - printed,
+				    "%sDSO: %s", thread ? " " : "",
+				    dso->short_name);
+	return printed ?: snprintf(bf, size, "Report: %s", input_name);
+}
+
 int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 			       u64 session_total, const char *helpline,
 			       const char *input_name)
 {
+	struct hist_browser *browser = hist_browser__new();
+	const struct thread *thread_filter = NULL;
+	const struct dso *dso_filter = NULL;
 	struct newtExitStruct es;
-	bool dso_filtered = false, thread_filtered = false;
+	char msg[160];
 	int err = -1;
-	struct hist_browser *browser = hist_browser__new();
 
 	if (browser == NULL)
 		return -1;
 
 	newtPushHelpLine(helpline);
 
-	if (hist_browser__populate(browser, hists, nr_hists, session_total) < 0)
+	hist_browser__title(msg, sizeof(msg), input_name,
+			    dso_filter, thread_filter);
+	if (hist_browser__populate(browser, hists, nr_hists, session_total, msg) < 0)
 		goto out;
 
 	while (1) {
 		const struct thread *thread;
+		const struct dso *dso;
 		char *options[16];
 		int nr_options = 0, choice = 0, i,
 		    annotate = -2, zoom_dso = -2, zoom_thread = -2;
@@ -602,20 +624,21 @@ int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 			     browser->selection->sym->name) > 0)
 			annotate = nr_options++;
 
-		if (browser->selection->map != NULL &&
-		    asprintf(&options[nr_options], "Zoom %s %s DSO",
-			     dso_filtered ? "out of" : "into",
-			     (browser->selection->map->dso->kernel ? "the Kernel" :
-			      browser->selection->map->dso->short_name)) > 0)
-			zoom_dso = nr_options++;
-
 		thread = hist_browser__selected_thread(browser);
 		if (thread != NULL &&
 		    asprintf(&options[nr_options], "Zoom %s %s(%d) thread",
-			     (thread_filtered ? "out of" : "into"),
-			     (thread->comm_set ?  thread->comm : ""), thread->pid) > 0)
+			     (thread_filter ? "out of" : "into"),
+			     (thread->comm_set ? thread->comm : ""),
+			     thread->pid) > 0)
 			zoom_thread = nr_options++;
 
+		dso = browser->selection->map ? browser->selection->map->dso : NULL;
+		if (dso != NULL &&
+		    asprintf(&options[nr_options], "Zoom %s %s DSO",
+			     (dso_filter ? "out of" : "into"),
+			     (dso->kernel ? "the Kernel" : dso->short_name)) > 0)
+			zoom_dso = nr_options++;
+
 		options[nr_options++] = (char *)"Exit";
 
 		choice = popup_menu(nr_options, options);
@@ -637,22 +660,39 @@ do_annotate:
 						 "kallsyms file");
 				continue;
 			}
-			map_symbol__annotate_browser(browser->selection,
-						     input_name);
+			map_symbol__annotate_browser(browser->selection, input_name);
 		} else if (choice == zoom_dso) {
-			nr_hists = hists__filter_by_dso(hists,
-							(dso_filtered ? NULL :
-							 browser->selection->map->dso),
-							&session_total);
-			dso_filtered = !dso_filtered;
-			if (hist_browser__populate(browser, hists, nr_hists, session_total) < 0)
+			if (dso_filter) {
+				newtPopHelpLine();
+				dso_filter = NULL;
+			} else {
+				snprintf(msg, sizeof(msg),
+					 "To zoom out press -> + \"Zoom out of %s DSO\"",
+					 dso->kernel ? "the Kernel" : dso->short_name);
+				newtPushHelpLine(msg);
+				dso_filter = dso;
+			}
+			nr_hists = hists__filter_by_dso(hists, dso_filter, &session_total);
+			hist_browser__title(msg, sizeof(msg), input_name,
+					    dso_filter, thread_filter);
+			if (hist_browser__populate(browser, hists, nr_hists, session_total, msg) < 0)
 				goto out;
 		} else if (choice == zoom_thread) {
-			nr_hists = hists__filter_by_thread(hists,
-							   (thread_filtered ? NULL : thread),
-							   &session_total);
-			thread_filtered = !thread_filtered;
-			if (hist_browser__populate(browser, hists, nr_hists, session_total) < 0)
+			if (thread_filter) {
+				newtPopHelpLine();
+				thread_filter = NULL;
+			} else {
+				snprintf(msg, sizeof(msg),
+					 "To zoom out press -> + \"Zoom out of %s(%d) thread\"",
+					 (thread->comm_set ? thread->comm : ""),
+					 thread->pid);
+				newtPushHelpLine(msg);
+				thread_filter = thread;
+			}
+			nr_hists = hists__filter_by_thread(hists, thread_filter, &session_total);
+			hist_browser__title(msg, sizeof(msg), input_name,
+					    dso_filter, thread_filter);
+			if (hist_browser__populate(browser, hists, nr_hists, session_total, msg) < 0)
 				goto out;
 		}
 	}
-- 
cgit v1.2.3


From c0ed55d2e4f600335193612725c0d7c8211a7a4a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 5 Apr 2010 12:04:23 -0300
Subject: perf TUI: Move "Yes" button to before "No"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Esc + Enter should be enough warning to avoid accidentaly exiting from
the browser.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index c0e71aacfc1..7a123a94e3f 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -132,7 +132,7 @@ static bool dialog_yesno(const char *msg)
 {
 	/* newtWinChoice should really be accepting const char pointers... */
 	char yes[] = "Yes", no[] = "No";
-	return newtWinChoice(NULL, no, yes, (char *)msg) == 2;
+	return newtWinChoice(NULL, yes, no, (char *)msg) == 1;
 }
 
 /*
-- 
cgit v1.2.3


From eed05fe70f96b04ebeb218b07ae8898e605f9b23 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 5 Apr 2010 12:53:45 -0300
Subject: perf tools: Reorganize some structs to save space
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using 'pahole --packable' I found some structs that could be reorganized
to eliminate alignment holes, in some cases getting them to be cacheline
multiples.

[acme@doppio linux-2.6-tip]$ codiff perf.old ~/bin/perf
builtin-annotate.c:
  struct perf_session    |   -8
  struct perf_header     |   -8
 2 structs changed

builtin-diff.c:
  struct sample_data         |   -8
 1 struct changed
  diff__process_sample_event |   -8
 1 function changed, 8 bytes removed, diff: -8

builtin-sched.c:
  struct sched_atom      |   -8
 1 struct changed

builtin-timechart.c:
  struct per_pid         |   -8
 1 struct changed
  cmd_timechart          |  -16
 1 function changed, 16 bytes removed, diff: -16

builtin-probe.c:
  struct perf_probe_point |   -8
  struct perf_probe_event |   -8
 2 structs changed
  opt_add_probe_event     |   -3
 1 function changed, 3 bytes removed, diff: -3

util/probe-finder.c:
  struct probe_finder      |   -8
 1 struct changed
  find_kprobe_trace_events |  -16
 1 function changed, 16 bytes removed, diff: -16

/home/acme/bin/perf:
 4 functions changed, 43 bytes removed, diff: -43
[acme@doppio linux-2.6-tip]$

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-sched.c     | 2 +-
 tools/perf/builtin-timechart.c | 2 --
 tools/perf/util/event.h        | 4 ++--
 tools/perf/util/header.h       | 2 +-
 tools/perf/util/probe-event.h  | 2 +-
 tools/perf/util/probe-finder.h | 4 ++--
 6 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 4f5a03e4344..5e59c0c40c4 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -68,10 +68,10 @@ enum sched_event_type {
 
 struct sched_atom {
 	enum sched_event_type	type;
+	int			specific_wait;
 	u64			timestamp;
 	u64			duration;
 	unsigned long		nr;
-	int			specific_wait;
 	sem_t			*wait_sem;
 	struct task_desc	*wakee;
 };
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 266e7aa996d..369c1b490a9 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -77,8 +77,6 @@ struct per_pid {
 
 	struct per_pidcomm *all;
 	struct per_pidcomm *current;
-
-	int painted;
 };
 
 
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index a33b94952e3..7f7cf8539cf 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -68,11 +68,11 @@ struct sample_data {
 	u64 addr;
 	u64 id;
 	u64 stream_id;
-	u32 cpu;
 	u64 period;
-	struct ip_callchain *callchain;
+	u32 cpu;
 	u32 raw_size;
 	void *raw_data;
+	struct ip_callchain *callchain;
 };
 
 #define BUILD_ID_SIZE 20
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 82a6af72d4c..c059f08cf87 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -47,13 +47,13 @@ int perf_file_header__read(struct perf_file_header *self,
 struct perf_header {
 	int			frozen;
 	int			attrs, size;
+	bool			needs_swap;
 	struct perf_header_attr **attr;
 	s64			attr_offset;
 	u64			data_offset;
 	u64			data_size;
 	u64			event_offset;
 	u64			event_size;
-	bool			needs_swap;
 	DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS);
 };
 
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index cd308b0a4d9..9d99fc24c4f 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -40,9 +40,9 @@ struct perf_probe_point {
 	char		*file;		/* File path */
 	char		*function;	/* Function name */
 	int		line;		/* Line number */
+	bool		retprobe;	/* Return probe flag */
 	char		*lazy_line;	/* Lazy matching pattern */
 	unsigned long	offset;		/* Offset from function entry */
-	bool		retprobe;	/* Return probe flag */
 };
 
 /* Perf probe probing argument field chain */
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index 3564f22954f..2a271321944 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -31,13 +31,13 @@ extern int find_line_range(int fd, struct line_range *lr);
 
 struct probe_finder {
 	struct perf_probe_event	*pev;		/* Target probe event */
-	int			ntevs;		/* number of trace events */
 	struct kprobe_trace_event *tevs;	/* Result trace events */
+	int			ntevs;		/* number of trace events */
 
 	/* For function searching */
+	int			lno;		/* Line number */
 	Dwarf_Addr		addr;		/* Address */
 	const char		*fname;		/* Real file name */
-	int			lno;		/* Line number */
 	Dwarf_Die		cu_die;		/* Current CU */
 	struct list_head	lcache;		/* Line cache for lazy match */
 
-- 
cgit v1.2.3


From f0e9c4fcefa42b5d28b915768dab81a7fdbbc00c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 31 Mar 2010 11:30:56 -0700
Subject: perf bench: fix spello

Fix spello in user message.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
Cc: Paul Mackerra <paulus@samba.org>s
LKML-Reference: <20100331113056.2c7df509.randy.dunlap@oracle.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
---
 tools/perf/bench/sched-pipe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c
index 4f77c7c2764..d9ab3ce446a 100644
--- a/tools/perf/bench/sched-pipe.c
+++ b/tools/perf/bench/sched-pipe.c
@@ -93,7 +93,7 @@ int bench_sched_pipe(int argc, const char **argv,
 
 	switch (bench_format) {
 	case BENCH_FORMAT_DEFAULT:
-		printf("# Extecuted %d pipe operations between two tasks\n\n",
+		printf("# Executed %d pipe operations between two tasks\n\n",
 			loops);
 
 		result_usec = diff.tv_sec * 1000000;
-- 
cgit v1.2.3


From 854c5548dfad017920a36201d40449fdbad90bfb Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 31 Mar 2010 11:31:00 -0700
Subject: perf: cleanup some Documentation

Correct typos in perf bench & perf sched help text.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>,
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20100331113100.cc898487.randy.dunlap@oracle.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
---
 tools/perf/Documentation/perf-bench.txt | 6 +++---
 tools/perf/Documentation/perf-sched.txt | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt
index ae525ac5a2c..0181dddf6b6 100644
--- a/tools/perf/Documentation/perf-bench.txt
+++ b/tools/perf/Documentation/perf-bench.txt
@@ -19,12 +19,12 @@ COMMON OPTIONS
 -f::
 --format=::
 Specify format style.
-Current available format styles are,
+Current available format styles are:
 
 'default'::
 Default style. This is mainly for human reading.
 ---------------------
-% perf bench sched pipe                      # with no style specify
+% perf bench sched pipe                      # with no style specified
 (executing 1000000 pipe operations between two tasks)
         Total time:5.855 sec
                 5.855061 usecs/op
@@ -79,7 +79,7 @@ options (20 sender and receiver processes per group)
 
       Total time:0.308 sec
 
-% perf bench sched messaging -t -g 20        # be multi-thread,with 20 groups
+% perf bench sched messaging -t -g 20        # be multi-thread, with 20 groups
 (20 sender and receiver threads per group)
 (20 groups == 800 threads run)
 
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 1ce79198997..8417644a616 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -12,7 +12,7 @@ SYNOPSIS
 
 DESCRIPTION
 -----------
-There's four variants of perf sched:
+There are four variants of perf sched:
 
   'perf sched record <command>' to record the scheduling events
   of an arbitrary workload.
@@ -27,7 +27,7 @@ There's four variants of perf sched:
   via perf sched record. (this is done by starting up mockup threads
   that mimic the workload based on the events in the trace. These
   threads can then replay the timings (CPU runtime and sleep patterns)
-  of the workload as it occured when it was recorded - and can repeat
+  of the workload as it occurred when it was recorded - and can repeat
   it a number of times, measuring its performance.)
 
 OPTIONS
-- 
cgit v1.2.3


From e9e94e3bd862d31777335722e747e97d9821bc1d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 5 Apr 2010 18:01:10 -0300
Subject: perf trace: Ignore "overwrite" field if present in
 /events/header_page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

That is not used in perf where we have the LOST events.

Without this patch we get:

[root@doppio ~]# perf lock report | head -3
  Warning: Error: expected 'data' but read 'overwrite'

So, to make the same perf command work with kernels with and without
this field, introduce variants for the parsing routines to not warn the
user in such case.

Discussed-with: Steven Rostedt <rostedt@goodmis.org>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/trace-event-parse.c | 49 +++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 9b3c20f42f9..3b81250ffed 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -37,6 +37,8 @@ int header_page_ts_offset;
 int header_page_ts_size;
 int header_page_size_offset;
 int header_page_size_size;
+int header_page_overwrite_offset;
+int header_page_overwrite_size;
 int header_page_data_offset;
 int header_page_data_size;
 
@@ -628,23 +630,32 @@ static int test_type(enum event_type type, enum event_type expect)
 	return 0;
 }
 
-static int test_type_token(enum event_type type, char *token,
-		    enum event_type expect, const char *expect_tok)
+static int __test_type_token(enum event_type type, char *token,
+			     enum event_type expect, const char *expect_tok,
+			     bool warn)
 {
 	if (type != expect) {
-		warning("Error: expected type %d but read %d",
-		    expect, type);
+		if (warn)
+			warning("Error: expected type %d but read %d",
+				expect, type);
 		return -1;
 	}
 
 	if (strcmp(token, expect_tok) != 0) {
-		warning("Error: expected '%s' but read '%s'",
-		    expect_tok, token);
+		if (warn)
+			warning("Error: expected '%s' but read '%s'",
+				expect_tok, token);
 		return -1;
 	}
 	return 0;
 }
 
+static int test_type_token(enum event_type type, char *token,
+			   enum event_type expect, const char *expect_tok)
+{
+	return __test_type_token(type, token, expect, expect_tok, true);
+}
+
 static int __read_expect_type(enum event_type expect, char **tok, int newline_ok)
 {
 	enum event_type type;
@@ -661,7 +672,8 @@ static int read_expect_type(enum event_type expect, char **tok)
 	return __read_expect_type(expect, tok, 1);
 }
 
-static int __read_expected(enum event_type expect, const char *str, int newline_ok)
+static int __read_expected(enum event_type expect, const char *str,
+			   int newline_ok, bool warn)
 {
 	enum event_type type;
 	char *token;
@@ -672,21 +684,26 @@ static int __read_expected(enum event_type expect, const char *str, int newline_
 	else
 		type = read_token_item(&token);
 
-	ret = test_type_token(type, token, expect, str);
+	ret = __test_type_token(type, token, expect, str, warn);
 
 	free_token(token);
 
 	return ret;
 }
 
+static int read_expected_warn(enum event_type expect, const char *str, bool warn)
+{
+	return __read_expected(expect, str, 1, warn);
+}
+
 static int read_expected(enum event_type expect, const char *str)
 {
-	return __read_expected(expect, str, 1);
+	return __read_expected(expect, str, 1, true);
 }
 
 static int read_expected_item(enum event_type expect, const char *str)
 {
-	return __read_expected(expect, str, 0);
+	return __read_expected(expect, str, 0, true);
 }
 
 static char *event_read_name(void)
@@ -3088,7 +3105,7 @@ static void print_args(struct print_arg *args)
 }
 
 static void parse_header_field(const char *field,
-			       int *offset, int *size)
+			       int *offset, int *size, bool warn)
 {
 	char *token;
 	int type;
@@ -3103,7 +3120,7 @@ static void parse_header_field(const char *field,
 		goto fail;
 	free_token(token);
 
-	if (read_expected(EVENT_ITEM, field) < 0)
+	if (read_expected_warn(EVENT_ITEM, field, warn) < 0)
 		return;
 	if (read_expected(EVENT_OP, ";") < 0)
 		return;
@@ -3160,11 +3177,13 @@ int parse_header_page(char *buf, unsigned long size)
 	init_input_buf(buf, size);
 
 	parse_header_field("timestamp", &header_page_ts_offset,
-			   &header_page_ts_size);
+			   &header_page_ts_size, true);
 	parse_header_field("commit", &header_page_size_offset,
-			   &header_page_size_size);
+			   &header_page_size_size, true);
+	parse_header_field("overwrite", &header_page_overwrite_offset,
+			   &header_page_overwrite_size, false);
 	parse_header_field("data", &header_page_data_offset,
-			   &header_page_data_size);
+			   &header_page_data_size, true);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 53e5b5c215ce8372250e227f2c9acf9892de8434 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 9 Apr 2010 13:33:54 -0300
Subject: perf tools: Fix perl support installation when O= is used
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We need to create the $O/scripts/perl/Perf-Trace-Util/ directory too.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index f578b05a30e..57b3569716d 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -158,6 +158,7 @@ all::
 # Define NO_DWARF if you do not want debug-info analysis feature at all.
 
 $(shell sh -c 'mkdir -p $(OUTPUT)scripts/python/Perf-Trace-Util/' 2> /dev/null)
+$(shell sh -c 'mkdir -p $(OUTPUT)scripts/perl/Perf-Trace-Util/' 2> /dev/null)
 $(shell sh -c 'mkdir -p $(OUTPUT)util/scripting-engines/' 2> /dev/null)
 $(shell sh -c 'mkdir $(OUTPUT)bench' 2> /dev/null)
 
-- 
cgit v1.2.3


From 5958f1d5d722df7a9e5d129676614a8e5219bacd Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 31 Mar 2010 21:56:41 +0200
Subject: x86, cpu: Add AMD core boosting feature flag to /proc/cpuinfo

By semi-popular demand, this adds the Core Performance Boost feature
flag to /proc/cpuinfo. Possible use case for this is userspace tools
like cpufreq-aperf, for example, so that they don't have to jump through
hoops of accessing "/dev/cpu/%d/cpuid" in order to check for CPB hw
support, or call cpuid from userspace.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-2-git-send-email-bp@amd64.org>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/cpufeature.h          | 1 +
 arch/x86/kernel/cpu/addon_cpuid_features.c | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 0cd82d06861..630e623f61e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -161,6 +161,7 @@
  */
 #define X86_FEATURE_IDA		(7*32+ 0) /* Intel Dynamic Acceleration */
 #define X86_FEATURE_ARAT	(7*32+ 1) /* Always Running APIC Timer */
+#define X86_FEATURE_CPB		(7*32+ 2) /* AMD Core Performance Boost */
 
 /* Virtualization flags: Linux defined */
 #define X86_FEATURE_TPR_SHADOW  (8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 97ad79cdf68..ead2a1cfa57 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -30,8 +30,9 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	const struct cpuid_bit *cb;
 
 	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-		{ X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
-		{ X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
+		{ X86_FEATURE_IDA,   CR_EAX, 1, 0x00000006 },
+		{ X86_FEATURE_ARAT,  CR_EAX, 2, 0x00000006 },
+		{ X86_FEATURE_CPB,   CR_EDX, 9, 0x80000007 },
 		{ X86_FEATURE_NPT,   CR_EDX, 0, 0x8000000a },
 		{ X86_FEATURE_LBRV,  CR_EDX, 1, 0x8000000a },
 		{ X86_FEATURE_SVML,  CR_EDX, 2, 0x8000000a },
-- 
cgit v1.2.3


From 73860c6b2fd159a35637e233d735e36887c266ad Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 31 Mar 2010 21:56:42 +0200
Subject: powernow-k8: Add core performance boost support

Starting with F10h, revE, AMD processors add support for a dynamic
core boosting feature called Core Performance Boost. When a specific
condition is present, a subset of the cores on a system are boosted
beyond their P0 operating frequency to speed up the performance of
single-threaded applications.

In the normal case, the system comes out of reset with core boosting
enabled. This patch adds a sysfs knob with which core boosting can be
switched on or off for benchmarking purposes.

While at it, make the CPB code hotplug-aware so that taking cores
offline wouldn't interfere with boosting the remaining online cores.
Furthermore, add cpu_online_mask hotplug protection as suggested by
Andrew.

Finally, cleanup the driver init codepath and update copyrights.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-3-git-send-email-bp@amd64.org>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 161 ++++++++++++++++++++++++++++--
 arch/x86/kernel/cpu/cpufreq/powernow-k8.h |   2 -
 2 files changed, 151 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index d360b56e982..74ca34b5c00 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,6 +1,5 @@
-
 /*
- *   (c) 2003-2006 Advanced Micro Devices, Inc.
+ *   (c) 2003-2010 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
@@ -54,6 +53,10 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
 
 static int cpu_family = CPU_OPTERON;
 
+/* core performance boost */
+static bool cpb_capable, cpb_enabled;
+static struct msr __percpu *msrs;
+
 #ifndef CONFIG_SMP
 static inline const struct cpumask *cpu_core_mask(int cpu)
 {
@@ -1393,8 +1396,77 @@ out:
 	return khz;
 }
 
+static void _cpb_toggle_msrs(bool t)
+{
+	int cpu;
+
+	get_online_cpus();
+
+	rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+	for_each_cpu(cpu, cpu_online_mask) {
+		struct msr *reg = per_cpu_ptr(msrs, cpu);
+		if (t)
+			reg->l &= ~BIT(25);
+		else
+			reg->l |= BIT(25);
+	}
+	wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+	put_online_cpus();
+}
+
+/*
+ * Switch on/off core performance boosting.
+ *
+ * 0=disable
+ * 1=enable.
+ */
+static void cpb_toggle(bool t)
+{
+	if (!cpb_capable)
+		return;
+
+	if (t && !cpb_enabled) {
+		cpb_enabled = true;
+		_cpb_toggle_msrs(t);
+		printk(KERN_INFO PFX "Core Boosting enabled.\n");
+	} else if (!t && cpb_enabled) {
+		cpb_enabled = false;
+		_cpb_toggle_msrs(t);
+		printk(KERN_INFO PFX "Core Boosting disabled.\n");
+	}
+}
+
+static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
+				 size_t count)
+{
+	int ret = -EINVAL;
+	unsigned long val = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (!ret && (val == 0 || val == 1) && cpb_capable)
+		cpb_toggle(val);
+	else
+		return -EINVAL;
+
+	return count;
+}
+
+static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
+{
+	return sprintf(buf, "%u\n", cpb_enabled);
+}
+
+#define define_one_rw(_name) \
+static struct freq_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+define_one_rw(cpb);
+
 static struct freq_attr *powernow_k8_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
+	&cpb,
 	NULL,
 };
 
@@ -1410,10 +1482,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
 	.attr		= powernow_k8_attr,
 };
 
+/*
+ * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
+ * cannot block the remaining ones from boosting. On the CPU_UP path we
+ * simply keep the boost-disable flag in sync with the current global
+ * state.
+ */
+static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
+				void *hcpu)
+{
+	unsigned cpu = (long)hcpu;
+	u32 lo, hi;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+
+		if (!cpb_enabled) {
+			rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+			lo |= BIT(25);
+			wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+		}
+		break;
+
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+		lo &= ~BIT(25);
+		wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+		break;
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata cpb_nb = {
+	.notifier_call		= cpb_notify,
+};
+
 /* driver entry point for init */
 static int __cpuinit powernowk8_init(void)
 {
-	unsigned int i, supported_cpus = 0;
+	unsigned int i, supported_cpus = 0, cpu;
 
 	for_each_online_cpu(i) {
 		int rc;
@@ -1422,15 +1535,36 @@ static int __cpuinit powernowk8_init(void)
 			supported_cpus++;
 	}
 
-	if (supported_cpus == num_online_cpus()) {
-		printk(KERN_INFO PFX "Found %d %s "
-			"processors (%d cpu cores) (" VERSION ")\n",
-			num_online_nodes(),
-			boot_cpu_data.x86_model_id, supported_cpus);
-		return cpufreq_register_driver(&cpufreq_amd64_driver);
+	if (supported_cpus != num_online_cpus())
+		return -ENODEV;
+
+	printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
+		num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
+
+	if (boot_cpu_has(X86_FEATURE_CPB)) {
+
+		cpb_capable = true;
+
+		register_cpu_notifier(&cpb_nb);
+
+		msrs = msrs_alloc();
+		if (!msrs) {
+			printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
+			return -ENOMEM;
+		}
+
+		rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+		for_each_cpu(cpu, cpu_online_mask) {
+			struct msr *reg = per_cpu_ptr(msrs, cpu);
+			cpb_enabled |= !(!!(reg->l & BIT(25)));
+		}
+
+		printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
+			(cpb_enabled ? "on" : "off"));
 	}
 
-	return -ENODEV;
+	return cpufreq_register_driver(&cpufreq_amd64_driver);
 }
 
 /* driver entry point for term */
@@ -1438,6 +1572,13 @@ static void __exit powernowk8_exit(void)
 {
 	dprintk("exit\n");
 
+	if (boot_cpu_has(X86_FEATURE_CPB)) {
+		msrs_free(msrs);
+		msrs = NULL;
+
+		unregister_cpu_notifier(&cpb_nb);
+	}
+
 	cpufreq_unregister_driver(&cpufreq_amd64_driver);
 }
 
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 02ce824073c..df3529b1c02 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,7 +5,6 @@
  *  http://www.gnu.org/licenses/gpl.html
  */
 
-
 enum pstate {
 	HW_PSTATE_INVALID = 0xff,
 	HW_PSTATE_0 = 0,
@@ -55,7 +54,6 @@ struct powernow_k8_data {
 	struct cpumask *available_cores;
 };
 
-
 /* processor's cpuid instruction support */
 #define CPUID_PROCESSOR_SIGNATURE	1	/* function 1 */
 #define CPUID_XFAM			0x0ff00000	/* extended family */
-- 
cgit v1.2.3


From d65ad45cd82a0db9544469b8c54f5dc5cafbb2d8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 31 Mar 2010 21:56:43 +0200
Subject: x86: Unify APERF/MPERF support

Initialize this CPUID flag feature in common code. It could be made a
standalone function later, maybe, if more functionality is duplicated.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-4-git-send-email-bp@amd64.org>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 8 ++++++++
 arch/x86/kernel/cpu/intel.c                | 6 ------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index ead2a1cfa57..fd1fc1902a4 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -54,6 +54,14 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		if (regs[cb->reg] & (1 << cb->bit))
 			set_cpu_cap(c, cb->feature);
 	}
+
+	/*
+	 * common AMD/Intel features
+	 */
+	if (c->cpuid_level >= 6) {
+		if (cpuid_ecx(6) & 0x1)
+			set_cpu_cap(c, X86_FEATURE_APERFMPERF);
+	}
 }
 
 /* leaf 0xb SMT level */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7e1cca13af3..3830258a5f5 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -352,12 +352,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
 	}
 
-	if (c->cpuid_level > 6) {
-		unsigned ecx = cpuid_ecx(6);
-		if (ecx & 0x01)
-			set_cpu_cap(c, X86_FEATURE_APERFMPERF);
-	}
-
 	if (cpu_has_xmm2)
 		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
 	if (cpu_has_ds) {
-- 
cgit v1.2.3


From a2fed573f065e526bfd5cbf26e5491973d9e9aaa Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Thu, 18 Mar 2010 18:41:46 +0100
Subject: x86, cpufreq: Add APERF/MPERF support for AMD processors

Starting with model 10 of Family 0x10, AMD processors may have
support for APERF/MPERF.  Add support for identifying it and using
it within cpufreq.  Move the APERF/MPERF functions out of the
acpi-cpufreq code and into their own file so they can easily be
shared.

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <20100401141956.GA1930@aftab>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/cpufreq/Makefile       |  4 +--
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 44 ++------------------------
 arch/x86/kernel/cpu/cpufreq/mperf.c        | 51 ++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/cpufreq/mperf.h        |  9 ++++++
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c  |  8 +++++
 5 files changed, 72 insertions(+), 44 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/cpufreq/mperf.c
 create mode 100644 arch/x86/kernel/cpu/cpufreq/mperf.h

diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 1840c0a5170..bd54bf67e6f 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -2,8 +2,8 @@
 # K8 systems. ACPI is preferred to all other hardware-specific drivers.
 # speedstep-* is preferred over p4-clockmod.
 
-obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o
+obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o mperf.o
+obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o mperf.o
 obj-$(CONFIG_X86_PCC_CPUFREQ)		+= pcc-cpufreq.o
 obj-$(CONFIG_X86_POWERNOW_K6)		+= powernow-k6.o
 obj-$(CONFIG_X86_POWERNOW_K7)		+= powernow-k7.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1b1920fa7c8..dc68e5c2c07 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -45,6 +45,7 @@
 #include <asm/msr.h>
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
+#include "mperf.h"
 
 #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
 		"acpi-cpufreq", msg)
@@ -70,8 +71,6 @@ struct acpi_cpufreq_data {
 
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
 
-static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
-
 /* acpi_perf_data is a pointer to percpu data. */
 static struct acpi_processor_performance *acpi_perf_data;
 
@@ -239,45 +238,6 @@ static u32 get_cur_val(const struct cpumask *mask)
 	return cmd.val;
 }
 
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
-	struct aperfmperf *am = _cur;
-
-	get_aperfmperf(am);
-}
-
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-static unsigned int get_measured_perf(struct cpufreq_policy *policy,
-				      unsigned int cpu)
-{
-	struct aperfmperf perf;
-	unsigned long ratio;
-	unsigned int retval;
-
-	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
-		return 0;
-
-	ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
-	per_cpu(acfreq_old_perf, cpu) = perf;
-
-	retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
-
-	return retval;
-}
-
 static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
 {
 	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
@@ -701,7 +661,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	/* Check for APERF/MPERF support in hardware */
 	if (cpu_has(c, X86_FEATURE_APERFMPERF))
-		acpi_cpufreq_driver.getavg = get_measured_perf;
+		acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
 
 	dprintk("CPU%u - ACPI performance management activated.\n", cpu);
 	for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
new file mode 100644
index 00000000000..911e193018a
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.c
@@ -0,0 +1,51 @@
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+
+#include "mperf.h"
+
+static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
+
+/* Called via smp_call_function_single(), on the target CPU */
+static void read_measured_perf_ctrs(void *_cur)
+{
+	struct aperfmperf *am = _cur;
+
+	get_aperfmperf(am);
+}
+
+/*
+ * Return the measured active (C0) frequency on this CPU since last call
+ * to this function.
+ * Input: cpu number
+ * Return: Average CPU frequency in terms of max frequency (zero on error)
+ *
+ * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
+ * over a period of time, while CPU is in C0 state.
+ * IA32_MPERF counts at the rate of max advertised frequency
+ * IA32_APERF counts at the rate of actual CPU frequency
+ * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
+ * no meaning should be associated with absolute values of these MSRs.
+ */
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+					unsigned int cpu)
+{
+	struct aperfmperf perf;
+	unsigned long ratio;
+	unsigned int retval;
+
+	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
+		return 0;
+
+	ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
+	per_cpu(acfreq_old_perf, cpu) = perf;
+
+	retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
new file mode 100644
index 00000000000..5dbf2950dc2
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.h
@@ -0,0 +1,9 @@
+/*
+ *  (c) 2010 Advanced Micro Devices, Inc.
+ *  Your use of this code is subject to the terms and conditions of the
+ *  GNU general public license version 2. See "COPYING" or
+ *  http://www.gnu.org/licenses/gpl.html
+ */
+
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+					unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 74ca34b5c00..52fce638f44 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -45,6 +45,7 @@
 #define PFX "powernow-k8: "
 #define VERSION "version 2.20.00"
 #include "powernow-k8.h"
+#include "mperf.h"
 
 /* serialize freq changes  */
 static DEFINE_MUTEX(fidvid_mutex);
@@ -57,6 +58,8 @@ static int cpu_family = CPU_OPTERON;
 static bool cpb_capable, cpb_enabled;
 static struct msr __percpu *msrs;
 
+static struct cpufreq_driver cpufreq_amd64_driver;
+
 #ifndef CONFIG_SMP
 static inline const struct cpumask *cpu_core_mask(int cpu)
 {
@@ -1251,6 +1254,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 	struct powernow_k8_data *data;
 	struct init_on_cpu init_on_cpu;
 	int rc;
+	struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
 
 	if (!cpu_online(pol->cpu))
 		return -ENODEV;
@@ -1325,6 +1329,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 		return -EINVAL;
 	}
 
+	/* Check for APERF/MPERF support in hardware */
+	if (cpu_has(c, X86_FEATURE_APERFMPERF))
+		cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
+
 	cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
 
 	if (cpu_family == CPU_HW_PSTATE)
-- 
cgit v1.2.3


From 679370641e3675633cad222449262abbe93a4a2a Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Wed, 31 Mar 2010 21:56:45 +0200
Subject: powernow-k8: Fix frequency reporting

With F10, model 10, all valid frequencies are in the ACPI _PST table.

Cc: <stable@kernel.org> # 33.x 32.x
Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <1270065406-1814-6-git-send-email-bp@amd64.org>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 52fce638f44..6f3dc8fbbfd 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -935,7 +935,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
 		powernow_table[i].index = index;
 
 		/* Frequency may be rounded for these */
-		if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) {
+		if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
+				 || boot_cpu_data.x86 == 0x11) {
 			powernow_table[i].frequency =
 				freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
 		} else
-- 
cgit v1.2.3


From 6dad2a29646ce3792c40cfc52d77e9b65a7bb143 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 31 Mar 2010 21:56:46 +0200
Subject: cpufreq: Unify sysfs attribute definition macros

Multiple modules used to define those which are with identical
functionality and were needlessly replicated among the different cpufreq
drivers. Push them into the header and remove duplication.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-7-git-send-email-bp@amd64.org>
Reviewed-by: Thomas Renninger <trenn@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 drivers/cpufreq/cpufreq.c              | 40 ++++++++++------------------
 drivers/cpufreq/cpufreq_conservative.c | 48 ++++++++++++----------------------
 drivers/cpufreq/cpufreq_ondemand.c     | 40 +++++++++-------------------
 include/linux/cpufreq.h                | 30 +++++++++++++++++++++
 4 files changed, 72 insertions(+), 86 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 2d5d575e889..e02e4174c2c 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -662,32 +662,20 @@ static ssize_t show_bios_limit(struct cpufreq_policy *policy, char *buf)
 	return sprintf(buf, "%u\n", policy->cpuinfo.max_freq);
 }
 
-#define define_one_ro(_name) \
-static struct freq_attr _name = \
-__ATTR(_name, 0444, show_##_name, NULL)
-
-#define define_one_ro0400(_name) \
-static struct freq_attr _name = \
-__ATTR(_name, 0400, show_##_name, NULL)
-
-#define define_one_rw(_name) \
-static struct freq_attr _name = \
-__ATTR(_name, 0644, show_##_name, store_##_name)
-
-define_one_ro0400(cpuinfo_cur_freq);
-define_one_ro(cpuinfo_min_freq);
-define_one_ro(cpuinfo_max_freq);
-define_one_ro(cpuinfo_transition_latency);
-define_one_ro(scaling_available_governors);
-define_one_ro(scaling_driver);
-define_one_ro(scaling_cur_freq);
-define_one_ro(bios_limit);
-define_one_ro(related_cpus);
-define_one_ro(affected_cpus);
-define_one_rw(scaling_min_freq);
-define_one_rw(scaling_max_freq);
-define_one_rw(scaling_governor);
-define_one_rw(scaling_setspeed);
+cpufreq_freq_attr_ro_perm(cpuinfo_cur_freq, 0400);
+cpufreq_freq_attr_ro(cpuinfo_min_freq);
+cpufreq_freq_attr_ro(cpuinfo_max_freq);
+cpufreq_freq_attr_ro(cpuinfo_transition_latency);
+cpufreq_freq_attr_ro(scaling_available_governors);
+cpufreq_freq_attr_ro(scaling_driver);
+cpufreq_freq_attr_ro(scaling_cur_freq);
+cpufreq_freq_attr_ro(bios_limit);
+cpufreq_freq_attr_ro(related_cpus);
+cpufreq_freq_attr_ro(affected_cpus);
+cpufreq_freq_attr_rw(scaling_min_freq);
+cpufreq_freq_attr_rw(scaling_max_freq);
+cpufreq_freq_attr_rw(scaling_governor);
+cpufreq_freq_attr_rw(scaling_setspeed);
 
 static struct attribute *default_attrs[] = {
 	&cpuinfo_min_freq.attr,
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 599a40b25cb..ce5248e0421 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -178,12 +178,8 @@ static ssize_t show_sampling_rate_min(struct kobject *kobj,
 	return sprintf(buf, "%u\n", min_sampling_rate);
 }
 
-#define define_one_ro(_name)		\
-static struct global_attr _name =	\
-__ATTR(_name, 0444, show_##_name, NULL)
-
-define_one_ro(sampling_rate_max);
-define_one_ro(sampling_rate_min);
+define_one_global_ro(sampling_rate_max);
+define_one_global_ro(sampling_rate_min);
 
 /* cpufreq_conservative Governor Tunables */
 #define show_one(file_name, object)					\
@@ -221,12 +217,8 @@ show_one_old(freq_step);
 show_one_old(sampling_rate_min);
 show_one_old(sampling_rate_max);
 
-#define define_one_ro_old(object, _name)	\
-static struct freq_attr object =		\
-__ATTR(_name, 0444, show_##_name##_old, NULL)
-
-define_one_ro_old(sampling_rate_min_old, sampling_rate_min);
-define_one_ro_old(sampling_rate_max_old, sampling_rate_max);
+cpufreq_freq_attr_ro_old(sampling_rate_min);
+cpufreq_freq_attr_ro_old(sampling_rate_max);
 
 /*** delete after deprecation time ***/
 
@@ -364,16 +356,12 @@ static ssize_t store_freq_step(struct kobject *a, struct attribute *b,
 	return count;
 }
 
-#define define_one_rw(_name) \
-static struct global_attr _name = \
-__ATTR(_name, 0644, show_##_name, store_##_name)
-
-define_one_rw(sampling_rate);
-define_one_rw(sampling_down_factor);
-define_one_rw(up_threshold);
-define_one_rw(down_threshold);
-define_one_rw(ignore_nice_load);
-define_one_rw(freq_step);
+define_one_global_rw(sampling_rate);
+define_one_global_rw(sampling_down_factor);
+define_one_global_rw(up_threshold);
+define_one_global_rw(down_threshold);
+define_one_global_rw(ignore_nice_load);
+define_one_global_rw(freq_step);
 
 static struct attribute *dbs_attributes[] = {
 	&sampling_rate_max.attr,
@@ -409,16 +397,12 @@ write_one_old(down_threshold);
 write_one_old(ignore_nice_load);
 write_one_old(freq_step);
 
-#define define_one_rw_old(object, _name)	\
-static struct freq_attr object =		\
-__ATTR(_name, 0644, show_##_name##_old, store_##_name##_old)
-
-define_one_rw_old(sampling_rate_old, sampling_rate);
-define_one_rw_old(sampling_down_factor_old, sampling_down_factor);
-define_one_rw_old(up_threshold_old, up_threshold);
-define_one_rw_old(down_threshold_old, down_threshold);
-define_one_rw_old(ignore_nice_load_old, ignore_nice_load);
-define_one_rw_old(freq_step_old, freq_step);
+cpufreq_freq_attr_rw_old(sampling_rate);
+cpufreq_freq_attr_rw_old(sampling_down_factor);
+cpufreq_freq_attr_rw_old(up_threshold);
+cpufreq_freq_attr_rw_old(down_threshold);
+cpufreq_freq_attr_rw_old(ignore_nice_load);
+cpufreq_freq_attr_rw_old(freq_step);
 
 static struct attribute *dbs_attributes_old[] = {
 	&sampling_rate_max_old.attr,
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index bd444dc93cf..c00b25f4d24 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -234,12 +234,8 @@ static ssize_t show_sampling_rate_min(struct kobject *kobj,
 	return sprintf(buf, "%u\n", min_sampling_rate);
 }
 
-#define define_one_ro(_name)		\
-static struct global_attr _name =	\
-__ATTR(_name, 0444, show_##_name, NULL)
-
-define_one_ro(sampling_rate_max);
-define_one_ro(sampling_rate_min);
+define_one_global_ro(sampling_rate_max);
+define_one_global_ro(sampling_rate_min);
 
 /* cpufreq_ondemand Governor Tunables */
 #define show_one(file_name, object)					\
@@ -274,12 +270,8 @@ show_one_old(powersave_bias);
 show_one_old(sampling_rate_min);
 show_one_old(sampling_rate_max);
 
-#define define_one_ro_old(object, _name)       \
-static struct freq_attr object =               \
-__ATTR(_name, 0444, show_##_name##_old, NULL)
-
-define_one_ro_old(sampling_rate_min_old, sampling_rate_min);
-define_one_ro_old(sampling_rate_max_old, sampling_rate_max);
+cpufreq_freq_attr_ro_old(sampling_rate_min);
+cpufreq_freq_attr_ro_old(sampling_rate_max);
 
 /*** delete after deprecation time ***/
 
@@ -376,14 +368,10 @@ static ssize_t store_powersave_bias(struct kobject *a, struct attribute *b,
 	return count;
 }
 
-#define define_one_rw(_name) \
-static struct global_attr _name = \
-__ATTR(_name, 0644, show_##_name, store_##_name)
-
-define_one_rw(sampling_rate);
-define_one_rw(up_threshold);
-define_one_rw(ignore_nice_load);
-define_one_rw(powersave_bias);
+define_one_global_rw(sampling_rate);
+define_one_global_rw(up_threshold);
+define_one_global_rw(ignore_nice_load);
+define_one_global_rw(powersave_bias);
 
 static struct attribute *dbs_attributes[] = {
 	&sampling_rate_max.attr,
@@ -415,14 +403,10 @@ write_one_old(up_threshold);
 write_one_old(ignore_nice_load);
 write_one_old(powersave_bias);
 
-#define define_one_rw_old(object, _name)       \
-static struct freq_attr object =               \
-__ATTR(_name, 0644, show_##_name##_old, store_##_name##_old)
-
-define_one_rw_old(sampling_rate_old, sampling_rate);
-define_one_rw_old(up_threshold_old, up_threshold);
-define_one_rw_old(ignore_nice_load_old, ignore_nice_load);
-define_one_rw_old(powersave_bias_old, powersave_bias);
+cpufreq_freq_attr_rw_old(sampling_rate);
+cpufreq_freq_attr_rw_old(up_threshold);
+cpufreq_freq_attr_rw_old(ignore_nice_load);
+cpufreq_freq_attr_rw_old(powersave_bias);
 
 static struct attribute *dbs_attributes_old[] = {
        &sampling_rate_max_old.attr,
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 4de02b10007..9f15150ce8d 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -278,6 +278,27 @@ struct freq_attr {
 	ssize_t (*store)(struct cpufreq_policy *, const char *, size_t count);
 };
 
+#define cpufreq_freq_attr_ro(_name)		\
+static struct freq_attr _name =			\
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define cpufreq_freq_attr_ro_perm(_name, _perm)	\
+static struct freq_attr _name =			\
+__ATTR(_name, _perm, show_##_name, NULL)
+
+#define cpufreq_freq_attr_ro_old(_name)		\
+static struct freq_attr _name##_old =		\
+__ATTR(_name, 0444, show_##_name##_old, NULL)
+
+#define cpufreq_freq_attr_rw(_name)		\
+static struct freq_attr _name =			\
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+#define cpufreq_freq_attr_rw_old(_name)		\
+static struct freq_attr _name##_old =		\
+__ATTR(_name, 0644, show_##_name##_old, store_##_name##_old)
+
+
 struct global_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct kobject *kobj,
@@ -286,6 +307,15 @@ struct global_attr {
 			 const char *c, size_t count);
 };
 
+#define define_one_global_ro(_name)		\
+static struct global_attr _name =		\
+__ATTR(_name, 0444, show_##_name, NULL)
+
+#define define_one_global_rw(_name)		\
+static struct global_attr _name =		\
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+
 /*********************************************************************
  *                        CPUFREQ 2.6. INTERFACE                     *
  *********************************************************************/
-- 
cgit v1.2.3


From 5534ecb2dda04345e8243901e0e49599228b4273 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 27 Feb 2010 19:49:37 +0100
Subject: ptrace: kill BKL in ptrace syscall

The comment suggests that this usage is stale. There is no bkl in the
exec path so if there is a race lurking there, the bkl in ptrace is
not going to help in this regard.

Overview of the possibility of "accidental" races this bkl might
protect:

- ptrace_traceme() is protected against task removal and concurrent
read/write on current->ptrace as it locks write tasklist_lock.

- arch_ptrace_attach() is serialized by ptrace_traceme() against
concurrent PTRACE_TRACEME or PTRACE_ATTACH

- ptrace_attach() is protected the same way ptrace_traceme() and
in turn serializes arch_ptrace_attach()

- ptrace_check_attach() does its own well described serializing too.

There is no obvious race here.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Roland McGrath <roland@redhat.com>
---
 kernel/ptrace.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42ad8ae729a..53575020f82 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -666,10 +666,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
 	struct task_struct *child;
 	long ret;
 
-	/*
-	 * This lock_kernel fixes a subtle race with suid exec
-	 */
-	lock_kernel();
 	if (request == PTRACE_TRACEME) {
 		ret = ptrace_traceme();
 		if (!ret)
@@ -703,7 +699,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
  out_put_task_struct:
 	put_task_struct(child);
  out:
-	unlock_kernel();
 	return ret;
 }
 
@@ -813,10 +808,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 	struct task_struct *child;
 	long ret;
 
-	/*
-	 * This lock_kernel fixes a subtle race with suid exec
-	 */
-	lock_kernel();
 	if (request == PTRACE_TRACEME) {
 		ret = ptrace_traceme();
 		goto out;
@@ -846,7 +837,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
  out_put_task_struct:
 	put_task_struct(child);
  out:
-	unlock_kernel();
 	return ret;
 }
 #endif	/* CONFIG_COMPAT */
-- 
cgit v1.2.3


From c05556421742eb47f80301767653a4bcb19de9de Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au.ibm.com>
Date: Tue, 13 Apr 2010 18:37:33 +1000
Subject: perf: Fix endianness argument compatibility with OPT_BOOLEAN() and
 introduce OPT_INCR()

Parsing an option from the command line with OPT_BOOLEAN on a
bool data type would not work on a big-endian machine due to the
manner in which the boolean was being cast into an int and
incremented. For example, running 'perf probe --list' on a
PowerPC machine would fail to properly set the list_events bool
and would therefore print out the usage information and
terminate.

This patch makes OPT_BOOLEAN work as expected with a bool
datatype. For cases where the original OPT_BOOLEAN was
intentionally being used to increment an int each time it was
passed in on the command line, this patch introduces OPT_INCR
with the old behaviour of OPT_BOOLEAN (the verbose variable is
currently the only such example of this).

I have reviewed every use of OPT_BOOLEAN to verify that a true
C99 bool was passed. Where integers were used, I verified that
they were only being used for boolean logic and changed them to
bools to ensure that they would not be mistakenly used as ints.
The major exception was the verbose variable which now uses
OPT_INCR instead of OPT_BOOLEAN.

Signed-off-by: Ian Munsie <imunsie@au.ibm.com>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: <stable@kernel.org> # NOTE: wont apply to .3[34].x cleanly, please backport
Cc: Git development list <git@vger.kernel.org>
Cc: Ian Munsie <imunsie@au1.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric B Munson <ebmunson@us.ibm.com>
Cc: Valdis.Kletnieks@vt.edu
Cc: WANG Cong <amwang@redhat.com>
Cc: Thiago Farina <tfransosi@gmail.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Cc: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: John Kacur <jkacur@redhat.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1271147857-11604-1-git-send-email-imunsie@au.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/bench/mem-memcpy.c       |  2 +-
 tools/perf/bench/sched-messaging.c  |  4 ++--
 tools/perf/builtin-annotate.c       |  8 ++++----
 tools/perf/builtin-buildid-cache.c  |  2 +-
 tools/perf/builtin-buildid-list.c   |  4 ++--
 tools/perf/builtin-diff.c           |  4 ++--
 tools/perf/builtin-help.c           |  2 +-
 tools/perf/builtin-lock.c           |  2 +-
 tools/perf/builtin-probe.c          |  2 +-
 tools/perf/builtin-record.c         | 24 ++++++++++++------------
 tools/perf/builtin-report.c         |  6 +++---
 tools/perf/builtin-sched.c          |  6 +++---
 tools/perf/builtin-stat.c           | 10 +++++-----
 tools/perf/builtin-timechart.c      |  2 +-
 tools/perf/builtin-top.c            | 14 +++++++-------
 tools/perf/builtin-trace.c          |  2 +-
 tools/perf/util/debug.c             |  2 +-
 tools/perf/util/debug.h             |  3 ++-
 tools/perf/util/parse-options.c     |  6 ++++++
 tools/perf/util/parse-options.h     |  4 +++-
 tools/perf/util/trace-event-parse.c |  2 +-
 tools/perf/util/trace-event.h       |  3 ++-
 22 files changed, 62 insertions(+), 52 deletions(-)

diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index 52e646e3e87..38dae746514 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -23,7 +23,7 @@
 
 static const char	*length_str	= "1MB";
 static const char	*routine	= "default";
-static int		use_clock	= 0;
+static bool		use_clock	= false;
 static int		clock_fd;
 
 static const struct option options[] = {
diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c
index 81cee78181f..da1b2e9f01f 100644
--- a/tools/perf/bench/sched-messaging.c
+++ b/tools/perf/bench/sched-messaging.c
@@ -31,9 +31,9 @@
 
 #define DATASIZE 100
 
-static int use_pipes = 0;
+static bool use_pipes = false;
 static unsigned int loops = 100;
-static unsigned int thread_mode = 0;
+static bool thread_mode = false;
 static unsigned int num_groups = 10;
 
 struct sender_context {
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index ee0d9172699..06eaebe10d0 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -28,11 +28,11 @@
 
 static char		const *input_name = "perf.data";
 
-static int		force;
+static bool		force;
 
-static int		full_paths;
+static bool		full_paths;
 
-static int		print_line;
+static bool		print_line;
 
 struct sym_hist {
 	u64		sum;
@@ -595,7 +595,7 @@ static const struct option options[] = {
 	OPT_STRING('s', "symbol", &sym_hist_filter, "symbol",
 		    "symbol to annotate"),
 	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index 30a05f552c9..f8e3d185202 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -27,7 +27,7 @@ static const struct option buildid_cache_options[] = {
 		   "file list", "file(s) to add"),
 	OPT_STRING('r', "remove", &remove_name_list_str, "file list",
 		    "file(s) to remove"),
-	OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose"),
+	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
 	OPT_END()
 };
 
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index d0675c02f81..af2ad8b92f7 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -16,7 +16,7 @@
 #include "util/symbol.h"
 
 static char const *input_name = "perf.data";
-static int force;
+static bool force;
 static bool with_hits;
 
 static const char * const buildid_list_usage[] = {
@@ -29,7 +29,7 @@ static const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
 	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose"),
 	OPT_END()
 };
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 1ea15d8aeed..3a1d94d75dc 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -19,7 +19,7 @@
 static char const *input_old = "perf.data.old",
 		  *input_new = "perf.data";
 static char	  diff__default_sort_order[] = "dso,symbol";
-static int  force;
+static bool  force;
 static bool show_displacement;
 
 static int perf_session__add_hist_entry(struct perf_session *self,
@@ -188,7 +188,7 @@ static const char * const diff_usage[] = {
 };
 
 static const struct option options[] = {
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('m', "displacement", &show_displacement,
 		    "Show position displacement relative to baseline"),
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 215b584007b..81e3ecc40fc 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -29,7 +29,7 @@ enum help_format {
 	HELP_FORMAT_WEB,
 };
 
-static int show_all = 0;
+static bool show_all = false;
 static enum help_format help_format = HELP_FORMAT_MAN;
 static struct option builtin_help_options[] = {
 	OPT_BOOLEAN('a', "all", &show_all, "print all available commands"),
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index e12c844df1e..6c38e4febf9 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -744,7 +744,7 @@ static const char * const lock_usage[] = {
 
 static const struct option lock_options[] = {
 	OPT_STRING('i', "input", &input_name, "file", "input file name"),
-	OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"),
+	OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"),
 	OPT_END()
 };
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index b3ba25a910f..bfc47fff9c5 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -133,7 +133,7 @@ static const char * const probe_usage[] = {
 };
 
 static const struct option options[] = {
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show parsed arguments, etc)"),
 	OPT_BOOLEAN('l', "list", &params.list_events,
 		    "list up current probe events"),
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index dc61f1b68b4..9a951368723 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -38,22 +38,22 @@ static int			output;
 static const char		*output_name			= "perf.data";
 static int			group				=      0;
 static unsigned int		realtime_prio			=      0;
-static int			raw_samples			=      0;
-static int			system_wide			=      0;
+static bool			raw_samples			=  false;
+static bool			system_wide			=  false;
 static int			profile_cpu			=     -1;
 static pid_t			target_pid			=     -1;
 static pid_t			target_tid			=     -1;
 static pid_t			*all_tids			=      NULL;
 static int			thread_num			=      0;
 static pid_t			child_pid			=     -1;
-static int			inherit				=      1;
-static int			force				=      0;
-static int			append_file			=      0;
-static int			call_graph			=      0;
-static int			inherit_stat			=      0;
-static int			no_samples			=      0;
-static int			sample_address			=      0;
-static int			multiplex			=      0;
+static bool			inherit				=   true;
+static bool			force				=  false;
+static bool			append_file			=  false;
+static bool			call_graph			=  false;
+static bool			inherit_stat			=  false;
+static bool			no_samples			=  false;
+static bool			sample_address			=  false;
+static bool			multiplex			=  false;
 static int			multiplex_fd			=     -1;
 
 static long			samples				=      0;
@@ -465,7 +465,7 @@ static int __cmd_record(int argc, const char **argv)
 			rename(output_name, oldname);
 		}
 	} else {
-		append_file = 0;
+		append_file = false;
 	}
 
 	flags = O_CREAT|O_RDWR;
@@ -701,7 +701,7 @@ static const struct option options[] = {
 		    "number of mmap data pages"),
 	OPT_BOOLEAN('g', "call-graph", &call_graph,
 		    "do call-graph (stack chain/backtrace) recording"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
 	OPT_BOOLEAN('s', "stat", &inherit_stat,
 		    "per thread counts"),
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index e93c69a8e72..daee082ab42 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -32,11 +32,11 @@
 
 static char		const *input_name = "perf.data";
 
-static int		force;
+static bool		force;
 static bool		hide_unresolved;
 static bool		dont_use_callchains;
 
-static int		show_threads;
+static bool		show_threads;
 static struct perf_read_values	show_threads_values;
 
 static char		default_pretty_printing_style[] = "normal";
@@ -418,7 +418,7 @@ static const char * const report_usage[] = {
 static const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 5e59c0c40c4..09ddc8e6d8e 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1790,7 +1790,7 @@ static const char * const sched_usage[] = {
 static const struct option sched_options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
@@ -1805,7 +1805,7 @@ static const char * const latency_usage[] = {
 static const struct option latency_options[] = {
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
 		   "sort by key(s): runtime, switch, avg, max"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_INTEGER('C', "CPU", &profile_cpu,
 		    "CPU to profile on"),
@@ -1822,7 +1822,7 @@ static const char * const replay_usage[] = {
 static const struct option replay_options[] = {
 	OPT_INTEGER('r', "repeat", &replay_repeat,
 		    "repeat the workload replay N times (-1: infinite)"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 1036ca739e6..e619ac89dff 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -67,19 +67,19 @@ static struct perf_event_attr default_attrs[] = {
 
 };
 
-static int			system_wide			=  0;
+static bool			system_wide			=  false;
 static unsigned int		nr_cpus				=  0;
 static int			run_idx				=  0;
 
 static int			run_count			=  1;
-static int			inherit				=  1;
-static int			scale				=  1;
+static bool			inherit				=  true;
+static bool			scale				=  true;
 static pid_t			target_pid			= -1;
 static pid_t			target_tid			= -1;
 static pid_t			*all_tids			=  NULL;
 static int			thread_num			=  0;
 static pid_t			child_pid			= -1;
-static int			null_run			=  0;
+static bool			null_run			=  false;
 
 static int			*fd[MAX_NR_CPUS][MAX_COUNTERS];
 
@@ -528,7 +528,7 @@ static const struct option options[] = {
 		    "system-wide collection from all CPUs"),
 	OPT_BOOLEAN('c', "scale", &scale,
 		    "scale/normalize counters"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
 	OPT_INTEGER('r', "repeat", &run_count,
 		    "repeat command and print average + stddev (max: 100)"),
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 369c1b490a9..96f4a092df3 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -42,7 +42,7 @@ static u64		turbo_frequency;
 
 static u64		first_time, last_time;
 
-static int		power_only;
+static bool		power_only;
 
 
 struct per_pid;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 4abdd9b646b..40f24dd46ef 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -57,7 +57,7 @@
 
 static int			*fd[MAX_NR_CPUS][MAX_COUNTERS];
 
-static int			system_wide			=      0;
+static bool			system_wide			=  false;
 
 static int			default_interval		=      0;
 
@@ -68,18 +68,18 @@ static int			target_pid			=     -1;
 static int			target_tid			=     -1;
 static pid_t			*all_tids			=      NULL;
 static int			thread_num			=      0;
-static int			inherit				=      0;
+static bool			inherit				=  false;
 static int			profile_cpu			=     -1;
 static int			nr_cpus				=      0;
 static unsigned int		realtime_prio			=      0;
-static int			group				=      0;
+static bool			group				=  false;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			=     16;
 static int			freq				=   1000; /* 1 KHz */
 
 static int			delay_secs			=      2;
-static int			zero                            =      0;
-static int			dump_symtab                     =      0;
+static bool			zero                            =  false;
+static bool			dump_symtab                     =  false;
 
 static bool			hide_kernel_symbols		=  false;
 static bool			hide_user_symbols		=  false;
@@ -854,7 +854,7 @@ static void handle_keypress(int c)
 			display_weighted = ~display_weighted;
 			break;
 		case 'z':
-			zero = ~zero;
+			zero = !zero;
 			break;
 		default:
 			break;
@@ -1335,7 +1335,7 @@ static const struct option options[] = {
 		    "display this many functions"),
 	OPT_BOOLEAN('U', "hide_user_symbols", &hide_user_symbols,
 		    "hide user symbols"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
 	OPT_END()
 };
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 407041d20de..8fc50d83154 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -505,7 +505,7 @@ static const char * const trace_usage[] = {
 static const struct option options[] = {
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('L', "Latency", &latency_format,
 		    "show latency attributes (irqs/preemption disabled, etc)"),
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index 033d66db863..dd824cf3b62 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -13,7 +13,7 @@
 #include "util.h"
 
 int verbose = 0;
-int dump_trace = 0;
+bool dump_trace = false;
 
 int eprintf(int level, const char *fmt, ...)
 {
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index 5cb0a1b1401..047ac3324eb 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -2,10 +2,11 @@
 #ifndef __PERF_DEBUG_H
 #define __PERF_DEBUG_H
 
+#include <stdbool.h>
 #include "event.h"
 
 extern int verbose;
-extern int dump_trace;
+extern bool dump_trace;
 
 int dump_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
 void trace_event(event_t *event);
diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
index 79dfa0c34b3..ed887642460 100644
--- a/tools/perf/util/parse-options.c
+++ b/tools/perf/util/parse-options.c
@@ -49,6 +49,7 @@ static int get_value(struct parse_opt_ctx_t *p,
 				break;
 			/* FALLTHROUGH */
 		case OPTION_BOOLEAN:
+		case OPTION_INCR:
 		case OPTION_BIT:
 		case OPTION_SET_INT:
 		case OPTION_SET_PTR:
@@ -73,6 +74,10 @@ static int get_value(struct parse_opt_ctx_t *p,
 		return 0;
 
 	case OPTION_BOOLEAN:
+		*(bool *)opt->value = unset ? false : true;
+		return 0;
+
+	case OPTION_INCR:
 		*(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
 		return 0;
 
@@ -478,6 +483,7 @@ int usage_with_options_internal(const char * const *usagestr,
 		case OPTION_GROUP:
 		case OPTION_BIT:
 		case OPTION_BOOLEAN:
+		case OPTION_INCR:
 		case OPTION_SET_INT:
 		case OPTION_SET_PTR:
 		case OPTION_LONG:
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
index 948805af43c..b2da725f102 100644
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -8,7 +8,8 @@ enum parse_opt_type {
 	OPTION_GROUP,
 	/* options with no arguments */
 	OPTION_BIT,
-	OPTION_BOOLEAN, /* _INCR would have been a better name */
+	OPTION_BOOLEAN,
+	OPTION_INCR,
 	OPTION_SET_INT,
 	OPTION_SET_PTR,
 	/* options with arguments (usually) */
@@ -95,6 +96,7 @@ struct option {
 #define OPT_GROUP(h)                { .type = OPTION_GROUP, .help = (h) }
 #define OPT_BIT(s, l, v, h, b)      { .type = OPTION_BIT, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (b) }
 #define OPT_BOOLEAN(s, l, v, h)     { .type = OPTION_BOOLEAN, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
+#define OPT_INCR(s, l, v, h)        { .type = OPTION_INCR, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
 #define OPT_SET_INT(s, l, v, h, i)  { .type = OPTION_SET_INT, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (i) }
 #define OPT_SET_PTR(s, l, v, h, p)  { .type = OPTION_SET_PTR, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (p) }
 #define OPT_INTEGER(s, l, v, h)     { .type = OPTION_INTEGER, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 3b81250ffed..17d6d66ed76 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -42,7 +42,7 @@ int header_page_overwrite_size;
 int header_page_data_offset;
 int header_page_data_size;
 
-int latency_format;
+bool latency_format;
 
 static char *input_buf;
 static unsigned long long input_buf_ptr;
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index c3269b937db..81f2fd20a0e 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -1,6 +1,7 @@
 #ifndef __PERF_TRACE_EVENTS_H
 #define __PERF_TRACE_EVENTS_H
 
+#include <stdbool.h>
 #include "parse-events.h"
 
 #define __unused __attribute__((unused))
@@ -241,7 +242,7 @@ extern int header_page_size_size;
 extern int header_page_data_offset;
 extern int header_page_data_size;
 
-extern int latency_format;
+extern bool latency_format;
 
 int parse_header_page(char *buf, unsigned long size);
 int trace_parse_common_type(void *data);
-- 
cgit v1.2.3


From 8dc58101f2c838355d44402aa77646649d10dbec Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Thu, 1 Apr 2010 23:59:15 -0500
Subject: perf: Add pipe-specific header read/write and event processing code

This patch makes several changes to allow the perf event stream
to be sent and received over a pipe:

- adds pipe-specific versions of the header read/write code

- adds pipe-specific version of the event processing code

- adds a range of event types to be used for header or other
  pseudo events, above the range used by the kernel

- checks the return value of event handlers, which they can use
  to skip over large events during event processing rather than actually
  reading them into event objects.

- unifies the multiple do_read() functions and updates its
  users.

Note that none of these changes affect the existing perf data
file format or processing - this code only comes into play if
perf output is sent to stdout (or is read from stdin).

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
Cc: k-keiichi@bx.jp.nec.com
Cc: acme@ghostprotocols.net
LKML-Reference: <1270184365-8281-2-git-send-email-tzanussi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c |   2 +-
 tools/perf/util/event.h     |   4 ++
 tools/perf/util/header.c    |  78 +++++++++++++++++++------
 tools/perf/util/header.h    |   8 ++-
 tools/perf/util/session.c   | 135 +++++++++++++++++++++++++++++++++++++++++---
 tools/perf/util/session.h   |   4 ++
 6 files changed, 202 insertions(+), 29 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 9a951368723..d060fc50c8a 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -487,7 +487,7 @@ static int __cmd_record(int argc, const char **argv)
 	}
 
 	if (!file_new) {
-		err = perf_header__read(&session->header, output);
+		err = perf_header__read(session, output);
 		if (err < 0)
 			return err;
 	}
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 7f7cf8539cf..5c1eba67130 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -83,6 +83,10 @@ struct build_id_event {
 	char			 filename[];
 };
 
+enum perf_header_event_type { /* above any possible kernel type */
+	PERF_RECORD_HEADER_MAX			= 64,
+};
+
 typedef union event_union {
 	struct perf_event_header	header;
 	struct ip_event			ip;
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 6c9aa16ee51..8d05337d1a5 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -427,6 +427,25 @@ out_free:
 	return err;
 }
 
+int perf_header__write_pipe(int fd)
+{
+	struct perf_pipe_file_header f_header;
+	int err;
+
+	f_header = (struct perf_pipe_file_header){
+		.magic	   = PERF_MAGIC,
+		.size	   = sizeof(f_header),
+	};
+
+	err = do_write(fd, &f_header, sizeof(f_header));
+	if (err < 0) {
+		pr_debug("failed to write perf pipe header\n");
+		return err;
+	}
+
+	return 0;
+}
+
 int perf_header__write(struct perf_header *self, int fd, bool at_exit)
 {
 	struct perf_file_header f_header;
@@ -518,25 +537,10 @@ int perf_header__write(struct perf_header *self, int fd, bool at_exit)
 	return 0;
 }
 
-static int do_read(int fd, void *buf, size_t size)
-{
-	while (size) {
-		int ret = read(fd, buf, size);
-
-		if (ret <= 0)
-			return -1;
-
-		size -= ret;
-		buf += ret;
-	}
-
-	return 0;
-}
-
 static int perf_header__getbuffer64(struct perf_header *self,
 				    int fd, void *buf, size_t size)
 {
-	if (do_read(fd, buf, size))
+	if (do_read(fd, buf, size) <= 0)
 		return -1;
 
 	if (self->needs_swap)
@@ -592,7 +596,7 @@ int perf_file_header__read(struct perf_file_header *self,
 {
 	lseek(fd, 0, SEEK_SET);
 
-	if (do_read(fd, self, sizeof(*self)) ||
+	if (do_read(fd, self, sizeof(*self)) <= 0 ||
 	    memcmp(&self->magic, __perf_magic, sizeof(self->magic)))
 		return -1;
 
@@ -662,13 +666,51 @@ static int perf_file_section__process(struct perf_file_section *self,
 	return 0;
 }
 
-int perf_header__read(struct perf_header *self, int fd)
+static int perf_file_header__read_pipe(struct perf_pipe_file_header *self,
+				       struct perf_header *ph, int fd)
 {
+	if (do_read(fd, self, sizeof(*self)) <= 0 ||
+	    memcmp(&self->magic, __perf_magic, sizeof(self->magic)))
+		return -1;
+
+	if (self->size != sizeof(*self)) {
+		u64 size = bswap_64(self->size);
+
+		if (size != sizeof(*self))
+			return -1;
+
+		ph->needs_swap = true;
+	}
+
+	return 0;
+}
+
+static int perf_header__read_pipe(struct perf_session *session, int fd)
+{
+	struct perf_header *self = &session->header;
+	struct perf_pipe_file_header f_header;
+
+	if (perf_file_header__read_pipe(&f_header, self, fd) < 0) {
+		pr_debug("incompatible file format\n");
+		return -EINVAL;
+	}
+
+	session->fd = fd;
+
+	return 0;
+}
+
+int perf_header__read(struct perf_session *session, int fd)
+{
+	struct perf_header *self = &session->header;
 	struct perf_file_header	f_header;
 	struct perf_file_attr	f_attr;
 	u64			f_id;
 	int nr_attrs, nr_ids, i, j;
 
+	if (session->fd_pipe)
+		return perf_header__read_pipe(session, fd);
+
 	if (perf_file_header__read(&f_header, self, fd) < 0) {
 		pr_debug("incompatible file format\n");
 		return -EINVAL;
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index c059f08cf87..6562ece6706 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -39,6 +39,11 @@ struct perf_file_header {
 	DECLARE_BITMAP(adds_features, HEADER_FEAT_BITS);
 };
 
+struct perf_pipe_file_header {
+	u64				magic;
+	u64				size;
+};
+
 struct perf_header;
 
 int perf_file_header__read(struct perf_file_header *self,
@@ -60,8 +65,9 @@ struct perf_header {
 int perf_header__init(struct perf_header *self);
 void perf_header__exit(struct perf_header *self);
 
-int perf_header__read(struct perf_header *self, int fd);
+int perf_header__read(struct perf_session *session, int fd);
 int perf_header__write(struct perf_header *self, int fd, bool at_exit);
+int perf_header__write_pipe(int fd);
 
 int perf_header__add_attr(struct perf_header *self,
 			  struct perf_header_attr *attr);
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index ddf288fca3e..2c1277cb4ae 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -14,6 +14,16 @@ static int perf_session__open(struct perf_session *self, bool force)
 {
 	struct stat input_stat;
 
+	if (!strcmp(self->filename, "-")) {
+		self->fd_pipe = true;
+		self->fd = STDIN_FILENO;
+
+		if (perf_header__read(self, self->fd) < 0)
+			pr_err("incompatible file format");
+
+		return 0;
+	}
+
 	self->fd = open(self->filename, O_RDONLY);
 	if (self->fd < 0) {
 		pr_err("failed to open file: %s", self->filename);
@@ -38,7 +48,7 @@ static int perf_session__open(struct perf_session *self, bool force)
 		goto out_close;
 	}
 
-	if (perf_header__read(&self->header, self->fd) < 0) {
+	if (perf_header__read(self, self->fd) < 0) {
 		pr_err("incompatible file format");
 		goto out_close;
 	}
@@ -52,6 +62,11 @@ out_close:
 	return -1;
 }
 
+void perf_session__update_sample_type(struct perf_session *self)
+{
+	self->sample_type = perf_header__sample_type(&self->header);
+}
+
 struct perf_session *perf_session__new(const char *filename, int mode, bool force)
 {
 	size_t len = filename ? strlen(filename) + 1 : 0;
@@ -85,7 +100,7 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
 			goto out_delete;
 	}
 
-	self->sample_type = perf_header__sample_type(&self->header);
+	perf_session__update_sample_type(self);
 out:
 	return self;
 out_free:
@@ -200,14 +215,17 @@ static const char *event__name[] = {
 	[PERF_RECORD_SAMPLE]	 = "SAMPLE",
 };
 
-unsigned long event__total[PERF_RECORD_MAX];
+unsigned long event__total[PERF_RECORD_HEADER_MAX];
 
 void event__print_totals(void)
 {
 	int i;
-	for (i = 0; i < PERF_RECORD_MAX; ++i)
+	for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
+		if (!event__name[i])
+			continue;
 		pr_info("%10s events: %10ld\n",
 			event__name[i], event__total[i]);
+	}
 }
 
 void mem_bswap_64(void *src, int byte_size)
@@ -271,7 +289,7 @@ static event__swap_op event__swap_ops[] = {
 	[PERF_RECORD_LOST]   = event__all64_swap,
 	[PERF_RECORD_READ]   = event__read_swap,
 	[PERF_RECORD_SAMPLE] = event__all64_swap,
-	[PERF_RECORD_MAX]    = NULL,
+	[PERF_RECORD_HEADER_MAX]    = NULL,
 };
 
 static int perf_session__process_event(struct perf_session *self,
@@ -281,7 +299,7 @@ static int perf_session__process_event(struct perf_session *self,
 {
 	trace_event(event);
 
-	if (event->header.type < PERF_RECORD_MAX) {
+	if (event->header.type < PERF_RECORD_HEADER_MAX) {
 		dump_printf("%#Lx [%#x]: PERF_RECORD_%s",
 			    offset + head, event->header.size,
 			    event__name[event->header.type]);
@@ -376,6 +394,101 @@ static struct thread *perf_session__register_idle_thread(struct perf_session *se
 	return thread;
 }
 
+int do_read(int fd, void *buf, size_t size)
+{
+	void *buf_start = buf;
+
+	while (size) {
+		int ret = read(fd, buf, size);
+
+		if (ret <= 0)
+			return ret;
+
+		size -= ret;
+		buf += ret;
+	}
+
+	return buf - buf_start;
+}
+
+#define session_done()	(*(volatile int *)(&session_done))
+volatile int session_done;
+
+static int __perf_session__process_pipe_events(struct perf_session *self,
+					       struct perf_event_ops *ops)
+{
+	event_t event;
+	uint32_t size;
+	int skip = 0;
+	u64 head;
+	int err;
+	void *p;
+
+	perf_event_ops__fill_defaults(ops);
+
+	head = 0;
+more:
+	err = do_read(self->fd, &event, sizeof(struct perf_event_header));
+	if (err <= 0) {
+		if (err == 0)
+			goto done;
+
+		pr_err("failed to read event header\n");
+		goto out_err;
+	}
+
+	if (self->header.needs_swap)
+		perf_event_header__bswap(&event.header);
+
+	size = event.header.size;
+	if (size == 0)
+		size = 8;
+
+	p = &event;
+	p += sizeof(struct perf_event_header);
+
+	err = do_read(self->fd, p, size - sizeof(struct perf_event_header));
+	if (err <= 0) {
+		if (err == 0) {
+			pr_err("unexpected end of event stream\n");
+			goto done;
+		}
+
+		pr_err("failed to read event data\n");
+		goto out_err;
+	}
+
+	if (size == 0 ||
+	    (skip = perf_session__process_event(self, &event, ops,
+						0, head)) < 0) {
+		dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n",
+			    head, event.header.size, event.header.type);
+		/*
+		 * assume we lost track of the stream, check alignment, and
+		 * increment a single u64 in the hope to catch on again 'soon'.
+		 */
+		if (unlikely(head & 7))
+			head &= ~7ULL;
+
+		size = 8;
+	}
+
+	head += size;
+
+	dump_printf("\n%#Lx [%#x]: event: %d\n",
+		    head, event.header.size, event.header.type);
+
+	if (skip > 0)
+		head += skip;
+
+	if (!session_done())
+		goto more;
+done:
+	err = 0;
+out_err:
+	return err;
+}
+
 int __perf_session__process_events(struct perf_session *self,
 				   u64 data_offset, u64 data_size,
 				   u64 file_size, struct perf_event_ops *ops)
@@ -499,9 +612,13 @@ out_getcwd_err:
 		self->cwdlen = strlen(self->cwd);
 	}
 
-	err = __perf_session__process_events(self, self->header.data_offset,
-					     self->header.data_size,
-					     self->size, ops);
+	if (!self->fd_pipe)
+		err = __perf_session__process_events(self,
+						     self->header.data_offset,
+						     self->header.data_size,
+						     self->size, ops);
+	else
+		err = __perf_session__process_pipe_events(self, ops);
 out_err:
 	return err;
 }
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 27f4c2dc715..5f789113665 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -27,6 +27,7 @@ struct perf_session {
 	u64			sample_type;
 	struct ref_reloc_sym	ref_reloc_sym;
 	int			fd;
+	bool			fd_pipe;
 	int			cwdlen;
 	char			*cwd;
 	char filename[0];
@@ -92,6 +93,9 @@ static inline struct map *
 	return map_groups__new_module(&self->kmaps, start, filename);
 }
 
+int do_read(int fd, void *buf, size_t size);
+void perf_session__update_sample_type(struct perf_session *self);
+
 #ifdef NO_NEWT_SUPPORT
 static inline int perf_session__browse_hists(struct rb_root *hists __used,
 					      u64 nr_hists __used,
-- 
cgit v1.2.3


From 529870e37473a9fc609078f03cc5b4148cf06a87 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Thu, 1 Apr 2010 23:59:16 -0500
Subject: perf record: Introduce special handling for pipe output

Adds special treatment for stdout - if the user specifies '-o -'
to perf record, the intent is that the event stream be written
to stdout rather than to a disk file.

Also, redirect stdout of forked child to stderr - in pipe mode,
stdout of the forked child interferes with the stdout perf
stream, so redirect it to stderr where it can still be seen but
won't be mixed in with the perf output.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
Cc: k-keiichi@bx.jp.nec.com
Cc: acme@ghostprotocols.net
LKML-Reference: <1270184365-8281-3-git-send-email-tzanussi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index d060fc50c8a..d4464f7fcea 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -35,6 +35,7 @@ static unsigned int		page_size;
 static unsigned int		mmap_pages			=    128;
 static int			freq				=   1000;
 static int			output;
+static int			pipe_output			=      0;
 static const char		*output_name			= "perf.data";
 static int			group				=      0;
 static unsigned int		realtime_prio			=      0;
@@ -449,7 +450,9 @@ static int __cmd_record(int argc, const char **argv)
 		exit(-1);
 	}
 
-	if (!stat(output_name, &st) && st.st_size) {
+	if (!strcmp(output_name, "-"))
+		pipe_output = 1;
+	else if (!stat(output_name, &st) && st.st_size) {
 		if (!force) {
 			if (!append_file) {
 				pr_err("Error, output file %s exists, use -A "
@@ -474,7 +477,10 @@ static int __cmd_record(int argc, const char **argv)
 	else
 		flags |= O_TRUNC;
 
-	output = open(output_name, flags, S_IRUSR|S_IWUSR);
+	if (pipe_output)
+		output = STDOUT_FILENO;
+	else
+		output = open(output_name, flags, S_IRUSR | S_IWUSR);
 	if (output < 0) {
 		perror("failed to create output file");
 		exit(-1);
@@ -513,6 +519,8 @@ static int __cmd_record(int argc, const char **argv)
 		}
 
 		if (!child_pid) {
+			if (pipe_output)
+				dup2(2, 1);
 			close(child_ready_pipe[0]);
 			close(go_pipe[1]);
 			fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
@@ -564,7 +572,11 @@ static int __cmd_record(int argc, const char **argv)
 			open_counters(cpumap[i]);
 	}
 
-	if (file_new) {
+	if (pipe_output) {
+		err = perf_header__write_pipe(output);
+		if (err < 0)
+			return err;
+	} else if (file_new) {
 		err = perf_header__write(&session->header, output, false);
 		if (err < 0)
 			return err;
-- 
cgit v1.2.3


From 46656ac7fb3252f8a3db29b18638e0e8067849ba Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Thu, 1 Apr 2010 23:59:17 -0500
Subject: perf report: Introduce special handling for pipe input

Adds special treatment for stdin - if the user specifies '-i -'
to perf report, the intent is that the event stream be written
to stdin rather than from a disk file.

The actual handling of the '-' filename is done by the session;
this just adds a signal handler to stop reporting, and turns off
interference by the pager.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
Cc: k-keiichi@bx.jp.nec.com
Cc: acme@ghostprotocols.net
LKML-Reference: <1270184365-8281-4-git-send-email-tzanussi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index daee082ab42..00b358ff135 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -269,6 +269,13 @@ static struct perf_event_ops event_ops = {
 	.read	= process_read_event,
 };
 
+extern volatile int session_done;
+
+static void sig_handler(int sig __attribute__((__unused__)))
+{
+	session_done = 1;
+}
+
 static int __cmd_report(void)
 {
 	int ret = -EINVAL;
@@ -276,6 +283,8 @@ static int __cmd_report(void)
 	struct rb_node *next;
 	const char *help = "For a higher level overview, try: perf report --sort comm,dso";
 
+	signal(SIGINT, sig_handler);
+
 	session = perf_session__new(input_name, O_RDONLY, force);
 	if (session == NULL)
 		return -ENOMEM;
@@ -465,7 +474,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 {
 	argc = parse_options(argc, argv, options, report_usage, 0);
 
-	setup_browser();
+	if (strcmp(input_name, "-") != 0)
+		setup_browser();
 
 	if (symbol__init() < 0)
 		return -1;
-- 
cgit v1.2.3


From c239da3b4b55dbb8f30bcb8d1a0d63fc44a567c3 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Thu, 1 Apr 2010 23:59:18 -0500
Subject: perf trace: Introduce special handling for pipe input

Adds special treatment for stdin - if the user specifies '-i -'
to perf trace, the intent is that the event stream be read from
stdin rather than from a disk file.

The actual handling of the '-' filename is done by the session;
this just adds a signal handler to stop reporting, and turns off
interference by the pager.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
Cc: k-keiichi@bx.jp.nec.com
Cc: acme@ghostprotocols.net
LKML-Reference: <1270184365-8281-5-git-send-email-tzanussi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-trace.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 8fc50d83154..c681e85a912 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -106,8 +106,17 @@ static struct perf_event_ops event_ops = {
 	.comm	= event__process_comm,
 };
 
+extern volatile int session_done;
+
+static void sig_handler(int sig __unused)
+{
+	session_done = 1;
+}
+
 static int __cmd_trace(struct perf_session *session)
 {
+	signal(SIGINT, sig_handler);
+
 	return perf_session__process_events(session, &event_ops);
 }
 
@@ -580,7 +589,8 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used)
 	if (session == NULL)
 		return -ENOMEM;
 
-	if (!perf_session__has_traces(session, "record -R"))
+	if (strcmp(input_name, "-") &&
+	    !perf_session__has_traces(session, "record -R"))
 		return -EINVAL;
 
 	if (generate_script_lang) {
-- 
cgit v1.2.3


From 2c46dbb517a10b18d459e6ceffefde5bfb290cf6 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Thu, 1 Apr 2010 23:59:19 -0500
Subject: perf: Convert perf header attrs into attr events

Bypasses the attr perf header code and replaces it with a
synthesized event and processing function that accomplishes the
same thing, used when reading/writing perf data to/from a pipe.

Making the attrs into events allows them to be streamed over a
pipe along with the rest of the header data (in later patches).
It also paves the way to allowing events to be added and removed
from perf sessions dynamically.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
Cc: k-keiichi@bx.jp.nec.com
Cc: acme@ghostprotocols.net
LKML-Reference: <1270184365-8281-6-git-send-email-tzanussi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 10 ++++++
 tools/perf/builtin-report.c |  1 +
 tools/perf/builtin-trace.c  |  1 +
 tools/perf/util/event.h     | 10 +++++-
 tools/perf/util/header.c    | 79 +++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/header.h    |  8 +++++
 tools/perf/util/session.c   | 26 +++++++++++++++
 tools/perf/util/session.h   |  3 +-
 8 files changed, 136 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index d4464f7fcea..289d9cf3bf7 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -584,6 +584,16 @@ static int __cmd_record(int argc, const char **argv)
 
 	post_processing_offset = lseek(output, 0, SEEK_CUR);
 
+	if (pipe_output) {
+		err = event__synthesize_attrs(&session->header,
+					      process_synthesized_event,
+					      session);
+		if (err < 0) {
+			pr_err("Couldn't synthesize attrs.\n");
+			return err;
+		}
+	}
+
 	err = event__synthesize_kernel_mmap(process_synthesized_event,
 					    session, "_text");
 	if (err < 0)
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 00b358ff135..f0486ce591a 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -267,6 +267,7 @@ static struct perf_event_ops event_ops = {
 	.fork	= event__process_task,
 	.lost	= event__process_lost,
 	.read	= process_read_event,
+	.attr	= event__process_attr,
 };
 
 extern volatile int session_done;
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index c681e85a912..e30eac6af54 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -104,6 +104,7 @@ static int process_sample_event(event_t *event, struct perf_session *session)
 static struct perf_event_ops event_ops = {
 	.sample	= process_sample_event,
 	.comm	= event__process_comm,
+	.attr	= event__process_attr,
 };
 
 extern volatile int session_done;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 5c1eba67130..b4fbf25078b 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -84,7 +84,14 @@ struct build_id_event {
 };
 
 enum perf_header_event_type { /* above any possible kernel type */
-	PERF_RECORD_HEADER_MAX			= 64,
+	PERF_RECORD_HEADER_ATTR			= 64,
+	PERF_RECORD_HEADER_MAX
+};
+
+struct attr_event {
+	struct perf_event_header header;
+	struct perf_event_attr attr;
+	u64 id[];
 };
 
 typedef union event_union {
@@ -96,6 +103,7 @@ typedef union event_union {
 	struct lost_event		lost;
 	struct read_event		read;
 	struct sample_event		sample;
+	struct attr_event		attr;
 } event_t;
 
 struct events_stats {
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 8d05337d1a5..e36173934e8 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -807,3 +807,82 @@ perf_header__find_attr(u64 id, struct perf_header *header)
 
 	return NULL;
 }
+
+int event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
+			   event__handler_t process,
+			   struct perf_session *session)
+{
+	event_t *ev;
+	size_t size;
+	int err;
+
+	size = sizeof(struct perf_event_attr);
+	size = ALIGN(size, sizeof(u64));
+	size += sizeof(struct perf_event_header);
+	size += ids * sizeof(u64);
+
+	ev = malloc(size);
+
+	ev->attr.attr = *attr;
+	memcpy(ev->attr.id, id, ids * sizeof(u64));
+
+	ev->attr.header.type = PERF_RECORD_HEADER_ATTR;
+	ev->attr.header.size = size;
+
+	err = process(ev, session);
+
+	free(ev);
+
+	return err;
+}
+
+int event__synthesize_attrs(struct perf_header *self,
+			    event__handler_t process,
+			    struct perf_session *session)
+{
+	struct perf_header_attr	*attr;
+	int i, err = 0;
+
+	for (i = 0; i < self->attrs; i++) {
+		attr = self->attr[i];
+
+		err = event__synthesize_attr(&attr->attr, attr->ids, attr->id,
+					     process, session);
+		if (err) {
+			pr_debug("failed to create perf header attribute\n");
+			return err;
+		}
+	}
+
+	return err;
+}
+
+int event__process_attr(event_t *self, struct perf_session *session)
+{
+	struct perf_header_attr *attr;
+	unsigned int i, ids, n_ids;
+
+	attr = perf_header_attr__new(&self->attr.attr);
+	if (attr == NULL)
+		return -ENOMEM;
+
+	ids = self->header.size;
+	ids -= (void *)&self->attr.id - (void *)self;
+	n_ids = ids / sizeof(u64);
+
+	for (i = 0; i < n_ids; i++) {
+		if (perf_header_attr__add_id(attr, self->attr.id[i]) < 0) {
+			perf_header_attr__delete(attr);
+			return -ENOMEM;
+		}
+	}
+
+	if (perf_header__add_attr(&session->header, attr) < 0) {
+		perf_header_attr__delete(attr);
+		return -ENOMEM;
+	}
+
+	perf_session__update_sample_type(session);
+
+	return 0;
+}
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 6562ece6706..e916ac509a6 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -95,4 +95,12 @@ int build_id_cache__add_s(const char *sbuild_id, const char *debugdir,
 			  const char *name, bool is_kallsyms);
 int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir);
 
+int event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id,
+			   event__handler_t process,
+			   struct perf_session *session);
+int event__synthesize_attrs(struct perf_header *self,
+			    event__handler_t process,
+			    struct perf_session *session);
+int event__process_attr(event_t *self, struct perf_session *session);
+
 #endif /* __PERF_HEADER_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 2c1277cb4ae..bc81864cd04 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -200,6 +200,8 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
 		handler->throttle = process_event_stub;
 	if (handler->unthrottle == NULL)
 		handler->unthrottle = process_event_stub;
+	if (handler->attr == NULL)
+		handler->attr = process_event_stub;
 }
 
 static const char *event__name[] = {
@@ -213,6 +215,7 @@ static const char *event__name[] = {
 	[PERF_RECORD_FORK]	 = "FORK",
 	[PERF_RECORD_READ]	 = "READ",
 	[PERF_RECORD_SAMPLE]	 = "SAMPLE",
+	[PERF_RECORD_HEADER_ATTR]	 = "ATTR",
 };
 
 unsigned long event__total[PERF_RECORD_HEADER_MAX];
@@ -279,6 +282,26 @@ static void event__read_swap(event_t *self)
 	self->read.id		= bswap_64(self->read.id);
 }
 
+static void event__attr_swap(event_t *self)
+{
+	size_t size;
+
+	self->attr.attr.type		= bswap_32(self->attr.attr.type);
+	self->attr.attr.size		= bswap_32(self->attr.attr.size);
+	self->attr.attr.config		= bswap_64(self->attr.attr.config);
+	self->attr.attr.sample_period	= bswap_64(self->attr.attr.sample_period);
+	self->attr.attr.sample_type	= bswap_64(self->attr.attr.sample_type);
+	self->attr.attr.read_format	= bswap_64(self->attr.attr.read_format);
+	self->attr.attr.wakeup_events	= bswap_32(self->attr.attr.wakeup_events);
+	self->attr.attr.bp_type		= bswap_32(self->attr.attr.bp_type);
+	self->attr.attr.bp_addr		= bswap_64(self->attr.attr.bp_addr);
+	self->attr.attr.bp_len		= bswap_64(self->attr.attr.bp_len);
+
+	size = self->header.size;
+	size -= (void *)&self->attr.id - (void *)self;
+	mem_bswap_64(self->attr.id, size);
+}
+
 typedef void (*event__swap_op)(event_t *self);
 
 static event__swap_op event__swap_ops[] = {
@@ -289,6 +312,7 @@ static event__swap_op event__swap_ops[] = {
 	[PERF_RECORD_LOST]   = event__all64_swap,
 	[PERF_RECORD_READ]   = event__read_swap,
 	[PERF_RECORD_SAMPLE] = event__all64_swap,
+	[PERF_RECORD_HEADER_ATTR]   = event__attr_swap,
 	[PERF_RECORD_HEADER_MAX]    = NULL,
 };
 
@@ -329,6 +353,8 @@ static int perf_session__process_event(struct perf_session *self,
 		return ops->throttle(event, self);
 	case PERF_RECORD_UNTHROTTLE:
 		return ops->unthrottle(event, self);
+	case PERF_RECORD_HEADER_ATTR:
+		return ops->attr(event, self);
 	default:
 		self->unknown_events++;
 		return -1;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 5f789113665..45a13741351 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -44,7 +44,8 @@ struct perf_event_ops {
 		 lost,
 		 read,
 		 throttle,
-		 unthrottle;
+		 unthrottle,
+		 attr;
 };
 
 struct perf_session *perf_session__new(const char *filename, int mode, bool force);
-- 
cgit v1.2.3


From cd19a035f3b63fee6dcbdb5371c4b22276f7dc8c Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Thu, 1 Apr 2010 23:59:20 -0500
Subject: perf: Convert perf event types into event type events

Bypasses the event type perf header code and replaces it with a
synthesized event and processing function that accomplishes the
same thing, used when reading/writing perf data to/from a pipe.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
Cc: k-keiichi@bx.jp.nec.com
Cc: acme@ghostprotocols.net
LKML-Reference: <1270184365-8281-7-git-send-email-tzanussi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c |  7 +++++
 tools/perf/builtin-report.c |  1 +
 tools/perf/builtin-trace.c  |  1 +
 tools/perf/util/event.h     | 14 ++++++++++
 tools/perf/util/header.c    | 62 ++++++++++++++++++++++++++++++++++++++++-----
 tools/perf/util/header.h    |  9 +++++++
 tools/perf/util/session.c   | 12 +++++++++
 tools/perf/util/session.h   |  3 ++-
 8 files changed, 101 insertions(+), 8 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 289d9cf3bf7..c4c132205ed 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -592,6 +592,13 @@ static int __cmd_record(int argc, const char **argv)
 			pr_err("Couldn't synthesize attrs.\n");
 			return err;
 		}
+
+		err = event__synthesize_event_types(process_synthesized_event,
+						    session);
+		if (err < 0) {
+			pr_err("Couldn't synthesize event_types.\n");
+			return err;
+		}
 	}
 
 	err = event__synthesize_kernel_mmap(process_synthesized_event,
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f0486ce591a..e59d0127d5e 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -268,6 +268,7 @@ static struct perf_event_ops event_ops = {
 	.lost	= event__process_lost,
 	.read	= process_read_event,
 	.attr	= event__process_attr,
+	.event_type = event__process_event_type,
 };
 
 extern volatile int session_done;
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index e30eac6af54..eb884a7dc24 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -105,6 +105,7 @@ static struct perf_event_ops event_ops = {
 	.sample	= process_sample_event,
 	.comm	= event__process_comm,
 	.attr	= event__process_attr,
+	.event_type = event__process_event_type,
 };
 
 extern volatile int session_done;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index b4fbf25078b..c720fe06f8d 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -85,6 +85,7 @@ struct build_id_event {
 
 enum perf_header_event_type { /* above any possible kernel type */
 	PERF_RECORD_HEADER_ATTR			= 64,
+	PERF_RECORD_HEADER_EVENT_TYPE		= 65,
 	PERF_RECORD_HEADER_MAX
 };
 
@@ -94,6 +95,18 @@ struct attr_event {
 	u64 id[];
 };
 
+#define MAX_EVENT_NAME 64
+
+struct perf_trace_event_type {
+	u64	event_id;
+	char	name[MAX_EVENT_NAME];
+};
+
+struct event_type_event {
+	struct perf_event_header header;
+	struct perf_trace_event_type event_type;
+};
+
 typedef union event_union {
 	struct perf_event_header	header;
 	struct ip_event			ip;
@@ -104,6 +117,7 @@ typedef union event_union {
 	struct read_event		read;
 	struct sample_event		sample;
 	struct attr_event		attr;
+	struct event_type_event		event_type;
 } event_t;
 
 struct events_stats {
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index e36173934e8..44637999dbc 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -99,13 +99,6 @@ int perf_header__add_attr(struct perf_header *self,
 	return 0;
 }
 
-#define MAX_EVENT_NAME 64
-
-struct perf_trace_event_type {
-	u64	event_id;
-	char	name[MAX_EVENT_NAME];
-};
-
 static int event_count;
 static struct perf_trace_event_type *events;
 
@@ -886,3 +879,58 @@ int event__process_attr(event_t *self, struct perf_session *session)
 
 	return 0;
 }
+
+int event__synthesize_event_type(u64 event_id, char *name,
+				 event__handler_t process,
+				 struct perf_session *session)
+{
+	event_t ev;
+	size_t size = 0;
+	int err = 0;
+
+	memset(&ev, 0, sizeof(ev));
+
+	ev.event_type.event_type.event_id = event_id;
+	memset(ev.event_type.event_type.name, 0, MAX_EVENT_NAME);
+	strncpy(ev.event_type.event_type.name, name, MAX_EVENT_NAME - 1);
+
+	ev.event_type.header.type = PERF_RECORD_HEADER_EVENT_TYPE;
+	size = strlen(name);
+	size = ALIGN(size, sizeof(u64));
+	ev.event_type.header.size = sizeof(ev.event_type) -
+		(sizeof(ev.event_type.event_type.name) - size);
+
+	err = process(&ev, session);
+
+	return err;
+}
+
+int event__synthesize_event_types(event__handler_t process,
+				  struct perf_session *session)
+{
+	struct perf_trace_event_type *type;
+	int i, err = 0;
+
+	for (i = 0; i < event_count; i++) {
+		type = &events[i];
+
+		err = event__synthesize_event_type(type->event_id, type->name,
+						   process, session);
+		if (err) {
+			pr_debug("failed to create perf header event type\n");
+			return err;
+		}
+	}
+
+	return err;
+}
+
+int event__process_event_type(event_t *self,
+			      struct perf_session *session __unused)
+{
+	if (perf_header__push_event(self->event_type.event_type.event_id,
+				    self->event_type.event_type.name) < 0)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index e916ac509a6..afeb6188376 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -103,4 +103,13 @@ int event__synthesize_attrs(struct perf_header *self,
 			    struct perf_session *session);
 int event__process_attr(event_t *self, struct perf_session *session);
 
+int event__synthesize_event_type(u64 event_id, char *name,
+				 event__handler_t process,
+				 struct perf_session *session);
+int event__synthesize_event_types(event__handler_t process,
+				  struct perf_session *session);
+int event__process_event_type(event_t *self,
+			      struct perf_session *session);
+
+
 #endif /* __PERF_HEADER_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index bc81864cd04..96c4629b774 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -202,6 +202,8 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
 		handler->unthrottle = process_event_stub;
 	if (handler->attr == NULL)
 		handler->attr = process_event_stub;
+	if (handler->event_type == NULL)
+		handler->event_type = process_event_stub;
 }
 
 static const char *event__name[] = {
@@ -216,6 +218,7 @@ static const char *event__name[] = {
 	[PERF_RECORD_READ]	 = "READ",
 	[PERF_RECORD_SAMPLE]	 = "SAMPLE",
 	[PERF_RECORD_HEADER_ATTR]	 = "ATTR",
+	[PERF_RECORD_HEADER_EVENT_TYPE]	 = "EVENT_TYPE",
 };
 
 unsigned long event__total[PERF_RECORD_HEADER_MAX];
@@ -302,6 +305,12 @@ static void event__attr_swap(event_t *self)
 	mem_bswap_64(self->attr.id, size);
 }
 
+static void event__event_type_swap(event_t *self)
+{
+	self->event_type.event_type.event_id =
+		bswap_64(self->event_type.event_type.event_id);
+}
+
 typedef void (*event__swap_op)(event_t *self);
 
 static event__swap_op event__swap_ops[] = {
@@ -313,6 +322,7 @@ static event__swap_op event__swap_ops[] = {
 	[PERF_RECORD_READ]   = event__read_swap,
 	[PERF_RECORD_SAMPLE] = event__all64_swap,
 	[PERF_RECORD_HEADER_ATTR]   = event__attr_swap,
+	[PERF_RECORD_HEADER_EVENT_TYPE]   = event__event_type_swap,
 	[PERF_RECORD_HEADER_MAX]    = NULL,
 };
 
@@ -355,6 +365,8 @@ static int perf_session__process_event(struct perf_session *self,
 		return ops->unthrottle(event, self);
 	case PERF_RECORD_HEADER_ATTR:
 		return ops->attr(event, self);
+	case PERF_RECORD_HEADER_EVENT_TYPE:
+		return ops->event_type(event, self);
 	default:
 		self->unknown_events++;
 		return -1;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 45a13741351..0dac1f4457d 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -45,7 +45,8 @@ struct perf_event_ops {
 		 read,
 		 throttle,
 		 unthrottle,
-		 attr;
+		 attr,
+		 event_type;
 };
 
 struct perf_session *perf_session__new(const char *filename, int mode, bool force);
-- 
cgit v1.2.3


From 9215545e99d8c0b27323df2de504f4294bf5e407 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Thu, 1 Apr 2010 23:59:21 -0500
Subject: perf: Convert perf tracing data into a tracing_data event

Bypasses the tracing_data perf header code and replaces it with
a synthesized event and processing function that accomplishes
the same thing, used when reading/writing perf data to/from a
pipe.

The tracing data is pretty large, and this patch doesn't attempt
to break it down into component events.  The tracing_data event
itself doesn't actually contain the tracing data, rather it
arranges for the event processing code to skip over it after
it's read, using the skip return value added to the event
processing loop in a previous patch.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
Cc: k-keiichi@bx.jp.nec.com
Cc: acme@ghostprotocols.net
LKML-Reference: <1270184365-8281-8-git-send-email-tzanussi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c        | 16 +++++++
 tools/perf/builtin-report.c        |  1 +
 tools/perf/builtin-trace.c         |  1 +
 tools/perf/util/event.h            |  7 +++
 tools/perf/util/header.c           | 52 ++++++++++++++++++++++
 tools/perf/util/header.h           |  6 +++
 tools/perf/util/session.c          | 13 ++++++
 tools/perf/util/session.h          |  3 +-
 tools/perf/util/trace-event-info.c | 24 ++++++++++
 tools/perf/util/trace-event-read.c | 89 +++++++++++++++++++-------------------
 tools/perf/util/trace-event.h      |  4 +-
 11 files changed, 170 insertions(+), 46 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index c4c132205ed..3775abe2af7 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -104,6 +104,11 @@ static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
 	pc->data_tail = tail;
 }
 
+static void advance_output(size_t size)
+{
+	bytes_written += size;
+}
+
 static void write_output(void *buf, size_t size)
 {
 	while (size) {
@@ -599,6 +604,17 @@ static int __cmd_record(int argc, const char **argv)
 			pr_err("Couldn't synthesize event_types.\n");
 			return err;
 		}
+
+		err = event__synthesize_tracing_data(output, attrs,
+						     nr_counters,
+						     process_synthesized_event,
+						     session);
+		if (err <= 0) {
+			pr_err("Couldn't record tracing data.\n");
+			return err;
+		}
+
+		advance_output(err);
 	}
 
 	err = event__synthesize_kernel_mmap(process_synthesized_event,
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index e59d0127d5e..76f03a70aac 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -269,6 +269,7 @@ static struct perf_event_ops event_ops = {
 	.read	= process_read_event,
 	.attr	= event__process_attr,
 	.event_type = event__process_event_type,
+	.tracing_data = event__process_tracing_data,
 };
 
 extern volatile int session_done;
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index eb884a7dc24..1509744429c 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -106,6 +106,7 @@ static struct perf_event_ops event_ops = {
 	.comm	= event__process_comm,
 	.attr	= event__process_attr,
 	.event_type = event__process_event_type,
+	.tracing_data = event__process_tracing_data,
 };
 
 extern volatile int session_done;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index c720fe06f8d..b896a177ea4 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -86,6 +86,7 @@ struct build_id_event {
 enum perf_header_event_type { /* above any possible kernel type */
 	PERF_RECORD_HEADER_ATTR			= 64,
 	PERF_RECORD_HEADER_EVENT_TYPE		= 65,
+	PERF_RECORD_HEADER_TRACING_DATA		= 66,
 	PERF_RECORD_HEADER_MAX
 };
 
@@ -107,6 +108,11 @@ struct event_type_event {
 	struct perf_trace_event_type event_type;
 };
 
+struct tracing_data_event {
+	struct perf_event_header header;
+	u32 size;
+};
+
 typedef union event_union {
 	struct perf_event_header	header;
 	struct ip_event			ip;
@@ -118,6 +124,7 @@ typedef union event_union {
 	struct sample_event		sample;
 	struct attr_event		attr;
 	struct event_type_event		event_type;
+	struct tracing_data_event	tracing_data;
 } event_t;
 
 struct events_stats {
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 44637999dbc..c6874ecc90b 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -934,3 +934,55 @@ int event__process_event_type(event_t *self,
 
 	return 0;
 }
+
+int event__synthesize_tracing_data(int fd, struct perf_event_attr *pattrs,
+				   int nb_events,
+				   event__handler_t process,
+				   struct perf_session *session __unused)
+{
+	event_t ev;
+	ssize_t size = 0, aligned_size = 0, padding;
+	int err = 0;
+
+	memset(&ev, 0, sizeof(ev));
+
+	ev.tracing_data.header.type = PERF_RECORD_HEADER_TRACING_DATA;
+	size = read_tracing_data_size(fd, pattrs, nb_events);
+	if (size <= 0)
+		return size;
+	aligned_size = ALIGN(size, sizeof(u64));
+	padding = aligned_size - size;
+	ev.tracing_data.header.size = sizeof(ev.tracing_data);
+	ev.tracing_data.size = aligned_size;
+
+	process(&ev, session);
+
+	err = read_tracing_data(fd, pattrs, nb_events);
+	write_padded(fd, NULL, 0, padding);
+
+	return aligned_size;
+}
+
+int event__process_tracing_data(event_t *self,
+				struct perf_session *session)
+{
+	ssize_t size_read, padding, size = self->tracing_data.size;
+	off_t offset = lseek(session->fd, 0, SEEK_CUR);
+	char buf[BUFSIZ];
+
+	/* setup for reading amidst mmap */
+	lseek(session->fd, offset + sizeof(struct tracing_data_event),
+	      SEEK_SET);
+
+	size_read = trace_report(session->fd);
+
+	padding = ALIGN(size_read, sizeof(u64)) - size_read;
+
+	if (read(session->fd, buf, padding) < 0)
+		die("reading input file");
+
+	if (size_read + padding != size)
+		die("tracing data size mismatch");
+
+	return size_read + padding;
+}
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index afeb6188376..3ed3d98c81d 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -111,5 +111,11 @@ int event__synthesize_event_types(event__handler_t process,
 int event__process_event_type(event_t *self,
 			      struct perf_session *session);
 
+int event__synthesize_tracing_data(int fd, struct perf_event_attr *pattrs,
+				   int nb_events,
+				   event__handler_t process,
+				   struct perf_session *session);
+int event__process_tracing_data(event_t *self,
+				struct perf_session *session);
 
 #endif /* __PERF_HEADER_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 96c4629b774..1516c40d47a 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -204,6 +204,8 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
 		handler->attr = process_event_stub;
 	if (handler->event_type == NULL)
 		handler->event_type = process_event_stub;
+	if (handler->tracing_data == NULL)
+		handler->tracing_data = process_event_stub;
 }
 
 static const char *event__name[] = {
@@ -219,6 +221,7 @@ static const char *event__name[] = {
 	[PERF_RECORD_SAMPLE]	 = "SAMPLE",
 	[PERF_RECORD_HEADER_ATTR]	 = "ATTR",
 	[PERF_RECORD_HEADER_EVENT_TYPE]	 = "EVENT_TYPE",
+	[PERF_RECORD_HEADER_TRACING_DATA]	 = "TRACING_DATA",
 };
 
 unsigned long event__total[PERF_RECORD_HEADER_MAX];
@@ -311,6 +314,11 @@ static void event__event_type_swap(event_t *self)
 		bswap_64(self->event_type.event_type.event_id);
 }
 
+static void event__tracing_data_swap(event_t *self)
+{
+	self->tracing_data.size = bswap_32(self->tracing_data.size);
+}
+
 typedef void (*event__swap_op)(event_t *self);
 
 static event__swap_op event__swap_ops[] = {
@@ -323,6 +331,7 @@ static event__swap_op event__swap_ops[] = {
 	[PERF_RECORD_SAMPLE] = event__all64_swap,
 	[PERF_RECORD_HEADER_ATTR]   = event__attr_swap,
 	[PERF_RECORD_HEADER_EVENT_TYPE]   = event__event_type_swap,
+	[PERF_RECORD_HEADER_TRACING_DATA]   = event__tracing_data_swap,
 	[PERF_RECORD_HEADER_MAX]    = NULL,
 };
 
@@ -367,6 +376,10 @@ static int perf_session__process_event(struct perf_session *self,
 		return ops->attr(event, self);
 	case PERF_RECORD_HEADER_EVENT_TYPE:
 		return ops->event_type(event, self);
+	case PERF_RECORD_HEADER_TRACING_DATA:
+		/* setup for reading amidst mmap */
+		lseek(self->fd, offset + head, SEEK_SET);
+		return ops->tracing_data(event, self);
 	default:
 		self->unknown_events++;
 		return -1;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 0dac1f4457d..0739ebbbf9f 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -46,7 +46,8 @@ struct perf_event_ops {
 		 throttle,
 		 unthrottle,
 		 attr,
-		 event_type;
+		 event_type,
+		 tracing_data;
 };
 
 struct perf_session *perf_session__new(const char *filename, int mode, bool force);
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index 5ea8973ad33..30cd9b57595 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -154,10 +154,17 @@ static void put_tracing_file(char *file)
 	free(file);
 }
 
+static ssize_t calc_data_size;
+
 static ssize_t write_or_die(const void *buf, size_t len)
 {
 	int ret;
 
+	if (calc_data_size) {
+		calc_data_size += len;
+		return len;
+	}
+
 	ret = write(output_fd, buf, len);
 	if (ret < 0)
 		die("writing to '%s'", output_file);
@@ -526,3 +533,20 @@ int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events)
 
 	return 0;
 }
+
+ssize_t read_tracing_data_size(int fd, struct perf_event_attr *pattrs,
+			       int nb_events)
+{
+	ssize_t size;
+	int err = 0;
+
+	calc_data_size = 1;
+	err = read_tracing_data(fd, pattrs, nb_events);
+	size = calc_data_size - 1;
+	calc_data_size = 0;
+
+	if (err < 0)
+		return err;
+
+	return size;
+}
diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c
index 7cd1193918c..44889c9b563 100644
--- a/tools/perf/util/trace-event-read.c
+++ b/tools/perf/util/trace-event-read.c
@@ -50,14 +50,37 @@ static int long_size;
 
 static unsigned long	page_size;
 
+static ssize_t calc_data_size;
+
+static int do_read(int fd, void *buf, int size)
+{
+	int rsize = size;
+
+	while (size) {
+		int ret = read(fd, buf, size);
+
+		if (ret <= 0)
+			return -1;
+
+		size -= ret;
+		buf += ret;
+	}
+
+	return rsize;
+}
+
 static int read_or_die(void *data, int size)
 {
 	int r;
 
-	r = read(input_fd, data, size);
-	if (r != size)
+	r = do_read(input_fd, data, size);
+	if (r <= 0)
 		die("reading input file (size expected=%d received=%d)",
 		    size, r);
+
+	if (calc_data_size)
+		calc_data_size += r;
+
 	return r;
 }
 
@@ -82,56 +105,28 @@ static char *read_string(void)
 	char buf[BUFSIZ];
 	char *str = NULL;
 	int size = 0;
-	int i;
 	off_t r;
+	char c;
 
 	for (;;) {
-		r = read(input_fd, buf, BUFSIZ);
+		r = read(input_fd, &c, 1);
 		if (r < 0)
 			die("reading input file");
 
 		if (!r)
 			die("no data");
 
-		for (i = 0; i < r; i++) {
-			if (!buf[i])
-				break;
-		}
-		if (i < r)
-			break;
+		buf[size++] = c;
 
-		if (str) {
-			size += BUFSIZ;
-			str = realloc(str, size);
-			if (!str)
-				die("malloc of size %d", size);
-			memcpy(str + (size - BUFSIZ), buf, BUFSIZ);
-		} else {
-			size = BUFSIZ;
-			str = malloc_or_die(size);
-			memcpy(str, buf, size);
-		}
+		if (!c)
+			break;
 	}
 
-	/* trailing \0: */
-	i++;
-
-	/* move the file descriptor to the end of the string */
-	r = lseek(input_fd, -(r - i), SEEK_CUR);
-	if (r == (off_t)-1)
-		die("lseek");
-
-	if (str) {
-		size += i;
-		str = realloc(str, size);
-		if (!str)
-			die("malloc of size %d", size);
-		memcpy(str + (size - i), buf, i);
-	} else {
-		size = i;
-		str = malloc_or_die(i);
-		memcpy(str, buf, i);
-	}
+	if (calc_data_size)
+		calc_data_size += size;
+
+	str = malloc_or_die(size);
+	memcpy(str, buf, size);
 
 	return str;
 }
@@ -459,7 +454,7 @@ struct record *trace_read_data(int cpu)
 	return data;
 }
 
-void trace_report(int fd)
+ssize_t trace_report(int fd)
 {
 	char buf[BUFSIZ];
 	char test[] = { 23, 8, 68 };
@@ -467,6 +462,9 @@ void trace_report(int fd)
 	int show_version = 0;
 	int show_funcs = 0;
 	int show_printk = 0;
+	ssize_t size;
+
+	calc_data_size = 1;
 
 	input_fd = fd;
 
@@ -499,14 +497,17 @@ void trace_report(int fd)
 	read_proc_kallsyms();
 	read_ftrace_printk();
 
+	size = calc_data_size - 1;
+	calc_data_size = 0;
+
 	if (show_funcs) {
 		print_funcs();
-		return;
+		return size;
 	}
 	if (show_printk) {
 		print_printk();
-		return;
+		return size;
 	}
 
-	return;
+	return size;
 }
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 81f2fd20a0e..1f45d468fd9 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -163,7 +163,7 @@ struct record *trace_read_data(int cpu);
 
 void parse_set_info(int nr_cpus, int long_sz);
 
-void trace_report(int fd);
+ssize_t trace_report(int fd);
 
 void *malloc_or_die(unsigned int size);
 
@@ -259,6 +259,8 @@ void *raw_field_ptr(struct event *event, const char *name, void *data);
 unsigned long long eval_flag(const char *flag);
 
 int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events);
+ssize_t read_tracing_data_size(int fd, struct perf_event_attr *pattrs,
+			       int nb_events);
 
 /* taken from kernel/trace/trace.h */
 enum trace_flag_type {
-- 
cgit v1.2.3


From c7929e4727e8ff2d6fc8327188820e3b1c2f1dc3 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Thu, 1 Apr 2010 23:59:22 -0500
Subject: perf: Convert perf header build_ids into build_id events

Bypasses the build_id perf header code and replaces it with a
synthesized event and processing function that accomplishes the
same thing, used when reading/writing perf data to/from a pipe.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
Cc: k-keiichi@bx.jp.nec.com
Cc: acme@ghostprotocols.net
LKML-Reference: <1270184365-8281-9-git-send-email-tzanussi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 15 ++++++--
 tools/perf/builtin-report.c |  1 +
 tools/perf/builtin-trace.c  |  1 +
 tools/perf/util/event.h     |  2 +
 tools/perf/util/header.c    | 90 +++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/header.h    |  7 ++++
 tools/perf/util/session.c   |  6 +++
 tools/perf/util/session.h   |  3 +-
 8 files changed, 121 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 3775abe2af7..0bde31bc8e2 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -426,10 +426,19 @@ static int process_buildids(void)
 
 static void atexit_header(void)
 {
-	session->header.data_size += bytes_written;
+	if (!pipe_output) {
+		session->header.data_size += bytes_written;
 
-	process_buildids();
-	perf_header__write(&session->header, output, true);
+		process_buildids();
+		perf_header__write(&session->header, output, true);
+	} else {
+		int err;
+
+		err = event__synthesize_build_ids(process_synthesized_event,
+						  session);
+		if (err < 0)
+			pr_err("Couldn't synthesize build ids.\n");
+	}
 }
 
 static int __cmd_record(int argc, const char **argv)
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 76f03a70aac..7da5fb36526 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -270,6 +270,7 @@ static struct perf_event_ops event_ops = {
 	.attr	= event__process_attr,
 	.event_type = event__process_event_type,
 	.tracing_data = event__process_tracing_data,
+	.build_id = event__process_build_id,
 };
 
 extern volatile int session_done;
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 1509744429c..1ee1e300664 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -107,6 +107,7 @@ static struct perf_event_ops event_ops = {
 	.attr	= event__process_attr,
 	.event_type = event__process_event_type,
 	.tracing_data = event__process_tracing_data,
+	.build_id = event__process_build_id,
 };
 
 extern volatile int session_done;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index b896a177ea4..e5740ea140a 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -87,6 +87,7 @@ enum perf_header_event_type { /* above any possible kernel type */
 	PERF_RECORD_HEADER_ATTR			= 64,
 	PERF_RECORD_HEADER_EVENT_TYPE		= 65,
 	PERF_RECORD_HEADER_TRACING_DATA		= 66,
+	PERF_RECORD_HEADER_BUILD_ID		= 67,
 	PERF_RECORD_HEADER_MAX
 };
 
@@ -125,6 +126,7 @@ typedef union event_union {
 	struct attr_event		attr;
 	struct event_type_event		event_type;
 	struct tracing_data_event	tracing_data;
+	struct build_id_event		build_id;
 } event_t;
 
 struct events_stats {
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index c6874ecc90b..628173ba689 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -986,3 +986,93 @@ int event__process_tracing_data(event_t *self,
 
 	return size_read + padding;
 }
+
+int event__synthesize_build_id(struct dso *pos, u16 misc,
+			       event__handler_t process,
+			       struct perf_session *session)
+{
+	event_t ev;
+	size_t len;
+	int err = 0;
+
+	if (!pos->hit)
+		return err;
+
+	memset(&ev, 0, sizeof(ev));
+
+	len = pos->long_name_len + 1;
+	len = ALIGN(len, NAME_ALIGN);
+	memcpy(&ev.build_id.build_id, pos->build_id, sizeof(pos->build_id));
+	ev.build_id.header.type = PERF_RECORD_HEADER_BUILD_ID;
+	ev.build_id.header.misc = misc;
+	ev.build_id.header.size = sizeof(ev.build_id) + len;
+	memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len);
+
+	err = process(&ev, session);
+
+	return err;
+}
+
+static int __event_synthesize_build_ids(struct list_head *head, u16 misc,
+					event__handler_t process,
+					struct perf_session *session)
+{
+	struct dso *pos;
+
+	dsos__for_each_with_build_id(pos, head) {
+		int err;
+		if (!pos->hit)
+			continue;
+
+		err = event__synthesize_build_id(pos, misc, process, session);
+		if (err < 0)
+			return err;
+	}
+
+	return 0;
+}
+
+int event__synthesize_build_ids(event__handler_t process,
+				struct perf_session *session)
+{
+	int err;
+
+	if (!dsos__read_build_ids(true))
+		return 0;
+
+	err = __event_synthesize_build_ids(&dsos__kernel,
+					   PERF_RECORD_MISC_KERNEL,
+					   process, session);
+	if (err == 0)
+		err = __event_synthesize_build_ids(&dsos__user,
+						   PERF_RECORD_MISC_USER,
+						   process, session);
+
+	if (err < 0) {
+		pr_debug("failed to synthesize build ids\n");
+		return err;
+	}
+
+	dsos__cache_build_ids();
+
+	return 0;
+}
+
+int event__process_build_id(event_t *self,
+			    struct perf_session *session __unused)
+{
+	struct list_head *head = &dsos__user;
+	struct dso *dso;
+
+	if (self->build_id.header.misc & PERF_RECORD_MISC_KERNEL)
+		head = &dsos__kernel;
+
+	dso = __dsos__findnew(head, self->build_id.filename);
+	if (dso != NULL) {
+		dso__set_build_id(dso, &self->build_id.build_id);
+		if (head == &dsos__kernel && self->build_id.filename[0] == '[')
+			dso->kernel = 1;
+	}
+
+	return 0;
+}
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 3ed3d98c81d..4214e237565 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -118,4 +118,11 @@ int event__synthesize_tracing_data(int fd, struct perf_event_attr *pattrs,
 int event__process_tracing_data(event_t *self,
 				struct perf_session *session);
 
+int event__synthesize_build_id(struct dso *pos, u16 misc,
+			       event__handler_t process,
+			       struct perf_session *session);
+int event__synthesize_build_ids(event__handler_t process,
+				struct perf_session *session);
+int event__process_build_id(event_t *self, struct perf_session *session);
+
 #endif /* __PERF_HEADER_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 1516c40d47a..0fdf3ebef1e 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -206,6 +206,8 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
 		handler->event_type = process_event_stub;
 	if (handler->tracing_data == NULL)
 		handler->tracing_data = process_event_stub;
+	if (handler->build_id == NULL)
+		handler->build_id = process_event_stub;
 }
 
 static const char *event__name[] = {
@@ -222,6 +224,7 @@ static const char *event__name[] = {
 	[PERF_RECORD_HEADER_ATTR]	 = "ATTR",
 	[PERF_RECORD_HEADER_EVENT_TYPE]	 = "EVENT_TYPE",
 	[PERF_RECORD_HEADER_TRACING_DATA]	 = "TRACING_DATA",
+	[PERF_RECORD_HEADER_BUILD_ID]	 = "BUILD_ID",
 };
 
 unsigned long event__total[PERF_RECORD_HEADER_MAX];
@@ -332,6 +335,7 @@ static event__swap_op event__swap_ops[] = {
 	[PERF_RECORD_HEADER_ATTR]   = event__attr_swap,
 	[PERF_RECORD_HEADER_EVENT_TYPE]   = event__event_type_swap,
 	[PERF_RECORD_HEADER_TRACING_DATA]   = event__tracing_data_swap,
+	[PERF_RECORD_HEADER_BUILD_ID]   = NULL,
 	[PERF_RECORD_HEADER_MAX]    = NULL,
 };
 
@@ -380,6 +384,8 @@ static int perf_session__process_event(struct perf_session *self,
 		/* setup for reading amidst mmap */
 		lseek(self->fd, offset + head, SEEK_SET);
 		return ops->tracing_data(event, self);
+	case PERF_RECORD_HEADER_BUILD_ID:
+		return ops->build_id(event, self);
 	default:
 		self->unknown_events++;
 		return -1;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 0739ebbbf9f..0ac14d42dc2 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -47,7 +47,8 @@ struct perf_event_ops {
 		 unthrottle,
 		 attr,
 		 event_type,
-		 tracing_data;
+		 tracing_data,
+		 build_id;
 };
 
 struct perf_session *perf_session__new(const char *filename, int mode, bool force);
-- 
cgit v1.2.3


From 47902f3611b392209e2a412bf7ec02dca95e666d Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Thu, 1 Apr 2010 23:59:23 -0500
Subject: perf trace/scripting: Add rwtop and sctop scripts

A couple of scripts, one in Python and the other in Perl, that
demonstrate 'live mode' tracing.  For each, the output of the
perf event stream is fed continuously to the script, which
continuously aggregates the data and reports the current results
every 3 seconds, or at the optionally specified interval.  After
the current results are displayed, the aggregations are cleared
and the cycle begins anew.

To run the scripts, simply pipe the output of the 'perf trace
record' step as input to the corresponding 'perf trace report'
step, using '-' as the filename to -o and -i:

 $ perf trace record sctop -o - | perf trace report sctop -i -

Also adds clear_term() utility functions to the Util.pm and
Util.py utility modules, for use by any script to clear the
screen.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
Cc: k-keiichi@bx.jp.nec.com
Cc: acme@ghostprotocols.net
LKML-Reference: <1270184365-8281-10-git-send-email-tzanussi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 .../perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm    |   6 +
 tools/perf/scripts/perl/bin/rwtop-record           |   2 +
 tools/perf/scripts/perl/bin/rwtop-report           |  23 +++
 tools/perf/scripts/perl/rwtop.pl                   | 177 +++++++++++++++++++++
 .../python/Perf-Trace-Util/lib/Perf/Trace/Util.py  |   3 +
 tools/perf/scripts/python/bin/sctop-record         |   2 +
 tools/perf/scripts/python/bin/sctop-report         |  24 +++
 tools/perf/scripts/python/sctop.py                 |  78 +++++++++
 8 files changed, 315 insertions(+)
 create mode 100644 tools/perf/scripts/perl/bin/rwtop-record
 create mode 100644 tools/perf/scripts/perl/bin/rwtop-report
 create mode 100644 tools/perf/scripts/perl/rwtop.pl
 create mode 100644 tools/perf/scripts/python/bin/sctop-record
 create mode 100644 tools/perf/scripts/python/bin/sctop-report
 create mode 100644 tools/perf/scripts/python/sctop.py

diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm
index f869c48dc9b..d94b40c8ac8 100644
--- a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm
+++ b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm
@@ -15,6 +15,7 @@ our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
 
 our @EXPORT = qw(
 avg nsecs nsecs_secs nsecs_nsecs nsecs_usecs print_nsecs
+clear_term
 );
 
 our $VERSION = '0.01';
@@ -55,6 +56,11 @@ sub nsecs_str {
     return $str;
 }
 
+sub clear_term
+{
+    print "\x1b[H\x1b[2J";
+}
+
 1;
 __END__
 =head1 NAME
diff --git a/tools/perf/scripts/perl/bin/rwtop-record b/tools/perf/scripts/perl/bin/rwtop-record
new file mode 100644
index 00000000000..63976bf11e8
--- /dev/null
+++ b/tools/perf/scripts/perl/bin/rwtop-record
@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -c 1 -f -a -M -R -e syscalls:sys_enter_read -e syscalls:sys_exit_read -e syscalls:sys_enter_write -e syscalls:sys_exit_write $@
diff --git a/tools/perf/scripts/perl/bin/rwtop-report b/tools/perf/scripts/perl/bin/rwtop-report
new file mode 100644
index 00000000000..93e698cd3f3
--- /dev/null
+++ b/tools/perf/scripts/perl/bin/rwtop-report
@@ -0,0 +1,23 @@
+#!/bin/bash
+# description: system-wide r/w top
+# args: [interval]
+n_args=0
+for i in "$@"
+do
+    if expr match "$i" "-" > /dev/null ; then
+	break
+    fi
+    n_args=$(( $n_args + 1 ))
+done
+if [ "$n_args" -gt 1 ] ; then
+    echo "usage: rwtop-report [interval]"
+    exit
+fi
+if [ "$n_args" -gt 0 ] ; then
+    interval=$1
+    shift
+fi
+perf trace $@ -s ~/libexec/perf-core/scripts/perl/rwtop.pl $interval
+
+
+
diff --git a/tools/perf/scripts/perl/rwtop.pl b/tools/perf/scripts/perl/rwtop.pl
new file mode 100644
index 00000000000..ec2ab49a6f2
--- /dev/null
+++ b/tools/perf/scripts/perl/rwtop.pl
@@ -0,0 +1,177 @@
+#!/usr/bin/perl -w
+# (c) 2010, Tom Zanussi <tzanussi@gmail.com>
+# Licensed under the terms of the GNU GPL License version 2
+
+# read/write top
+#
+# Periodically displays system-wide r/w call activity, broken down by
+# pid.  If an [interval] arg is specified, the display will be
+# refreshed every [interval] seconds.  The default interval is 3
+# seconds.
+
+use 5.010000;
+use strict;
+use warnings;
+
+use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
+use lib "./Perf-Trace-Util/lib";
+use Perf::Trace::Core;
+use Perf::Trace::Util;
+
+my $default_interval = 3;
+my $nlines = 20;
+my $print_thread;
+
+my %reads;
+my %writes;
+
+my $interval = shift;
+if (!$interval) {
+    $interval = $default_interval;
+}
+
+sub syscalls::sys_exit_read
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+	$common_pid, $common_comm,
+	$nr, $ret) = @_;
+
+    if ($ret > 0) {
+	$reads{$common_pid}{bytes_read} += $ret;
+    } else {
+	if (!defined ($reads{$common_pid}{bytes_read})) {
+	    $reads{$common_pid}{bytes_read} = 0;
+	}
+	$reads{$common_pid}{errors}{$ret}++;
+    }
+}
+
+sub syscalls::sys_enter_read
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+	$common_pid, $common_comm,
+	$nr, $fd, $buf, $count) = @_;
+
+    $reads{$common_pid}{bytes_requested} += $count;
+    $reads{$common_pid}{total_reads}++;
+    $reads{$common_pid}{comm} = $common_comm;
+}
+
+sub syscalls::sys_exit_write
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+	$common_pid, $common_comm,
+	$nr, $ret) = @_;
+
+    if ($ret <= 0) {
+	$writes{$common_pid}{errors}{$ret}++;
+    }
+}
+
+sub syscalls::sys_enter_write
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+	$common_pid, $common_comm,
+	$nr, $fd, $buf, $count) = @_;
+
+    $writes{$common_pid}{bytes_written} += $count;
+    $writes{$common_pid}{total_writes}++;
+    $writes{$common_pid}{comm} = $common_comm;
+}
+
+sub trace_begin
+{
+    $SIG{ALRM} = \&print_totals;
+    alarm 1;
+}
+
+sub trace_end
+{
+    print_unhandled();
+    print_totals();
+}
+
+sub print_totals
+{
+    my $count;
+
+    $count = 0;
+
+    clear_term();
+
+    printf("\nread counts by pid:\n\n");
+
+    printf("%6s  %20s  %10s  %10s  %10s\n", "pid", "comm",
+	   "# reads", "bytes_req", "bytes_read");
+    printf("%6s  %-20s  %10s  %10s  %10s\n", "------", "--------------------",
+	   "----------", "----------", "----------");
+
+    foreach my $pid (sort {$reads{$b}{bytes_read} <=>
+			       $reads{$a}{bytes_read}} keys %reads) {
+	my $comm = $reads{$pid}{comm};
+	my $total_reads = $reads{$pid}{total_reads};
+	my $bytes_requested = $reads{$pid}{bytes_requested};
+	my $bytes_read = $reads{$pid}{bytes_read};
+
+	printf("%6s  %-20s  %10s  %10s  %10s\n", $pid, $comm,
+	       $total_reads, $bytes_requested, $bytes_read);
+
+	if (++$count == $nlines) {
+	    last;
+	}
+    }
+
+    $count = 0;
+
+    printf("\nwrite counts by pid:\n\n");
+
+    printf("%6s  %20s  %10s  %13s\n", "pid", "comm",
+	   "# writes", "bytes_written");
+    printf("%6s  %-20s  %10s  %13s\n", "------", "--------------------",
+	   "----------", "-------------");
+
+    foreach my $pid (sort {$writes{$b}{bytes_written} <=>
+			       $writes{$a}{bytes_written}} keys %writes) {
+	my $comm = $writes{$pid}{comm};
+	my $total_writes = $writes{$pid}{total_writes};
+	my $bytes_written = $writes{$pid}{bytes_written};
+
+	printf("%6s  %-20s  %10s  %13s\n", $pid, $comm,
+	       $total_writes, $bytes_written);
+
+	if (++$count == $nlines) {
+	    last;
+	}
+    }
+
+    %reads = ();
+    %writes = ();
+    alarm $interval;
+}
+
+my %unhandled;
+
+sub print_unhandled
+{
+    if ((scalar keys %unhandled) == 0) {
+	return;
+    }
+
+    print "\nunhandled events:\n\n";
+
+    printf("%-40s  %10s\n", "event", "count");
+    printf("%-40s  %10s\n", "----------------------------------------",
+	   "-----------");
+
+    foreach my $event_name (keys %unhandled) {
+	printf("%-40s  %10d\n", $event_name, $unhandled{$event_name});
+    }
+}
+
+sub trace_unhandled
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+	$common_pid, $common_comm) = @_;
+
+    $unhandled{$event_name}++;
+}
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
index 83e91435ed0..9689bc0acd9 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
@@ -23,3 +23,6 @@ def nsecs_nsecs(nsecs):
 def nsecs_str(nsecs):
     str = "%5u.%09u" % (nsecs_secs(nsecs), nsecs_nsecs(nsecs)),
     return str
+
+def clear_term():
+    print("\x1b[H\x1b[2J")
diff --git a/tools/perf/scripts/python/bin/sctop-record b/tools/perf/scripts/python/bin/sctop-record
new file mode 100644
index 00000000000..27ccffa26ab
--- /dev/null
+++ b/tools/perf/scripts/python/bin/sctop-record
@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -c 1 -f -a -M -R -e raw_syscalls:sys_enter $@
diff --git a/tools/perf/scripts/python/bin/sctop-report b/tools/perf/scripts/python/bin/sctop-report
new file mode 100644
index 00000000000..b01c842ae7b
--- /dev/null
+++ b/tools/perf/scripts/python/bin/sctop-report
@@ -0,0 +1,24 @@
+#!/bin/bash
+# description: syscall top
+# args: [comm] [interval]
+n_args=0
+for i in "$@"
+do
+    if expr match "$i" "-" > /dev/null ; then
+	break
+    fi
+    n_args=$(( $n_args + 1 ))
+done
+if [ "$n_args" -gt 2 ] ; then
+    echo "usage: sctop-report [comm] [interval]"
+    exit
+fi
+if [ "$n_args" -gt 1 ] ; then
+    comm=$1
+    interval=$2
+    shift 2
+elif [ "$n_args" -gt 0 ] ; then
+    interval=$1
+    shift
+fi
+perf trace $@ -s ~/libexec/perf-core/scripts/python/sctop.py $comm $interval
diff --git a/tools/perf/scripts/python/sctop.py b/tools/perf/scripts/python/sctop.py
new file mode 100644
index 00000000000..6cafad40c29
--- /dev/null
+++ b/tools/perf/scripts/python/sctop.py
@@ -0,0 +1,78 @@
+# system call top
+# (c) 2010, Tom Zanussi <tzanussi@gmail.com>
+# Licensed under the terms of the GNU GPL License version 2
+#
+# Periodically displays system-wide system call totals, broken down by
+# syscall.  If a [comm] arg is specified, only syscalls called by
+# [comm] are displayed. If an [interval] arg is specified, the display
+# will be refreshed every [interval] seconds.  The default interval is
+# 3 seconds.
+
+import thread
+import time
+import os
+import sys
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+from Util import *
+
+usage = "perf trace -s syscall-counts.py [comm] [interval]\n";
+
+for_comm = None
+default_interval = 3
+interval = default_interval
+
+if len(sys.argv) > 3:
+	sys.exit(usage)
+
+if len(sys.argv) > 2:
+	for_comm = sys.argv[1]
+	interval = int(sys.argv[2])
+elif len(sys.argv) > 1:
+	try:
+		interval = int(sys.argv[1])
+	except ValueError:
+		for_comm = sys.argv[1]
+		interval = default_interval
+
+syscalls = autodict()
+
+def trace_begin():
+	thread.start_new_thread(print_syscall_totals, (interval,))
+	pass
+
+def raw_syscalls__sys_enter(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	id, args):
+	if for_comm is not None:
+		if common_comm != for_comm:
+			return
+	try:
+		syscalls[id] += 1
+	except TypeError:
+		syscalls[id] = 1
+
+def print_syscall_totals(interval):
+	while 1:
+		clear_term()
+		if for_comm is not None:
+			print "\nsyscall events for %s:\n\n" % (for_comm),
+		else:
+			print "\nsyscall events:\n\n",
+
+		print "%-40s  %10s\n" % ("event", "count"),
+		print "%-40s  %10s\n" % ("----------------------------------------", \
+						 "----------"),
+
+		for id, val in sorted(syscalls.iteritems(), key = lambda(k, v): (v, k), \
+					      reverse = True):
+			try:
+				print "%-40d  %10d\n" % (id, val),
+			except TypeError:
+				pass
+		syscalls.clear()
+		time.sleep(interval)
-- 
cgit v1.2.3


From 00b21a01935892a2b97613f10300434998f45093 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Thu, 1 Apr 2010 23:59:24 -0500
Subject: perf trace/scripting: Enable scripting shell scripts for live mode

It should be possible to run any perf trace script in 'live
mode'. This requires being able to pass in e.g. '-i -' or other
args, which the current shell scripts aren't equipped to handle.
 In a few cases, there are required or optional args that also
need special handling. This patch makes changes the current set
of shell scripts as necessary.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
Cc: k-keiichi@bx.jp.nec.com
Cc: acme@ghostprotocols.net
LKML-Reference: <1270184365-8281-11-git-send-email-tzanussi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/scripts/perl/bin/failed-syscalls-record          | 2 +-
 tools/perf/scripts/perl/bin/failed-syscalls-report          | 8 +++++++-
 tools/perf/scripts/perl/bin/rw-by-file-record               | 3 ++-
 tools/perf/scripts/perl/bin/rw-by-file-report               | 8 +++++++-
 tools/perf/scripts/perl/bin/rw-by-pid-record                | 2 +-
 tools/perf/scripts/perl/bin/rw-by-pid-report                | 2 +-
 tools/perf/scripts/perl/bin/wakeup-latency-record           | 2 +-
 tools/perf/scripts/perl/bin/wakeup-latency-report           | 2 +-
 tools/perf/scripts/perl/bin/workqueue-stats-record          | 2 +-
 tools/perf/scripts/perl/bin/workqueue-stats-report          | 2 +-
 tools/perf/scripts/python/bin/failed-syscalls-by-pid-record | 2 +-
 tools/perf/scripts/python/bin/failed-syscalls-by-pid-report | 8 +++++++-
 tools/perf/scripts/python/bin/syscall-counts-by-pid-record  | 2 +-
 tools/perf/scripts/python/bin/syscall-counts-by-pid-report  | 8 +++++++-
 tools/perf/scripts/python/bin/syscall-counts-record         | 2 +-
 tools/perf/scripts/python/bin/syscall-counts-report         | 8 +++++++-
 16 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/tools/perf/scripts/perl/bin/failed-syscalls-record b/tools/perf/scripts/perl/bin/failed-syscalls-record
index f8885d389e6..6ad9b8f5f00 100644
--- a/tools/perf/scripts/perl/bin/failed-syscalls-record
+++ b/tools/perf/scripts/perl/bin/failed-syscalls-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_exit
+perf record -c 1 -f -a -M -R -e raw_syscalls:sys_exit $@
diff --git a/tools/perf/scripts/perl/bin/failed-syscalls-report b/tools/perf/scripts/perl/bin/failed-syscalls-report
index 8bfc660e505..f6346082a8f 100644
--- a/tools/perf/scripts/perl/bin/failed-syscalls-report
+++ b/tools/perf/scripts/perl/bin/failed-syscalls-report
@@ -1,4 +1,10 @@
 #!/bin/bash
 # description: system-wide failed syscalls
 # args: [comm]
-perf trace -s ~/libexec/perf-core/scripts/perl/failed-syscalls.pl $1
+if [ $# -gt 0 ] ; then
+    if ! expr match "$1" "-" ; then
+	comm=$1
+	shift
+    fi
+fi
+perf trace $@ -s ~/libexec/perf-core/scripts/perl/failed-syscalls.pl $comm
diff --git a/tools/perf/scripts/perl/bin/rw-by-file-record b/tools/perf/scripts/perl/bin/rw-by-file-record
index b25056ebf96..a828679837a 100644
--- a/tools/perf/scripts/perl/bin/rw-by-file-record
+++ b/tools/perf/scripts/perl/bin/rw-by-file-record
@@ -1,2 +1,3 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e syscalls:sys_enter_read -e syscalls:sys_enter_write
+perf record -c 1 -f -a -M -R -e syscalls:sys_enter_read -e syscalls:sys_enter_write $@
+
diff --git a/tools/perf/scripts/perl/bin/rw-by-file-report b/tools/perf/scripts/perl/bin/rw-by-file-report
index eddb9ccce6a..d83070b7eeb 100644
--- a/tools/perf/scripts/perl/bin/rw-by-file-report
+++ b/tools/perf/scripts/perl/bin/rw-by-file-report
@@ -1,7 +1,13 @@
 #!/bin/bash
 # description: r/w activity for a program, by file
 # args: <comm>
-perf trace -s ~/libexec/perf-core/scripts/perl/rw-by-file.pl $1
+if [ $# -lt 1 ] ; then
+    echo "usage: rw-by-file <comm>"
+    exit
+fi
+comm=$1
+shift
+perf trace $@ -s ~/libexec/perf-core/scripts/perl/rw-by-file.pl $comm
 
 
diff --git a/tools/perf/scripts/perl/bin/rw-by-pid-record b/tools/perf/scripts/perl/bin/rw-by-pid-record
index 8903979c5b6..63976bf11e8 100644
--- a/tools/perf/scripts/perl/bin/rw-by-pid-record
+++ b/tools/perf/scripts/perl/bin/rw-by-pid-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e syscalls:sys_enter_read -e syscalls:sys_exit_read -e syscalls:sys_enter_write -e syscalls:sys_exit_write
+perf record -c 1 -f -a -M -R -e syscalls:sys_enter_read -e syscalls:sys_exit_read -e syscalls:sys_enter_write -e syscalls:sys_exit_write $@
diff --git a/tools/perf/scripts/perl/bin/rw-by-pid-report b/tools/perf/scripts/perl/bin/rw-by-pid-report
index 7f44c25cc85..7ef46983f62 100644
--- a/tools/perf/scripts/perl/bin/rw-by-pid-report
+++ b/tools/perf/scripts/perl/bin/rw-by-pid-report
@@ -1,6 +1,6 @@
 #!/bin/bash
 # description: system-wide r/w activity
-perf trace -s ~/libexec/perf-core/scripts/perl/rw-by-pid.pl
+perf trace $@ -s ~/libexec/perf-core/scripts/perl/rw-by-pid.pl
 
 
diff --git a/tools/perf/scripts/perl/bin/wakeup-latency-record b/tools/perf/scripts/perl/bin/wakeup-latency-record
index 6abedda911a..9c0cf588ff8 100644
--- a/tools/perf/scripts/perl/bin/wakeup-latency-record
+++ b/tools/perf/scripts/perl/bin/wakeup-latency-record
@@ -1,5 +1,5 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e sched:sched_switch -e sched:sched_wakeup
+perf record -c 1 -f -a -M -R -e sched:sched_switch -e sched:sched_wakeup $@
 
 
diff --git a/tools/perf/scripts/perl/bin/wakeup-latency-report b/tools/perf/scripts/perl/bin/wakeup-latency-report
index fce3adcb324..a0d898f9ca1 100644
--- a/tools/perf/scripts/perl/bin/wakeup-latency-report
+++ b/tools/perf/scripts/perl/bin/wakeup-latency-report
@@ -1,6 +1,6 @@
 #!/bin/bash
 # description: system-wide min/max/avg wakeup latency
-perf trace -s ~/libexec/perf-core/scripts/perl/wakeup-latency.pl
+perf trace $@ -s ~/libexec/perf-core/scripts/perl/wakeup-latency.pl
 
 
diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-record b/tools/perf/scripts/perl/bin/workqueue-stats-record
index fce6637b19b..c2a1a942113 100644
--- a/tools/perf/scripts/perl/bin/workqueue-stats-record
+++ b/tools/perf/scripts/perl/bin/workqueue-stats-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e workqueue:workqueue_creation -e workqueue:workqueue_destruction -e workqueue:workqueue_execution -e workqueue:workqueue_insertion
+perf record -c 1 -f -a -M -R -e workqueue:workqueue_creation -e workqueue:workqueue_destruction -e workqueue:workqueue_execution -e workqueue:workqueue_insertion $@
diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-report b/tools/perf/scripts/perl/bin/workqueue-stats-report
index 71cfbd182fb..35081132ef9 100644
--- a/tools/perf/scripts/perl/bin/workqueue-stats-report
+++ b/tools/perf/scripts/perl/bin/workqueue-stats-report
@@ -1,6 +1,6 @@
 #!/bin/bash
 # description: workqueue stats (ins/exe/create/destroy)
-perf trace -s ~/libexec/perf-core/scripts/perl/workqueue-stats.pl
+perf trace $@ -s ~/libexec/perf-core/scripts/perl/workqueue-stats.pl
 
 
diff --git a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-record b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-record
index f8885d389e6..6ad9b8f5f00 100644
--- a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-record
+++ b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_exit
+perf record -c 1 -f -a -M -R -e raw_syscalls:sys_exit $@
diff --git a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report
index 1e0c0a860c8..8c128eff9c0 100644
--- a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report
+++ b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report
@@ -1,4 +1,10 @@
 #!/bin/bash
 # description: system-wide failed syscalls, by pid
 # args: [comm]
-perf trace -s ~/libexec/perf-core/scripts/python/failed-syscalls-by-pid.py $1
+if [ $# -gt 0 ] ; then
+    if ! expr match "$1" "-" ; then
+	comm=$1
+	shift
+    fi
+fi
+perf trace $@ -s ~/libexec/perf-core/scripts/python/failed-syscalls-by-pid.py $comm
diff --git a/tools/perf/scripts/python/bin/syscall-counts-by-pid-record b/tools/perf/scripts/python/bin/syscall-counts-by-pid-record
index 45a8c50359d..27ccffa26ab 100644
--- a/tools/perf/scripts/python/bin/syscall-counts-by-pid-record
+++ b/tools/perf/scripts/python/bin/syscall-counts-by-pid-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_enter
+perf record -c 1 -f -a -M -R -e raw_syscalls:sys_enter $@
diff --git a/tools/perf/scripts/python/bin/syscall-counts-by-pid-report b/tools/perf/scripts/python/bin/syscall-counts-by-pid-report
index f8044d19227..c53362e4860 100644
--- a/tools/perf/scripts/python/bin/syscall-counts-by-pid-report
+++ b/tools/perf/scripts/python/bin/syscall-counts-by-pid-report
@@ -1,4 +1,10 @@
 #!/bin/bash
 # description: system-wide syscall counts, by pid
 # args: [comm]
-perf trace -s ~/libexec/perf-core/scripts/python/syscall-counts-by-pid.py $1
+if [ $# -gt 0 ] ; then
+    if ! expr match "$1" "-" ; then
+	comm=$1
+	shift
+    fi
+fi
+perf trace $@ -s ~/libexec/perf-core/scripts/python/syscall-counts-by-pid.py $comm
diff --git a/tools/perf/scripts/python/bin/syscall-counts-record b/tools/perf/scripts/python/bin/syscall-counts-record
index 45a8c50359d..27ccffa26ab 100644
--- a/tools/perf/scripts/python/bin/syscall-counts-record
+++ b/tools/perf/scripts/python/bin/syscall-counts-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_enter
+perf record -c 1 -f -a -M -R -e raw_syscalls:sys_enter $@
diff --git a/tools/perf/scripts/python/bin/syscall-counts-report b/tools/perf/scripts/python/bin/syscall-counts-report
index a366aa61612..8c21552b3cd 100644
--- a/tools/perf/scripts/python/bin/syscall-counts-report
+++ b/tools/perf/scripts/python/bin/syscall-counts-report
@@ -1,4 +1,10 @@
 #!/bin/bash
 # description: system-wide syscall counts
 # args: [comm]
-perf trace -s ~/libexec/perf-core/scripts/python/syscall-counts.py $1
+if [ $# -gt 0 ] ; then
+    if ! expr match "$1" "-" ; then
+	comm=$1
+	shift
+    fi
+fi
+perf trace $@ -s ~/libexec/perf-core/scripts/python/syscall-counts.py $comm
-- 
cgit v1.2.3


From a0cccc2e8e9fb16cbed3a117b30e3fbac3092ee3 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Thu, 1 Apr 2010 23:59:25 -0500
Subject: perf trace: Invoke live mode automatically if record/report not
 specified

Currently, live mode is invoked by explicitly invoking the
record and report sides and connecting them with a pipe e.g.

 $ perf trace record rwtop -o - | perf trace report rwtop 5 -i -

In terms of usability, it's not that bad, but it does require
the user to type and remember more than necessary.

This patch allows the user to accomplish the same thing without
specifying the separate record/report steps or the pipe.  So the
same command as above can be accomplished more simply as:

 $ perf trace rwtop 5

Notice that the '-i -' and '-o -' aren't required in this case -
they're added internally, and that any extra arguments are
passed along to the report script (but not to the record
script).

The overall effect is that any of the scripts listed in 'perf
trace -l' can now be used directly in live mode, with the
expected arguments, by simply specifying the script and args to
'perf trace'.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
Cc: k-keiichi@bx.jp.nec.com
Cc: acme@ghostprotocols.net
LKML-Reference: <1270184365-8281-12-git-send-email-tzanussi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-trace.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 1ee1e300664..2eefb33c967 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -561,6 +561,65 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used)
 		suffix = REPORT_SUFFIX;
 	}
 
+	if (!suffix && argc >= 2 && strncmp(argv[1], "-", strlen("-")) != 0) {
+		char *record_script_path, *report_script_path;
+		int live_pipe[2];
+		pid_t pid;
+
+		record_script_path = get_script_path(argv[1], RECORD_SUFFIX);
+		if (!record_script_path) {
+			fprintf(stderr, "record script not found\n");
+			return -1;
+		}
+
+		report_script_path = get_script_path(argv[1], REPORT_SUFFIX);
+		if (!report_script_path) {
+			fprintf(stderr, "report script not found\n");
+			return -1;
+		}
+
+		if (pipe(live_pipe) < 0) {
+			perror("failed to create pipe");
+			exit(-1);
+		}
+
+		pid = fork();
+		if (pid < 0) {
+			perror("failed to fork");
+			exit(-1);
+		}
+
+		if (!pid) {
+			dup2(live_pipe[1], 1);
+			close(live_pipe[0]);
+
+			__argv = malloc(5 * sizeof(const char *));
+			__argv[0] = "/bin/sh";
+			__argv[1] = record_script_path;
+			__argv[2] = "-o";
+			__argv[3] = "-";
+			__argv[4] = NULL;
+
+			execvp("/bin/sh", (char **)__argv);
+			exit(-1);
+		}
+
+		dup2(live_pipe[0], 0);
+		close(live_pipe[1]);
+
+		__argv = malloc((argc + 3) * sizeof(const char *));
+		__argv[0] = "/bin/sh";
+		__argv[1] = report_script_path;
+		for (i = 2; i < argc; i++)
+			__argv[i] = argv[i];
+		__argv[i++] = "-i";
+		__argv[i++] = "-";
+		__argv[i++] = NULL;
+
+		execvp("/bin/sh", (char **)__argv);
+		exit(-1);
+	}
+
 	if (suffix) {
 		script_path = get_script_path(argv[2], suffix);
 		if (!script_path) {
-- 
cgit v1.2.3


From 3f10940e4fb69d312602078f2c5234206797ca31 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@relay.de.ibm.com>
Date: Sat, 10 Apr 2010 16:46:21 +0200
Subject: x86/microcode: Use nonseekable_open()

No need to seek on this file, so prevent it outright so we can
avoid using default_llseek - removes one more BKL usage.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
[drop useless llseek = no_llseek and smp_lock.h inclusion]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnd Bergmann <arnd@relay.de.ibm.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Dmitry Adamushko <dmitry.adamushko@gmail.com>
LKML-Reference: <1270910781-8786-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index cceb5bc3c3c..2cd8c544e41 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -201,9 +201,9 @@ static int do_microcode_update(const void __user *buf, size_t size)
 	return error;
 }
 
-static int microcode_open(struct inode *unused1, struct file *unused2)
+static int microcode_open(struct inode *inode, struct file *file)
 {
-	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+	return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
 }
 
 static ssize_t microcode_write(struct file *file, const char __user *buf,
-- 
cgit v1.2.3


From 76e1d9047e4edefb8ada20aa90d5762306082bd6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 5 Apr 2010 15:35:57 +0200
Subject: perf: Store active software events in a hashlist

Each time a software event triggers, we need to walk through
the entire list of events from the current cpu and task contexts
to retrieve a running perf event that matches.
We also need to check a matching perf event is actually counting.

This walk is wasteful and makes the event fast path scaling
down with a growing number of events running on the same
contexts.

To solve this, we store the running perf events in a hashlist to
get an immediate access to them against their type:event_id when
they trigger.

v2: - Fix SWEVENT_HLIST_SIZE definition (and re-learn some basic
      maths along the way)
    - Only allocate hlist for online cpus, but keep track of the
      refcount on offline possible cpus too, so that we allocate it
      if needed when it becomes online.
    - Drop the kref use as it's not adapted to our tricks anymore.

v3: - Fix bad refcount check (address instead of value). Thanks to
      Eric Dumazet who spotted this.
    - While exiting cpu, move the hlist release out of the IPI path
      to lock the hlist mutex sanely.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  12 +++
 kernel/perf_event.c        | 246 +++++++++++++++++++++++++++++++++------------
 2 files changed, 195 insertions(+), 63 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6e96cc8225d..bf896d0b2e9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -589,6 +589,14 @@ enum perf_group_flag {
 	PERF_GROUP_SOFTWARE = 0x1,
 };
 
+#define SWEVENT_HLIST_BITS	8
+#define SWEVENT_HLIST_SIZE	(1 << SWEVENT_HLIST_BITS)
+
+struct swevent_hlist {
+	struct hlist_head	heads[SWEVENT_HLIST_SIZE];
+	struct rcu_head		rcu_head;
+};
+
 /**
  * struct perf_event - performance event kernel representation:
  */
@@ -597,6 +605,7 @@ struct perf_event {
 	struct list_head		group_entry;
 	struct list_head		event_entry;
 	struct list_head		sibling_list;
+	struct hlist_node		hlist_entry;
 	int				nr_siblings;
 	int				group_flags;
 	struct perf_event		*group_leader;
@@ -744,6 +753,9 @@ struct perf_cpu_context {
 	int				active_oncpu;
 	int				max_pertask;
 	int				exclusive;
+	struct swevent_hlist		*swevent_hlist;
+	struct mutex			hlist_mutex;
+	int				hlist_refcount;
 
 	/*
 	 * Recursion avoidance:
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index fcf42dcd608..9efdfe5b8d3 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -16,6 +16,7 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
+#include <linux/hash.h>
 #include <linux/sysfs.h>
 #include <linux/dcache.h>
 #include <linux/percpu.h>
@@ -3966,36 +3967,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
 	perf_swevent_overflow(event, 0, nmi, data, regs);
 }
 
-static int perf_swevent_is_counting(struct perf_event *event)
-{
-	/*
-	 * The event is active, we're good!
-	 */
-	if (event->state == PERF_EVENT_STATE_ACTIVE)
-		return 1;
-
-	/*
-	 * The event is off/error, not counting.
-	 */
-	if (event->state != PERF_EVENT_STATE_INACTIVE)
-		return 0;
-
-	/*
-	 * The event is inactive, if the context is active
-	 * we're part of a group that didn't make it on the 'pmu',
-	 * not counting.
-	 */
-	if (event->ctx->is_active)
-		return 0;
-
-	/*
-	 * We're inactive and the context is too, this means the
-	 * task is scheduled out, we're counting events that happen
-	 * to us, like migration events.
-	 */
-	return 1;
-}
-
 static int perf_tp_event_match(struct perf_event *event,
 				struct perf_sample_data *data);
 
@@ -4019,12 +3990,6 @@ static int perf_swevent_match(struct perf_event *event,
 				struct perf_sample_data *data,
 				struct pt_regs *regs)
 {
-	if (event->cpu != -1 && event->cpu != smp_processor_id())
-		return 0;
-
-	if (!perf_swevent_is_counting(event))
-		return 0;
-
 	if (event->attr.type != type)
 		return 0;
 
@@ -4041,18 +4006,53 @@ static int perf_swevent_match(struct perf_event *event,
 	return 1;
 }
 
-static void perf_swevent_ctx_event(struct perf_event_context *ctx,
-				     enum perf_type_id type,
-				     u32 event_id, u64 nr, int nmi,
-				     struct perf_sample_data *data,
-				     struct pt_regs *regs)
+static inline u64 swevent_hash(u64 type, u32 event_id)
+{
+	u64 val = event_id | (type << 32);
+
+	return hash_64(val, SWEVENT_HLIST_BITS);
+}
+
+static struct hlist_head *
+find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
+{
+	u64 hash;
+	struct swevent_hlist *hlist;
+
+	hash = swevent_hash(type, event_id);
+
+	hlist = rcu_dereference(ctx->swevent_hlist);
+	if (!hlist)
+		return NULL;
+
+	return &hlist->heads[hash];
+}
+
+static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
+				    u64 nr, int nmi,
+				    struct perf_sample_data *data,
+				    struct pt_regs *regs)
 {
+	struct perf_cpu_context *cpuctx;
 	struct perf_event *event;
+	struct hlist_node *node;
+	struct hlist_head *head;
 
-	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+	cpuctx = &__get_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+
+	head = find_swevent_head(cpuctx, type, event_id);
+
+	if (!head)
+		goto end;
+
+	hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
 		if (perf_swevent_match(event, type, event_id, data, regs))
 			perf_swevent_add(event, nr, nmi, data, regs);
 	}
+end:
+	rcu_read_unlock();
 }
 
 int perf_swevent_get_recursion_context(void)
@@ -4090,27 +4090,6 @@ void perf_swevent_put_recursion_context(int rctx)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
-static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
-				    u64 nr, int nmi,
-				    struct perf_sample_data *data,
-				    struct pt_regs *regs)
-{
-	struct perf_cpu_context *cpuctx;
-	struct perf_event_context *ctx;
-
-	cpuctx = &__get_cpu_var(perf_cpu_context);
-	rcu_read_lock();
-	perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
-				 nr, nmi, data, regs);
-	/*
-	 * doesn't really matter which of the child contexts the
-	 * events ends up in.
-	 */
-	ctx = rcu_dereference(current->perf_event_ctxp);
-	if (ctx)
-		perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
-	rcu_read_unlock();
-}
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
 			    struct pt_regs *regs, u64 addr)
@@ -4136,16 +4115,28 @@ static void perf_swevent_read(struct perf_event *event)
 static int perf_swevent_enable(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
+	struct perf_cpu_context *cpuctx;
+	struct hlist_head *head;
+
+	cpuctx = &__get_cpu_var(perf_cpu_context);
 
 	if (hwc->sample_period) {
 		hwc->last_period = hwc->sample_period;
 		perf_swevent_set_period(event);
 	}
+
+	head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
+	if (WARN_ON_ONCE(!head))
+		return -EINVAL;
+
+	hlist_add_head_rcu(&event->hlist_entry, head);
+
 	return 0;
 }
 
 static void perf_swevent_disable(struct perf_event *event)
 {
+	hlist_del_rcu(&event->hlist_entry);
 }
 
 static const struct pmu perf_ops_generic = {
@@ -4359,13 +4350,115 @@ static int perf_tp_event_match(struct perf_event *event,
 	return 0;
 }
 
+static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
+{
+	struct swevent_hlist *hlist;
+
+	hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
+	kfree(hlist);
+}
+
+static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
+{
+	struct swevent_hlist *hlist;
+
+	if (!cpuctx->swevent_hlist)
+		return;
+
+	hlist = cpuctx->swevent_hlist;
+	rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
+	call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
+}
+
+static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+
+	mutex_lock(&cpuctx->hlist_mutex);
+
+	if (!--cpuctx->hlist_refcount)
+		swevent_hlist_release(cpuctx);
+
+	mutex_unlock(&cpuctx->hlist_mutex);
+}
+
+static void swevent_hlist_put(struct perf_event *event)
+{
+	int cpu;
+
+	if (event->cpu != -1) {
+		swevent_hlist_put_cpu(event, event->cpu);
+		return;
+	}
+
+	for_each_possible_cpu(cpu)
+		swevent_hlist_put_cpu(event, cpu);
+}
+
+static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	int err = 0;
+
+	mutex_lock(&cpuctx->hlist_mutex);
+
+	if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
+		struct swevent_hlist *hlist;
+
+		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+		if (!hlist) {
+			err = -ENOMEM;
+			goto exit;
+		}
+		rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+	}
+	cpuctx->hlist_refcount++;
+ exit:
+	mutex_unlock(&cpuctx->hlist_mutex);
+
+	return err;
+}
+
+static int swevent_hlist_get(struct perf_event *event)
+{
+	int err;
+	int cpu, failed_cpu;
+
+	if (event->cpu != -1)
+		return swevent_hlist_get_cpu(event, event->cpu);
+
+	get_online_cpus();
+	for_each_possible_cpu(cpu) {
+		err = swevent_hlist_get_cpu(event, cpu);
+		if (err) {
+			failed_cpu = cpu;
+			goto fail;
+		}
+	}
+	put_online_cpus();
+
+	return 0;
+ fail:
+	for_each_possible_cpu(cpu) {
+		if (cpu == failed_cpu)
+			break;
+		swevent_hlist_put_cpu(event, cpu);
+	}
+
+	put_online_cpus();
+	return err;
+}
+
 static void tp_perf_event_destroy(struct perf_event *event)
 {
 	perf_trace_disable(event->attr.config);
+	swevent_hlist_put(event);
 }
 
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 {
+	int err;
+
 	/*
 	 * Raw tracepoint data is a severe data leak, only allow root to
 	 * have these.
@@ -4379,6 +4472,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
 		return NULL;
 
 	event->destroy = tp_perf_event_destroy;
+	err = swevent_hlist_get(event);
+	if (err) {
+		perf_trace_disable(event->attr.config);
+		return ERR_PTR(err);
+	}
 
 	return &perf_ops_generic;
 }
@@ -4479,6 +4577,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
 	WARN_ON(event->parent);
 
 	atomic_dec(&perf_swevent_enabled[event_id]);
+	swevent_hlist_put(event);
 }
 
 static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4517,6 +4616,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
 	case PERF_COUNT_SW_ALIGNMENT_FAULTS:
 	case PERF_COUNT_SW_EMULATION_FAULTS:
 		if (!event->parent) {
+			int err;
+
+			err = swevent_hlist_get(event);
+			if (err)
+				return ERR_PTR(err);
+
 			atomic_inc(&perf_swevent_enabled[event_id]);
 			event->destroy = sw_perf_event_destroy;
 		}
@@ -5389,6 +5494,7 @@ static void __init perf_event_init_all_cpus(void)
 
 	for_each_possible_cpu(cpu) {
 		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		mutex_init(&cpuctx->hlist_mutex);
 		__perf_event_init_context(&cpuctx->ctx, NULL);
 	}
 }
@@ -5402,6 +5508,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
 	spin_lock(&perf_resource_lock);
 	cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
 	spin_unlock(&perf_resource_lock);
+
+	mutex_lock(&cpuctx->hlist_mutex);
+	if (cpuctx->hlist_refcount > 0) {
+		struct swevent_hlist *hlist;
+
+		hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
+		WARN_ON_ONCE(!hlist);
+		rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
+	}
+	mutex_unlock(&cpuctx->hlist_mutex);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5421,6 +5537,10 @@ static void perf_event_exit_cpu(int cpu)
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_event_context *ctx = &cpuctx->ctx;
 
+	mutex_lock(&cpuctx->hlist_mutex);
+	swevent_hlist_release(cpuctx);
+	mutex_unlock(&cpuctx->hlist_mutex);
+
 	mutex_lock(&ctx->mutex);
 	smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
 	mutex_unlock(&ctx->mutex);
-- 
cgit v1.2.3


From df8290bf7ea6b3051e2f315579a6e829309ec1ed Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 9 Apr 2010 00:28:14 +0200
Subject: perf: Make clock software events consistent with general exclusion
 rules

The cpu/task clock events implement their own version of exclusion
on top of exclude_user and exclude_kernel.

The result is that when the event triggered in the kernel but we
have exclude_kernel set, we try to rewind using task_pt_regs.
There are two side effects of this:

- we call task_pt_regs even on kernel threads, which doesn't give
  us the desired result.
- if the event occured in the kernel, we shouldn't rewind to the
  user context. We want to actually ignore the event.

get_irq_regs() will always give us the right interrupted context, so
use its result and submit it to perf_exclude_context() that knows
when an event must be ignored.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9efdfe5b8d3..095101d685b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4164,15 +4164,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
 	regs = get_irq_regs();
-	/*
-	 * In case we exclude kernel IPs or are somehow not in interrupt
-	 * context, provide the next best thing, the user IP.
-	 */
-	if ((event->attr.exclude_kernel || !regs) &&
-			!event->attr.exclude_user)
-		regs = task_pt_regs(current);
 
-	if (regs) {
+	if (regs && !perf_exclude_event(event, regs)) {
 		if (!(event->attr.exclude_idle && current->pid == 0))
 			if (perf_event_overflow(event, 0, &data, regs))
 				ret = HRTIMER_NORESTART;
-- 
cgit v1.2.3


From b8f7fb13d2d7ff14818fd1d3edd8b834d38b0217 Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 14 Apr 2010 11:35:46 -0500
Subject: x86, UV: Improve BAU performance and error recovery

- increase performance of the interrupt handler

- release timed-out software acknowledge resources

- recover from continuous-busy status due to a hardware issue

- add a 'throttle' to keep a uvhub from sending more than a
  specified number of broadcasts concurrently (work around the hardware issue)

- provide a 'nobau' boot command line option

- rename 'pnode' and 'node' to 'uvhub' (the 'node' terminology
  is ambiguous)

- add some new statistics about the scope of broadcasts, retries, the
  hardware issue and the 'throttle'

- split off new function uv_bau_retry_msg() from
  uv_bau_process_message() per community coding style feedback.

- simplify the argument list to uv_bau_process_message(), per
  community coding style feedback.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: linux-mm@kvack.org
Cc: Jack Steiner <steiner@sgi.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Mike Travis <travis@sgi.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <E1O25Z4-0004Ur-PB@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  247 +++++---
 arch/x86/kernel/tlb_uv.c         | 1270 +++++++++++++++++++++++++++-----------
 2 files changed, 1075 insertions(+), 442 deletions(-)

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index b414d2b401f..aa558ac0306 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -27,13 +27,14 @@
  * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
  *
  * We will use 31 sets, one for sending BAU messages from each of the 32
- * cpu's on the node.
+ * cpu's on the uvhub.
  *
  * TLB shootdown will use the first of the 8 descriptors of each set.
  * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
  */
 
 #define UV_ITEMS_PER_DESCRIPTOR		8
+#define MAX_BAU_CONCURRENT		3
 #define UV_CPUS_PER_ACT_STATUS		32
 #define UV_ACT_STATUS_MASK		0x3
 #define UV_ACT_STATUS_SIZE		2
@@ -45,6 +46,9 @@
 #define UV_PAYLOADQ_PNODE_SHIFT		49
 #define UV_PTC_BASENAME			"sgi_uv/ptc_statistics"
 #define uv_physnodeaddr(x)		((__pa((unsigned long)(x)) & uv_mmask))
+#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
 
 /*
  * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
@@ -55,15 +59,29 @@
 #define DESC_STATUS_SOURCE_TIMEOUT	3
 
 /*
- * source side thresholds at which message retries print a warning
+ * source side threshholds at which message retries print a warning
  */
 #define SOURCE_TIMEOUT_LIMIT		20
 #define DESTINATION_TIMEOUT_LIMIT	20
 
+/*
+ * misc. delays, in microseconds
+ */
+#define THROTTLE_DELAY			10
+#define TIMEOUT_DELAY			10
+#define BIOS_TO				1000
+/* BIOS is assumed to set the destination timeout to 1003520 nanoseconds */
+
+/*
+ * threshholds at which to use IPI to free resources
+ */
+#define PLUGSB4RESET 100
+#define TIMEOUTSB4RESET 100
+
 /*
  * number of entries in the destination side payload queue
  */
-#define DEST_Q_SIZE			17
+#define DEST_Q_SIZE			20
 /*
  * number of destination side software ack resources
  */
@@ -72,9 +90,10 @@
 /*
  * completion statuses for sending a TLB flush message
  */
-#define	FLUSH_RETRY			1
-#define	FLUSH_GIVEUP			2
-#define	FLUSH_COMPLETE			3
+#define FLUSH_RETRY_PLUGGED		1
+#define FLUSH_RETRY_TIMEOUT		2
+#define FLUSH_GIVEUP			3
+#define FLUSH_COMPLETE			4
 
 /*
  * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
@@ -86,14 +105,14 @@
  * 'base_dest_nodeid' field of the header corresponds to the
  * destination nodeID associated with that specified bit.
  */
-struct bau_target_nodemask {
-	unsigned long bits[BITS_TO_LONGS(256)];
+struct bau_target_uvhubmask {
+	unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
 };
 
 /*
- * mask of cpu's on a node
+ * mask of cpu's on a uvhub
  * (during initialization we need to check that unsigned long has
- *  enough bits for max. cpu's per node)
+ *  enough bits for max. cpu's per uvhub)
  */
 struct bau_local_cpumask {
 	unsigned long bits;
@@ -135,8 +154,8 @@ struct bau_msg_payload {
 struct bau_msg_header {
 	unsigned int dest_subnodeid:6;	/* must be 0x10, for the LB */
 	/* bits 5:0 */
-	unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */
-	/* bits 20:6 */			  /* first bit in node_map */
+	unsigned int base_dest_nodeid:15; /* nasid (pnode<<1) of */
+	/* bits 20:6 */			  /* first bit in uvhub map */
 	unsigned int command:8;	/* message type */
 	/* bits 28:21 */
 				/* 0x38: SN3net EndPoint Message */
@@ -146,26 +165,38 @@ struct bau_msg_header {
 	unsigned int rsvd_2:9;	/* must be zero */
 	/* bits 40:32 */
 				/* Suppl_A is 56-41 */
-	unsigned int payload_2a:8;/* becomes byte 16 of msg */
-	/* bits 48:41 */	/* not currently using */
-	unsigned int payload_2b:8;/* becomes byte 17 of msg */
-	/* bits 56:49 */	/* not currently using */
+	unsigned int sequence:16;/* message sequence number */
+	/* bits 56:41 */	/* becomes bytes 16-17 of msg */
 				/* Address field (96:57) is never used as an
 				   address (these are address bits 42:3) */
+
 	unsigned int rsvd_3:1;	/* must be zero */
 	/* bit 57 */
 				/* address bits 27:4 are payload */
-				/* these 24 bits become bytes 12-14 of msg */
+	/* these next 24  (58-81) bits become bytes 12-14 of msg */
+
+	/* bits 65:58 land in byte 12 */
 	unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */
 	/* bit 58 */
-
-	unsigned int payload_1a:5;/* not currently used */
-	/* bits 63:59 */
-	unsigned int payload_1b:8;/* not currently used */
-	/* bits 71:64 */
-	unsigned int payload_1c:8;/* not currently used */
-	/* bits 79:72 */
-	unsigned int payload_1d:2;/* not currently used */
+	unsigned int msg_type:3; /* software type of the message*/
+	/* bits 61:59 */
+	unsigned int canceled:1; /* message canceled, resource to be freed*/
+	/* bit 62 */
+	unsigned int payload_1a:1;/* not currently used */
+	/* bit 63 */
+	unsigned int payload_1b:2;/* not currently used */
+	/* bits 65:64 */
+
+	/* bits 73:66 land in byte 13 */
+	unsigned int payload_1ca:6;/* not currently used */
+	/* bits 71:66 */
+	unsigned int payload_1c:2;/* not currently used */
+	/* bits 73:72 */
+
+	/* bits 81:74 land in byte 14 */
+	unsigned int payload_1d:6;/* not currently used */
+	/* bits 79:74 */
+	unsigned int payload_1e:2;/* not currently used */
 	/* bits 81:80 */
 
 	unsigned int rsvd_4:7;	/* must be zero */
@@ -178,7 +209,7 @@ struct bau_msg_header {
 	/* bits 95:90 */
 	unsigned int rsvd_6:5;	/* must be zero */
 	/* bits 100:96 */
-	unsigned int int_both:1;/* if 1, interrupt both sockets on the blade */
+	unsigned int int_both:1;/* if 1, interrupt both sockets on the uvhub */
 	/* bit 101*/
 	unsigned int fairness:3;/* usually zero */
 	/* bits 104:102 */
@@ -191,13 +222,18 @@ struct bau_msg_header {
 	/* bits 127:107 */
 };
 
+/* see msg_type: */
+#define MSG_NOOP 0
+#define MSG_REGULAR 1
+#define MSG_RETRY 2
+
 /*
  * The activation descriptor:
  * The format of the message to send, plus all accompanying control
  * Should be 64 bytes
  */
 struct bau_desc {
-	struct bau_target_nodemask distribution;
+	struct bau_target_uvhubmask distribution;
 	/*
 	 * message template, consisting of header and payload:
 	 */
@@ -237,19 +273,25 @@ struct bau_payload_queue_entry {
 	unsigned short acknowledge_count; /* filled in by destination */
 	/* 16 bits, bytes 10-11 */
 
-	unsigned short replied_to:1;	/* sent as 0 by the source */
-	/* 1 bit */
-	unsigned short unused1:7;       /* not currently using */
-	/* 7 bits: byte 12) */
+	/* these next 3 bytes come from bits 58-81 of the message header */
+	unsigned short replied_to:1;    /* sent as 0 by the source */
+	unsigned short msg_type:3;      /* software message type */
+	unsigned short canceled:1;      /* sent as 0 by the source */
+	unsigned short unused1:3;       /* not currently using */
+	/* byte 12 */
 
-	unsigned char unused2[2];	/* not currently using */
-	/* bytes 13-14 */
+	unsigned char unused2a;		/* not currently using */
+	/* byte 13 */
+	unsigned char unused2;		/* not currently using */
+	/* byte 14 */
 
 	unsigned char sw_ack_vector;	/* filled in by the hardware */
 	/* byte 15 (bits 127:120) */
 
-	unsigned char unused4[3];	/* not currently using bytes 17-19 */
-	/* bytes 17-19 */
+	unsigned short sequence;	/* message sequence number */
+	/* bytes 16-17 */
+	unsigned char unused4[2];	/* not currently using bytes 18-19 */
+	/* bytes 18-19 */
 
 	int number_of_cpus;		/* filled in at destination */
 	/* 32 bits, bytes 20-23 (aligned) */
@@ -259,63 +301,93 @@ struct bau_payload_queue_entry {
 };
 
 /*
- * one for every slot in the destination payload queue
- */
-struct bau_msg_status {
-	struct bau_local_cpumask seen_by;	/* map of cpu's */
-};
-
-/*
- * one for every slot in the destination software ack resources
- */
-struct bau_sw_ack_status {
-	struct bau_payload_queue_entry *msg;	/* associated message */
-	int watcher;				/* cpu monitoring, or -1 */
-};
-
-/*
- * one on every node and per-cpu; to locate the software tables
+ * one per-cpu; to locate the software tables
  */
 struct bau_control {
 	struct bau_desc *descriptor_base;
-	struct bau_payload_queue_entry *bau_msg_head;
 	struct bau_payload_queue_entry *va_queue_first;
 	struct bau_payload_queue_entry *va_queue_last;
-	struct bau_msg_status *msg_statuses;
-	int *watching; /* pointer to array */
+	struct bau_payload_queue_entry *bau_msg_head;
+	struct bau_control *uvhub_master;
+	struct bau_control *socket_master;
+	unsigned long timeout_interval;
+	atomic_t active_descriptor_count;
+	int max_concurrent;
+	int max_concurrent_constant;
+	int retry_message_scans;
+	int plugged_tries;
+	int timeout_tries;
+	int ipi_attempts;
+	int conseccompletes;
+	short cpu;
+	short uvhub_cpu;
+	short uvhub;
+	short cpus_in_socket;
+	short cpus_in_uvhub;
+	unsigned short message_number;
+	unsigned short uvhub_quiesce;
+	short socket_acknowledge_count[DEST_Q_SIZE];
+	cycles_t send_message;
+	spinlock_t masks_lock;
+	spinlock_t uvhub_lock;
+	spinlock_t queue_lock;
 };
 
 /*
  * This structure is allocated per_cpu for UV TLB shootdown statistics.
  */
 struct ptc_stats {
-	unsigned long ptc_i;	/* number of IPI-style flushes */
-	unsigned long requestor;	/* number of nodes this cpu sent to */
-	unsigned long requestee;	/* times cpu was remotely requested */
-	unsigned long alltlb;	/* times all tlb's on this cpu were flushed */
-	unsigned long onetlb;	/* times just one tlb on this cpu was flushed */
-	unsigned long s_retry;	/* retries on source side timeouts */
-	unsigned long d_retry;	/* retries on destination side timeouts */
-	unsigned long sflush;	/* cycles spent in uv_flush_tlb_others */
-	unsigned long dflush;	/* cycles spent on destination side */
-	unsigned long retriesok; /* successes on retries */
-	unsigned long nomsg;	/* interrupts with no message */
-	unsigned long multmsg;	/* interrupts with multiple messages */
-	unsigned long ntargeted;/* nodes targeted */
+	/* sender statistics */
+	unsigned long s_giveup; /* number of fall backs to IPI-style flushes */
+	unsigned long s_requestor; /* number of shootdown requests */
+	unsigned long s_stimeout; /* source side timeouts */
+	unsigned long s_dtimeout; /* destination side timeouts */
+	unsigned long s_time; /* time spent in sending side */
+	unsigned long s_retriesok; /* successful retries */
+	unsigned long s_ntargcpu; /* number of cpus targeted */
+	unsigned long s_ntarguvhub; /* number of uvhubs targeted */
+	unsigned long s_ntarguvhub16; /* number of times >= 16 target hubs */
+	unsigned long s_ntarguvhub8; /* number of times >= 8 target hubs */
+	unsigned long s_ntarguvhub4; /* number of times >= 4 target hubs */
+	unsigned long s_ntarguvhub2; /* number of times >= 2 target hubs */
+	unsigned long s_ntarguvhub1; /* number of times == 1 target hub */
+	unsigned long s_resets_plug; /* ipi-style resets from plug state */
+	unsigned long s_resets_timeout; /* ipi-style resets from timeouts */
+	unsigned long s_busy; /* status stayed busy past s/w timer */
+	unsigned long s_throttles; /* waits in throttle */
+	unsigned long s_retry_messages; /* retry broadcasts */
+	/* destination statistics */
+	unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */
+	unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
+	unsigned long d_multmsg; /* interrupts with multiple messages */
+	unsigned long d_nomsg; /* interrupts with no message */
+	unsigned long d_time; /* time spent on destination side */
+	unsigned long d_requestee; /* number of messages processed */
+	unsigned long d_retries; /* number of retry messages processed */
+	unsigned long d_canceled; /* number of messages canceled by retries */
+	unsigned long d_nocanceled; /* retries that found nothing to cancel */
+	unsigned long d_resets; /* number of ipi-style requests processed */
+	unsigned long d_rcanceled; /* number of messages canceled by resets */
 };
 
-static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp)
+static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp)
 {
-	return constant_test_bit(node, &dstp->bits[0]);
+	return constant_test_bit(uvhub, &dstp->bits[0]);
 }
-static inline void bau_node_set(int node, struct bau_target_nodemask *dstp)
+static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp)
 {
-	__set_bit(node, &dstp->bits[0]);
+	__set_bit(uvhub, &dstp->bits[0]);
 }
-static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits)
+static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp,
+				    int nbits)
 {
 	bitmap_zero(&dstp->bits[0], nbits);
 }
+static inline int bau_uvhub_weight(struct bau_target_uvhubmask *dstp)
+{
+	return bitmap_weight((unsigned long *)&dstp->bits[0],
+				UV_DISTRIBUTION_SIZE);
+}
 
 static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
 {
@@ -328,4 +400,35 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
 extern void uv_bau_message_intr1(void);
 extern void uv_bau_timeout_intr1(void);
 
+struct atomic_short {
+	short counter;
+};
+
+/**
+ * atomic_read_short - read a short atomic variable
+ * @v: pointer of type atomic_short
+ *
+ * Atomically reads the value of @v.
+ */
+static inline int atomic_read_short(const struct atomic_short *v)
+{
+	return v->counter;
+}
+
+/**
+ * atomic_add_short_return - add and return a short int
+ * @i: short value to add
+ * @v: pointer of type atomic_short
+ *
+ * Atomically adds @i to @v and returns @i + @v
+ */
+static inline int atomic_add_short_return(short i, struct atomic_short *v)
+{
+	short __i = i;
+	asm volatile(LOCK_PREFIX "xaddw %0, %1"
+			: "+r" (i), "+m" (v->counter)
+			: : "memory");
+	return i + __i;
+}
+
 #endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ef68ba48564..414f7c4fe76 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1,7 +1,7 @@
 /*
  *	SGI UltraViolet TLB flush routines.
  *
- *	(c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
+ *	(c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
  *
  *	This code is released under the GNU General Public License version 2 or
  *	later.
@@ -19,44 +19,67 @@
 #include <asm/idle.h>
 #include <asm/tsc.h>
 #include <asm/irq_vectors.h>
+#include <asm/timer.h>
+
+struct msg_desc {
+	struct bau_payload_queue_entry *msg;
+	int msg_slot;
+	int sw_ack_slot;
+	struct bau_payload_queue_entry *va_queue_first;
+	struct bau_payload_queue_entry *va_queue_last;
+};
 
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	0x000000000bUL
 
-static struct bau_control	**uv_bau_table_bases __read_mostly;
-static int			uv_bau_retry_limit __read_mostly;
+static int uv_bau_max_concurrent __read_mostly;
 
-/* base pnode in this partition */
-static int			uv_partition_base_pnode __read_mostly;
+static int nobau;
+static int __init setup_nobau(char *arg)
+{
+	nobau = 1;
+	return 0;
+}
+early_param("nobau", setup_nobau);
 
-static unsigned long		uv_mmask __read_mostly;
+/* base pnode in this partition */
+static int uv_partition_base_pnode __read_mostly;
+/* position of pnode (which is nasid>>1): */
+static int uv_nshift __read_mostly;
+static unsigned long uv_mmask __read_mostly;
 
 static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
 static DEFINE_PER_CPU(struct bau_control, bau_control);
+static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
+
+struct reset_args {
+	int sender;
+};
 
 /*
- * Determine the first node on a blade.
+ * Determine the first node on a uvhub. 'Nodes' are used for kernel
+ * memory allocation.
  */
-static int __init blade_to_first_node(int blade)
+static int __init uvhub_to_first_node(int uvhub)
 {
 	int node, b;
 
 	for_each_online_node(node) {
 		b = uv_node_to_blade_id(node);
-		if (blade == b)
+		if (uvhub == b)
 			return node;
 	}
-	return -1; /* shouldn't happen */
+	return -1;
 }
 
 /*
- * Determine the apicid of the first cpu on a blade.
+ * Determine the apicid of the first cpu on a uvhub.
  */
-static int __init blade_to_first_apicid(int blade)
+static int __init uvhub_to_first_apicid(int uvhub)
 {
 	int cpu;
 
 	for_each_present_cpu(cpu)
-		if (blade == uv_cpu_to_blade_id(cpu))
+		if (uvhub == uv_cpu_to_blade_id(cpu))
 			return per_cpu(x86_cpu_to_apicid, cpu);
 	return -1;
 }
@@ -69,195 +92,459 @@ static int __init blade_to_first_apicid(int blade)
  * clear of the Timeout bit (as well) will free the resource. No reply will
  * be sent (the hardware will only do one reply per message).
  */
-static void uv_reply_to_message(int resource,
-				struct bau_payload_queue_entry *msg,
-				struct bau_msg_status *msp)
+static inline void uv_reply_to_message(struct msg_desc *mdp,
+				       struct bau_control *bcp)
 {
 	unsigned long dw;
+	struct bau_payload_queue_entry *msg;
 
-	dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
+	msg = mdp->msg;
+	if (!msg->canceled) {
+		dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
+						msg->sw_ack_vector;
+		uv_write_local_mmr(
+				UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
+	}
 	msg->replied_to = 1;
 	msg->sw_ack_vector = 0;
-	if (msp)
-		msp->seen_by.bits = 0;
-	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
 }
 
 /*
- * Do all the things a cpu should do for a TLB shootdown message.
- * Other cpu's may come here at the same time for this message.
+ * Process the receipt of a RETRY message
  */
-static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
-				   int msg_slot, int sw_ack_slot)
+static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
+					    struct bau_control *bcp)
 {
-	unsigned long this_cpu_mask;
-	struct bau_msg_status *msp;
-	int cpu;
+	int i;
+	int cancel_count = 0;
+	int slot2;
+	unsigned long msg_res;
+	unsigned long mmr = 0;
+	struct bau_payload_queue_entry *msg;
+	struct bau_payload_queue_entry *msg2;
+	struct ptc_stats *stat;
 
-	msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
-	cpu = uv_blade_processor_id();
-	msg->number_of_cpus =
-		uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
-	this_cpu_mask = 1UL << cpu;
-	if (msp->seen_by.bits & this_cpu_mask)
-		return;
-	atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
+	msg = mdp->msg;
+	stat = &per_cpu(ptcstats, bcp->cpu);
+	stat->d_retries++;
+	/*
+	 * cancel any message from msg+1 to the retry itself
+	 */
+	for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
+		if (msg2 > mdp->va_queue_last)
+			msg2 = mdp->va_queue_first;
+		if (msg2 == msg)
+			break;
+
+		/* same conditions for cancellation as uv_do_reset */
+		if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
+		    (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
+			msg->sw_ack_vector) == 0) &&
+		    (msg2->sending_cpu == msg->sending_cpu) &&
+		    (msg2->msg_type != MSG_NOOP)) {
+			slot2 = msg2 - mdp->va_queue_first;
+			mmr = uv_read_local_mmr
+				(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+			msg_res = ((msg2->sw_ack_vector << 8) |
+				   msg2->sw_ack_vector);
+			/*
+			 * This is a message retry; clear the resources held
+			 * by the previous message only if they timed out.
+			 * If it has not timed out we have an unexpected
+			 * situation to report.
+			 */
+			if (mmr & (msg_res << 8)) {
+				/*
+				 * is the resource timed out?
+				 * make everyone ignore the cancelled message.
+				 */
+				msg2->canceled = 1;
+				stat->d_canceled++;
+				cancel_count++;
+				uv_write_local_mmr(
+				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+					(msg_res << 8) | msg_res);
+			} else
+				printk(KERN_INFO "note bau retry: no effect\n");
+		}
+	}
+	if (!cancel_count)
+		stat->d_nocanceled++;
+}
 
-	if (msg->replied_to == 1)
-		return;
+/*
+ * Do all the things a cpu should do for a TLB shootdown message.
+ * Other cpu's may come here at the same time for this message.
+ */
+static void uv_bau_process_message(struct msg_desc *mdp,
+				   struct bau_control *bcp)
+{
+	int msg_ack_count;
+	short socket_ack_count = 0;
+	struct ptc_stats *stat;
+	struct bau_payload_queue_entry *msg;
+	struct bau_control *smaster = bcp->socket_master;
 
+	/*
+	 * This must be a normal message, or retry of a normal message
+	 */
+	msg = mdp->msg;
+	stat = &per_cpu(ptcstats, bcp->cpu);
 	if (msg->address == TLB_FLUSH_ALL) {
 		local_flush_tlb();
-		__get_cpu_var(ptcstats).alltlb++;
+		stat->d_alltlb++;
 	} else {
 		__flush_tlb_one(msg->address);
-		__get_cpu_var(ptcstats).onetlb++;
+		stat->d_onetlb++;
 	}
+	stat->d_requestee++;
+
+	/*
+	 * One cpu on each uvhub has the additional job on a RETRY
+	 * of releasing the resource held by the message that is
+	 * being retried.  That message is identified by sending
+	 * cpu number.
+	 */
+	if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
+		uv_bau_process_retry_msg(mdp, bcp);
 
-	__get_cpu_var(ptcstats).requestee++;
+	/*
+	 * This is a sw_ack message, so we have to reply to it.
+	 * Count each responding cpu on the socket. This avoids
+	 * pinging the count's cache line back and forth between
+	 * the sockets.
+	 */
+	socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
+			&smaster->socket_acknowledge_count[mdp->msg_slot]);
+	if (socket_ack_count == bcp->cpus_in_socket) {
+		/*
+		 * Both sockets dump their completed count total into
+		 * the message's count.
+		 */
+		smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
+		msg_ack_count = atomic_add_short_return(socket_ack_count,
+				(struct atomic_short *)&msg->acknowledge_count);
+
+		if (msg_ack_count == bcp->cpus_in_uvhub) {
+			/*
+			 * All cpus in uvhub saw it; reply
+			 */
+			uv_reply_to_message(mdp, bcp);
+		}
+	}
 
-	atomic_inc_short(&msg->acknowledge_count);
-	if (msg->number_of_cpus == msg->acknowledge_count)
-		uv_reply_to_message(sw_ack_slot, msg, msp);
+	return;
 }
 
 /*
- * Examine the payload queue on one distribution node to see
- * which messages have not been seen, and which cpu(s) have not seen them.
+ * Determine the first cpu on a uvhub.
+ */
+static int uvhub_to_first_cpu(int uvhub)
+{
+	int cpu;
+	for_each_present_cpu(cpu)
+		if (uvhub == uv_cpu_to_blade_id(cpu))
+			return cpu;
+	return -1;
+}
+
+/*
+ * Last resort when we get a large number of destination timeouts is
+ * to clear resources held by a given cpu.
+ * Do this with IPI so that all messages in the BAU message queue
+ * can be identified by their nonzero sw_ack_vector field.
  *
- * Returns the number of cpu's that have not responded.
+ * This is entered for a single cpu on the uvhub.
+ * The sender want's this uvhub to free a specific message's
+ * sw_ack resources.
  */
-static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
+static void
+uv_do_reset(void *ptr)
 {
-	struct bau_payload_queue_entry *msg;
-	struct bau_msg_status *msp;
-	int count = 0;
 	int i;
-	int j;
+	int slot;
+	int count = 0;
+	unsigned long mmr;
+	unsigned long msg_res;
+	struct bau_control *bcp;
+	struct reset_args *rap;
+	struct bau_payload_queue_entry *msg;
+	struct ptc_stats *stat;
 
-	for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
-	     msg++, i++) {
-		if ((msg->sending_cpu == sender) && (!msg->replied_to)) {
-			msp = bau_tablesp->msg_statuses + i;
-			printk(KERN_DEBUG
-			       "blade %d: address:%#lx %d of %d, not cpu(s): ",
-			       i, msg->address, msg->acknowledge_count,
-			       msg->number_of_cpus);
-			for (j = 0; j < msg->number_of_cpus; j++) {
-				if (!((1L << j) & msp->seen_by.bits)) {
-					count++;
-					printk("%d ", j);
-				}
+	bcp = &per_cpu(bau_control, smp_processor_id());
+	rap = (struct reset_args *)ptr;
+	stat = &per_cpu(ptcstats, bcp->cpu);
+	stat->d_resets++;
+
+	/*
+	 * We're looking for the given sender, and
+	 * will free its sw_ack resource.
+	 * If all cpu's finally responded after the timeout, its
+	 * message 'replied_to' was set.
+	 */
+	for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
+		/* uv_do_reset: same conditions for cancellation as
+		   uv_bau_process_retry_msg() */
+		if ((msg->replied_to == 0) &&
+		    (msg->canceled == 0) &&
+		    (msg->sending_cpu == rap->sender) &&
+		    (msg->sw_ack_vector) &&
+		    (msg->msg_type != MSG_NOOP)) {
+			/*
+			 * make everyone else ignore this message
+			 */
+			msg->canceled = 1;
+			slot = msg - bcp->va_queue_first;
+			count++;
+			/*
+			 * only reset the resource if it is still pending
+			 */
+			mmr = uv_read_local_mmr
+					(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+			msg_res = ((msg->sw_ack_vector << 8) |
+						   msg->sw_ack_vector);
+			if (mmr & msg_res) {
+				stat->d_rcanceled++;
+				uv_write_local_mmr(
+				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+							msg_res);
 			}
-			printk("\n");
 		}
 	}
-	return count;
+	return;
 }
 
 /*
- * Examine the payload queue on all the distribution nodes to see
- * which messages have not been seen, and which cpu(s) have not seen them.
- *
- * Returns the number of cpu's that have not responded.
+ * Use IPI to get all target uvhubs to release resources held by
+ * a given sending cpu number.
  */
-static int uv_examine_destinations(struct bau_target_nodemask *distribution)
+static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
+			      int sender)
 {
-	int sender;
-	int i;
-	int count = 0;
+	int uvhub;
+	int cpu;
+	cpumask_t mask;
+	struct reset_args reset_args;
 
-	sender = smp_processor_id();
-	for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {
-		if (!bau_node_isset(i, distribution))
+	reset_args.sender = sender;
+
+	cpus_clear(mask);
+	/* find a single cpu for each uvhub in this distribution mask */
+	for (uvhub = 0;
+		    uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
+		    uvhub++) {
+		if (!bau_uvhub_isset(uvhub, distribution))
 			continue;
-		count += uv_examine_destination(uv_bau_table_bases[i], sender);
+		/* find a cpu for this uvhub */
+		cpu = uvhub_to_first_cpu(uvhub);
+		cpu_set(cpu, mask);
 	}
-	return count;
+	/* IPI all cpus; Preemption is already disabled */
+	smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
+	return;
+}
+
+static inline unsigned long
+cycles_2_us(unsigned long long cyc)
+{
+	unsigned long long ns;
+	unsigned long us;
+	ns =  (cyc * per_cpu(cyc2ns, smp_processor_id()))
+						>> CYC2NS_SCALE_FACTOR;
+	us = ns / 1000;
+	return us;
 }
 
 /*
- * wait for completion of a broadcast message
- *
- * return COMPLETE, RETRY or GIVEUP
+ * wait for all cpus on this hub to finish their sends and go quiet
+ * leaves uvhub_quiesce set so that no new broadcasts are started by
+ * bau_flush_send_and_wait()
+ */
+static inline void
+quiesce_local_uvhub(struct bau_control *hmaster)
+{
+	atomic_add_short_return(1, (struct atomic_short *)
+		 &hmaster->uvhub_quiesce);
+}
+
+/*
+ * mark this quiet-requestor as done
+ */
+static inline void
+end_uvhub_quiesce(struct bau_control *hmaster)
+{
+	atomic_add_short_return(-1, (struct atomic_short *)
+		&hmaster->uvhub_quiesce);
+}
+
+/*
+ * Wait for completion of a broadcast software ack message
+ * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
  */
 static int uv_wait_completion(struct bau_desc *bau_desc,
-			      unsigned long mmr_offset, int right_shift)
+	unsigned long mmr_offset, int right_shift, int this_cpu,
+	struct bau_control *bcp, struct bau_control *smaster, long try)
 {
-	int exams = 0;
-	long destination_timeouts = 0;
-	long source_timeouts = 0;
+	int relaxes = 0;
 	unsigned long descriptor_status;
+	unsigned long mmr;
+	unsigned long mask;
+	cycles_t ttime;
+	cycles_t timeout_time;
+	struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
+	struct bau_control *hmaster;
 
+	hmaster = bcp->uvhub_master;
+	timeout_time = get_cycles() + bcp->timeout_interval;
+
+	/* spin on the status MMR, waiting for it to go idle */
 	while ((descriptor_status = (((unsigned long)
 		uv_read_local_mmr(mmr_offset) >>
 			right_shift) & UV_ACT_STATUS_MASK)) !=
 			DESC_STATUS_IDLE) {
-		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
-			source_timeouts++;
-			if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
-				source_timeouts = 0;
-			__get_cpu_var(ptcstats).s_retry++;
-			return FLUSH_RETRY;
-		}
 		/*
-		 * spin here looking for progress at the destinations
+		 * Our software ack messages may be blocked because there are
+		 * no swack resources available.  As long as none of them
+		 * has timed out hardware will NACK our message and its
+		 * state will stay IDLE.
 		 */
-		if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
-			destination_timeouts++;
-			if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
-				/*
-				 * returns number of cpus not responding
-				 */
-				if (uv_examine_destinations
-				    (&bau_desc->distribution) == 0) {
-					__get_cpu_var(ptcstats).d_retry++;
-					return FLUSH_RETRY;
-				}
-				exams++;
-				if (exams >= uv_bau_retry_limit) {
-					printk(KERN_DEBUG
-					       "uv_flush_tlb_others");
-					printk("giving up on cpu %d\n",
-					       smp_processor_id());
+		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
+			stat->s_stimeout++;
+			return FLUSH_GIVEUP;
+		} else if (descriptor_status ==
+					DESC_STATUS_DESTINATION_TIMEOUT) {
+			stat->s_dtimeout++;
+			ttime = get_cycles();
+
+			/*
+			 * Our retries may be blocked by all destination
+			 * swack resources being consumed, and a timeout
+			 * pending.  In that case hardware returns the
+			 * ERROR that looks like a destination timeout.
+			 */
+			if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
+				bcp->conseccompletes = 0;
+				return FLUSH_RETRY_PLUGGED;
+			}
+
+			bcp->conseccompletes = 0;
+			return FLUSH_RETRY_TIMEOUT;
+		} else {
+			/*
+			 * descriptor_status is still BUSY
+			 */
+			cpu_relax();
+			relaxes++;
+			if (relaxes >= 10000) {
+				relaxes = 0;
+				if (get_cycles() > timeout_time) {
+					quiesce_local_uvhub(hmaster);
+
+					/* single-thread the register change */
+					spin_lock(&hmaster->masks_lock);
+					mmr = uv_read_local_mmr(mmr_offset);
+					mask = 0UL;
+					mask |= (3UL < right_shift);
+					mask = ~mask;
+					mmr &= mask;
+					uv_write_local_mmr(mmr_offset, mmr);
+					spin_unlock(&hmaster->masks_lock);
+					end_uvhub_quiesce(hmaster);
+					stat->s_busy++;
 					return FLUSH_GIVEUP;
 				}
-				/*
-				 * delays can hang the simulator
-				   udelay(1000);
-				 */
-				destination_timeouts = 0;
 			}
 		}
-		cpu_relax();
 	}
+	bcp->conseccompletes++;
 	return FLUSH_COMPLETE;
 }
 
+static inline cycles_t
+sec_2_cycles(unsigned long sec)
+{
+	unsigned long ns;
+	cycles_t cyc;
+
+	ns = sec * 1000000000;
+	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+	return cyc;
+}
+
+/*
+ * conditionally add 1 to *v, unless *v is >= u
+ * return 0 if we cannot add 1 to *v because it is >= u
+ * return 1 if we can add 1 to *v because it is < u
+ * the add is atomic
+ *
+ * This is close to atomic_add_unless(), but this allows the 'u' value
+ * to be lowered below the current 'v'.  atomic_add_unless can only stop
+ * on equal.
+ */
+static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+{
+	spin_lock(lock);
+	if (atomic_read(v) >= u) {
+		spin_unlock(lock);
+		return 0;
+	}
+	atomic_inc(v);
+	spin_unlock(lock);
+	return 1;
+}
+
 /**
  * uv_flush_send_and_wait
  *
- * Send a broadcast and wait for a broadcast message to complete.
+ * Send a broadcast and wait for it to complete.
  *
- * The flush_mask contains the cpus the broadcast was sent to.
+ * The flush_mask contains the cpus the broadcast is to be sent to, plus
+ * cpus that are on the local uvhub.
  *
- * Returns NULL if all remote flushing was done. The mask is zeroed.
+ * Returns NULL if all flushing represented in the mask was done. The mask
+ * is zeroed.
  * Returns @flush_mask if some remote flushing remains to be done. The
- * mask will have some bits still set.
+ * mask will have some bits still set, representing any cpus on the local
+ * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
  */
-const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
-					     struct bau_desc *bau_desc,
-					     struct cpumask *flush_mask)
+const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
+					     struct cpumask *flush_mask,
+					     struct bau_control *bcp)
 {
-	int completion_status = 0;
 	int right_shift;
-	int tries = 0;
-	int pnode;
+	int uvhub;
 	int bit;
+	int completion_status = 0;
+	int seq_number = 0;
+	long try = 0;
+	int cpu = bcp->uvhub_cpu;
+	int this_cpu = bcp->cpu;
+	int this_uvhub = bcp->uvhub;
 	unsigned long mmr_offset;
 	unsigned long index;
 	cycles_t time1;
 	cycles_t time2;
+	struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
+	struct bau_control *smaster = bcp->socket_master;
+	struct bau_control *hmaster = bcp->uvhub_master;
+
+	/*
+	 * Spin here while there are hmaster->max_concurrent or more active
+	 * descriptors. This is the per-uvhub 'throttle'.
+	 */
+	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+			&hmaster->active_descriptor_count,
+			hmaster->max_concurrent)) {
+		stat->s_throttles++;
+		do {
+			cpu_relax();
+		} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+			&hmaster->active_descriptor_count,
+			hmaster->max_concurrent));
+	}
+
+	while (hmaster->uvhub_quiesce)
+		cpu_relax();
 
 	if (cpu < UV_CPUS_PER_ACT_STATUS) {
 		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@ -269,24 +556,108 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
 	}
 	time1 = get_cycles();
 	do {
-		tries++;
+		/*
+		 * Every message from any given cpu gets a unique message
+		 * sequence number. But retries use that same number.
+		 * Our message may have timed out at the destination because
+		 * all sw-ack resources are in use and there is a timeout
+		 * pending there.  In that case, our last send never got
+		 * placed into the queue and we need to persist until it
+		 * does.
+		 *
+		 * Make any retry a type MSG_RETRY so that the destination will
+		 * free any resource held by a previous message from this cpu.
+		 */
+		if (try == 0) {
+			/* use message type set by the caller the first time */
+			seq_number = bcp->message_number++;
+		} else {
+			/* use RETRY type on all the rest; same sequence */
+			bau_desc->header.msg_type = MSG_RETRY;
+			stat->s_retry_messages++;
+		}
+		bau_desc->header.sequence = seq_number;
 		index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
-			cpu;
+			bcp->uvhub_cpu;
+		bcp->send_message = get_cycles();
+
 		uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+
+		try++;
 		completion_status = uv_wait_completion(bau_desc, mmr_offset,
-					right_shift);
-	} while (completion_status == FLUSH_RETRY);
+			right_shift, this_cpu, bcp, smaster, try);
+
+		if (completion_status == FLUSH_RETRY_PLUGGED) {
+			/*
+			 * Our retries may be blocked by all destination swack
+			 * resources being consumed, and a timeout pending. In
+			 * that case hardware immediately returns the ERROR
+			 * that looks like a destination timeout.
+			 */
+			udelay(TIMEOUT_DELAY);
+			bcp->plugged_tries++;
+			if (bcp->plugged_tries >= PLUGSB4RESET) {
+				bcp->plugged_tries = 0;
+				quiesce_local_uvhub(hmaster);
+				spin_lock(&hmaster->queue_lock);
+				uv_reset_with_ipi(&bau_desc->distribution,
+							this_cpu);
+				spin_unlock(&hmaster->queue_lock);
+				end_uvhub_quiesce(hmaster);
+				bcp->ipi_attempts++;
+				stat->s_resets_plug++;
+			}
+		} else if (completion_status == FLUSH_RETRY_TIMEOUT) {
+			hmaster->max_concurrent = 1;
+			bcp->timeout_tries++;
+			udelay(TIMEOUT_DELAY);
+			if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
+				bcp->timeout_tries = 0;
+				quiesce_local_uvhub(hmaster);
+				spin_lock(&hmaster->queue_lock);
+				uv_reset_with_ipi(&bau_desc->distribution,
+								this_cpu);
+				spin_unlock(&hmaster->queue_lock);
+				end_uvhub_quiesce(hmaster);
+				bcp->ipi_attempts++;
+				stat->s_resets_timeout++;
+			}
+		}
+		if (bcp->ipi_attempts >= 3) {
+			bcp->ipi_attempts = 0;
+			completion_status = FLUSH_GIVEUP;
+			break;
+		}
+		cpu_relax();
+	} while ((completion_status == FLUSH_RETRY_PLUGGED) ||
+		 (completion_status == FLUSH_RETRY_TIMEOUT));
 	time2 = get_cycles();
-	__get_cpu_var(ptcstats).sflush += (time2 - time1);
-	if (tries > 1)
-		__get_cpu_var(ptcstats).retriesok++;
 
-	if (completion_status == FLUSH_GIVEUP) {
+	if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
+	    && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
+			hmaster->max_concurrent++;
+
+	/*
+	 * hold any cpu not timing out here; no other cpu currently held by
+	 * the 'throttle' should enter the activation code
+	 */
+	while (hmaster->uvhub_quiesce)
+		cpu_relax();
+	atomic_dec(&hmaster->active_descriptor_count);
+
+	/* guard against cycles wrap */
+	if (time2 > time1)
+		stat->s_time += (time2 - time1);
+	else
+		stat->s_requestor--; /* don't count this one */
+	if (completion_status == FLUSH_COMPLETE && try > 1)
+		stat->s_retriesok++;
+	else if (completion_status == FLUSH_GIVEUP) {
 		/*
 		 * Cause the caller to do an IPI-style TLB shootdown on
-		 * the cpu's, all of which are still in the mask.
+		 * the target cpu's, all of which are still in the mask.
 		 */
-		__get_cpu_var(ptcstats).ptc_i++;
+		stat->s_giveup++;
 		return flush_mask;
 	}
 
@@ -295,18 +666,17 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
 	 * use the IPI method of shootdown on them.
 	 */
 	for_each_cpu(bit, flush_mask) {
-		pnode = uv_cpu_to_pnode(bit);
-		if (pnode == this_pnode)
+		uvhub = uv_cpu_to_blade_id(bit);
+		if (uvhub == this_uvhub)
 			continue;
 		cpumask_clear_cpu(bit, flush_mask);
 	}
 	if (!cpumask_empty(flush_mask))
 		return flush_mask;
+
 	return NULL;
 }
 
-static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
-
 /**
  * uv_flush_tlb_others - globally purge translation cache of a virtual
  * address or all TLB's
@@ -323,8 +693,8 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
  * The caller has derived the cpumask from the mm_struct.  This function
  * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
  *
- * The cpumask is converted into a nodemask of the nodes containing
- * the cpus.
+ * The cpumask is converted into a uvhubmask of the uvhubs containing
+ * those cpus.
  *
  * Note that this function should be called with preemption disabled.
  *
@@ -336,52 +706,82 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 					  struct mm_struct *mm,
 					  unsigned long va, unsigned int cpu)
 {
-	struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);
-	int i;
-	int bit;
-	int pnode;
-	int uv_cpu;
-	int this_pnode;
+	int remotes;
+	int tcpu;
+	int uvhub;
 	int locals = 0;
 	struct bau_desc *bau_desc;
+	struct cpumask *flush_mask;
+	struct ptc_stats *stat;
+	struct bau_control *bcp;
 
-	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+	if (nobau)
+		return cpumask;
 
-	uv_cpu = uv_blade_processor_id();
-	this_pnode = uv_hub_info->pnode;
-	bau_desc = __get_cpu_var(bau_control).descriptor_base;
-	bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
+	bcp = &per_cpu(bau_control, cpu);
+	/*
+	 * Each sending cpu has a per-cpu mask which it fills from the caller's
+	 * cpu mask.  Only remote cpus are converted to uvhubs and copied.
+	 */
+	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
+	/*
+	 * copy cpumask to flush_mask, removing current cpu
+	 * (current cpu should already have been flushed by the caller and
+	 *  should never be returned if we return flush_mask)
+	 */
+	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+	if (cpu_isset(cpu, *cpumask))
+		locals++;  /* current cpu was targeted */
 
-	bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+	bau_desc = bcp->descriptor_base;
+	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
 
-	i = 0;
-	for_each_cpu(bit, flush_mask) {
-		pnode = uv_cpu_to_pnode(bit);
-		BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1));
-		if (pnode == this_pnode) {
+	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+	remotes = 0;
+	for_each_cpu(tcpu, flush_mask) {
+		uvhub = uv_cpu_to_blade_id(tcpu);
+		if (uvhub == bcp->uvhub) {
 			locals++;
 			continue;
 		}
-		bau_node_set(pnode - uv_partition_base_pnode,
-				&bau_desc->distribution);
-		i++;
+		bau_uvhub_set(uvhub, &bau_desc->distribution);
+		remotes++;
 	}
-	if (i == 0) {
+	if (remotes == 0) {
 		/*
-		 * no off_node flushing; return status for local node
+		 * No off_hub flushing; return status for local hub.
+		 * Return the caller's mask if all were local (the current
+		 * cpu may be in that mask).
 		 */
 		if (locals)
-			return flush_mask;
+			return cpumask;
 		else
 			return NULL;
 	}
-	__get_cpu_var(ptcstats).requestor++;
-	__get_cpu_var(ptcstats).ntargeted += i;
+	stat = &per_cpu(ptcstats, cpu);
+	stat->s_requestor++;
+	stat->s_ntargcpu += remotes;
+	remotes = bau_uvhub_weight(&bau_desc->distribution);
+	stat->s_ntarguvhub += remotes;
+	if (remotes >= 16)
+		stat->s_ntarguvhub16++;
+	else if (remotes >= 8)
+		stat->s_ntarguvhub8++;
+	else if (remotes >= 4)
+		stat->s_ntarguvhub4++;
+	else if (remotes >= 2)
+		stat->s_ntarguvhub2++;
+	else
+		stat->s_ntarguvhub1++;
 
 	bau_desc->payload.address = va;
 	bau_desc->payload.sending_cpu = cpu;
 
-	return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask);
+	/*
+	 * uv_flush_send_and_wait returns null if all cpu's were messaged, or
+	 * the adjusted flush_mask if any cpu's were not messaged.
+	 */
+	return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
 }
 
 /*
@@ -390,87 +790,70 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
  *
  * We received a broadcast assist message.
  *
- * Interrupts may have been disabled; this interrupt could represent
+ * Interrupts are disabled; this interrupt could represent
  * the receipt of several messages.
  *
- * All cores/threads on this node get this interrupt.
- * The last one to see it does the s/w ack.
+ * All cores/threads on this hub get this interrupt.
+ * The last one to see it does the software ack.
  * (the resource will not be freed until noninterruptable cpus see this
- *  interrupt; hardware will timeout the s/w ack and reply ERROR)
+ *  interrupt; hardware may timeout the s/w ack and reply ERROR)
  */
 void uv_bau_message_interrupt(struct pt_regs *regs)
 {
-	struct bau_payload_queue_entry *va_queue_first;
-	struct bau_payload_queue_entry *va_queue_last;
-	struct bau_payload_queue_entry *msg;
-	struct pt_regs *old_regs = set_irq_regs(regs);
-	cycles_t time1;
-	cycles_t time2;
-	int msg_slot;
-	int sw_ack_slot;
-	int fw;
 	int count = 0;
-	unsigned long local_pnode;
-
-	ack_APIC_irq();
-	exit_idle();
-	irq_enter();
-
-	time1 = get_cycles();
-
-	local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
-
-	va_queue_first = __get_cpu_var(bau_control).va_queue_first;
-	va_queue_last = __get_cpu_var(bau_control).va_queue_last;
-
-	msg = __get_cpu_var(bau_control).bau_msg_head;
+	cycles_t time_start;
+	struct bau_payload_queue_entry *msg;
+	struct bau_control *bcp;
+	struct ptc_stats *stat;
+	struct msg_desc msgdesc;
+
+	time_start = get_cycles();
+	bcp = &per_cpu(bau_control, smp_processor_id());
+	stat = &per_cpu(ptcstats, smp_processor_id());
+	msgdesc.va_queue_first = bcp->va_queue_first;
+	msgdesc.va_queue_last = bcp->va_queue_last;
+	msg = bcp->bau_msg_head;
 	while (msg->sw_ack_vector) {
 		count++;
-		fw = msg->sw_ack_vector;
-		msg_slot = msg - va_queue_first;
-		sw_ack_slot = ffs(fw) - 1;
-
-		uv_bau_process_message(msg, msg_slot, sw_ack_slot);
-
+		msgdesc.msg_slot = msg - msgdesc.va_queue_first;
+		msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
+		msgdesc.msg = msg;
+		uv_bau_process_message(&msgdesc, bcp);
 		msg++;
-		if (msg > va_queue_last)
-			msg = va_queue_first;
-		__get_cpu_var(bau_control).bau_msg_head = msg;
+		if (msg > msgdesc.va_queue_last)
+			msg = msgdesc.va_queue_first;
+		bcp->bau_msg_head = msg;
 	}
+	stat->d_time += (get_cycles() - time_start);
 	if (!count)
-		__get_cpu_var(ptcstats).nomsg++;
+		stat->d_nomsg++;
 	else if (count > 1)
-		__get_cpu_var(ptcstats).multmsg++;
-
-	time2 = get_cycles();
-	__get_cpu_var(ptcstats).dflush += (time2 - time1);
-
-	irq_exit();
-	set_irq_regs(old_regs);
+		stat->d_multmsg++;
+	ack_APIC_irq();
 }
 
 /*
  * uv_enable_timeouts
  *
- * Each target blade (i.e. blades that have cpu's) needs to have
+ * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
  * shootdown message timeouts enabled.  The timeout does not cause
  * an interrupt, but causes an error message to be returned to
  * the sender.
  */
 static void uv_enable_timeouts(void)
 {
-	int blade;
-	int nblades;
+	int uvhub;
+	int nuvhubs;
 	int pnode;
 	unsigned long mmr_image;
 
-	nblades = uv_num_possible_blades();
+	nuvhubs = uv_num_possible_blades();
 
-	for (blade = 0; blade < nblades; blade++) {
-		if (!uv_blade_nr_possible_cpus(blade))
+	for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+		if (!uv_blade_nr_possible_cpus(uvhub))
 			continue;
 
-		pnode = uv_blade_to_pnode(blade);
+		pnode = uv_blade_to_pnode(uvhub);
 		mmr_image =
 		    uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
 		/*
@@ -523,9 +906,20 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
 {
 }
 
+static inline unsigned long long
+millisec_2_cycles(unsigned long millisec)
+{
+	unsigned long ns;
+	unsigned long long cyc;
+
+	ns = millisec * 1000;
+	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+	return cyc;
+}
+
 /*
- * Display the statistics thru /proc
- * data points to the cpu number
+ * Display the statistics thru /proc.
+ * 'data' points to the cpu number
  */
 static int uv_ptc_seq_show(struct seq_file *file, void *data)
 {
@@ -536,78 +930,155 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
 
 	if (!cpu) {
 		seq_printf(file,
-		"# cpu requestor requestee one all sretry dretry ptc_i ");
+			"# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
 		seq_printf(file,
-		"sw_ack sflush dflush sok dnomsg dmult starget\n");
+			"numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
+		seq_printf(file,
+			"retries rok resetp resett giveup sto bz throt ");
+		seq_printf(file,
+			"sw_ack recv rtime all ");
+		seq_printf(file,
+			"one mult none retry canc nocan reset rcan\n");
 	}
 	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
 		stat = &per_cpu(ptcstats, cpu);
-		seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",
-			   cpu, stat->requestor,
-			   stat->requestee, stat->onetlb, stat->alltlb,
-			   stat->s_retry, stat->d_retry, stat->ptc_i);
-		seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
+		/* source side statistics */
+		seq_printf(file,
+			"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+			   cpu, stat->s_requestor, cycles_2_us(stat->s_time),
+			   stat->s_ntarguvhub, stat->s_ntarguvhub16,
+			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,
+			   stat->s_ntarguvhub2, stat->s_ntarguvhub1,
+			   stat->s_ntargcpu, stat->s_dtimeout);
+		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
+			   stat->s_retry_messages, stat->s_retriesok,
+			   stat->s_resets_plug, stat->s_resets_timeout,
+			   stat->s_giveup, stat->s_stimeout,
+			   stat->s_busy, stat->s_throttles);
+		/* destination side statistics */
+		seq_printf(file,
+			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
 			   uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
 					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
-			   stat->sflush, stat->dflush,
-			   stat->retriesok, stat->nomsg,
-			   stat->multmsg, stat->ntargeted);
+			   stat->d_requestee, cycles_2_us(stat->d_time),
+			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
+			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
+			   stat->d_nocanceled, stat->d_resets,
+			   stat->d_rcanceled);
 	}
 
 	return 0;
 }
 
 /*
+ * -1: resetf the statistics
  *  0: display meaning of the statistics
- * >0: retry limit
+ * >0: maximum concurrent active descriptors per uvhub (throttle)
  */
 static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
 				 size_t count, loff_t *data)
 {
-	long newmode;
+	int cpu;
+	long input_arg;
 	char optstr[64];
+	struct ptc_stats *stat;
+	struct bau_control *bcp;
 
 	if (count == 0 || count > sizeof(optstr))
 		return -EINVAL;
 	if (copy_from_user(optstr, user, count))
 		return -EFAULT;
 	optstr[count - 1] = '\0';
-	if (strict_strtoul(optstr, 10, &newmode) < 0) {
+	if (strict_strtol(optstr, 10, &input_arg) < 0) {
 		printk(KERN_DEBUG "%s is invalid\n", optstr);
 		return -EINVAL;
 	}
 
-	if (newmode == 0) {
+	if (input_arg == 0) {
 		printk(KERN_DEBUG "# cpu:      cpu number\n");
+		printk(KERN_DEBUG "Sender statistics:\n");
+		printk(KERN_DEBUG
+		"sent:     number of shootdown messages sent\n");
+		printk(KERN_DEBUG
+		"stime:    time spent sending messages\n");
+		printk(KERN_DEBUG
+		"numuvhubs: number of hubs targeted with shootdown\n");
+		printk(KERN_DEBUG
+		"numuvhubs16: number times 16 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs8: number times 8 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs4: number times 4 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs2: number times 2 or more hubs targeted\n");
+		printk(KERN_DEBUG
+		"numuvhubs1: number times 1 hub targeted\n");
+		printk(KERN_DEBUG
+		"numcpus:  number of cpus targeted with shootdown\n");
+		printk(KERN_DEBUG
+		"dto:      number of destination timeouts\n");
+		printk(KERN_DEBUG
+		"retries:  destination timeout retries sent\n");
+		printk(KERN_DEBUG
+		"rok:   :  destination timeouts successfully retried\n");
+		printk(KERN_DEBUG
+		"resetp:   ipi-style resource resets for plugs\n");
+		printk(KERN_DEBUG
+		"resett:   ipi-style resource resets for timeouts\n");
+		printk(KERN_DEBUG
+		"giveup:   fall-backs to ipi-style shootdowns\n");
+		printk(KERN_DEBUG
+		"sto:      number of source timeouts\n");
+		printk(KERN_DEBUG
+		"bz:       number of stay-busy's\n");
+		printk(KERN_DEBUG
+		"throt:    number times spun in throttle\n");
+		printk(KERN_DEBUG "Destination side statistics:\n");
 		printk(KERN_DEBUG
-		"requestor:  times this cpu was the flush requestor\n");
+		"sw_ack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
 		printk(KERN_DEBUG
-		"requestee:  times this cpu was requested to flush its TLBs\n");
+		"recv:     shootdown messages received\n");
 		printk(KERN_DEBUG
-		"one:        times requested to flush a single address\n");
+		"rtime:    time spent processing messages\n");
 		printk(KERN_DEBUG
-		"all:        times requested to flush all TLB's\n");
+		"all:      shootdown all-tlb messages\n");
 		printk(KERN_DEBUG
-		"sretry:     number of retries of source-side timeouts\n");
+		"one:      shootdown one-tlb messages\n");
 		printk(KERN_DEBUG
-		"dretry:     number of retries of destination-side timeouts\n");
+		"mult:     interrupts that found multiple messages\n");
 		printk(KERN_DEBUG
-		"ptc_i:      times UV fell through to IPI-style flushes\n");
+		"none:     interrupts that found no messages\n");
 		printk(KERN_DEBUG
-		"sw_ack:     image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
+		"retry:    number of retry messages processed\n");
 		printk(KERN_DEBUG
-		"sflush_us:  cycles spent in uv_flush_tlb_others()\n");
+		"canc:     number messages canceled by retries\n");
 		printk(KERN_DEBUG
-		"dflush_us:  cycles spent in handling flush requests\n");
-		printk(KERN_DEBUG "sok:        successes on retry\n");
-		printk(KERN_DEBUG "dnomsg:     interrupts with no message\n");
+		"nocan:    number retries that found nothing to cancel\n");
 		printk(KERN_DEBUG
-		"dmult:      interrupts with multiple messages\n");
-		printk(KERN_DEBUG "starget:    nodes targeted\n");
+		"reset:    number of ipi-style reset requests processed\n");
+		printk(KERN_DEBUG
+		"rcan:     number messages canceled by reset requests\n");
+	} else if (input_arg == -1) {
+		for_each_present_cpu(cpu) {
+			stat = &per_cpu(ptcstats, cpu);
+			memset(stat, 0, sizeof(struct ptc_stats));
+		}
 	} else {
-		uv_bau_retry_limit = newmode;
-		printk(KERN_DEBUG "timeout retry limit:%d\n",
-		       uv_bau_retry_limit);
+		uv_bau_max_concurrent = input_arg;
+		bcp = &per_cpu(bau_control, smp_processor_id());
+		if (uv_bau_max_concurrent < 1 ||
+		    uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
+			printk(KERN_DEBUG
+				"Error: BAU max concurrent %d; %d is invalid\n",
+				bcp->max_concurrent, uv_bau_max_concurrent);
+			return -EINVAL;
+		}
+		printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
+		       uv_bau_max_concurrent);
+		for_each_present_cpu(cpu) {
+			bcp = &per_cpu(bau_control, cpu);
+			bcp->max_concurrent = uv_bau_max_concurrent;
+		}
 	}
 
 	return count;
@@ -650,80 +1121,31 @@ static int __init uv_ptc_init(void)
 	return 0;
 }
 
-/*
- * begin the initialization of the per-blade control structures
- */
-static struct bau_control * __init uv_table_bases_init(int blade, int node)
-{
-	int i;
-	struct bau_msg_status *msp;
-	struct bau_control *bau_tabp;
-
-	bau_tabp =
-	    kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
-	BUG_ON(!bau_tabp);
-
-	bau_tabp->msg_statuses =
-	    kmalloc_node(sizeof(struct bau_msg_status) *
-			 DEST_Q_SIZE, GFP_KERNEL, node);
-	BUG_ON(!bau_tabp->msg_statuses);
-
-	for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
-		bau_cpubits_clear(&msp->seen_by, (int)
-				  uv_blade_nr_possible_cpus(blade));
-
-	uv_bau_table_bases[blade] = bau_tabp;
-
-	return bau_tabp;
-}
-
-/*
- * finish the initialization of the per-blade control structures
- */
-static void __init
-uv_table_bases_finish(int blade,
-		      struct bau_control *bau_tablesp,
-		      struct bau_desc *adp)
-{
-	struct bau_control *bcp;
-	int cpu;
-
-	for_each_present_cpu(cpu) {
-		if (blade != uv_cpu_to_blade_id(cpu))
-			continue;
-
-		bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
-		bcp->bau_msg_head	= bau_tablesp->va_queue_first;
-		bcp->va_queue_first	= bau_tablesp->va_queue_first;
-		bcp->va_queue_last	= bau_tablesp->va_queue_last;
-		bcp->msg_statuses	= bau_tablesp->msg_statuses;
-		bcp->descriptor_base	= adp;
-	}
-}
-
 /*
  * initialize the sending side's sending buffers
  */
-static struct bau_desc * __init
+static void
 uv_activation_descriptor_init(int node, int pnode)
 {
 	int i;
+	int cpu;
 	unsigned long pa;
 	unsigned long m;
 	unsigned long n;
-	struct bau_desc *adp;
-	struct bau_desc *ad2;
+	struct bau_desc *bau_desc;
+	struct bau_desc *bd2;
+	struct bau_control *bcp;
 
 	/*
 	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
-	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
+	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
 	 */
-	adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+	bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
 		UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
-	BUG_ON(!adp);
+	BUG_ON(!bau_desc);
 
-	pa = uv_gpa(adp); /* need the real nasid*/
-	n = uv_gpa_to_pnode(pa);
+	pa = uv_gpa(bau_desc); /* need the real nasid*/
+	n = pa >> uv_nshift;
 	m = pa & uv_mmask;
 
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -732,96 +1154,188 @@ uv_activation_descriptor_init(int node, int pnode)
 	/*
 	 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
 	 * cpu even though we only use the first one; one descriptor can
-	 * describe a broadcast to 256 nodes.
+	 * describe a broadcast to 256 uv hubs.
 	 */
-	for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
-		i++, ad2++) {
-		memset(ad2, 0, sizeof(struct bau_desc));
-		ad2->header.sw_ack_flag = 1;
+	for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+		i++, bd2++) {
+		memset(bd2, 0, sizeof(struct bau_desc));
+		bd2->header.sw_ack_flag = 1;
 		/*
-		 * base_dest_nodeid is the first node in the partition, so
-		 * the bit map will indicate partition-relative node numbers.
-		 * note that base_dest_nodeid is actually a nasid.
+		 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
+		 * in the partition. The bit map will indicate uvhub numbers,
+		 * which are 0-N in a partition. Pnodes are unique system-wide.
 		 */
-		ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
-		ad2->header.dest_subnodeid = 0x10; /* the LB */
-		ad2->header.command = UV_NET_ENDPOINT_INTD;
-		ad2->header.int_both = 1;
+		bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
+		bd2->header.dest_subnodeid = 0x10; /* the LB */
+		bd2->header.command = UV_NET_ENDPOINT_INTD;
+		bd2->header.int_both = 1;
 		/*
 		 * all others need to be set to zero:
 		 *   fairness chaining multilevel count replied_to
 		 */
 	}
-	return adp;
+	for_each_present_cpu(cpu) {
+		if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
+			continue;
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->descriptor_base = bau_desc;
+	}
 }
 
 /*
  * initialize the destination side's receiving buffers
+ * entered for each uvhub in the partition
+ * - node is first node (kernel memory notion) on the uvhub
+ * - pnode is the uvhub's physical identifier
  */
-static struct bau_payload_queue_entry * __init
-uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
+static void
+uv_payload_queue_init(int node, int pnode)
 {
-	struct bau_payload_queue_entry *pqp;
-	unsigned long pa;
 	int pn;
+	int cpu;
 	char *cp;
+	unsigned long pa;
+	struct bau_payload_queue_entry *pqp;
+	struct bau_payload_queue_entry *pqp_malloc;
+	struct bau_control *bcp;
 
 	pqp = (struct bau_payload_queue_entry *) kmalloc_node(
 		(DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
 		GFP_KERNEL, node);
 	BUG_ON(!pqp);
+	pqp_malloc = pqp;
 
 	cp = (char *)pqp + 31;
 	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
-	bau_tablesp->va_queue_first = pqp;
+
+	for_each_present_cpu(cpu) {
+		if (pnode != uv_cpu_to_pnode(cpu))
+			continue;
+		/* for every cpu on this pnode: */
+		bcp = &per_cpu(bau_control, cpu);
+		bcp->va_queue_first = pqp;
+		bcp->bau_msg_head = pqp;
+		bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
+	}
 	/*
 	 * need the pnode of where the memory was really allocated
 	 */
 	pa = uv_gpa(pqp);
-	pn = uv_gpa_to_pnode(pa);
+	pn = pa >> uv_nshift;
 	uv_write_global_mmr64(pnode,
 			      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
 			      ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
 			      uv_physnodeaddr(pqp));
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
 			      uv_physnodeaddr(pqp));
-	bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
 	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
 			      (unsigned long)
-			      uv_physnodeaddr(bau_tablesp->va_queue_last));
+			      uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
+	/* in effect, all msg_type's are set to MSG_NOOP */
 	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
-
-	return pqp;
 }
 
 /*
- * Initialization of each UV blade's structures
+ * Initialization of each UV hub's structures
  */
-static int __init uv_init_blade(int blade)
+static void __init uv_init_uvhub(int uvhub, int vector)
 {
 	int node;
 	int pnode;
-	unsigned long pa;
 	unsigned long apicid;
-	struct bau_desc *adp;
-	struct bau_payload_queue_entry *pqp;
-	struct bau_control *bau_tablesp;
-
-	node = blade_to_first_node(blade);
-	bau_tablesp = uv_table_bases_init(blade, node);
-	pnode = uv_blade_to_pnode(blade);
-	adp = uv_activation_descriptor_init(node, pnode);
-	pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
-	uv_table_bases_finish(blade, bau_tablesp, adp);
+
+	node = uvhub_to_first_node(uvhub);
+	pnode = uv_blade_to_pnode(uvhub);
+	uv_activation_descriptor_init(node, pnode);
+	uv_payload_queue_init(node, pnode);
 	/*
 	 * the below initialization can't be in firmware because the
 	 * messaging IRQ will be determined by the OS
 	 */
-	apicid = blade_to_first_apicid(blade);
-	pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
+	apicid = uvhub_to_first_apicid(uvhub);
 	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
-				      ((apicid << 32) | UV_BAU_MESSAGE));
-	return 0;
+				      ((apicid << 32) | vector));
+}
+
+/*
+ * initialize the bau_control structure for each cpu
+ */
+static void uv_init_per_cpu(int nuvhubs)
+{
+	int i, j, k;
+	int cpu;
+	int pnode;
+	int uvhub;
+	short socket = 0;
+	struct bau_control *bcp;
+	struct uvhub_desc *bdp;
+	struct socket_desc *sdp;
+	struct bau_control *hmaster = NULL;
+	struct bau_control *smaster = NULL;
+	struct socket_desc {
+		short num_cpus;
+		short cpu_number[16];
+	};
+	struct uvhub_desc {
+		short num_sockets;
+		short num_cpus;
+		short uvhub;
+		short pnode;
+		struct socket_desc socket[2];
+	};
+	struct uvhub_desc *uvhub_descs;
+
+	uvhub_descs = (struct uvhub_desc *)
+		kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
+	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+	for_each_present_cpu(cpu) {
+		bcp = &per_cpu(bau_control, cpu);
+		memset(bcp, 0, sizeof(struct bau_control));
+		spin_lock_init(&bcp->masks_lock);
+		bcp->max_concurrent = uv_bau_max_concurrent;
+		pnode = uv_cpu_hub_info(cpu)->pnode;
+		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+		bdp = &uvhub_descs[uvhub];
+		bdp->num_cpus++;
+		bdp->uvhub = uvhub;
+		bdp->pnode = pnode;
+		/* time interval to catch a hardware stay-busy bug */
+		bcp->timeout_interval = millisec_2_cycles(3);
+		/* kludge: assume uv_hub.h is constant */
+		socket = (cpu_physical_id(cpu)>>5)&1;
+		if (socket >= bdp->num_sockets)
+			bdp->num_sockets = socket+1;
+		sdp = &bdp->socket[socket];
+		sdp->cpu_number[sdp->num_cpus] = cpu;
+		sdp->num_cpus++;
+	}
+	socket = 0;
+	for_each_possible_blade(uvhub) {
+		bdp = &uvhub_descs[uvhub];
+		for (i = 0; i < bdp->num_sockets; i++) {
+			sdp = &bdp->socket[i];
+			for (j = 0; j < sdp->num_cpus; j++) {
+				cpu = sdp->cpu_number[j];
+				bcp = &per_cpu(bau_control, cpu);
+				bcp->cpu = cpu;
+				if (j == 0) {
+					smaster = bcp;
+					if (i == 0)
+						hmaster = bcp;
+				}
+				bcp->cpus_in_uvhub = bdp->num_cpus;
+				bcp->cpus_in_socket = sdp->num_cpus;
+				bcp->socket_master = smaster;
+				bcp->uvhub_master = hmaster;
+				for (k = 0; k < DEST_Q_SIZE; k++)
+					bcp->socket_acknowledge_count[k] = 0;
+				bcp->uvhub_cpu =
+				  uv_cpu_hub_info(cpu)->blade_processor_id;
+			}
+			socket++;
+		}
+	}
+	kfree(uvhub_descs);
 }
 
 /*
@@ -829,38 +1343,54 @@ static int __init uv_init_blade(int blade)
  */
 static int __init uv_bau_init(void)
 {
-	int blade;
-	int nblades;
+	int uvhub;
+	int pnode;
+	int nuvhubs;
 	int cur_cpu;
+	int vector;
+	unsigned long mmr;
 
 	if (!is_uv_system())
 		return 0;
 
+	if (nobau)
+		return 0;
+
 	for_each_possible_cpu(cur_cpu)
 		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
 				       GFP_KERNEL, cpu_to_node(cur_cpu));
 
-	uv_bau_retry_limit = 1;
+	uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
+	uv_nshift = uv_hub_info->m_val;
 	uv_mmask = (1UL << uv_hub_info->m_val) - 1;
-	nblades = uv_num_possible_blades();
+	nuvhubs = uv_num_possible_blades();
 
-	uv_bau_table_bases = (struct bau_control **)
-	    kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
-	BUG_ON(!uv_bau_table_bases);
+	uv_init_per_cpu(nuvhubs);
 
 	uv_partition_base_pnode = 0x7fffffff;
-	for (blade = 0; blade < nblades; blade++)
-		if (uv_blade_nr_possible_cpus(blade) &&
-			(uv_blade_to_pnode(blade) < uv_partition_base_pnode))
-			uv_partition_base_pnode = uv_blade_to_pnode(blade);
-	for (blade = 0; blade < nblades; blade++)
-		if (uv_blade_nr_possible_cpus(blade))
-			uv_init_blade(blade);
-
-	alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
+	for (uvhub = 0; uvhub < nuvhubs; uvhub++)
+		if (uv_blade_nr_possible_cpus(uvhub) &&
+			(uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
+			uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
+
+	vector = UV_BAU_MESSAGE;
+	for_each_possible_blade(uvhub)
+		if (uv_blade_nr_possible_cpus(uvhub))
+			uv_init_uvhub(uvhub, vector);
+
 	uv_enable_timeouts();
+	alloc_intr_gate(vector, uv_bau_message_intr1);
+
+	for_each_possible_blade(uvhub) {
+		pnode = uv_blade_to_pnode(uvhub);
+		/* INIT the bau */
+		uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
+				      ((unsigned long)1 << 63));
+		mmr = 1; /* should be 1 to broadcast to both sockets */
+		uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
+	}
 
 	return 0;
 }
-__initcall(uv_bau_init);
-__initcall(uv_ptc_init);
+core_initcall(uv_bau_init);
+core_initcall(uv_ptc_init);
-- 
cgit v1.2.3


From fcd1498405c2c88ac632e7c3c3fce3213d9196db Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 14 Apr 2010 19:11:29 +0200
Subject: perf tools: Fix accidentally preprocessed snprintf callback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct sort_entry has a callback named snprintf that turns an
entry into a string result.
But there are glibc versions that implement snprintf through a
macro. The following expression is then going to get the snprintf
call preprocessed:

        ent->snprintf(...)

to finally end up in a build error:

        util/hist.c: Dans la fonction «hist_entry__snprintf» :
        util/hist.c:539: erreur: «struct sort_entry» has no member named «__builtin___snprintf_chk»

To fix this, prepend struct sort_entry callbacks with an "se_"
prefix.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.c | 28 ++++++++++++++--------------
 tools/perf/util/sort.c | 42 +++++++++++++++++++++---------------------
 tools/perf/util/sort.h | 12 ++++++------
 3 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 18cf8b32160..9c2b8743cef 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -68,7 +68,7 @@ hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 	int64_t cmp = 0;
 
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		cmp = se->cmp(left, right);
+		cmp = se->se_cmp(left, right);
 		if (cmp)
 			break;
 	}
@@ -85,7 +85,7 @@ hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		int64_t (*f)(struct hist_entry *, struct hist_entry *);
 
-		f = se->collapse ?: se->cmp;
+		f = se->se_collapse ?: se->se_cmp;
 
 		cmp = f(left, right);
 		if (cmp)
@@ -536,8 +536,8 @@ int hist_entry__snprintf(struct hist_entry *self,
 			continue;
 
 		ret += snprintf(s + ret, size - ret, "%s", sep ?: "  ");
-		ret += se->snprintf(self, s + ret, size - ret,
-				    se->width ? *se->width : 0);
+		ret += se->se_snprintf(self, s + ret, size - ret,
+				       se->se_width ? *se->se_width : 0);
 	}
 
 	return ret;
@@ -564,7 +564,7 @@ static size_t hist_entry__fprintf_callchain(struct hist_entry *self, FILE *fp,
 	if (sort__first_dimension == SORT_COMM) {
 		struct sort_entry *se = list_first_entry(&hist_entry__sort_list,
 							 typeof(*se), list);
-		left_margin = se->width ? *se->width : 0;
+		left_margin = se->se_width ? *se->se_width : 0;
 		left_margin -= thread__comm_len(self->thread);
 	}
 
@@ -615,22 +615,22 @@ size_t perf_session__fprintf_hists(struct rb_root *hists,
 		if (se->elide)
 			continue;
 		if (sep) {
-			fprintf(fp, "%c%s", *sep, se->header);
+			fprintf(fp, "%c%s", *sep, se->se_header);
 			continue;
 		}
-		width = strlen(se->header);
-		if (se->width) {
+		width = strlen(se->se_header);
+		if (se->se_width) {
 			if (symbol_conf.col_width_list_str) {
 				if (col_width) {
-					*se->width = atoi(col_width);
+					*se->se_width = atoi(col_width);
 					col_width = strchr(col_width, ',');
 					if (col_width)
 						++col_width;
 				}
 			}
-			width = *se->width = max(*se->width, width);
+			width = *se->se_width = max(*se->se_width, width);
 		}
-		fprintf(fp, "  %*s", width, se->header);
+		fprintf(fp, "  %*s", width, se->se_header);
 	}
 	fprintf(fp, "\n");
 
@@ -652,10 +652,10 @@ size_t perf_session__fprintf_hists(struct rb_root *hists,
 			continue;
 
 		fprintf(fp, "  ");
-		if (se->width)
-			width = *se->width;
+		if (se->se_width)
+			width = *se->se_width;
 		else
-			width = strlen(se->header);
+			width = strlen(se->se_header);
 		for (i = 0; i < width; i++)
 			fprintf(fp, ".");
 	}
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 9d24d4b2c8f..da30b305fba 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -30,38 +30,38 @@ static int hist_entry__parent_snprintf(struct hist_entry *self, char *bf,
 				       size_t size, unsigned int width);
 
 struct sort_entry sort_thread = {
-	.header = "Command:  Pid",
-	.cmp	= sort__thread_cmp,
-	.snprintf = hist_entry__thread_snprintf,
-	.width	= &threads__col_width,
+	.se_header	= "Command:  Pid",
+	.se_cmp		= sort__thread_cmp,
+	.se_snprintf	= hist_entry__thread_snprintf,
+	.se_width	= &threads__col_width,
 };
 
 struct sort_entry sort_comm = {
-	.header		= "Command",
-	.cmp		= sort__comm_cmp,
-	.collapse	= sort__comm_collapse,
-	.snprintf	= hist_entry__comm_snprintf,
-	.width		= &comms__col_width,
+	.se_header	= "Command",
+	.se_cmp		= sort__comm_cmp,
+	.se_collapse	= sort__comm_collapse,
+	.se_snprintf	= hist_entry__comm_snprintf,
+	.se_width	= &comms__col_width,
 };
 
 struct sort_entry sort_dso = {
-	.header = "Shared Object",
-	.cmp	= sort__dso_cmp,
-	.snprintf = hist_entry__dso_snprintf,
-	.width	= &dsos__col_width,
+	.se_header	= "Shared Object",
+	.se_cmp		= sort__dso_cmp,
+	.se_snprintf	= hist_entry__dso_snprintf,
+	.se_width	= &dsos__col_width,
 };
 
 struct sort_entry sort_sym = {
-	.header = "Symbol",
-	.cmp	= sort__sym_cmp,
-	.snprintf = hist_entry__sym_snprintf,
+	.se_header	= "Symbol",
+	.se_cmp		= sort__sym_cmp,
+	.se_snprintf	= hist_entry__sym_snprintf,
 };
 
 struct sort_entry sort_parent = {
-	.header = "Parent symbol",
-	.cmp	= sort__parent_cmp,
-	.snprintf = hist_entry__parent_snprintf,
-	.width	= &parent_symbol__col_width,
+	.se_header	= "Parent symbol",
+	.se_cmp		= sort__parent_cmp,
+	.se_snprintf	= hist_entry__parent_snprintf,
+	.se_width	= &parent_symbol__col_width,
 };
 
 struct sort_dimension {
@@ -255,7 +255,7 @@ int sort_dimension__add(const char *tok)
 		if (strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
-		if (sd->entry->collapse)
+		if (sd->entry->se_collapse)
 			sort__need_collapse = 1;
 
 		if (sd->entry == &sort_parent) {
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 6d7b4be7060..1d857aa2c01 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -78,13 +78,13 @@ enum sort_type {
 struct sort_entry {
 	struct list_head list;
 
-	const char *header;
+	const char *se_header;
 
-	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
-	int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
-	int	(*snprintf)(struct hist_entry *self, char *bf, size_t size,
-			    unsigned int width);
-	unsigned int *width;
+	int64_t (*se_cmp)(struct hist_entry *, struct hist_entry *);
+	int64_t (*se_collapse)(struct hist_entry *, struct hist_entry *);
+	int	(*se_snprintf)(struct hist_entry *self, char *bf, size_t size,
+			       unsigned int width);
+	unsigned int *se_width;
 	bool	elide;
 };
 
-- 
cgit v1.2.3


From 48481938b02471d505296d7557ed296eb093d496 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 12 Apr 2010 13:16:53 -0400
Subject: perf probe: Support argument name

Set given names to event arguments. The syntax is same as kprobe-tracer,
you can add 'NAME=' right before each argument.

e.g.
  ./perf probe vfs_read foo=file

 then, 'foo' is set to the argument name as below.

  ./perf probe -l
  probe:vfs_read       (on vfs_read@linux-2.6-tip/fs/read_write.c with foo)

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100412171653.3790.74624.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-probe.txt | 10 +++++++++-
 tools/perf/builtin-probe.c              |  4 ++--
 tools/perf/util/probe-event.c           | 34 +++++++++++++++++++++++----------
 tools/perf/util/probe-event.h           |  1 +
 tools/perf/util/probe-finder.c          | 27 +++++++++++++++-----------
 5 files changed, 52 insertions(+), 24 deletions(-)

diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index bb671b34677..e36ed4dd386 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -79,7 +79,15 @@ Probe points are defined by following syntax.
 'EVENT' specifies the name of new event, if omitted, it will be set the name of the probed function. Currently, event group name is set as 'probe'.
 'FUNC' specifies a probed function name, and it may have one of the following options; '+OFFS' is the offset from function entry address in bytes, ':RLN' is the relative-line number from function entry line, and '%return' means that it probes function return. And ';PTN' means lazy matching pattern (see LAZY MATCHING). Note that ';PTN' must be the end of the probe point definition.  In addition, '@SRC' specifies a source file which has that function.
 It is also possible to specify a probe point by the source line number or lazy matching by using 'SRC:ALN' or 'SRC;PTN' syntax, where 'SRC' is the source file path, ':ALN' is the line number and ';PTN' is the lazy matching pattern.
-'ARG' specifies the arguments of this probe point. You can use the name of local variable, or kprobe-tracer argument format (e.g. $retval, %ax, etc).
+'ARG' specifies the arguments of this probe point, (see PROBE ARGUMENT).
+
+PROBE ARGUMENT
+--------------
+Each probe argument follows below syntax.
+
+ [NAME=]LOCALVAR|$retval|%REG|@SYMBOL
+
+'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), or kprobe-tracer argument format (e.g. $retval, %ax, etc).
 
 LINE SYNTAX
 -----------
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index bfc47fff9c5..daf4668d2de 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -142,9 +142,9 @@ static const struct option options[] = {
 	OPT_CALLBACK('a', "add", NULL,
 #ifdef DWARF_SUPPORT
 		"[EVENT=]FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT"
-		" [ARG ...]",
+		" [[NAME=]ARG ...]",
 #else
-		"[EVENT=]FUNC[+OFF|%return] [ARG ...]",
+		"[EVENT=]FUNC[+OFF|%return] [[NAME=]ARG ...]",
 #endif
 		"probe point definition, where\n"
 		"\t\tGROUP:\tGroup name (optional)\n"
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 3fc0be741b8..ab6f53deaba 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -437,22 +437,28 @@ static void parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 /* Parse perf-probe event argument */
 static void parse_perf_probe_arg(const char *str, struct perf_probe_arg *arg)
 {
-	const char *tmp;
+	char *tmp;
 	struct perf_probe_arg_field **fieldp;
 
 	pr_debug("parsing arg: %s into ", str);
 
+	tmp = strchr(str, '=');
+	if (tmp) {
+		arg->name = xstrndup(str, tmp - str);
+		str = tmp + 1;
+	}
+
 	tmp = strpbrk(str, "-.");
 	if (!is_c_varname(str) || !tmp) {
 		/* A variable, register, symbol or special value */
-		arg->name = xstrdup(str);
-		pr_debug("%s\n", arg->name);
+		arg->var = xstrdup(str);
+		pr_debug("%s\n", arg->var);
 		return;
 	}
 
 	/* Structure fields */
-	arg->name = xstrndup(str, tmp - str);
-	pr_debug("%s, ", arg->name);
+	arg->var = xstrndup(str, tmp - str);
+	pr_debug("%s, ", arg->var);
 	fieldp = &arg->field;
 
 	do {
@@ -497,7 +503,7 @@ void parse_perf_probe_command(const char *cmd, struct perf_probe_event *pev)
 	pev->args = xzalloc(sizeof(struct perf_probe_arg) * pev->nargs);
 	for (i = 0; i < pev->nargs; i++) {
 		parse_perf_probe_arg(argv[i + 1], &pev->args[i]);
-		if (is_c_varname(pev->args[i].name) && pev->point.retprobe)
+		if (is_c_varname(pev->args[i].var) && pev->point.retprobe)
 			semantic_error("You can't specify local variable for"
 				       " kretprobe");
 	}
@@ -514,7 +520,7 @@ bool perf_probe_event_need_dwarf(struct perf_probe_event *pev)
 		return true;
 
 	for (i = 0; i < pev->nargs; i++)
-		if (is_c_varname(pev->args[i].name))
+		if (is_c_varname(pev->args[i].var))
 			return true;
 
 	return false;
@@ -575,7 +581,10 @@ int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf, size_t len)
 	int ret;
 	char *tmp = buf;
 
-	ret = e_snprintf(tmp, len, "%s", pa->name);
+	if (pa->name && pa->var)
+		ret = e_snprintf(tmp, len, "%s=%s", pa->name, pa->var);
+	else
+		ret = e_snprintf(tmp, len, "%s", pa->name ? pa->name : pa->var);
 	if (ret <= 0)
 		goto error;
 	tmp += ret;
@@ -803,6 +812,8 @@ void clear_perf_probe_event(struct perf_probe_event *pev)
 	for (i = 0; i < pev->nargs; i++) {
 		if (pev->args[i].name)
 			free(pev->args[i].name);
+		if (pev->args[i].var)
+			free(pev->args[i].var);
 		field = pev->args[i].field;
 		while (field) {
 			next = field->next;
@@ -1117,8 +1128,11 @@ static int convert_to_kprobe_trace_events(struct perf_probe_event *pev,
 	if (tev->nargs) {
 		tev->args = xzalloc(sizeof(struct kprobe_trace_arg)
 				    * tev->nargs);
-		for (i = 0; i < tev->nargs; i++)
-			tev->args[i].value = xstrdup(pev->args[i].name);
+		for (i = 0; i < tev->nargs; i++) {
+			if (pev->args[i].name)
+				tev->args[i].name = xstrdup(pev->args[i].name);
+			tev->args[i].value = xstrdup(pev->args[i].var);
+		}
 	}
 
 	/* Currently just checking function name from symbol map */
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 9d99fc24c4f..10411f59632 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -55,6 +55,7 @@ struct perf_probe_arg_field {
 /* Perf probe probing argument */
 struct perf_probe_arg {
 	char				*name;	/* Argument name */
+	char				*var;	/* Variable name */
 	struct perf_probe_arg_field	*field;	/* Structure fields */
 };
 
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index a8513772df0..105e95c95ee 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -484,35 +484,40 @@ static void convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
 	convert_location(expr, pf);
 
 	if (pf->pvar->field)
-		convert_variable_fields(vr_die, pf->pvar->name,
+		convert_variable_fields(vr_die, pf->pvar->var,
 					pf->pvar->field, &pf->tvar->ref);
 	/* *expr will be cached in libdw. Don't free it. */
 	return ;
 error:
 	/* TODO: Support const_value */
 	die("Failed to find the location of %s at this address.\n"
-	    " Perhaps, it has been optimized out.", pf->pvar->name);
+	    " Perhaps, it has been optimized out.", pf->pvar->var);
 }
 
 /* Find a variable in a subprogram die */
 static void find_variable(Dwarf_Die *sp_die, struct probe_finder *pf)
 {
 	Dwarf_Die vr_die;
-	char buf[128];
+	char buf[32];
 
-	/* TODO: Support struct members and arrays */
-	if (!is_c_varname(pf->pvar->name)) {
+	/* TODO: Support arrays */
+	if (pf->pvar->name)
+		pf->tvar->name = xstrdup(pf->pvar->name);
+	else {
+		synthesize_perf_probe_arg(pf->pvar, buf, 32);
+		pf->tvar->name = xstrdup(buf);
+	}
+
+	if (!is_c_varname(pf->pvar->var)) {
 		/* Copy raw parameters */
-		pf->tvar->value = xstrdup(pf->pvar->name);
+		pf->tvar->value = xstrdup(pf->pvar->var);
 	} else {
-		synthesize_perf_probe_arg(pf->pvar, buf, 128);
-		pf->tvar->name = xstrdup(buf);
 		pr_debug("Searching '%s' variable in context.\n",
-			 pf->pvar->name);
+			 pf->pvar->var);
 		/* Search child die for local variables and parameters. */
-		if (!die_find_variable(sp_die, pf->pvar->name, &vr_die))
+		if (!die_find_variable(sp_die, pf->pvar->var, &vr_die))
 			die("Failed to find '%s' in this function.",
-			    pf->pvar->name);
+			    pf->pvar->var);
 		convert_variable(&vr_die, pf);
 	}
 }
-- 
cgit v1.2.3


From df0faf4be02996135bc3a06b4f34360449c78084 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 12 Apr 2010 13:17:00 -0400
Subject: perf probe: Use the last field name as the argument name

Set the last field name to the argument name when the argument
is refering a data-structure member.

e.g.
 ./perf probe --add 'vfs_read file->f_mode'
 Add new event:
   probe:vfs_read       (on vfs_read with f_mode=file->f_mode)

 This probe records file->f_mode, but the argument name becomes "f_mode".

This enables perf-trace command to parse trace event format correctly.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100412171700.3790.72961.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-probe.txt | 2 +-
 tools/perf/util/probe-event.c           | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index e36ed4dd386..441324f2615 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -87,7 +87,7 @@ Each probe argument follows below syntax.
 
  [NAME=]LOCALVAR|$retval|%REG|@SYMBOL
 
-'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), or kprobe-tracer argument format (e.g. $retval, %ax, etc).
+'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.)
 
 LINE SYNTAX
 -----------
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index ab6f53deaba..19de8b77973 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -481,6 +481,10 @@ static void parse_perf_probe_arg(const char *str, struct perf_probe_arg *arg)
 	} while (tmp);
 	(*fieldp)->name = xstrdup(str);
 	pr_debug("%s(%d)\n", (*fieldp)->name, (*fieldp)->ref);
+
+	/* If no name is specified, set the last field name */
+	if (!arg->name)
+		arg->name = xstrdup((*fieldp)->name);
 }
 
 /* Parse perf-probe event command */
-- 
cgit v1.2.3


From 93ccae7a2227466a0d071fe52c51319f2f34c365 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 12 Apr 2010 13:17:08 -0400
Subject: tracing/kprobes: Support basic types on dynamic events

Support basic types of integer (u8, u16, u32, u64, s8, s16, s32, s64) in
kprobe tracer. With this patch, users can specify above basic types on
each arguments after ':'. If omitted, the argument type is set as
unsigned long (u32 or u64, arch-dependent).

 e.g.
  echo 'p account_system_time+0 hardirq_offset=%si:s32' > kprobe_events

  adds a probe recording hardirq_offset in signed-32bits value on the
  entry of account_system_time.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100412171708.3790.18599.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 Documentation/trace/kprobetrace.txt |   4 +-
 kernel/trace/trace.h                |  16 +-
 kernel/trace/trace_kprobe.c         | 535 ++++++++++++++++++++++--------------
 3 files changed, 334 insertions(+), 221 deletions(-)

diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index a9100b28eb8..ec94748ae65 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -40,7 +40,9 @@ Synopsis of kprobe_events
   $stack	: Fetch stack address.
   $retval	: Fetch return value.(*)
   +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
-  NAME=FETCHARG: Set NAME as the argument name of FETCHARG.
+  NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
+  FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
+		  (u8/u16/u32/u64/s8/s16/s32/s64) are supported.
 
   (*) only for return probe.
   (**) this is useful for fetching a field of data structures.
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bec2c973ff0..3ebdb6bd236 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -102,29 +102,17 @@ struct syscall_trace_exit {
 	long			ret;
 };
 
-struct kprobe_trace_entry {
+struct kprobe_trace_entry_head {
 	struct trace_entry	ent;
 	unsigned long		ip;
-	int			nargs;
-	unsigned long		args[];
 };
 
-#define SIZEOF_KPROBE_TRACE_ENTRY(n)			\
-	(offsetof(struct kprobe_trace_entry, args) +	\
-	(sizeof(unsigned long) * (n)))
-
-struct kretprobe_trace_entry {
+struct kretprobe_trace_entry_head {
 	struct trace_entry	ent;
 	unsigned long		func;
 	unsigned long		ret_ip;
-	int			nargs;
-	unsigned long		args[];
 };
 
-#define SIZEOF_KRETPROBE_TRACE_ENTRY(n)			\
-	(offsetof(struct kretprobe_trace_entry, args) +	\
-	(sizeof(unsigned long) * (n)))
-
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1251e367bae..a7514326052 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -29,6 +29,8 @@
 #include <linux/ctype.h>
 #include <linux/ptrace.h>
 #include <linux/perf_event.h>
+#include <linux/stringify.h>
+#include <asm/bitsperlong.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -40,7 +42,6 @@
 
 /* Reserved field names */
 #define FIELD_STRING_IP "__probe_ip"
-#define FIELD_STRING_NARGS "__probe_nargs"
 #define FIELD_STRING_RETIP "__probe_ret_ip"
 #define FIELD_STRING_FUNC "__probe_func"
 
@@ -52,56 +53,102 @@ const char *reserved_field_names[] = {
 	"common_tgid",
 	"common_lock_depth",
 	FIELD_STRING_IP,
-	FIELD_STRING_NARGS,
 	FIELD_STRING_RETIP,
 	FIELD_STRING_FUNC,
 };
 
-struct fetch_func {
-	unsigned long (*func)(struct pt_regs *, void *);
+/* Printing function type */
+typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *);
+#define PRINT_TYPE_FUNC_NAME(type)	print_type_##type
+#define PRINT_TYPE_FMT_NAME(type)	print_type_format_##type
+
+/* Printing  in basic type function template */
+#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast)			\
+static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,	\
+						const char *name, void *data)\
+{									\
+	return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
+}									\
+static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
+
+DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
+DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
+
+/* Data fetch function type */
+typedef	void (*fetch_func_t)(struct pt_regs *, void *, void *);
+
+struct fetch_param {
+	fetch_func_t	fn;
 	void *data;
 };
 
-static __kprobes unsigned long call_fetch(struct fetch_func *f,
-					  struct pt_regs *regs)
+static __kprobes void call_fetch(struct fetch_param *fprm,
+				 struct pt_regs *regs, void *dest)
 {
-	return f->func(regs, f->data);
+	return fprm->fn(regs, fprm->data, dest);
 }
 
-/* fetch handlers */
-static __kprobes unsigned long fetch_register(struct pt_regs *regs,
-					      void *offset)
-{
-	return regs_get_register(regs, (unsigned int)((unsigned long)offset));
+#define FETCH_FUNC_NAME(kind, type)	fetch_##kind##_##type
+/*
+ * Define macro for basic types - we don't need to define s* types, because
+ * we have to care only about bitwidth at recording time.
+ */
+#define DEFINE_BASIC_FETCH_FUNCS(kind)  \
+DEFINE_FETCH_##kind(u8)			\
+DEFINE_FETCH_##kind(u16)		\
+DEFINE_FETCH_##kind(u32)		\
+DEFINE_FETCH_##kind(u64)
+
+#define CHECK_BASIC_FETCH_FUNCS(kind, fn)	\
+	((FETCH_FUNC_NAME(kind, u8) == fn) ||	\
+	 (FETCH_FUNC_NAME(kind, u16) == fn) ||	\
+	 (FETCH_FUNC_NAME(kind, u32) == fn) ||	\
+	 (FETCH_FUNC_NAME(kind, u64) == fn))
+
+/* Data fetch function templates */
+#define DEFINE_FETCH_reg(type)						\
+static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs,	\
+					  void *offset, void *dest)	\
+{									\
+	*(type *)dest = (type)regs_get_register(regs,			\
+				(unsigned int)((unsigned long)offset));	\
 }
-
-static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
-					   void *num)
-{
-	return regs_get_kernel_stack_nth(regs,
-					 (unsigned int)((unsigned long)num));
+DEFINE_BASIC_FETCH_FUNCS(reg)
+
+#define DEFINE_FETCH_stack(type)					\
+static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+					  void *offset, void *dest)	\
+{									\
+	*(type *)dest = (type)regs_get_kernel_stack_nth(regs,		\
+				(unsigned int)((unsigned long)offset));	\
 }
+DEFINE_BASIC_FETCH_FUNCS(stack)
 
-static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
-{
-	unsigned long retval;
-
-	if (probe_kernel_address(addr, retval))
-		return 0;
-	return retval;
+#define DEFINE_FETCH_retval(type)					\
+static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
+					  void *dummy, void *dest)	\
+{									\
+	*(type *)dest = (type)regs_return_value(regs);			\
 }
-
-static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
-					      void *dummy)
-{
-	return regs_return_value(regs);
-}
-
-static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
-						   void *dummy)
-{
-	return kernel_stack_pointer(regs);
+DEFINE_BASIC_FETCH_FUNCS(retval)
+
+#define DEFINE_FETCH_memory(type)					\
+static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+					  void *addr, void *dest)	\
+{									\
+	type retval;							\
+	if (probe_kernel_address(addr, retval))				\
+		*(type *)dest = 0;					\
+	else								\
+		*(type *)dest = retval;					\
 }
+DEFINE_BASIC_FETCH_FUNCS(memory)
 
 /* Memory fetching by symbol */
 struct symbol_cache {
@@ -145,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
 	return sc;
 }
 
-static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
-{
-	struct symbol_cache *sc = data;
-
-	if (sc->addr)
-		return fetch_memory(regs, (void *)sc->addr);
-	else
-		return 0;
+#define DEFINE_FETCH_symbol(type)					\
+static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
+					  void *data, void *dest)	\
+{									\
+	struct symbol_cache *sc = data;					\
+	if (sc->addr)							\
+		fetch_memory_##type(regs, (void *)sc->addr, dest);	\
+	else								\
+		*(type *)dest = 0;					\
 }
+DEFINE_BASIC_FETCH_FUNCS(symbol)
 
-/* Special indirect memory access interface */
-struct indirect_fetch_data {
-	struct fetch_func orig;
+/* Dereference memory access function */
+struct deref_fetch_param {
+	struct fetch_param orig;
 	long offset;
 };
 
-static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
-{
-	struct indirect_fetch_data *ind = data;
-	unsigned long addr;
-
-	addr = call_fetch(&ind->orig, regs);
-	if (addr) {
-		addr += ind->offset;
-		return fetch_memory(regs, (void *)addr);
-	} else
-		return 0;
+#define DEFINE_FETCH_deref(type)					\
+static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
+					    void *data, void *dest)	\
+{									\
+	struct deref_fetch_param *dprm = data;				\
+	unsigned long addr;						\
+	call_fetch(&dprm->orig, regs, &addr);				\
+	if (addr) {							\
+		addr += dprm->offset;					\
+		fetch_memory_##type(regs, (void *)addr, dest);		\
+	} else								\
+		*(type *)dest = 0;					\
 }
+DEFINE_BASIC_FETCH_FUNCS(deref)
 
-static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
+static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
 {
-	if (data->orig.func == fetch_indirect)
-		free_indirect_fetch_data(data->orig.data);
-	else if (data->orig.func == fetch_symbol)
+	if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn))
+		free_deref_fetch_param(data->orig.data);
+	else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn))
 		free_symbol_cache(data->orig.data);
 	kfree(data);
 }
 
+/* Default (unsigned long) fetch type */
+#define __DEFAULT_FETCH_TYPE(t) u##t
+#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
+#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
+#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
+
+#define ASSIGN_FETCH_FUNC(kind, type)	\
+	.kind = FETCH_FUNC_NAME(kind, type)
+
+#define ASSIGN_FETCH_TYPE(ptype, ftype, sign)	\
+	{.name = #ptype,			\
+	 .size = sizeof(ftype),			\
+	 .is_signed = sign,			\
+	 .print = PRINT_TYPE_FUNC_NAME(ptype),	\
+	 .fmt = PRINT_TYPE_FMT_NAME(ptype),	\
+ASSIGN_FETCH_FUNC(reg, ftype),			\
+ASSIGN_FETCH_FUNC(stack, ftype),		\
+ASSIGN_FETCH_FUNC(retval, ftype),		\
+ASSIGN_FETCH_FUNC(memory, ftype),		\
+ASSIGN_FETCH_FUNC(symbol, ftype),		\
+ASSIGN_FETCH_FUNC(deref, ftype),		\
+	}
+
+/* Fetch type information table */
+static const struct fetch_type {
+	const char	*name;		/* Name of type */
+	size_t		size;		/* Byte size of type */
+	int		is_signed;	/* Signed flag */
+	print_type_func_t	print;	/* Print functions */
+	const char	*fmt;		/* Fromat string */
+	/* Fetch functions */
+	fetch_func_t	reg;
+	fetch_func_t	stack;
+	fetch_func_t	retval;
+	fetch_func_t	memory;
+	fetch_func_t	symbol;
+	fetch_func_t	deref;
+} fetch_type_table[] = {
+	ASSIGN_FETCH_TYPE(u8,  u8,  0),
+	ASSIGN_FETCH_TYPE(u16, u16, 0),
+	ASSIGN_FETCH_TYPE(u32, u32, 0),
+	ASSIGN_FETCH_TYPE(u64, u64, 0),
+	ASSIGN_FETCH_TYPE(s8,  u8,  1),
+	ASSIGN_FETCH_TYPE(s16, u16, 1),
+	ASSIGN_FETCH_TYPE(s32, u32, 1),
+	ASSIGN_FETCH_TYPE(s64, u64, 1),
+};
+
+static const struct fetch_type *find_fetch_type(const char *type)
+{
+	int i;
+
+	if (!type)
+		type = DEFAULT_FETCH_TYPE_STR;
+
+	for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
+		if (strcmp(type, fetch_type_table[i].name) == 0)
+			return &fetch_type_table[i];
+	return NULL;
+}
+
+/* Special function : only accept unsigned long */
+static __kprobes void fetch_stack_address(struct pt_regs *regs,
+					  void *dummy, void *dest)
+{
+	*(unsigned long *)dest = kernel_stack_pointer(regs);
+}
+
 /**
  * Kprobe event core functions
  */
 
 struct probe_arg {
-	struct fetch_func	fetch;
-	const char		*name;
+	struct fetch_param	fetch;
+	unsigned int		offset;	/* Offset from argument entry */
+	const char		*name;	/* Name of this argument */
+	const char		*comm;	/* Command of this argument */
+	const struct fetch_type	*type;	/* Type of this argument */
 };
 
 /* Flags for trace_probe */
@@ -204,6 +326,7 @@ struct trace_probe {
 	const char		*symbol;	/* symbol name */
 	struct ftrace_event_call	call;
 	struct trace_event		event;
+	ssize_t			size;		/* trace entry size */
 	unsigned int		nr_args;
 	struct probe_arg	args[];
 };
@@ -212,6 +335,7 @@ struct trace_probe {
 	(offsetof(struct trace_probe, args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
+
 static __kprobes int probe_is_return(struct trace_probe *tp)
 {
 	return tp->rp.handler != NULL;
@@ -222,49 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp)
 	return tp->symbol ? tp->symbol : "unknown";
 }
 
-static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
-{
-	int ret = -EINVAL;
-
-	if (ff->func == fetch_register) {
-		const char *name;
-		name = regs_query_register_name((unsigned int)((long)ff->data));
-		ret = snprintf(buf, n, "%%%s", name);
-	} else if (ff->func == fetch_stack)
-		ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
-	else if (ff->func == fetch_memory)
-		ret = snprintf(buf, n, "@0x%p", ff->data);
-	else if (ff->func == fetch_symbol) {
-		struct symbol_cache *sc = ff->data;
-		if (sc->offset)
-			ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
-					sc->offset);
-		else
-			ret = snprintf(buf, n, "@%s", sc->symbol);
-	} else if (ff->func == fetch_retvalue)
-		ret = snprintf(buf, n, "$retval");
-	else if (ff->func == fetch_stack_address)
-		ret = snprintf(buf, n, "$stack");
-	else if (ff->func == fetch_indirect) {
-		struct indirect_fetch_data *id = ff->data;
-		size_t l = 0;
-		ret = snprintf(buf, n, "%+ld(", id->offset);
-		if (ret >= n)
-			goto end;
-		l += ret;
-		ret = probe_arg_string(buf + l, n - l, &id->orig);
-		if (ret < 0)
-			goto end;
-		l += ret;
-		ret = snprintf(buf + l, n - l, ")");
-		ret += l;
-	}
-end:
-	if (ret >= n)
-		return -ENOSPC;
-	return ret;
-}
-
 static int register_probe_event(struct trace_probe *tp);
 static void unregister_probe_event(struct trace_probe *tp);
 
@@ -347,11 +428,12 @@ error:
 
 static void free_probe_arg(struct probe_arg *arg)
 {
-	if (arg->fetch.func == fetch_symbol)
+	if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn))
+		free_deref_fetch_param(arg->fetch.data);
+	else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn))
 		free_symbol_cache(arg->fetch.data);
-	else if (arg->fetch.func == fetch_indirect)
-		free_indirect_fetch_data(arg->fetch.data);
 	kfree(arg->name);
+	kfree(arg->comm);
 }
 
 static void free_trace_probe(struct trace_probe *tp)
@@ -457,28 +539,30 @@ static int split_symbol_offset(char *symbol, unsigned long *offset)
 #define PARAM_MAX_ARGS 16
 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
 
-static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
+static int parse_probe_vars(char *arg, const struct fetch_type *t,
+			    struct fetch_param *f, int is_return)
 {
 	int ret = 0;
 	unsigned long param;
 
 	if (strcmp(arg, "retval") == 0) {
-		if (is_return) {
-			ff->func = fetch_retvalue;
-			ff->data = NULL;
-		} else
+		if (is_return)
+			f->fn = t->retval;
+		else
 			ret = -EINVAL;
 	} else if (strncmp(arg, "stack", 5) == 0) {
 		if (arg[5] == '\0') {
-			ff->func = fetch_stack_address;
-			ff->data = NULL;
+			if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
+				f->fn = fetch_stack_address;
+			else
+				ret = -EINVAL;
 		} else if (isdigit(arg[5])) {
 			ret = strict_strtoul(arg + 5, 10, &param);
 			if (ret || param > PARAM_MAX_STACK)
 				ret = -EINVAL;
 			else {
-				ff->func = fetch_stack;
-				ff->data = (void *)param;
+				f->fn = t->stack;
+				f->data = (void *)param;
 			}
 		} else
 			ret = -EINVAL;
@@ -488,7 +572,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
 }
 
 /* Recursive argument parser */
-static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+static int __parse_probe_arg(char *arg, const struct fetch_type *t,
+			     struct fetch_param *f, int is_return)
 {
 	int ret = 0;
 	unsigned long param;
@@ -497,13 +582,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 
 	switch (arg[0]) {
 	case '$':
-		ret = parse_probe_vars(arg + 1, ff, is_return);
+		ret = parse_probe_vars(arg + 1, t, f, is_return);
 		break;
 	case '%':	/* named register */
 		ret = regs_query_register_offset(arg + 1);
 		if (ret >= 0) {
-			ff->func = fetch_register;
-			ff->data = (void *)(unsigned long)ret;
+			f->fn = t->reg;
+			f->data = (void *)(unsigned long)ret;
 			ret = 0;
 		}
 		break;
@@ -512,26 +597,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 			ret = strict_strtoul(arg + 1, 0, &param);
 			if (ret)
 				break;
-			ff->func = fetch_memory;
-			ff->data = (void *)param;
+			f->fn = t->memory;
+			f->data = (void *)param;
 		} else {
 			ret = split_symbol_offset(arg + 1, &offset);
 			if (ret)
 				break;
-			ff->data = alloc_symbol_cache(arg + 1, offset);
-			if (ff->data)
-				ff->func = fetch_symbol;
-			else
-				ret = -EINVAL;
+			f->data = alloc_symbol_cache(arg + 1, offset);
+			if (f->data)
+				f->fn = t->symbol;
 		}
 		break;
-	case '+':	/* indirect memory */
+	case '+':	/* deref memory */
 	case '-':
 		tmp = strchr(arg, '(');
-		if (!tmp) {
-			ret = -EINVAL;
+		if (!tmp)
 			break;
-		}
 		*tmp = '\0';
 		ret = strict_strtol(arg + 1, 0, &offset);
 		if (ret)
@@ -541,38 +622,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
 		arg = tmp + 1;
 		tmp = strrchr(arg, ')');
 		if (tmp) {
-			struct indirect_fetch_data *id;
+			struct deref_fetch_param *dprm;
+			const struct fetch_type *t2 = find_fetch_type(NULL);
 			*tmp = '\0';
-			id = kzalloc(sizeof(struct indirect_fetch_data),
-				     GFP_KERNEL);
-			if (!id)
+			dprm = kzalloc(sizeof(struct deref_fetch_param),
+				       GFP_KERNEL);
+			if (!dprm)
 				return -ENOMEM;
-			id->offset = offset;
-			ret = __parse_probe_arg(arg, &id->orig, is_return);
+			dprm->offset = offset;
+			ret = __parse_probe_arg(arg, t2, &dprm->orig,
+						is_return);
 			if (ret)
-				kfree(id);
+				kfree(dprm);
 			else {
-				ff->func = fetch_indirect;
-				ff->data = (void *)id;
+				f->fn = t->deref;
+				f->data = (void *)dprm;
 			}
-		} else
-			ret = -EINVAL;
+		}
 		break;
-	default:
-		/* TODO: support custom handler */
-		ret = -EINVAL;
 	}
+	if (!ret && !f->fn)
+		ret = -EINVAL;
 	return ret;
 }
 
 /* String length checking wrapper */
-static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
+static int parse_probe_arg(char *arg, struct trace_probe *tp,
+			   struct probe_arg *parg, int is_return)
 {
+	const char *t;
+
 	if (strlen(arg) > MAX_ARGSTR_LEN) {
 		pr_info("Argument is too long.: %s\n",  arg);
 		return -ENOSPC;
 	}
-	return __parse_probe_arg(arg, ff, is_return);
+	parg->comm = kstrdup(arg, GFP_KERNEL);
+	if (!parg->comm) {
+		pr_info("Failed to allocate memory for command '%s'.\n", arg);
+		return -ENOMEM;
+	}
+	t = strchr(parg->comm, ':');
+	if (t) {
+		arg[t - parg->comm] = '\0';
+		t++;
+	}
+	parg->type = find_fetch_type(t);
+	if (!parg->type) {
+		pr_info("Unsupported type: %s\n", t);
+		return -EINVAL;
+	}
+	parg->offset = tp->size;
+	tp->size += parg->type->size;
+	return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
 }
 
 /* Return 1 if name is reserved or already used by another argument */
@@ -602,15 +703,18 @@ static int create_trace_probe(int argc, char **argv)
 	 *  @ADDR	: fetch memory at ADDR (ADDR should be in kernel)
 	 *  @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
 	 *  %REG	: fetch register REG
-	 * Indirect memory fetch:
+	 * Dereferencing memory fetch:
 	 *  +|-offs(ARG) : fetch memory at ARG +|- offs address.
 	 * Alias name of args:
 	 *  NAME=FETCHARG : set NAME as alias of FETCHARG.
+	 * Type of args:
+	 *  FETCHARG:TYPE : use TYPE instead of unsigned long.
 	 */
 	struct trace_probe *tp;
 	int i, ret = 0;
 	int is_return = 0, is_delete = 0;
-	char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
+	char *symbol = NULL, *event = NULL, *group = NULL;
+	char *arg, *tmp;
 	unsigned long offset = 0;
 	void *addr = NULL;
 	char buf[MAX_EVENT_NAME_LEN];
@@ -723,13 +827,6 @@ static int create_trace_probe(int argc, char **argv)
 		else
 			arg = argv[i];
 
-		if (conflict_field_name(argv[i], tp->args, i)) {
-			pr_info("Argument%d name '%s' conflicts with "
-				"another field.\n", i, argv[i]);
-			ret = -EINVAL;
-			goto error;
-		}
-
 		tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
 		if (!tp->args[i].name) {
 			pr_info("Failed to allocate argument%d name '%s'.\n",
@@ -737,9 +834,19 @@ static int create_trace_probe(int argc, char **argv)
 			ret = -ENOMEM;
 			goto error;
 		}
+		tmp = strchr(tp->args[i].name, ':');
+		if (tmp)
+			*tmp = '_';	/* convert : to _ */
+
+		if (conflict_field_name(tp->args[i].name, tp->args, i)) {
+			pr_info("Argument%d name '%s' conflicts with "
+				"another field.\n", i, argv[i]);
+			ret = -EINVAL;
+			goto error;
+		}
 
 		/* Parse fetch argument */
-		ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
+		ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
 		if (ret) {
 			pr_info("Parse error at argument%d. (%d)\n", i, ret);
 			kfree(tp->args[i].name);
@@ -794,8 +901,7 @@ static void probes_seq_stop(struct seq_file *m, void *v)
 static int probes_seq_show(struct seq_file *m, void *v)
 {
 	struct trace_probe *tp = v;
-	int i, ret;
-	char buf[MAX_ARGSTR_LEN + 1];
+	int i;
 
 	seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
 	seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
@@ -807,15 +913,10 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	else
 		seq_printf(m, " %s", probe_symbol(tp));
 
-	for (i = 0; i < tp->nr_args; i++) {
-		ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
-		if (ret < 0) {
-			pr_warning("Argument%d decoding error(%d).\n", i, ret);
-			return ret;
-		}
-		seq_printf(m, " %s=%s", tp->args[i].name, buf);
-	}
+	for (i = 0; i < tp->nr_args; i++)
+		seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
 	seq_printf(m, "\n");
+
 	return 0;
 }
 
@@ -945,9 +1046,10 @@ static const struct file_operations kprobe_profile_ops = {
 static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
-	struct kprobe_trace_entry *entry;
+	struct kprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
+	u8 *data;
 	int size, i, pc;
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tp->call;
@@ -957,7 +1059,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
-	size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+	size = sizeof(*entry) + tp->size;
 
 	event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
 						  irq_flags, pc);
@@ -965,10 +1067,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
 		return;
 
 	entry = ring_buffer_event_data(event);
-	entry->nargs = tp->nr_args;
 	entry->ip = (unsigned long)kp->addr;
+	data = (u8 *)&entry[1];
 	for (i = 0; i < tp->nr_args; i++)
-		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+		call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -979,9 +1081,10 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 					  struct pt_regs *regs)
 {
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
-	struct kretprobe_trace_entry *entry;
+	struct kretprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
+	u8 *data;
 	int size, i, pc;
 	unsigned long irq_flags;
 	struct ftrace_event_call *call = &tp->call;
@@ -989,7 +1092,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
-	size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+	size = sizeof(*entry) + tp->size;
 
 	event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
 						  irq_flags, pc);
@@ -997,11 +1100,11 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 		return;
 
 	entry = ring_buffer_event_data(event);
-	entry->nargs = tp->nr_args;
 	entry->func = (unsigned long)tp->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
+	data = (u8 *)&entry[1];
 	for (i = 0; i < tp->nr_args; i++)
-		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+		call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 		trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1011,13 +1114,14 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
 enum print_line_t
 print_kprobe_event(struct trace_iterator *iter, int flags)
 {
-	struct kprobe_trace_entry *field;
+	struct kprobe_trace_entry_head *field;
 	struct trace_seq *s = &iter->seq;
 	struct trace_event *event;
 	struct trace_probe *tp;
+	u8 *data;
 	int i;
 
-	field = (struct kprobe_trace_entry *)iter->ent;
+	field = (struct kprobe_trace_entry_head *)iter->ent;
 	event = ftrace_find_event(field->ent.type);
 	tp = container_of(event, struct trace_probe, event);
 
@@ -1030,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
 	if (!trace_seq_puts(s, ")"))
 		goto partial;
 
-	for (i = 0; i < field->nargs; i++)
-		if (!trace_seq_printf(s, " %s=%lx",
-				      tp->args[i].name, field->args[i]))
+	data = (u8 *)&field[1];
+	for (i = 0; i < tp->nr_args; i++)
+		if (!tp->args[i].type->print(s, tp->args[i].name,
+					     data + tp->args[i].offset))
 			goto partial;
 
 	if (!trace_seq_puts(s, "\n"))
@@ -1046,13 +1151,14 @@ partial:
 enum print_line_t
 print_kretprobe_event(struct trace_iterator *iter, int flags)
 {
-	struct kretprobe_trace_entry *field;
+	struct kretprobe_trace_entry_head *field;
 	struct trace_seq *s = &iter->seq;
 	struct trace_event *event;
 	struct trace_probe *tp;
+	u8 *data;
 	int i;
 
-	field = (struct kretprobe_trace_entry *)iter->ent;
+	field = (struct kretprobe_trace_entry_head *)iter->ent;
 	event = ftrace_find_event(field->ent.type);
 	tp = container_of(event, struct trace_probe, event);
 
@@ -1071,9 +1177,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
 	if (!trace_seq_puts(s, ")"))
 		goto partial;
 
-	for (i = 0; i < field->nargs; i++)
-		if (!trace_seq_printf(s, " %s=%lx",
-				      tp->args[i].name, field->args[i]))
+	data = (u8 *)&field[1];
+	for (i = 0; i < tp->nr_args; i++)
+		if (!tp->args[i].type->print(s, tp->args[i].name,
+					     data + tp->args[i].offset))
 			goto partial;
 
 	if (!trace_seq_puts(s, "\n"))
@@ -1129,29 +1236,43 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call)
 static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
 	int ret, i;
-	struct kprobe_trace_entry field;
+	struct kprobe_trace_entry_head field;
 	struct trace_probe *tp = (struct trace_probe *)event_call->data;
 
 	DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
-	DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
 	/* Set argument names as fields */
-	for (i = 0; i < tp->nr_args; i++)
-		DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+	for (i = 0; i < tp->nr_args; i++) {
+		ret = trace_define_field(event_call, tp->args[i].type->name,
+					 tp->args[i].name,
+					 sizeof(field) + tp->args[i].offset,
+					 tp->args[i].type->size,
+					 tp->args[i].type->is_signed,
+					 FILTER_OTHER);
+		if (ret)
+			return ret;
+	}
 	return 0;
 }
 
 static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
 	int ret, i;
-	struct kretprobe_trace_entry field;
+	struct kretprobe_trace_entry_head field;
 	struct trace_probe *tp = (struct trace_probe *)event_call->data;
 
 	DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
 	DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
-	DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
 	/* Set argument names as fields */
-	for (i = 0; i < tp->nr_args; i++)
-		DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
+	for (i = 0; i < tp->nr_args; i++) {
+		ret = trace_define_field(event_call, tp->args[i].type->name,
+					 tp->args[i].name,
+					 sizeof(field) + tp->args[i].offset,
+					 tp->args[i].type->size,
+					 tp->args[i].type->is_signed,
+					 FILTER_OTHER);
+		if (ret)
+			return ret;
+	}
 	return 0;
 }
 
@@ -1176,8 +1297,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
 
 	for (i = 0; i < tp->nr_args; i++) {
-		pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx",
-				tp->args[i].name);
+		pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
+				tp->args[i].name, tp->args[i].type->fmt);
 	}
 
 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
@@ -1219,12 +1340,13 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
 {
 	struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
 	struct ftrace_event_call *call = &tp->call;
-	struct kprobe_trace_entry *entry;
+	struct kprobe_trace_entry_head *entry;
+	u8 *data;
 	int size, __size, i;
 	unsigned long irq_flags;
 	int rctx;
 
-	__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
+	__size = sizeof(*entry) + tp->size;
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1235,10 +1357,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
 	if (!entry)
 		return;
 
-	entry->nargs = tp->nr_args;
 	entry->ip = (unsigned long)kp->addr;
+	data = (u8 *)&entry[1];
 	for (i = 0; i < tp->nr_args; i++)
-		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+		call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
 
 	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
 }
@@ -1249,12 +1371,13 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
 {
 	struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
 	struct ftrace_event_call *call = &tp->call;
-	struct kretprobe_trace_entry *entry;
+	struct kretprobe_trace_entry_head *entry;
+	u8 *data;
 	int size, __size, i;
 	unsigned long irq_flags;
 	int rctx;
 
-	__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
+	__size = sizeof(*entry) + tp->size;
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1265,11 +1388,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
 	if (!entry)
 		return;
 
-	entry->nargs = tp->nr_args;
 	entry->func = (unsigned long)tp->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
+	data = (u8 *)&entry[1];
 	for (i = 0; i < tp->nr_args; i++)
-		entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
+		call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
 
 	perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
 			       irq_flags, regs);
-- 
cgit v1.2.3


From 4984912eb23113a4007940cd09c8351c0623ea5f Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 12 Apr 2010 13:17:15 -0400
Subject: perf probe: Query basic types from debuginfo

Query the basic type information (byte-size and signed-flag) from
debuginfo and pass that to kprobe-tracer. This is especially useful
for tracing the members of data structure, because each member has
different byte-size on the memory.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100412171715.3790.23730.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-event.c  |  9 +++++
 tools/perf/util/probe-event.h  |  1 +
 tools/perf/util/probe-finder.c | 78 +++++++++++++++++++++++++++++++++++++-----
 3 files changed, 80 insertions(+), 8 deletions(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 19de8b77973..05ca4a959e6 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -740,6 +740,13 @@ static int synthesize_kprobe_trace_arg(struct kprobe_trace_arg *arg,
 		buf += ret;
 		buflen -= ret;
 	}
+	/* Print argument type */
+	if (arg->type) {
+		ret = e_snprintf(buf, buflen, ":%s", arg->type);
+		if (ret <= 0)
+			return ret;
+		buf += ret;
+	}
 
 	return buf - tmp;
 }
@@ -848,6 +855,8 @@ void clear_kprobe_trace_event(struct kprobe_trace_event *tev)
 			free(tev->args[i].name);
 		if (tev->args[i].value)
 			free(tev->args[i].value);
+		if (tev->args[i].type)
+			free(tev->args[i].type);
 		ref = tev->args[i].ref;
 		while (ref) {
 			next = ref->next;
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 10411f59632..a393a3f87cd 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -23,6 +23,7 @@ struct kprobe_trace_arg_ref {
 struct kprobe_trace_arg {
 	char				*name;	/* Argument name */
 	char				*value;	/* Base value */
+	char				*type;	/* Type name */
 	struct kprobe_trace_arg_ref	*ref;	/* Referencing offset */
 };
 
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 105e95c95ee..ebeb413ac47 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -84,6 +84,9 @@ const char *x86_64_regs_table[X86_64_MAX_REGS] = {
 #define arch_regs_table x86_32_regs_table
 #endif
 
+/* Kprobe tracer basic type is up to u64 */
+#define MAX_BASIC_TYPE_BITS	64
+
 /* Return architecture dependent register string (for kprobe-tracer) */
 static const char *get_arch_regstr(unsigned int n)
 {
@@ -230,6 +233,31 @@ static Dwarf_Die *die_get_real_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem)
 	return die_mem;
 }
 
+static bool die_is_signed_type(Dwarf_Die *tp_die)
+{
+	Dwarf_Attribute attr;
+	Dwarf_Word ret;
+
+	if (dwarf_attr(tp_die, DW_AT_encoding, &attr) == NULL ||
+	    dwarf_formudata(&attr, &ret) != 0)
+		return false;
+
+	return (ret == DW_ATE_signed_char || ret == DW_ATE_signed ||
+		ret == DW_ATE_signed_fixed);
+}
+
+static int die_get_byte_size(Dwarf_Die *tp_die)
+{
+	Dwarf_Attribute attr;
+	Dwarf_Word ret;
+
+	if (dwarf_attr(tp_die, DW_AT_byte_size, &attr) == NULL ||
+	    dwarf_formudata(&attr, &ret) != 0)
+		return 0;
+
+	return (int)ret;
+}
+
 /* Return values for die_find callbacks */
 enum {
 	DIE_FIND_CB_FOUND = 0,		/* End of Search */
@@ -406,13 +434,42 @@ static void convert_location(Dwarf_Op *op, struct probe_finder *pf)
 	}
 }
 
+static void convert_variable_type(Dwarf_Die *vr_die,
+				  struct kprobe_trace_arg *targ)
+{
+	Dwarf_Die type;
+	char buf[16];
+	int ret;
+
+	if (die_get_real_type(vr_die, &type) == NULL)
+		die("Failed to get a type information of %s.",
+		    dwarf_diename(vr_die));
+
+	ret = die_get_byte_size(&type) * 8;
+	if (ret) {
+		/* Check the bitwidth */
+		if (ret > MAX_BASIC_TYPE_BITS) {
+			pr_warning("  Warning: %s exceeds max-bitwidth."
+				   " Cut down to %d bits.\n",
+				   dwarf_diename(&type), MAX_BASIC_TYPE_BITS);
+			ret = MAX_BASIC_TYPE_BITS;
+		}
+
+		ret = snprintf(buf, 16, "%c%d",
+			       die_is_signed_type(&type) ? 's' : 'u', ret);
+		if (ret < 0 || ret >= 16)
+			die("Failed to convert variable type.");
+		targ->type = xstrdup(buf);
+	}
+}
+
 static void convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 				    struct perf_probe_arg_field *field,
-				    struct kprobe_trace_arg_ref **ref_ptr)
+				    struct kprobe_trace_arg_ref **ref_ptr,
+				    Dwarf_Die *die_mem)
 {
 	struct kprobe_trace_arg_ref *ref = *ref_ptr;
 	Dwarf_Attribute attr;
-	Dwarf_Die member;
 	Dwarf_Die type;
 	Dwarf_Word offs;
 
@@ -450,26 +507,27 @@ static void convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 			die("Structure on a register is not supported yet.");
 	}
 
-	if (die_find_member(&type, field->name, &member) == NULL)
+	if (die_find_member(&type, field->name, die_mem) == NULL)
 		die("%s(tyep:%s) has no member %s.", varname,
 		    dwarf_diename(&type), field->name);
 
 	/* Get the offset of the field */
-	if (dwarf_attr(&member, DW_AT_data_member_location, &attr) == NULL ||
+	if (dwarf_attr(die_mem, DW_AT_data_member_location, &attr) == NULL ||
 	    dwarf_formudata(&attr, &offs) != 0)
 		die("Failed to get the offset of %s.", field->name);
 	ref->offset += (long)offs;
 
 	/* Converting next field */
 	if (field->next)
-		convert_variable_fields(&member, field->name, field->next,
-					&ref);
+		convert_variable_fields(die_mem, field->name, field->next,
+					&ref, die_mem);
 }
 
 /* Show a variables in kprobe event format */
 static void convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
 {
 	Dwarf_Attribute attr;
+	Dwarf_Die die_mem;
 	Dwarf_Op *expr;
 	size_t nexpr;
 	int ret;
@@ -483,9 +541,13 @@ static void convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
 
 	convert_location(expr, pf);
 
-	if (pf->pvar->field)
+	if (pf->pvar->field) {
 		convert_variable_fields(vr_die, pf->pvar->var,
-					pf->pvar->field, &pf->tvar->ref);
+					pf->pvar->field, &pf->tvar->ref,
+					&die_mem);
+		vr_die = &die_mem;
+	}
+	convert_variable_type(vr_die, pf->tvar);
 	/* *expr will be cached in libdw. Don't free it. */
 	return ;
 error:
-- 
cgit v1.2.3


From 11a1ca3554b377d2a8a318a3cbf8ce10a7a2a8e4 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 12 Apr 2010 13:17:22 -0400
Subject: perf probe: Support basic type casting

Add basic type casting for arguments to perf probe. This allows
users to specify the actual type of arguments. Of course, if
user sets invalid types, kprobe-tracer rejects that.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100412171722.3790.50372.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-probe.txt |  3 ++-
 tools/perf/util/probe-event.c           | 23 ++++++++++++++++++++++-
 tools/perf/util/probe-event.h           |  1 +
 tools/perf/util/probe-finder.c          | 10 ++++++++--
 4 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 441324f2615..63c25d30488 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -85,9 +85,10 @@ PROBE ARGUMENT
 --------------
 Each probe argument follows below syntax.
 
- [NAME=]LOCALVAR|$retval|%REG|@SYMBOL
+ [NAME=]LOCALVAR|$retval|%REG|@SYMBOL[:TYPE]
 
 'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.)
+'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo.
 
 LINE SYNTAX
 -----------
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 05ca4a959e6..bef280527e6 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -435,7 +435,7 @@ static void parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 }
 
 /* Parse perf-probe event argument */
-static void parse_perf_probe_arg(const char *str, struct perf_probe_arg *arg)
+static void parse_perf_probe_arg(char *str, struct perf_probe_arg *arg)
 {
 	char *tmp;
 	struct perf_probe_arg_field **fieldp;
@@ -445,9 +445,17 @@ static void parse_perf_probe_arg(const char *str, struct perf_probe_arg *arg)
 	tmp = strchr(str, '=');
 	if (tmp) {
 		arg->name = xstrndup(str, tmp - str);
+		pr_debug("name:%s ", arg->name);
 		str = tmp + 1;
 	}
 
+	tmp = strchr(str, ':');
+	if (tmp) {	/* Type setting */
+		*tmp = '\0';
+		arg->type = xstrdup(tmp + 1);
+		pr_debug("type:%s ", arg->type);
+	}
+
 	tmp = strpbrk(str, "-.");
 	if (!is_c_varname(str) || !tmp) {
 		/* A variable, register, symbol or special value */
@@ -603,6 +611,15 @@ int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf, size_t len)
 		len -= ret;
 		field = field->next;
 	}
+
+	if (pa->type) {
+		ret = e_snprintf(tmp, len, ":%s", pa->type);
+		if (ret <= 0)
+			goto error;
+		tmp += ret;
+		len -= ret;
+	}
+
 	return tmp - buf;
 error:
 	die("Failed to synthesize perf probe argument: %s", strerror(-ret));
@@ -825,6 +842,8 @@ void clear_perf_probe_event(struct perf_probe_event *pev)
 			free(pev->args[i].name);
 		if (pev->args[i].var)
 			free(pev->args[i].var);
+		if (pev->args[i].type)
+			free(pev->args[i].type);
 		field = pev->args[i].field;
 		while (field) {
 			next = field->next;
@@ -1145,6 +1164,8 @@ static int convert_to_kprobe_trace_events(struct perf_probe_event *pev,
 			if (pev->args[i].name)
 				tev->args[i].name = xstrdup(pev->args[i].name);
 			tev->args[i].value = xstrdup(pev->args[i].var);
+			if (pev->args[i].type)
+				tev->args[i].type = xstrdup(pev->args[i].type);
 		}
 	}
 
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index a393a3f87cd..ff2f26b1822 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -57,6 +57,7 @@ struct perf_probe_arg_field {
 struct perf_probe_arg {
 	char				*name;	/* Argument name */
 	char				*var;	/* Variable name */
+	char				*type;	/* Type name */
 	struct perf_probe_arg_field	*field;	/* Structure fields */
 };
 
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index ebeb413ac47..ab476736cbe 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -547,7 +547,10 @@ static void convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
 					&die_mem);
 		vr_die = &die_mem;
 	}
-	convert_variable_type(vr_die, pf->tvar);
+	if (pf->pvar->type)
+		pf->tvar->type = xstrdup(pf->pvar->type);
+	else
+		convert_variable_type(vr_die, pf->tvar);
 	/* *expr will be cached in libdw. Don't free it. */
 	return ;
 error:
@@ -560,13 +563,16 @@ error:
 static void find_variable(Dwarf_Die *sp_die, struct probe_finder *pf)
 {
 	Dwarf_Die vr_die;
-	char buf[32];
+	char buf[32], *ptr;
 
 	/* TODO: Support arrays */
 	if (pf->pvar->name)
 		pf->tvar->name = xstrdup(pf->pvar->name);
 	else {
 		synthesize_perf_probe_arg(pf->pvar, buf, 32);
+		ptr = strchr(buf, ':');	/* Change type separator to _ */
+		if (ptr)
+			*ptr = '_';
 		pf->tvar->name = xstrdup(buf);
 	}
 
-- 
cgit v1.2.3


From a34a985499895a46a4dacff727d0fbc63cdc75e8 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 12 Apr 2010 13:17:29 -0400
Subject: perf probe: Support DW_OP_call_frame_cfa in debuginfo

When building kernel without CONFIG_FRAME_POINTER, gcc uses
CFA (canonical frame address) for frame base. With this patch,
perf probe just gets CFI (call frame information) from debuginfo
and search corresponding CFA from the CFI. IOW, this allows
perf probe works correctly on the kernel without CONFIG_FRAME_POINTER.

<Before>
 ./perf probe -fn sched_slice:12 lw.weight
  Fatal: DW_OP 156 is not supported.
              (^^^ DW_OP_call_frame_cfa)

<After>
./perf probe -fn sched_slice:12 lw.weight
Add new event:
  probe:sched_slice    (on sched_slice:12 with weight=lw.weight)

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100412171728.3790.98217.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-finder.c | 14 +++++++++++---
 tools/perf/util/probe-finder.h |  1 +
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index ab476736cbe..1f4528555d4 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -398,7 +398,6 @@ static void convert_location(Dwarf_Op *op, struct probe_finder *pf)
 	const char *regs;
 	struct kprobe_trace_arg *tvar = pf->tvar;
 
-	/* TODO: support CFA */
 	/* If this is based on frame buffer, set the offset */
 	if (op->atom == DW_OP_fbreg) {
 		if (pf->fb_ops == NULL)
@@ -629,11 +628,17 @@ static void convert_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 	/* Get the frame base attribute/ops */
 	dwarf_attr(sp_die, DW_AT_frame_base, &fb_attr);
 	ret = dwarf_getlocation_addr(&fb_attr, pf->addr, &pf->fb_ops, &nops, 1);
-	if (ret <= 0 || nops == 0)
+	if (ret <= 0 || nops == 0) {
 		pf->fb_ops = NULL;
+	} else if (nops == 1 && pf->fb_ops[0].atom == DW_OP_call_frame_cfa &&
+		   pf->cfi != NULL) {
+		Dwarf_Frame *frame;
+		ret = dwarf_cfi_addrframe(pf->cfi, pf->addr, &frame);
+		DIE_IF(ret != 0);
+		dwarf_frame_cfa(frame, &pf->fb_ops, &nops);
+	}
 
 	/* Find each argument */
-	/* TODO: use dwarf_cfi_addrframe */
 	tev->nargs = pf->pev->nargs;
 	tev->args = xzalloc(sizeof(struct kprobe_trace_arg) * tev->nargs);
 	for (i = 0; i < pf->pev->nargs; i++) {
@@ -842,6 +847,9 @@ int find_kprobe_trace_events(int fd, struct perf_probe_event *pev,
 	if (!dbg)
 		return -ENOENT;
 
+	/* Get the call frame information from this dwarf */
+	pf.cfi = dwarf_getcfi(dbg);
+
 	off = 0;
 	line_list__init(&pf.lcache);
 	/* Loop on CUs (Compilation Unit) */
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index 2a271321944..310ce897229 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -42,6 +42,7 @@ struct probe_finder {
 	struct list_head	lcache;		/* Line cache for lazy match */
 
 	/* For variable searching */
+	Dwarf_CFI		*cfi;		/* Call Frame Information */
 	Dwarf_Op		*fb_ops;	/* Frame base attribute */
 	struct perf_probe_arg	*pvar;		/* Current target variable */
 	struct kprobe_trace_arg	*tvar;		/* Current result variable */
-- 
cgit v1.2.3


From b55a87ade3839c33ab94768a0b5955734073f987 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 12 Apr 2010 13:17:35 -0400
Subject: perf probe: Remove die() from probe-finder code

Remove die() and DIE_IF() code from util/probe-finder.c since
these 'sudden death' in utility functions make reusing it from
other code (especially tui/gui) difficult.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100412171735.3790.88853.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-event.c  |   4 +-
 tools/perf/util/probe-finder.c | 517 +++++++++++++++++++++++++----------------
 2 files changed, 322 insertions(+), 199 deletions(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index bef280527e6..7893b3207db 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -151,10 +151,10 @@ static int try_to_find_kprobe_trace_events(struct perf_probe_event *pev,
 
 	/* Error path */
 	if (need_dwarf) {
-		if (ntevs == -ENOENT)
+		if (ntevs == -EBADF)
 			pr_warning("No dwarf info found in the vmlinux - "
 				"please rebuild with CONFIG_DEBUG_INFO=y.\n");
-		die("Could not analyze debuginfo.");
+		die("Failed to analyze debuginfo.");
 	}
 	pr_debug("An error occurred in debuginfo analysis."
 		 " Try to use symbols.\n");
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 1f4528555d4..54daa91e901 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -196,19 +196,7 @@ static bool die_compare_name(Dwarf_Die *dw_die, const char *tname)
 {
 	const char *name;
 	name = dwarf_diename(dw_die);
-	DIE_IF(name == NULL);
-	return strcmp(tname, name);
-}
-
-/* Get entry pc(or low pc, 1st entry of ranges)  of the die */
-static Dwarf_Addr die_get_entrypc(Dwarf_Die *dw_die)
-{
-	Dwarf_Addr epc;
-	int ret;
-
-	ret = dwarf_entrypc(dw_die, &epc);
-	DIE_IF(ret == -1);
-	return epc;
+	return name ? strcmp(tname, name) : -1;
 }
 
 /* Get type die, but skip qualifiers and typedef */
@@ -390,7 +378,7 @@ static Dwarf_Die *die_find_member(Dwarf_Die *st_die, const char *name,
  */
 
 /* Show a location */
-static void convert_location(Dwarf_Op *op, struct probe_finder *pf)
+static int convert_location(Dwarf_Op *op, struct probe_finder *pf)
 {
 	unsigned int regn;
 	Dwarf_Word offs = 0;
@@ -400,8 +388,11 @@ static void convert_location(Dwarf_Op *op, struct probe_finder *pf)
 
 	/* If this is based on frame buffer, set the offset */
 	if (op->atom == DW_OP_fbreg) {
-		if (pf->fb_ops == NULL)
-			die("The attribute of frame base is not supported.\n");
+		if (pf->fb_ops == NULL) {
+			pr_warning("The attribute of frame base is not "
+				   "supported.\n");
+			return -ENOTSUP;
+		}
 		ref = true;
 		offs = op->number;
 		op = &pf->fb_ops[0];
@@ -419,50 +410,63 @@ static void convert_location(Dwarf_Op *op, struct probe_finder *pf)
 		ref = true;
 	} else if (op->atom == DW_OP_regx) {
 		regn = op->number;
-	} else
-		die("DW_OP %d is not supported.", op->atom);
+	} else {
+		pr_warning("DW_OP %x is not supported.\n", op->atom);
+		return -ENOTSUP;
+	}
 
 	regs = get_arch_regstr(regn);
-	if (!regs)
-		die("%u exceeds max register number.", regn);
+	if (!regs) {
+		pr_warning("%u exceeds max register number.\n", regn);
+		return -ERANGE;
+	}
 
 	tvar->value = xstrdup(regs);
 	if (ref) {
 		tvar->ref = xzalloc(sizeof(struct kprobe_trace_arg_ref));
 		tvar->ref->offset = (long)offs;
 	}
+	return 0;
 }
 
-static void convert_variable_type(Dwarf_Die *vr_die,
-				  struct kprobe_trace_arg *targ)
+static int convert_variable_type(Dwarf_Die *vr_die,
+				 struct kprobe_trace_arg *targ)
 {
 	Dwarf_Die type;
 	char buf[16];
 	int ret;
 
-	if (die_get_real_type(vr_die, &type) == NULL)
-		die("Failed to get a type information of %s.",
-		    dwarf_diename(vr_die));
+	if (die_get_real_type(vr_die, &type) == NULL) {
+		pr_warning("Failed to get a type information of %s.\n",
+			   dwarf_diename(vr_die));
+		return -ENOENT;
+	}
 
 	ret = die_get_byte_size(&type) * 8;
 	if (ret) {
 		/* Check the bitwidth */
 		if (ret > MAX_BASIC_TYPE_BITS) {
-			pr_warning("  Warning: %s exceeds max-bitwidth."
-				   " Cut down to %d bits.\n",
-				   dwarf_diename(&type), MAX_BASIC_TYPE_BITS);
+			pr_info("%s exceeds max-bitwidth."
+				" Cut down to %d bits.\n",
+				dwarf_diename(&type), MAX_BASIC_TYPE_BITS);
 			ret = MAX_BASIC_TYPE_BITS;
 		}
 
 		ret = snprintf(buf, 16, "%c%d",
 			       die_is_signed_type(&type) ? 's' : 'u', ret);
-		if (ret < 0 || ret >= 16)
-			die("Failed to convert variable type.");
+		if (ret < 0 || ret >= 16) {
+			if (ret >= 16)
+				ret = -E2BIG;
+			pr_warning("Failed to convert variable type: %s\n",
+				   strerror(-ret));
+			return ret;
+		}
 		targ->type = xstrdup(buf);
 	}
+	return 0;
 }
 
-static void convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
+static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 				    struct perf_probe_arg_field *field,
 				    struct kprobe_trace_arg_ref **ref_ptr,
 				    Dwarf_Die *die_mem)
@@ -473,21 +477,28 @@ static void convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 	Dwarf_Word offs;
 
 	pr_debug("converting %s in %s\n", field->name, varname);
-	if (die_get_real_type(vr_die, &type) == NULL)
-		die("Failed to get a type information of %s.", varname);
+	if (die_get_real_type(vr_die, &type) == NULL) {
+		pr_warning("Failed to get the type of %s.\n", varname);
+		return -ENOENT;
+	}
 
 	/* Check the pointer and dereference */
 	if (dwarf_tag(&type) == DW_TAG_pointer_type) {
-		if (!field->ref)
-			die("Semantic error: %s must be referred by '->'",
-			    field->name);
+		if (!field->ref) {
+			pr_err("Semantic error: %s must be referred by '->'\n",
+			       field->name);
+			return -EINVAL;
+		}
 		/* Get the type pointed by this pointer */
-		if (die_get_real_type(&type, &type) == NULL)
-			die("Failed to get a type information of %s.", varname);
-
+		if (die_get_real_type(&type, &type) == NULL) {
+			pr_warning("Failed to get the type of %s.\n", varname);
+			return -ENOENT;
+		}
 		/* Verify it is a data structure  */
-		if (dwarf_tag(&type) != DW_TAG_structure_type)
-			die("%s is not a data structure.", varname);
+		if (dwarf_tag(&type) != DW_TAG_structure_type) {
+			pr_warning("%s is not a data structure.\n", varname);
+			return -EINVAL;
+		}
 
 		ref = xzalloc(sizeof(struct kprobe_trace_arg_ref));
 		if (*ref_ptr)
@@ -496,34 +507,46 @@ static void convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 			*ref_ptr = ref;
 	} else {
 		/* Verify it is a data structure  */
-		if (dwarf_tag(&type) != DW_TAG_structure_type)
-			die("%s is not a data structure.", varname);
-
-		if (field->ref)
-			die("Semantic error: %s must be referred by '.'",
-			    field->name);
-		if (!ref)
-			die("Structure on a register is not supported yet.");
+		if (dwarf_tag(&type) != DW_TAG_structure_type) {
+			pr_warning("%s is not a data structure.\n", varname);
+			return -EINVAL;
+		}
+		if (field->ref) {
+			pr_err("Semantic error: %s must be referred by '.'\n",
+			       field->name);
+			return -EINVAL;
+		}
+		if (!ref) {
+			pr_warning("Structure on a register is not "
+				   "supported yet.\n");
+			return -ENOTSUP;
+		}
 	}
 
-	if (die_find_member(&type, field->name, die_mem) == NULL)
-		die("%s(tyep:%s) has no member %s.", varname,
-		    dwarf_diename(&type), field->name);
+	if (die_find_member(&type, field->name, die_mem) == NULL) {
+		pr_warning("%s(tyep:%s) has no member %s.\n", varname,
+			   dwarf_diename(&type), field->name);
+		return -EINVAL;
+	}
 
 	/* Get the offset of the field */
 	if (dwarf_attr(die_mem, DW_AT_data_member_location, &attr) == NULL ||
-	    dwarf_formudata(&attr, &offs) != 0)
-		die("Failed to get the offset of %s.", field->name);
+	    dwarf_formudata(&attr, &offs) != 0) {
+		pr_warning("Failed to get the offset of %s.\n", field->name);
+		return -ENOENT;
+	}
 	ref->offset += (long)offs;
 
 	/* Converting next field */
 	if (field->next)
-		convert_variable_fields(die_mem, field->name, field->next,
-					&ref, die_mem);
+		return convert_variable_fields(die_mem, field->name,
+					       field->next, &ref, die_mem);
+	else
+		return 0;
 }
 
 /* Show a variables in kprobe event format */
-static void convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
+static int convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
 {
 	Dwarf_Attribute attr;
 	Dwarf_Die die_mem;
@@ -538,28 +561,30 @@ static void convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
 	if (ret <= 0 || nexpr == 0)
 		goto error;
 
-	convert_location(expr, pf);
-
-	if (pf->pvar->field) {
-		convert_variable_fields(vr_die, pf->pvar->var,
-					pf->pvar->field, &pf->tvar->ref,
-					&die_mem);
+	ret = convert_location(expr, pf);
+	if (ret == 0 && pf->pvar->field) {
+		ret = convert_variable_fields(vr_die, pf->pvar->var,
+					      pf->pvar->field, &pf->tvar->ref,
+					      &die_mem);
 		vr_die = &die_mem;
 	}
-	if (pf->pvar->type)
-		pf->tvar->type = xstrdup(pf->pvar->type);
-	else
-		convert_variable_type(vr_die, pf->tvar);
+	if (ret == 0) {
+		if (pf->pvar->type)
+			pf->tvar->type = xstrdup(pf->pvar->type);
+		else
+			ret = convert_variable_type(vr_die, pf->tvar);
+	}
 	/* *expr will be cached in libdw. Don't free it. */
-	return ;
+	return ret;
 error:
 	/* TODO: Support const_value */
-	die("Failed to find the location of %s at this address.\n"
-	    " Perhaps, it has been optimized out.", pf->pvar->var);
+	pr_err("Failed to find the location of %s at this address.\n"
+	       " Perhaps, it has been optimized out.\n", pf->pvar->var);
+	return -ENOENT;
 }
 
 /* Find a variable in a subprogram die */
-static void find_variable(Dwarf_Die *sp_die, struct probe_finder *pf)
+static int find_variable(Dwarf_Die *sp_die, struct probe_finder *pf)
 {
 	Dwarf_Die vr_die;
 	char buf[32], *ptr;
@@ -578,19 +603,22 @@ static void find_variable(Dwarf_Die *sp_die, struct probe_finder *pf)
 	if (!is_c_varname(pf->pvar->var)) {
 		/* Copy raw parameters */
 		pf->tvar->value = xstrdup(pf->pvar->var);
-	} else {
-		pr_debug("Searching '%s' variable in context.\n",
-			 pf->pvar->var);
-		/* Search child die for local variables and parameters. */
-		if (!die_find_variable(sp_die, pf->pvar->var, &vr_die))
-			die("Failed to find '%s' in this function.",
-			    pf->pvar->var);
-		convert_variable(&vr_die, pf);
+		return 0;
 	}
+
+	pr_debug("Searching '%s' variable in context.\n",
+		 pf->pvar->var);
+	/* Search child die for local variables and parameters. */
+	if (!die_find_variable(sp_die, pf->pvar->var, &vr_die)) {
+		pr_warning("Failed to find '%s' in this function.\n",
+			   pf->pvar->var);
+		return -ENOENT;
+	}
+	return convert_variable(&vr_die, pf);
 }
 
 /* Show a probe point to output buffer */
-static void convert_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
+static int convert_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 {
 	struct kprobe_trace_event *tev;
 	Dwarf_Addr eaddr;
@@ -600,22 +628,31 @@ static void convert_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 	Dwarf_Attribute fb_attr;
 	size_t nops;
 
-	if (pf->ntevs == MAX_PROBES)
-		die("Too many( > %d) probe point found.\n", MAX_PROBES);
+	if (pf->ntevs == MAX_PROBES) {
+		pr_warning("Too many( > %d) probe point found.\n", MAX_PROBES);
+		return -ERANGE;
+	}
 	tev = &pf->tevs[pf->ntevs++];
 
 	/* If no real subprogram, find a real one */
 	if (!sp_die || dwarf_tag(sp_die) != DW_TAG_subprogram) {
 		sp_die = die_find_real_subprogram(&pf->cu_die,
 						 pf->addr, &die_mem);
-		if (!sp_die)
-			die("Probe point is not found in subprograms.");
+		if (!sp_die) {
+			pr_warning("Failed to find probe point in any "
+				   "functions.\n");
+			return -ENOENT;
+		}
 	}
 
 	/* Copy the name of probe point */
 	name = dwarf_diename(sp_die);
 	if (name) {
-		dwarf_entrypc(sp_die, &eaddr);
+		if (dwarf_entrypc(sp_die, &eaddr) != 0) {
+			pr_warning("Failed to get entry pc of %s\n",
+				   dwarf_diename(sp_die));
+			return -ENOENT;
+		}
 		tev->point.symbol = xstrdup(name);
 		tev->point.offset = (unsigned long)(pf->addr - eaddr);
 	} else
@@ -633,9 +670,12 @@ static void convert_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 	} else if (nops == 1 && pf->fb_ops[0].atom == DW_OP_call_frame_cfa &&
 		   pf->cfi != NULL) {
 		Dwarf_Frame *frame;
-		ret = dwarf_cfi_addrframe(pf->cfi, pf->addr, &frame);
-		DIE_IF(ret != 0);
-		dwarf_frame_cfa(frame, &pf->fb_ops, &nops);
+		if (dwarf_cfi_addrframe(pf->cfi, pf->addr, &frame) != 0 ||
+		    dwarf_frame_cfa(frame, &pf->fb_ops, &nops) != 0) {
+			pr_warning("Failed to get CFA on 0x%jx\n",
+				   (uintmax_t)pf->addr);
+			return -ENOENT;
+		}
 	}
 
 	/* Find each argument */
@@ -644,45 +684,53 @@ static void convert_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 	for (i = 0; i < pf->pev->nargs; i++) {
 		pf->pvar = &pf->pev->args[i];
 		pf->tvar = &tev->args[i];
-		find_variable(sp_die, pf);
+		ret = find_variable(sp_die, pf);
+		if (ret != 0)
+			return ret;
 	}
 
 	/* *pf->fb_ops will be cached in libdw. Don't free it. */
 	pf->fb_ops = NULL;
+	return 0;
 }
 
 /* Find probe point from its line number */
-static void find_probe_point_by_line(struct probe_finder *pf)
+static int find_probe_point_by_line(struct probe_finder *pf)
 {
 	Dwarf_Lines *lines;
 	Dwarf_Line *line;
 	size_t nlines, i;
 	Dwarf_Addr addr;
 	int lineno;
-	int ret;
+	int ret = 0;
 
-	ret = dwarf_getsrclines(&pf->cu_die, &lines, &nlines);
-	DIE_IF(ret != 0);
+	if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) {
+		pr_warning("No source lines found in this CU.\n");
+		return -ENOENT;
+	}
 
-	for (i = 0; i < nlines; i++) {
+	for (i = 0; i < nlines && ret == 0; i++) {
 		line = dwarf_onesrcline(lines, i);
-		dwarf_lineno(line, &lineno);
-		if (lineno != pf->lno)
+		if (dwarf_lineno(line, &lineno) != 0 ||
+		    lineno != pf->lno)
 			continue;
 
 		/* TODO: Get fileno from line, but how? */
 		if (strtailcmp(dwarf_linesrc(line, NULL, NULL), pf->fname) != 0)
 			continue;
 
-		ret = dwarf_lineaddr(line, &addr);
-		DIE_IF(ret != 0);
+		if (dwarf_lineaddr(line, &addr) != 0) {
+			pr_warning("Failed to get the address of the line.\n");
+			return -ENOENT;
+		}
 		pr_debug("Probe line found: line[%d]:%d addr:0x%jx\n",
 			 (int)i, lineno, (uintmax_t)addr);
 		pf->addr = addr;
 
-		convert_probe_point(NULL, pf);
+		ret = convert_probe_point(NULL, pf);
 		/* Continuing, because target line might be inlined. */
 	}
+	return ret;
 }
 
 /* Find lines which match lazy pattern */
@@ -690,15 +738,27 @@ static int find_lazy_match_lines(struct list_head *head,
 				 const char *fname, const char *pat)
 {
 	char *fbuf, *p1, *p2;
-	int fd, line, nlines = 0;
+	int fd, ret, line, nlines = 0;
 	struct stat st;
 
 	fd = open(fname, O_RDONLY);
-	if (fd < 0)
-		die("failed to open %s", fname);
-	DIE_IF(fstat(fd, &st) < 0);
+	if (fd < 0) {
+		pr_warning("Failed to open %s: %s\n", fname, strerror(-fd));
+		return fd;
+	}
+
+	ret = fstat(fd, &st);
+	if (ret < 0) {
+		pr_warning("Failed to get the size of %s: %s\n",
+			   fname, strerror(errno));
+		return ret;
+	}
 	fbuf = xmalloc(st.st_size + 2);
-	DIE_IF(read(fd, fbuf, st.st_size) < 0);
+	ret = read(fd, fbuf, st.st_size);
+	if (ret < 0) {
+		pr_warning("Failed to read %s: %s\n", fname, strerror(errno));
+		return ret;
+	}
 	close(fd);
 	fbuf[st.st_size] = '\n';	/* Dummy line */
 	fbuf[st.st_size + 1] = '\0';
@@ -718,7 +778,7 @@ static int find_lazy_match_lines(struct list_head *head,
 }
 
 /* Find probe points from lazy pattern  */
-static void find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
+static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
 {
 	Dwarf_Lines *lines;
 	Dwarf_Line *line;
@@ -726,31 +786,40 @@ static void find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
 	Dwarf_Addr addr;
 	Dwarf_Die die_mem;
 	int lineno;
-	int ret;
+	int ret = 0;
 
 	if (list_empty(&pf->lcache)) {
 		/* Matching lazy line pattern */
 		ret = find_lazy_match_lines(&pf->lcache, pf->fname,
 					    pf->pev->point.lazy_line);
-		if (ret <= 0)
-			die("No matched lines found in %s.", pf->fname);
+		if (ret == 0) {
+			pr_debug("No matched lines found in %s.\n", pf->fname);
+			return 0;
+		} else if (ret < 0)
+			return ret;
 	}
 
-	ret = dwarf_getsrclines(&pf->cu_die, &lines, &nlines);
-	DIE_IF(ret != 0);
-	for (i = 0; i < nlines; i++) {
+	if (dwarf_getsrclines(&pf->cu_die, &lines, &nlines) != 0) {
+		pr_warning("No source lines found in this CU.\n");
+		return -ENOENT;
+	}
+
+	for (i = 0; i < nlines && ret >= 0; i++) {
 		line = dwarf_onesrcline(lines, i);
 
-		dwarf_lineno(line, &lineno);
-		if (!line_list__has_line(&pf->lcache, lineno))
+		if (dwarf_lineno(line, &lineno) != 0 ||
+		    !line_list__has_line(&pf->lcache, lineno))
 			continue;
 
 		/* TODO: Get fileno from line, but how? */
 		if (strtailcmp(dwarf_linesrc(line, NULL, NULL), pf->fname) != 0)
 			continue;
 
-		ret = dwarf_lineaddr(line, &addr);
-		DIE_IF(ret != 0);
+		if (dwarf_lineaddr(line, &addr) != 0) {
+			pr_debug("Failed to get the address of line %d.\n",
+				 lineno);
+			continue;
+		}
 		if (sp_die) {
 			/* Address filtering 1: does sp_die include addr? */
 			if (!dwarf_haspc(sp_die, addr))
@@ -764,27 +833,42 @@ static void find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
 			 (int)i, lineno, (unsigned long long)addr);
 		pf->addr = addr;
 
-		convert_probe_point(sp_die, pf);
+		ret = convert_probe_point(sp_die, pf);
 		/* Continuing, because target line might be inlined. */
 	}
 	/* TODO: deallocate lines, but how? */
+	return ret;
 }
 
+/* Callback parameter with return value */
+struct dwarf_callback_param {
+	void *data;
+	int retval;
+};
+
 static int probe_point_inline_cb(Dwarf_Die *in_die, void *data)
 {
-	struct probe_finder *pf = (struct probe_finder *)data;
+	struct dwarf_callback_param *param = data;
+	struct probe_finder *pf = param->data;
 	struct perf_probe_point *pp = &pf->pev->point;
+	Dwarf_Addr addr;
 
 	if (pp->lazy_line)
-		find_probe_point_lazy(in_die, pf);
+		param->retval = find_probe_point_lazy(in_die, pf);
 	else {
 		/* Get probe address */
-		pf->addr = die_get_entrypc(in_die);
+		if (dwarf_entrypc(in_die, &addr) != 0) {
+			pr_warning("Failed to get entry pc of %s.\n",
+				   dwarf_diename(in_die));
+			param->retval = -ENOENT;
+			return DWARF_CB_ABORT;
+		}
+		pf->addr = addr;
 		pf->addr += pp->offset;
 		pr_debug("found inline addr: 0x%jx\n",
 			 (uintmax_t)pf->addr);
 
-		convert_probe_point(in_die, pf);
+		param->retval = convert_probe_point(in_die, pf);
 	}
 
 	return DWARF_CB_OK;
@@ -793,39 +877,53 @@ static int probe_point_inline_cb(Dwarf_Die *in_die, void *data)
 /* Search function from function name */
 static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
 {
-	struct probe_finder *pf = (struct probe_finder *)data;
+	struct dwarf_callback_param *param = data;
+	struct probe_finder *pf = param->data;
 	struct perf_probe_point *pp = &pf->pev->point;
 
 	/* Check tag and diename */
 	if (dwarf_tag(sp_die) != DW_TAG_subprogram ||
 	    die_compare_name(sp_die, pp->function) != 0)
-		return 0;
+		return DWARF_CB_OK;
 
 	pf->fname = dwarf_decl_file(sp_die);
 	if (pp->line) { /* Function relative line */
 		dwarf_decl_line(sp_die, &pf->lno);
 		pf->lno += pp->line;
-		find_probe_point_by_line(pf);
+		param->retval = find_probe_point_by_line(pf);
 	} else if (!dwarf_func_inline(sp_die)) {
 		/* Real function */
 		if (pp->lazy_line)
-			find_probe_point_lazy(sp_die, pf);
+			param->retval = find_probe_point_lazy(sp_die, pf);
 		else {
-			pf->addr = die_get_entrypc(sp_die);
+			if (dwarf_entrypc(sp_die, &pf->addr) != 0) {
+				pr_warning("Failed to get entry pc of %s.\n",
+					   dwarf_diename(sp_die));
+				param->retval = -ENOENT;
+				return DWARF_CB_ABORT;
+			}
 			pf->addr += pp->offset;
 			/* TODO: Check the address in this function */
-			convert_probe_point(sp_die, pf);
+			param->retval = convert_probe_point(sp_die, pf);
 		}
-	} else
+	} else {
+		struct dwarf_callback_param _param = {.data = (void *)pf,
+						      .retval = 0};
 		/* Inlined function: search instances */
-		dwarf_func_inline_instances(sp_die, probe_point_inline_cb, pf);
+		dwarf_func_inline_instances(sp_die, probe_point_inline_cb,
+					    &_param);
+		param->retval = _param.retval;
+	}
 
-	return 1; /* Exit; no same symbol in this CU. */
+	return DWARF_CB_ABORT; /* Exit; no same symbol in this CU. */
 }
 
-static void find_probe_point_by_func(struct probe_finder *pf)
+static int find_probe_point_by_func(struct probe_finder *pf)
 {
-	dwarf_getfuncs(&pf->cu_die, probe_point_search_cb, pf, 0);
+	struct dwarf_callback_param _param = {.data = (void *)pf,
+					      .retval = 0};
+	dwarf_getfuncs(&pf->cu_die, probe_point_search_cb, &_param, 0);
+	return _param.retval;
 }
 
 /* Find kprobe_trace_events specified by perf_probe_event from debuginfo */
@@ -838,14 +936,18 @@ int find_kprobe_trace_events(int fd, struct perf_probe_event *pev,
 	size_t cuhl;
 	Dwarf_Die *diep;
 	Dwarf *dbg;
+	int ret = 0;
 
 	pf.tevs = xzalloc(sizeof(struct kprobe_trace_event) * MAX_PROBES);
 	*tevs = pf.tevs;
 	pf.ntevs = 0;
 
 	dbg = dwarf_begin(fd, DWARF_C_READ);
-	if (!dbg)
-		return -ENOENT;
+	if (!dbg) {
+		pr_warning("No dwarf info found in the vmlinux - "
+			"please rebuild with CONFIG_DEBUG_INFO=y.\n");
+		return -EBADF;
+	}
 
 	/* Get the call frame information from this dwarf */
 	pf.cfi = dwarf_getcfi(dbg);
@@ -853,7 +955,8 @@ int find_kprobe_trace_events(int fd, struct perf_probe_event *pev,
 	off = 0;
 	line_list__init(&pf.lcache);
 	/* Loop on CUs (Compilation Unit) */
-	while (!dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL)) {
+	while (!dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL) &&
+	       ret >= 0) {
 		/* Get the DIE(Debugging Information Entry) of this CU */
 		diep = dwarf_offdie(dbg, off + cuhl, &pf.cu_die);
 		if (!diep)
@@ -867,12 +970,12 @@ int find_kprobe_trace_events(int fd, struct perf_probe_event *pev,
 
 		if (!pp->file || pf.fname) {
 			if (pp->function)
-				find_probe_point_by_func(&pf);
+				ret = find_probe_point_by_func(&pf);
 			else if (pp->lazy_line)
-				find_probe_point_lazy(NULL, &pf);
+				ret = find_probe_point_lazy(NULL, &pf);
 			else {
 				pf.lno = pp->line;
-				find_probe_point_by_line(&pf);
+				ret = find_probe_point_by_line(&pf);
 			}
 		}
 		off = noff;
@@ -880,7 +983,7 @@ int find_kprobe_trace_events(int fd, struct perf_probe_event *pev,
 	line_list__free(&pf.lcache);
 	dwarf_end(dbg);
 
-	return pf.ntevs;
+	return (ret < 0) ? ret : pf.ntevs;
 }
 
 /* Reverse search */
@@ -893,10 +996,11 @@ int find_perf_probe_point(int fd, unsigned long addr,
 	Dwarf_Addr laddr, eaddr;
 	const char *tmp;
 	int lineno, ret = 0;
+	bool found = false;
 
 	dbg = dwarf_begin(fd, DWARF_C_READ);
 	if (!dbg)
-		return -ENOENT;
+		return -EBADF;
 
 	/* Find cu die */
 	if (!dwarf_addrdie(dbg, (Dwarf_Addr)addr, &cudie)) {
@@ -907,82 +1011,87 @@ int find_perf_probe_point(int fd, unsigned long addr,
 	/* Find a corresponding line */
 	line = dwarf_getsrc_die(&cudie, (Dwarf_Addr)addr);
 	if (line) {
-		dwarf_lineaddr(line, &laddr);
-		if ((Dwarf_Addr)addr == laddr) {
-			dwarf_lineno(line, &lineno);
-			ppt->line = lineno;
-
+		if (dwarf_lineaddr(line, &laddr) == 0 &&
+		    (Dwarf_Addr)addr == laddr &&
+		    dwarf_lineno(line, &lineno) == 0) {
 			tmp = dwarf_linesrc(line, NULL, NULL);
-			DIE_IF(!tmp);
-			ppt->file = xstrdup(tmp);
-			ret = 1;
+			if (tmp) {
+				ppt->line = lineno;
+				ppt->file = xstrdup(tmp);
+				found = true;
+			}
 		}
 	}
 
 	/* Find a corresponding function */
 	if (die_find_real_subprogram(&cudie, (Dwarf_Addr)addr, &spdie)) {
 		tmp = dwarf_diename(&spdie);
-		if (!tmp)
+		if (!tmp || dwarf_entrypc(&spdie, &eaddr) != 0)
 			goto end;
 
-		dwarf_entrypc(&spdie, &eaddr);
-		if (!lineno) {
-			/* We don't have a line number, let's use offset */
-			ppt->function = xstrdup(tmp);
-			ppt->offset = addr - (unsigned long)eaddr;
-			ret = 1;
-			goto end;
-		}
-		if (die_find_inlinefunc(&spdie, (Dwarf_Addr)addr, &indie)) {
-			/* addr in an inline function */
-			tmp = dwarf_diename(&indie);
-			if (!tmp)
-				goto end;
-			dwarf_decl_line(&indie, &lineno);
-		} else {
-			if (eaddr == addr)	/* No offset: function entry */
-				lineno = ppt->line;
-			else
-				dwarf_decl_line(&spdie, &lineno);
+		if (ppt->line) {
+			if (die_find_inlinefunc(&spdie, (Dwarf_Addr)addr,
+						&indie)) {
+				/* addr in an inline function */
+				tmp = dwarf_diename(&indie);
+				if (!tmp)
+					goto end;
+				ret = dwarf_decl_line(&indie, &lineno);
+			} else {
+				if (eaddr == addr) {	/* Function entry */
+					lineno = ppt->line;
+					ret = 0;
+				} else
+					ret = dwarf_decl_line(&spdie, &lineno);
+			}
+			if (ret == 0) {
+				/* Make a relative line number */
+				ppt->line -= lineno;
+				goto found;
+			}
 		}
+		/* We don't have a line number, let's use offset */
+		ppt->offset = addr - (unsigned long)eaddr;
+found:
 		ppt->function = xstrdup(tmp);
-		ppt->line -= lineno;	/* Make a relative line number */
+		found = true;
 	}
 
 end:
 	dwarf_end(dbg);
+	if (ret >= 0)
+		ret = found ? 1 : 0;
 	return ret;
 }
 
 
 /* Find line range from its line number */
-static void find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
+static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
 {
 	Dwarf_Lines *lines;
 	Dwarf_Line *line;
 	size_t nlines, i;
 	Dwarf_Addr addr;
 	int lineno;
-	int ret;
 	const char *src;
 	Dwarf_Die die_mem;
 
 	line_list__init(&lf->lr->line_list);
-	ret = dwarf_getsrclines(&lf->cu_die, &lines, &nlines);
-	DIE_IF(ret != 0);
+	if (dwarf_getsrclines(&lf->cu_die, &lines, &nlines) != 0) {
+		pr_warning("No source lines found in this CU.\n");
+		return -ENOENT;
+	}
 
 	for (i = 0; i < nlines; i++) {
 		line = dwarf_onesrcline(lines, i);
-		ret = dwarf_lineno(line, &lineno);
-		DIE_IF(ret != 0);
-		if (lf->lno_s > lineno || lf->lno_e < lineno)
+		if (dwarf_lineno(line, &lineno) != 0 ||
+		    (lf->lno_s > lineno || lf->lno_e < lineno))
 			continue;
 
 		if (sp_die) {
 			/* Address filtering 1: does sp_die include addr? */
-			ret = dwarf_lineaddr(line, &addr);
-			DIE_IF(ret != 0);
-			if (!dwarf_haspc(sp_die, addr))
+			if (dwarf_lineaddr(line, &addr) != 0 ||
+			    !dwarf_haspc(sp_die, addr))
 				continue;
 
 			/* Address filtering 2: No child include addr? */
@@ -1007,18 +1116,22 @@ static void find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
 		free(lf->lr->path);
 		lf->lr->path = NULL;
 	}
+	return lf->found;
 }
 
 static int line_range_inline_cb(Dwarf_Die *in_die, void *data)
 {
-	find_line_range_by_line(in_die, (struct line_finder *)data);
+	struct dwarf_callback_param *param = data;
+
+	param->retval = find_line_range_by_line(in_die, param->data);
 	return DWARF_CB_ABORT;	/* No need to find other instances */
 }
 
 /* Search function from function name */
 static int line_range_search_cb(Dwarf_Die *sp_die, void *data)
 {
-	struct line_finder *lf = (struct line_finder *)data;
+	struct dwarf_callback_param *param = data;
+	struct line_finder *lf = param->data;
 	struct line_range *lr = lf->lr;
 
 	if (dwarf_tag(sp_die) == DW_TAG_subprogram &&
@@ -1033,38 +1146,47 @@ static int line_range_search_cb(Dwarf_Die *sp_die, void *data)
 			lf->lno_e = lr->offset + lr->end;
 		lr->start = lf->lno_s;
 		lr->end = lf->lno_e;
-		if (dwarf_func_inline(sp_die))
+		if (dwarf_func_inline(sp_die)) {
+			struct dwarf_callback_param _param;
+			_param.data = (void *)lf;
+			_param.retval = 0;
 			dwarf_func_inline_instances(sp_die,
-						    line_range_inline_cb, lf);
-		else
-			find_line_range_by_line(sp_die, lf);
-		return 1;
+						    line_range_inline_cb,
+						    &_param);
+			param->retval = _param.retval;
+		} else
+			param->retval = find_line_range_by_line(sp_die, lf);
+		return DWARF_CB_ABORT;
 	}
-	return 0;
+	return DWARF_CB_OK;
 }
 
-static void find_line_range_by_func(struct line_finder *lf)
+static int find_line_range_by_func(struct line_finder *lf)
 {
-	dwarf_getfuncs(&lf->cu_die, line_range_search_cb, lf, 0);
+	struct dwarf_callback_param param = {.data = (void *)lf, .retval = 0};
+	dwarf_getfuncs(&lf->cu_die, line_range_search_cb, &param, 0);
+	return param.retval;
 }
 
 int find_line_range(int fd, struct line_range *lr)
 {
 	struct line_finder lf = {.lr = lr, .found = 0};
-	int ret;
+	int ret = 0;
 	Dwarf_Off off = 0, noff;
 	size_t cuhl;
 	Dwarf_Die *diep;
 	Dwarf *dbg;
 
 	dbg = dwarf_begin(fd, DWARF_C_READ);
-	if (!dbg)
-		return -ENOENT;
+	if (!dbg) {
+		pr_warning("No dwarf info found in the vmlinux - "
+			"please rebuild with CONFIG_DEBUG_INFO=y.\n");
+		return -EBADF;
+	}
 
 	/* Loop on CUs (Compilation Unit) */
-	while (!lf.found) {
-		ret = dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL);
-		if (ret != 0)
+	while (!lf.found && ret >= 0) {
+		if (dwarf_nextcu(dbg, off, &noff, &cuhl, NULL, NULL, NULL) != 0)
 			break;
 
 		/* Get the DIE(Debugging Information Entry) of this CU */
@@ -1080,20 +1202,21 @@ int find_line_range(int fd, struct line_range *lr)
 
 		if (!lr->file || lf.fname) {
 			if (lr->function)
-				find_line_range_by_func(&lf);
+				ret = find_line_range_by_func(&lf);
 			else {
 				lf.lno_s = lr->start;
 				if (!lr->end)
 					lf.lno_e = INT_MAX;
 				else
 					lf.lno_e = lr->end;
-				find_line_range_by_line(NULL, &lf);
+				ret = find_line_range_by_line(NULL, &lf);
 			}
 		}
 		off = noff;
 	}
 	pr_debug("path: %lx\n", (unsigned long)lr->path);
 	dwarf_end(dbg);
-	return lf.found;
+
+	return (ret < 0) ? ret : lf.found;
 }
 
-- 
cgit v1.2.3


From 146a143948ed9e8b248c0ec59937f3e9e1bbc7e5 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 12 Apr 2010 13:17:42 -0400
Subject: perf probe: Remove die() from probe-event code

Remove die() and DIE_IF() code from util/probe-event.c since
these 'sudden death' in utility functions make reusing it from
other code (especially tui/gui) difficult.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100412171742.3790.33650.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-probe.c    |  70 +++--
 tools/perf/util/probe-event.c | 615 ++++++++++++++++++++++++++++--------------
 tools/perf/util/probe-event.h |  24 +-
 3 files changed, 480 insertions(+), 229 deletions(-)

diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index daf4668d2de..64bc11a183b 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -59,23 +59,25 @@ static struct {
 
 
 /* Parse an event definition. Note that any error must die. */
-static void parse_probe_event(const char *str)
+static int parse_probe_event(const char *str)
 {
 	struct perf_probe_event *pev = &params.events[params.nevents];
+	int ret;
 
 	pr_debug("probe-definition(%d): %s\n", params.nevents, str);
 	if (++params.nevents == MAX_PROBES)
 		die("Too many probes (> %d) are specified.", MAX_PROBES);
 
 	/* Parse a perf-probe command into event */
-	parse_perf_probe_command(str, pev);
-
+	ret = parse_perf_probe_command(str, pev);
 	pr_debug("%d arguments\n", pev->nargs);
+
+	return ret;
 }
 
-static void parse_probe_event_argv(int argc, const char **argv)
+static int parse_probe_event_argv(int argc, const char **argv)
 {
-	int i, len;
+	int i, len, ret;
 	char *buf;
 
 	/* Bind up rest arguments */
@@ -86,16 +88,18 @@ static void parse_probe_event_argv(int argc, const char **argv)
 	len = 0;
 	for (i = 0; i < argc; i++)
 		len += sprintf(&buf[len], "%s ", argv[i]);
-	parse_probe_event(buf);
+	ret = parse_probe_event(buf);
 	free(buf);
+	return ret;
 }
 
 static int opt_add_probe_event(const struct option *opt __used,
 			      const char *str, int unset __used)
 {
 	if (str)
-		parse_probe_event(str);
-	return 0;
+		return parse_probe_event(str);
+	else
+		return 0;
 }
 
 static int opt_del_probe_event(const struct option *opt __used,
@@ -113,11 +117,14 @@ static int opt_del_probe_event(const struct option *opt __used,
 static int opt_show_lines(const struct option *opt __used,
 			  const char *str, int unset __used)
 {
+	int ret = 0;
+
 	if (str)
-		parse_line_range_desc(str, &params.line_range);
+		ret = parse_line_range_desc(str, &params.line_range);
 	INIT_LIST_HEAD(&params.line_range.line_list);
 	params.show_lines = true;
-	return 0;
+
+	return ret;
 }
 #endif
 
@@ -178,6 +185,8 @@ static const struct option options[] = {
 
 int cmd_probe(int argc, const char **argv, const char *prefix __used)
 {
+	int ret;
+
 	argc = parse_options(argc, argv, options, probe_usage,
 			     PARSE_OPT_STOP_AT_NON_OPTION);
 	if (argc > 0) {
@@ -185,7 +194,11 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 			pr_warning("  Error: '-' is not supported.\n");
 			usage_with_options(probe_usage, options);
 		}
-		parse_probe_event_argv(argc, argv);
+		ret = parse_probe_event_argv(argc, argv);
+		if (ret < 0) {
+			pr_err("  Error: Parse Error.  (%d)\n", ret);
+			return ret;
+		}
 	}
 
 	if ((!params.nevents && !params.dellist && !params.list_events &&
@@ -197,16 +210,18 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 
 	if (params.list_events) {
 		if (params.nevents != 0 || params.dellist) {
-			pr_warning("  Error: Don't use --list with"
-				   " --add/--del.\n");
+			pr_err("  Error: Don't use --list with --add/--del.\n");
 			usage_with_options(probe_usage, options);
 		}
 		if (params.show_lines) {
-			pr_warning("  Error: Don't use --list with --line.\n");
+			pr_err("  Error: Don't use --list with --line.\n");
 			usage_with_options(probe_usage, options);
 		}
-		show_perf_probe_events();
-		return 0;
+		ret = show_perf_probe_events();
+		if (ret < 0)
+			pr_err("  Error: Failed to show event list. (%d)\n",
+			       ret);
+		return ret;
 	}
 
 #ifdef DWARF_SUPPORT
@@ -217,19 +232,30 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 			usage_with_options(probe_usage, options);
 		}
 
-		show_line_range(&params.line_range);
-		return 0;
+		ret = show_line_range(&params.line_range);
+		if (ret < 0)
+			pr_err("  Error: Failed to show lines. (%d)\n", ret);
+		return ret;
 	}
 #endif
 
 	if (params.dellist) {
-		del_perf_probe_events(params.dellist);
+		ret = del_perf_probe_events(params.dellist);
 		strlist__delete(params.dellist);
-		if (params.nevents == 0)
-			return 0;
+		if (ret < 0) {
+			pr_err("  Error: Failed to delete events. (%d)\n", ret);
+			return ret;
+		}
 	}
 
-	add_perf_probe_events(params.events, params.nevents, params.force_add);
+	if (params.nevents) {
+		ret = add_perf_probe_events(params.events, params.nevents,
+					    params.force_add);
+		if (ret < 0) {
+			pr_err("  Error: Failed to add events. (%d)\n", ret);
+			return ret;
+		}
+	}
 	return 0;
 }
 
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 7893b3207db..bd68f7b33b2 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -53,7 +53,7 @@
 
 bool probe_event_dry_run;	/* Dry run flag */
 
-#define semantic_error(msg ...) die("Semantic error :" msg)
+#define semantic_error(msg ...) pr_err("Semantic error :" msg)
 
 /* If there is no space to write, returns -E2BIG. */
 static int e_snprintf(char *str, size_t size, const char *format, ...)
@@ -76,19 +76,30 @@ static struct map_groups kmap_groups;
 static struct map *kmaps[MAP__NR_TYPES];
 
 /* Initialize symbol maps and path of vmlinux */
-static void init_vmlinux(void)
+static int init_vmlinux(void)
 {
+	int ret;
+
 	symbol_conf.sort_by_name = true;
 	if (symbol_conf.vmlinux_name == NULL)
 		symbol_conf.try_vmlinux_path = true;
 	else
 		pr_debug("Use vmlinux: %s\n", symbol_conf.vmlinux_name);
-	if (symbol__init() < 0)
-		die("Failed to init symbol map.");
+	ret = symbol__init();
+	if (ret < 0) {
+		pr_debug("Failed to init symbol map.\n");
+		goto out;
+	}
 
 	map_groups__init(&kmap_groups);
-	if (map_groups__create_kernel_maps(&kmap_groups, kmaps) < 0)
-		die("Failed to create kernel maps.");
+	ret = map_groups__create_kernel_maps(&kmap_groups, kmaps);
+	if (ret < 0)
+		pr_debug("Failed to create kernel maps.\n");
+
+out:
+	if (ret < 0)
+		pr_warning("Failed to init vmlinux path.\n");
+	return ret;
 }
 
 #ifdef DWARF_SUPPORT
@@ -102,24 +113,32 @@ static int open_vmlinux(void)
 	return open(kmaps[MAP__FUNCTION]->dso->long_name, O_RDONLY);
 }
 
-static void convert_to_perf_probe_point(struct kprobe_trace_point *tp,
-					struct perf_probe_point *pp)
+/* Convert trace point to probe point with debuginfo */
+static int convert_to_perf_probe_point(struct kprobe_trace_point *tp,
+				       struct perf_probe_point *pp)
 {
 	struct symbol *sym;
-	int fd, ret = 0;
+	int fd, ret = -ENOENT;
 
 	sym = map__find_symbol_by_name(kmaps[MAP__FUNCTION],
 				       tp->symbol, NULL);
 	if (sym) {
 		fd = open_vmlinux();
-		ret = find_perf_probe_point(fd, sym->start + tp->offset, pp);
-		close(fd);
+		if (fd >= 0) {
+			ret = find_perf_probe_point(fd,
+						 sym->start + tp->offset, pp);
+			close(fd);
+		}
 	}
 	if (ret <= 0) {
+		pr_debug("Failed to find corresponding probes from "
+			 "debuginfo. Use kprobe event information.\n");
 		pp->function = xstrdup(tp->symbol);
 		pp->offset = tp->offset;
 	}
 	pp->retprobe = tp->retprobe;
+
+	return 0;
 }
 
 /* Try to find perf_probe_event with debuginfo */
@@ -131,9 +150,10 @@ static int try_to_find_kprobe_trace_events(struct perf_probe_event *pev,
 
 	fd = open_vmlinux();
 	if (fd < 0) {
-		if (need_dwarf)
-			die("Could not open debuginfo file.");
-
+		if (need_dwarf) {
+			pr_warning("Failed to open debuginfo file.\n");
+			return fd;
+		}
 		pr_debug("Could not open vmlinux. Try to use symbols.\n");
 		return 0;
 	}
@@ -142,30 +162,32 @@ static int try_to_find_kprobe_trace_events(struct perf_probe_event *pev,
 	ntevs = find_kprobe_trace_events(fd, pev, tevs);
 	close(fd);
 
-	if (ntevs > 0)	/* Succeeded to find trace events */
+	if (ntevs > 0) {	/* Succeeded to find trace events */
+		pr_debug("find %d kprobe_trace_events.\n", ntevs);
 		return ntevs;
+	}
 
-	if (ntevs == 0)	/* No error but failed to find probe point. */
-		die("Probe point '%s' not found. - probe not added.",
-		    synthesize_perf_probe_point(&pev->point));
-
-	/* Error path */
+	if (ntevs == 0)	{	/* No error but failed to find probe point. */
+		pr_warning("Probe point '%s' not found.\n",
+			   synthesize_perf_probe_point(&pev->point));
+		return -ENOENT;
+	}
+	/* Error path : ntevs < 0 */
 	if (need_dwarf) {
 		if (ntevs == -EBADF)
 			pr_warning("No dwarf info found in the vmlinux - "
 				"please rebuild with CONFIG_DEBUG_INFO=y.\n");
-		die("Failed to analyze debuginfo.");
+		return ntevs;
 	}
 	pr_debug("An error occurred in debuginfo analysis."
 		 " Try to use symbols.\n");
 	return 0;
-
 }
 
 #define LINEBUF_SIZE 256
 #define NR_ADDITIONAL_LINES 2
 
-static void show_one_line(FILE *fp, unsigned int l, bool skip, bool show_num)
+static int show_one_line(FILE *fp, unsigned int l, bool skip, bool show_num)
 {
 	char buf[LINEBUF_SIZE];
 	const char *color = PERF_COLOR_BLUE;
@@ -190,19 +212,22 @@ static void show_one_line(FILE *fp, unsigned int l, bool skip, bool show_num)
 				color_fprintf(stdout, color, "%s", buf);
 		}
 	}
-	return;
+
+	return 0;
 error:
 	if (feof(fp))
-		die("Source file is shorter than expected.");
+		pr_warning("Source file is shorter than expected.\n");
 	else
-		die("File read error: %s", strerror(errno));
+		pr_warning("File read error: %s\n", strerror(errno));
+
+	return -1;
 }
 
 /*
  * Show line-range always requires debuginfo to find source file and
  * line number.
  */
-void show_line_range(struct line_range *lr)
+int show_line_range(struct line_range *lr)
 {
 	unsigned int l = 1;
 	struct line_node *ln;
@@ -210,14 +235,25 @@ void show_line_range(struct line_range *lr)
 	int fd, ret;
 
 	/* Search a line range */
-	init_vmlinux();
+	ret = init_vmlinux();
+	if (ret < 0)
+		return ret;
+
 	fd = open_vmlinux();
-	if (fd < 0)
-		die("Could not open debuginfo file.");
+	if (fd < 0) {
+		pr_warning("Failed to open debuginfo file.\n");
+		return fd;
+	}
+
 	ret = find_line_range(fd, lr);
-	if (ret <= 0)
-		die("Source line is not found.\n");
 	close(fd);
+	if (ret == 0) {
+		pr_warning("Specified source line is not found.\n");
+		return -ENOENT;
+	} else if (ret < 0) {
+		pr_warning("Debuginfo analysis failed. (%d)\n", ret);
+		return ret;
+	}
 
 	setup_pager();
 
@@ -228,52 +264,68 @@ void show_line_range(struct line_range *lr)
 		fprintf(stdout, "<%s:%d>\n", lr->file, lr->start);
 
 	fp = fopen(lr->path, "r");
-	if (fp == NULL)
-		die("Failed to open %s: %s", lr->path, strerror(errno));
+	if (fp == NULL) {
+		pr_warning("Failed to open %s: %s\n", lr->path,
+			   strerror(errno));
+		return -errno;
+	}
 	/* Skip to starting line number */
-	while (l < lr->start)
-		show_one_line(fp, l++, true, false);
+	while (l < lr->start && ret >= 0)
+		ret = show_one_line(fp, l++, true, false);
+	if (ret < 0)
+		goto end;
 
 	list_for_each_entry(ln, &lr->line_list, list) {
-		while (ln->line > l)
-			show_one_line(fp, (l++) - lr->offset, false, false);
-		show_one_line(fp, (l++) - lr->offset, false, true);
+		while (ln->line > l && ret >= 0)
+			ret = show_one_line(fp, (l++) - lr->offset,
+					    false, false);
+		if (ret >= 0)
+			ret = show_one_line(fp, (l++) - lr->offset,
+					    false, true);
+		if (ret < 0)
+			goto end;
 	}
 
 	if (lr->end == INT_MAX)
 		lr->end = l + NR_ADDITIONAL_LINES;
-	while (l < lr->end && !feof(fp))
-		show_one_line(fp, (l++) - lr->offset, false, false);
-
+	while (l < lr->end && !feof(fp) && ret >= 0)
+		ret = show_one_line(fp, (l++) - lr->offset, false, false);
+end:
 	fclose(fp);
+	return ret;
 }
 
 #else	/* !DWARF_SUPPORT */
 
-static void convert_to_perf_probe_point(struct kprobe_trace_point *tp,
+static int convert_to_perf_probe_point(struct kprobe_trace_point *tp,
 					struct perf_probe_point *pp)
 {
 	pp->function = xstrdup(tp->symbol);
 	pp->offset = tp->offset;
 	pp->retprobe = tp->retprobe;
+
+	return 0;
 }
 
 static int try_to_find_kprobe_trace_events(struct perf_probe_event *pev,
 				struct kprobe_trace_event **tevs __unused)
 {
-	if (perf_probe_event_need_dwarf(pev))
-		die("Debuginfo-analysis is not supported");
+	if (perf_probe_event_need_dwarf(pev)) {
+		pr_warning("Debuginfo-analysis is not supported.\n");
+		return -ENOSYS;
+	}
 	return 0;
 }
 
-void show_line_range(struct line_range *lr __unused)
+int show_line_range(struct line_range *lr __unused)
 {
-	die("Debuginfo-analysis is not supported");
+	pr_warning("Debuginfo-analysis is not supported.\n");
+	return -ENOSYS;
 }
 
 #endif
 
-void parse_line_range_desc(const char *arg, struct line_range *lr)
+int parse_line_range_desc(const char *arg, struct line_range *lr)
 {
 	const char *ptr;
 	char *tmp;
@@ -293,12 +345,16 @@ void parse_line_range_desc(const char *arg, struct line_range *lr)
 		else
 			lr->end = 0;
 		pr_debug("Line range is %u to %u\n", lr->start, lr->end);
-		if (lr->end && lr->start > lr->end)
+		if (lr->end && lr->start > lr->end) {
 			semantic_error("Start line must be smaller"
-				       " than end line.");
-		if (*tmp != '\0')
-			semantic_error("Tailing with invalid character '%d'.",
+				       " than end line.\n");
+			return -EINVAL;
+		}
+		if (*tmp != '\0') {
+			semantic_error("Tailing with invalid character '%d'.\n",
 				       *tmp);
+			return -EINVAL;
+		}
 		tmp = xstrndup(arg, (ptr - arg));
 	} else
 		tmp = xstrdup(arg);
@@ -307,6 +363,8 @@ void parse_line_range_desc(const char *arg, struct line_range *lr)
 		lr->file = tmp;
 	else
 		lr->function = tmp;
+
+	return 0;
 }
 
 /* Check the name is good for event/group */
@@ -322,7 +380,7 @@ static bool check_event_name(const char *name)
 }
 
 /* Parse probepoint definition. */
-static void parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
+static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 {
 	struct perf_probe_point *pp = &pev->point;
 	char *ptr, *tmp;
@@ -339,12 +397,15 @@ static void parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 	if (ptr && *ptr == '=') {	/* Event name */
 		*ptr = '\0';
 		tmp = ptr + 1;
-		ptr = strchr(arg, ':');
-		if (ptr)	/* Group name is not supported yet. */
-			semantic_error("Group name is not supported yet.");
-		if (!check_event_name(arg))
+		if (strchr(arg, ':')) {
+			semantic_error("Group name is not supported yet.\n");
+			return -ENOTSUP;
+		}
+		if (!check_event_name(arg)) {
 			semantic_error("%s is bad for event name -it must "
-				       "follow C symbol-naming rule.", arg);
+				       "follow C symbol-naming rule.\n", arg);
+			return -EINVAL;
+		}
 		pev->event = xstrdup(arg);
 		pev->group = NULL;
 		arg = tmp;
@@ -378,64 +439,89 @@ static void parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 		switch (c) {
 		case ':':	/* Line number */
 			pp->line = strtoul(arg, &tmp, 0);
-			if (*tmp != '\0')
+			if (*tmp != '\0') {
 				semantic_error("There is non-digit char"
-					       " in line number.");
+					       " in line number.\n");
+				return -EINVAL;
+			}
 			break;
 		case '+':	/* Byte offset from a symbol */
 			pp->offset = strtoul(arg, &tmp, 0);
-			if (*tmp != '\0')
+			if (*tmp != '\0') {
 				semantic_error("There is non-digit character"
-						" in offset.");
+						" in offset.\n");
+				return -EINVAL;
+			}
 			break;
 		case '@':	/* File name */
-			if (pp->file)
-				semantic_error("SRC@SRC is not allowed.");
+			if (pp->file) {
+				semantic_error("SRC@SRC is not allowed.\n");
+				return -EINVAL;
+			}
 			pp->file = xstrdup(arg);
 			break;
 		case '%':	/* Probe places */
 			if (strcmp(arg, "return") == 0) {
 				pp->retprobe = 1;
-			} else	/* Others not supported yet */
-				semantic_error("%%%s is not supported.", arg);
+			} else {	/* Others not supported yet */
+				semantic_error("%%%s is not supported.\n", arg);
+				return -ENOTSUP;
+			}
 			break;
-		default:
-			DIE_IF("Program has a bug.");
+		default:	/* Buggy case */
+			pr_err("This program has a bug at %s:%d.\n",
+				__FILE__, __LINE__);
+			return -ENOTSUP;
 			break;
 		}
 	}
 
 	/* Exclusion check */
-	if (pp->lazy_line && pp->line)
+	if (pp->lazy_line && pp->line) {
 		semantic_error("Lazy pattern can't be used with line number.");
+		return -EINVAL;
+	}
 
-	if (pp->lazy_line && pp->offset)
+	if (pp->lazy_line && pp->offset) {
 		semantic_error("Lazy pattern can't be used with offset.");
+		return -EINVAL;
+	}
 
-	if (pp->line && pp->offset)
+	if (pp->line && pp->offset) {
 		semantic_error("Offset can't be used with line number.");
+		return -EINVAL;
+	}
 
-	if (!pp->line && !pp->lazy_line && pp->file && !pp->function)
+	if (!pp->line && !pp->lazy_line && pp->file && !pp->function) {
 		semantic_error("File always requires line number or "
 			       "lazy pattern.");
+		return -EINVAL;
+	}
 
-	if (pp->offset && !pp->function)
+	if (pp->offset && !pp->function) {
 		semantic_error("Offset requires an entry function.");
+		return -EINVAL;
+	}
 
-	if (pp->retprobe && !pp->function)
+	if (pp->retprobe && !pp->function) {
 		semantic_error("Return probe requires an entry function.");
+		return -EINVAL;
+	}
 
-	if ((pp->offset || pp->line || pp->lazy_line) && pp->retprobe)
+	if ((pp->offset || pp->line || pp->lazy_line) && pp->retprobe) {
 		semantic_error("Offset/Line/Lazy pattern can't be used with "
 			       "return probe.");
+		return -EINVAL;
+	}
 
 	pr_debug("symbol:%s file:%s line:%d offset:%lu return:%d lazy:%s\n",
 		 pp->function, pp->file, pp->line, pp->offset, pp->retprobe,
 		 pp->lazy_line);
+	return 0;
 }
 
 /* Parse perf-probe event argument */
-static void parse_perf_probe_arg(char *str, struct perf_probe_arg *arg)
+static int parse_perf_probe_arg(char *str, struct perf_probe_arg *arg)
 {
 	char *tmp;
 	struct perf_probe_arg_field **fieldp;
@@ -461,7 +547,7 @@ static void parse_perf_probe_arg(char *str, struct perf_probe_arg *arg)
 		/* A variable, register, symbol or special value */
 		arg->var = xstrdup(str);
 		pr_debug("%s\n", arg->var);
-		return;
+		return 0;
 	}
 
 	/* Structure fields */
@@ -477,8 +563,10 @@ static void parse_perf_probe_arg(char *str, struct perf_probe_arg *arg)
 		} else if (tmp[1] == '>') {
 			str = tmp + 2;
 			(*fieldp)->ref = true;
-		} else
-			semantic_error("Argument parse error: %s", str);
+		} else {
+			semantic_error("Argument parse error: %s\n", str);
+			return -EINVAL;
+		}
 
 		tmp = strpbrk(str, "-.");
 		if (tmp) {
@@ -493,34 +581,47 @@ static void parse_perf_probe_arg(char *str, struct perf_probe_arg *arg)
 	/* If no name is specified, set the last field name */
 	if (!arg->name)
 		arg->name = xstrdup((*fieldp)->name);
+
+	return 0;
 }
 
 /* Parse perf-probe event command */
-void parse_perf_probe_command(const char *cmd, struct perf_probe_event *pev)
+int parse_perf_probe_command(const char *cmd, struct perf_probe_event *pev)
 {
 	char **argv;
-	int argc, i;
+	int argc, i, ret = 0;
 
 	argv = argv_split(cmd, &argc);
-	if (!argv)
-		die("argv_split failed.");
-	if (argc > MAX_PROBE_ARGS + 1)
-		semantic_error("Too many arguments");
-
+	if (!argv) {
+		pr_debug("Failed to split arguments.\n");
+		return -ENOMEM;
+	}
+	if (argc - 1 > MAX_PROBE_ARGS) {
+		semantic_error("Too many probe arguments (%d).\n", argc - 1);
+		ret = -ERANGE;
+		goto out;
+	}
 	/* Parse probe point */
-	parse_perf_probe_point(argv[0], pev);
+	ret = parse_perf_probe_point(argv[0], pev);
+	if (ret < 0)
+		goto out;
 
 	/* Copy arguments and ensure return probe has no C argument */
 	pev->nargs = argc - 1;
 	pev->args = xzalloc(sizeof(struct perf_probe_arg) * pev->nargs);
-	for (i = 0; i < pev->nargs; i++) {
-		parse_perf_probe_arg(argv[i + 1], &pev->args[i]);
-		if (is_c_varname(pev->args[i].var) && pev->point.retprobe)
+	for (i = 0; i < pev->nargs && ret >= 0; i++) {
+		ret = parse_perf_probe_arg(argv[i + 1], &pev->args[i]);
+		if (ret >= 0 &&
+		    is_c_varname(pev->args[i].var) && pev->point.retprobe) {
 			semantic_error("You can't specify local variable for"
-				       " kretprobe");
+				       " kretprobe.\n");
+			ret = -EINVAL;
+		}
 	}
-
+out:
 	argv_free(argv);
+
+	return ret;
 }
 
 /* Return true if this perf_probe_event requires debuginfo */
@@ -539,7 +640,7 @@ bool perf_probe_event_need_dwarf(struct perf_probe_event *pev)
 }
 
 /* Parse kprobe_events event into struct probe_point */
-void parse_kprobe_trace_command(const char *cmd, struct kprobe_trace_event *tev)
+int parse_kprobe_trace_command(const char *cmd, struct kprobe_trace_event *tev)
 {
 	struct kprobe_trace_point *tp = &tev->point;
 	char pr;
@@ -549,17 +650,25 @@ void parse_kprobe_trace_command(const char *cmd, struct kprobe_trace_event *tev)
 
 	pr_debug("Parsing kprobe_events: %s\n", cmd);
 	argv = argv_split(cmd, &argc);
-	if (!argv)
-		die("argv_split failed.");
-	if (argc < 2)
-		semantic_error("Too less arguments.");
+	if (!argv) {
+		pr_debug("Failed to split arguments.\n");
+		return -ENOMEM;
+	}
+	if (argc < 2) {
+		semantic_error("Too few probe arguments.\n");
+		ret = -ERANGE;
+		goto out;
+	}
 
 	/* Scan event and group name. */
 	ret = sscanf(argv[0], "%c:%a[^/ \t]/%a[^ \t]",
 		     &pr, (float *)(void *)&tev->group,
 		     (float *)(void *)&tev->event);
-	if (ret != 3)
-		semantic_error("Failed to parse event name: %s", argv[0]);
+	if (ret != 3) {
+		semantic_error("Failed to parse event name: %s\n", argv[0]);
+		ret = -EINVAL;
+		goto out;
+	}
 	pr_debug("Group:%s Event:%s probe:%c\n", tev->group, tev->event, pr);
 
 	tp->retprobe = (pr == 'r');
@@ -582,8 +691,10 @@ void parse_kprobe_trace_command(const char *cmd, struct kprobe_trace_event *tev)
 		/* TODO: parse regs and offset */
 		tev->args[i].value = xstrdup(p);
 	}
-
+	ret = 0;
+out:
 	argv_free(argv);
+	return ret;
 }
 
 /* Compose only probe arg */
@@ -622,7 +733,9 @@ int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf, size_t len)
 
 	return tmp - buf;
 error:
-	die("Failed to synthesize perf probe argument: %s", strerror(-ret));
+	pr_debug("Failed to synthesize perf probe argument: %s",
+		 strerror(-ret));
+	return ret;
 }
 
 /* Compose only probe point (not argument) */
@@ -666,7 +779,10 @@ static char *synthesize_perf_probe_point(struct perf_probe_point *pp)
 
 	return buf;
 error:
-	die("Failed to synthesize perf probe point: %s", strerror(-ret));
+	pr_debug("Failed to synthesize perf probe point: %s",
+		 strerror(-ret));
+	free(buf);
+	return NULL;
 }
 
 #if 0
@@ -796,29 +912,37 @@ error:
 	return NULL;
 }
 
-void convert_to_perf_probe_event(struct kprobe_trace_event *tev,
-				 struct perf_probe_event *pev)
+int convert_to_perf_probe_event(struct kprobe_trace_event *tev,
+				struct perf_probe_event *pev)
 {
 	char buf[64];
-	int i;
+	int i, ret;
 
 	/* Convert event/group name */
 	pev->event = xstrdup(tev->event);
 	pev->group = xstrdup(tev->group);
 
 	/* Convert trace_point to probe_point */
-	convert_to_perf_probe_point(&tev->point, &pev->point);
+	ret = convert_to_perf_probe_point(&tev->point, &pev->point);
+	if (ret < 0)
+		return ret;
 
 	/* Convert trace_arg to probe_arg */
 	pev->nargs = tev->nargs;
 	pev->args = xzalloc(sizeof(struct perf_probe_arg) * pev->nargs);
-	for (i = 0; i < tev->nargs; i++)
+	for (i = 0; i < tev->nargs && ret >= 0; i++)
 		if (tev->args[i].name)
 			pev->args[i].name = xstrdup(tev->args[i].name);
 		else {
-			synthesize_kprobe_trace_arg(&tev->args[i], buf, 64);
+			ret = synthesize_kprobe_trace_arg(&tev->args[i],
+							  buf, 64);
 			pev->args[i].name = xstrdup(buf);
 		}
+
+	if (ret < 0)
+		clear_perf_probe_event(pev);
+
+	return ret;
 }
 
 void clear_perf_probe_event(struct perf_probe_event *pev)
@@ -894,21 +1018,20 @@ static int open_kprobe_events(bool readwrite)
 	int ret;
 
 	ret = e_snprintf(buf, PATH_MAX, "%s/../kprobe_events", debugfs_path);
-	if (ret < 0)
-		die("Failed to make kprobe_events path.");
-
-	if (readwrite && !probe_event_dry_run)
-		ret = open(buf, O_RDWR, O_APPEND);
-	else
-		ret = open(buf, O_RDONLY, 0);
+	if (ret >= 0) {
+		if (readwrite && !probe_event_dry_run)
+			ret = open(buf, O_RDWR, O_APPEND);
+		else
+			ret = open(buf, O_RDONLY, 0);
+	}
 
 	if (ret < 0) {
 		if (errno == ENOENT)
-			die("kprobe_events file does not exist -"
-			    " please rebuild with CONFIG_KPROBE_EVENT.");
+			pr_warning("kprobe_events file does not exist - please"
+				 " rebuild kernel with CONFIG_KPROBE_EVENT.\n");
 		else
-			die("Could not open kprobe_events file: %s",
-			    strerror(errno));
+			pr_warning("Failed to open kprobe_events file: %s\n",
+				   strerror(errno));
 	}
 	return ret;
 }
@@ -934,8 +1057,11 @@ static struct strlist *get_kprobe_trace_command_rawlist(int fd)
 		if (p[idx] == '\n')
 			p[idx] = '\0';
 		ret = strlist__add(sl, buf);
-		if (ret < 0)
-			die("strlist__add failed: %s", strerror(-ret));
+		if (ret < 0) {
+			pr_debug("strlist__add failed: %s\n", strerror(-ret));
+			strlist__delete(sl);
+			return NULL;
+		}
 	}
 	fclose(fp);
 
@@ -943,7 +1069,7 @@ static struct strlist *get_kprobe_trace_command_rawlist(int fd)
 }
 
 /* Show an event */
-static void show_perf_probe_event(struct perf_probe_event *pev)
+static int show_perf_probe_event(struct perf_probe_event *pev)
 {
 	int i, ret;
 	char buf[128];
@@ -951,52 +1077,71 @@ static void show_perf_probe_event(struct perf_probe_event *pev)
 
 	/* Synthesize only event probe point */
 	place = synthesize_perf_probe_point(&pev->point);
+	if (!place)
+		return -EINVAL;
 
 	ret = e_snprintf(buf, 128, "%s:%s", pev->group, pev->event);
 	if (ret < 0)
-		die("Failed to copy event: %s", strerror(-ret));
+		return ret;
+
 	printf("  %-20s (on %s", buf, place);
 
 	if (pev->nargs > 0) {
 		printf(" with");
 		for (i = 0; i < pev->nargs; i++) {
-			synthesize_perf_probe_arg(&pev->args[i], buf, 128);
+			ret = synthesize_perf_probe_arg(&pev->args[i],
+							buf, 128);
+			if (ret < 0)
+				break;
 			printf(" %s", buf);
 		}
 	}
 	printf(")\n");
 	free(place);
+	return ret;
 }
 
 /* List up current perf-probe events */
-void show_perf_probe_events(void)
+int show_perf_probe_events(void)
 {
-	int fd;
+	int fd, ret;
 	struct kprobe_trace_event tev;
 	struct perf_probe_event pev;
 	struct strlist *rawlist;
 	struct str_node *ent;
 
 	setup_pager();
-	init_vmlinux();
+	ret = init_vmlinux();
+	if (ret < 0)
+		return ret;
 
 	memset(&tev, 0, sizeof(tev));
 	memset(&pev, 0, sizeof(pev));
 
 	fd = open_kprobe_events(false);
+	if (fd < 0)
+		return fd;
+
 	rawlist = get_kprobe_trace_command_rawlist(fd);
 	close(fd);
+	if (!rawlist)
+		return -ENOENT;
 
 	strlist__for_each(ent, rawlist) {
-		parse_kprobe_trace_command(ent->s, &tev);
-		convert_to_perf_probe_event(&tev, &pev);
-		/* Show an event */
-		show_perf_probe_event(&pev);
+		ret = parse_kprobe_trace_command(ent->s, &tev);
+		if (ret >= 0) {
+			ret = convert_to_perf_probe_event(&tev, &pev);
+			if (ret >= 0)
+				ret = show_perf_probe_event(&pev);
+		}
 		clear_perf_probe_event(&pev);
 		clear_kprobe_trace_event(&tev);
+		if (ret < 0)
+			break;
 	}
-
 	strlist__delete(rawlist);
+
+	return ret;
 }
 
 /* Get current perf-probe event names */
@@ -1006,88 +1151,118 @@ static struct strlist *get_kprobe_trace_event_names(int fd, bool include_group)
 	struct strlist *sl, *rawlist;
 	struct str_node *ent;
 	struct kprobe_trace_event tev;
+	int ret = 0;
 
 	memset(&tev, 0, sizeof(tev));
 
 	rawlist = get_kprobe_trace_command_rawlist(fd);
 	sl = strlist__new(true, NULL);
 	strlist__for_each(ent, rawlist) {
-		parse_kprobe_trace_command(ent->s, &tev);
+		ret = parse_kprobe_trace_command(ent->s, &tev);
+		if (ret < 0)
+			break;
 		if (include_group) {
-			if (e_snprintf(buf, 128, "%s:%s", tev.group,
-				       tev.event) < 0)
-				die("Failed to copy group:event name.");
-			strlist__add(sl, buf);
+			ret = e_snprintf(buf, 128, "%s:%s", tev.group,
+					tev.event);
+			if (ret >= 0)
+				ret = strlist__add(sl, buf);
 		} else
-			strlist__add(sl, tev.event);
+			ret = strlist__add(sl, tev.event);
 		clear_kprobe_trace_event(&tev);
+		if (ret < 0)
+			break;
 	}
-
 	strlist__delete(rawlist);
 
+	if (ret < 0) {
+		strlist__delete(sl);
+		return NULL;
+	}
 	return sl;
 }
 
-static void write_kprobe_trace_event(int fd, struct kprobe_trace_event *tev)
+static int write_kprobe_trace_event(int fd, struct kprobe_trace_event *tev)
 {
 	int ret;
 	char *buf = synthesize_kprobe_trace_command(tev);
 
+	if (!buf) {
+		pr_debug("Failed to synthesize kprobe trace event.\n");
+		return -EINVAL;
+	}
+
 	pr_debug("Writing event: %s\n", buf);
 	if (!probe_event_dry_run) {
 		ret = write(fd, buf, strlen(buf));
 		if (ret <= 0)
-			die("Failed to write event: %s", strerror(errno));
+			pr_warning("Failed to write event: %s\n",
+				   strerror(errno));
 	}
 	free(buf);
+	return ret;
 }
 
-static void get_new_event_name(char *buf, size_t len, const char *base,
-			       struct strlist *namelist, bool allow_suffix)
+static int get_new_event_name(char *buf, size_t len, const char *base,
+			      struct strlist *namelist, bool allow_suffix)
 {
 	int i, ret;
 
 	/* Try no suffix */
 	ret = e_snprintf(buf, len, "%s", base);
-	if (ret < 0)
-		die("snprintf() failed: %s", strerror(-ret));
+	if (ret < 0) {
+		pr_debug("snprintf() failed: %s\n", strerror(-ret));
+		return ret;
+	}
 	if (!strlist__has_entry(namelist, buf))
-		return;
+		return 0;
 
 	if (!allow_suffix) {
 		pr_warning("Error: event \"%s\" already exists. "
 			   "(Use -f to force duplicates.)\n", base);
-		die("Can't add new event.");
+		return -EEXIST;
 	}
 
 	/* Try to add suffix */
 	for (i = 1; i < MAX_EVENT_INDEX; i++) {
 		ret = e_snprintf(buf, len, "%s_%d", base, i);
-		if (ret < 0)
-			die("snprintf() failed: %s", strerror(-ret));
+		if (ret < 0) {
+			pr_debug("snprintf() failed: %s\n", strerror(-ret));
+			return ret;
+		}
 		if (!strlist__has_entry(namelist, buf))
 			break;
 	}
-	if (i == MAX_EVENT_INDEX)
-		die("Too many events are on the same function.");
+	if (i == MAX_EVENT_INDEX) {
+		pr_warning("Too many events are on the same function.\n");
+		ret = -ERANGE;
+	}
+
+	return ret;
 }
 
-static void __add_kprobe_trace_events(struct perf_probe_event *pev,
-				      struct kprobe_trace_event *tevs,
-				      int ntevs, bool allow_suffix)
+static int __add_kprobe_trace_events(struct perf_probe_event *pev,
+				     struct kprobe_trace_event *tevs,
+				     int ntevs, bool allow_suffix)
 {
-	int i, fd;
+	int i, fd, ret;
 	struct kprobe_trace_event *tev = NULL;
 	char buf[64];
 	const char *event, *group;
 	struct strlist *namelist;
 
 	fd = open_kprobe_events(true);
+	if (fd < 0)
+		return fd;
 	/* Get current event names */
 	namelist = get_kprobe_trace_event_names(fd, false);
+	if (!namelist) {
+		pr_debug("Failed to get current event list.\n");
+		return -EIO;
+	}
 
+	ret = 0;
 	printf("Add new event%s\n", (ntevs > 1) ? "s:" : ":");
-	for (i = 0; i < ntevs; i++) {
+	for (i = 0; i < ntevs && ret >= 0; i++) {
 		tev = &tevs[i];
 		if (pev->event)
 			event = pev->event;
@@ -1102,12 +1277,17 @@ static void __add_kprobe_trace_events(struct perf_probe_event *pev,
 			group = PERFPROBE_GROUP;
 
 		/* Get an unused new event name */
-		get_new_event_name(buf, 64, event, namelist, allow_suffix);
+		ret = get_new_event_name(buf, 64, event,
+					 namelist, allow_suffix);
+		if (ret < 0)
+			break;
 		event = buf;
 
 		tev->event = xstrdup(event);
 		tev->group = xstrdup(group);
-		write_kprobe_trace_event(fd, tev);
+		ret = write_kprobe_trace_event(fd, tev);
+		if (ret < 0)
+			break;
 		/* Add added event name to namelist */
 		strlist__add(namelist, event);
 
@@ -1129,12 +1309,17 @@ static void __add_kprobe_trace_events(struct perf_probe_event *pev,
 		 */
 		allow_suffix = true;
 	}
-	/* Show how to use the event. */
-	printf("\nYou can now use it on all perf tools, such as:\n\n");
-	printf("\tperf record -e %s:%s -a sleep 1\n\n", tev->group, tev->event);
+
+	if (ret >= 0) {
+		/* Show how to use the event. */
+		printf("\nYou can now use it on all perf tools, such as:\n\n");
+		printf("\tperf record -e %s:%s -aR sleep 1\n\n", tev->group,
+			 tev->event);
+	}
 
 	strlist__delete(namelist);
 	close(fd);
+	return ret;
 }
 
 static int convert_to_kprobe_trace_events(struct perf_probe_event *pev,
@@ -1146,7 +1331,7 @@ static int convert_to_kprobe_trace_events(struct perf_probe_event *pev,
 
 	/* Convert perf_probe_event with debuginfo */
 	ntevs = try_to_find_kprobe_trace_events(pev, tevs);
-	if (ntevs > 0)
+	if (ntevs != 0)
 		return ntevs;
 
 	/* Allocate trace event buffer */
@@ -1172,10 +1357,11 @@ static int convert_to_kprobe_trace_events(struct perf_probe_event *pev,
 	/* Currently just checking function name from symbol map */
 	sym = map__find_symbol_by_name(kmaps[MAP__FUNCTION],
 				       tev->point.symbol, NULL);
-	if (!sym)
-		die("Kernel symbol \'%s\' not found - probe not added.",
-		    tev->point.symbol);
-
+	if (!sym) {
+		pr_warning("Kernel symbol \'%s\' not found.\n",
+			   tev->point.symbol);
+		return -ENOENT;
+	}
 	return ntevs;
 }
 
@@ -1185,93 +1371,128 @@ struct __event_package {
 	int				ntevs;
 };
 
-void add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
-			   bool force_add)
+int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
+			  bool force_add)
 {
-	int i;
+	int i, j, ret;
 	struct __event_package *pkgs;
 
 	pkgs = xzalloc(sizeof(struct __event_package) * npevs);
 
 	/* Init vmlinux path */
-	init_vmlinux();
+	ret = init_vmlinux();
+	if (ret < 0)
+		return ret;
 
 	/* Loop 1: convert all events */
 	for (i = 0; i < npevs; i++) {
 		pkgs[i].pev = &pevs[i];
 		/* Convert with or without debuginfo */
-		pkgs[i].ntevs = convert_to_kprobe_trace_events(pkgs[i].pev,
-							       &pkgs[i].tevs);
+		ret  = convert_to_kprobe_trace_events(pkgs[i].pev,
+						      &pkgs[i].tevs);
+		if (ret < 0)
+			goto end;
+		pkgs[i].ntevs = ret;
 	}
 
 	/* Loop 2: add all events */
+	for (i = 0; i < npevs && ret >= 0; i++)
+		ret = __add_kprobe_trace_events(pkgs[i].pev, pkgs[i].tevs,
+						pkgs[i].ntevs, force_add);
+end:
+	/* Loop 3: cleanup trace events  */
 	for (i = 0; i < npevs; i++)
-		__add_kprobe_trace_events(pkgs[i].pev, pkgs[i].tevs,
-					  pkgs[i].ntevs, force_add);
-	/* TODO: cleanup all trace events? */
+		for (j = 0; j < pkgs[i].ntevs; j++)
+			clear_kprobe_trace_event(&pkgs[i].tevs[j]);
+
+	return ret;
 }
 
-static void __del_trace_kprobe_event(int fd, struct str_node *ent)
+static int __del_trace_kprobe_event(int fd, struct str_node *ent)
 {
 	char *p;
 	char buf[128];
 	int ret;
 
 	/* Convert from perf-probe event to trace-kprobe event */
-	if (e_snprintf(buf, 128, "-:%s", ent->s) < 0)
-		die("Failed to copy event.");
+	ret = e_snprintf(buf, 128, "-:%s", ent->s);
+	if (ret < 0)
+		goto error;
+
 	p = strchr(buf + 2, ':');
-	if (!p)
-		die("Internal error: %s should have ':' but not.", ent->s);
+	if (!p) {
+		pr_debug("Internal error: %s should have ':' but not.\n",
+			 ent->s);
+		ret = -ENOTSUP;
+		goto error;
+	}
 	*p = '/';
 
 	pr_debug("Writing event: %s\n", buf);
 	ret = write(fd, buf, strlen(buf));
-	if (ret <= 0)
-		die("Failed to write event: %s", strerror(errno));
+	if (ret < 0)
+		goto error;
+
 	printf("Remove event: %s\n", ent->s);
+	return 0;
+error:
+	pr_warning("Failed to delete event: %s\n", strerror(-ret));
+	return ret;
 }
 
-static void del_trace_kprobe_event(int fd, const char *group,
-				   const char *event, struct strlist *namelist)
+static int del_trace_kprobe_event(int fd, const char *group,
+				  const char *event, struct strlist *namelist)
 {
 	char buf[128];
 	struct str_node *ent, *n;
-	int found = 0;
+	int found = 0, ret = 0;
 
-	if (e_snprintf(buf, 128, "%s:%s", group, event) < 0)
-		die("Failed to copy event.");
+	ret = e_snprintf(buf, 128, "%s:%s", group, event);
+	if (ret < 0) {
+		pr_err("Failed to copy event.");
+		return ret;
+	}
 
 	if (strpbrk(buf, "*?")) { /* Glob-exp */
 		strlist__for_each_safe(ent, n, namelist)
 			if (strglobmatch(ent->s, buf)) {
 				found++;
-				__del_trace_kprobe_event(fd, ent);
+				ret = __del_trace_kprobe_event(fd, ent);
+				if (ret < 0)
+					break;
 				strlist__remove(namelist, ent);
 			}
 	} else {
 		ent = strlist__find(namelist, buf);
 		if (ent) {
 			found++;
-			__del_trace_kprobe_event(fd, ent);
-			strlist__remove(namelist, ent);
+			ret = __del_trace_kprobe_event(fd, ent);
+			if (ret >= 0)
+				strlist__remove(namelist, ent);
 		}
 	}
-	if (found == 0)
-		pr_info("Info: event \"%s\" does not exist, could not remove it.\n", buf);
+	if (found == 0 && ret >= 0)
+		pr_info("Info: Event \"%s\" does not exist.\n", buf);
+
+	return ret;
 }
 
-void del_perf_probe_events(struct strlist *dellist)
+int del_perf_probe_events(struct strlist *dellist)
 {
-	int fd;
+	int fd, ret = 0;
 	const char *group, *event;
 	char *p, *str;
 	struct str_node *ent;
 	struct strlist *namelist;
 
 	fd = open_kprobe_events(true);
+	if (fd < 0)
+		return fd;
+
 	/* Get current event names */
 	namelist = get_kprobe_trace_event_names(fd, true);
+	if (namelist == NULL)
+		return -EINVAL;
 
 	strlist__for_each(ent, dellist) {
 		str = xstrdup(ent->s);
@@ -1286,10 +1507,14 @@ void del_perf_probe_events(struct strlist *dellist)
 			event = str;
 		}
 		pr_debug("Group: %s, Event: %s\n", group, event);
-		del_trace_kprobe_event(fd, group, event, namelist);
+		ret = del_trace_kprobe_event(fd, group, event, namelist);
 		free(str);
+		if (ret < 0)
+			break;
 	}
 	strlist__delete(namelist);
 	close(fd);
+
+	return ret;
 }
 
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index ff2f26b1822..ab549290170 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -89,10 +89,10 @@ struct line_range {
 };
 
 /* Command string to events */
-extern void parse_perf_probe_command(const char *cmd,
-				     struct perf_probe_event *pev);
-extern void parse_kprobe_trace_command(const char *cmd,
-				       struct kprobe_trace_event *tev);
+extern int parse_perf_probe_command(const char *cmd,
+				    struct perf_probe_event *pev);
+extern int parse_kprobe_trace_command(const char *cmd,
+				      struct kprobe_trace_event *tev);
 
 /* Events to command string */
 extern char *synthesize_perf_probe_command(struct perf_probe_event *pev);
@@ -104,22 +104,22 @@ extern int synthesize_perf_probe_arg(struct perf_probe_arg *pa, char *buf,
 extern bool perf_probe_event_need_dwarf(struct perf_probe_event *pev);
 
 /* Convert from kprobe_trace_event to perf_probe_event */
-extern void convert_to_perf_probe_event(struct kprobe_trace_event *tev,
-					struct perf_probe_event *pev);
+extern int convert_to_perf_probe_event(struct kprobe_trace_event *tev,
+				       struct perf_probe_event *pev);
 
 /* Release event contents */
 extern void clear_perf_probe_event(struct perf_probe_event *pev);
 extern void clear_kprobe_trace_event(struct kprobe_trace_event *tev);
 
 /* Command string to line-range */
-extern void parse_line_range_desc(const char *cmd, struct line_range *lr);
+extern int parse_line_range_desc(const char *cmd, struct line_range *lr);
 
 
-extern void add_perf_probe_events(struct perf_probe_event *pevs, int ntevs,
-				  bool force_add);
-extern void del_perf_probe_events(struct strlist *dellist);
-extern void show_perf_probe_events(void);
-extern void show_line_range(struct line_range *lr);
+extern int add_perf_probe_events(struct perf_probe_event *pevs, int ntevs,
+				 bool force_add);
+extern int del_perf_probe_events(struct strlist *dellist);
+extern int show_perf_probe_events(void);
+extern int show_line_range(struct line_range *lr);
 
 
 /* Maximum index number of event-name postfix */
-- 
cgit v1.2.3


From e334016f1d7250a6523b3a44ccecfe23af6e2f57 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 12 Apr 2010 13:17:49 -0400
Subject: perf probe: Remove xzalloc() from util/probe-{event, finder}.c

Remove all xzalloc() calls from util/probe-{event,finder}.c since
it may cause 'sudden death' in utility functions and it makes
reusing it from other code difficult.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100412171749.3790.33303.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-event.c  | 69 +++++++++++++++++++++++++++++++-----------
 tools/perf/util/probe-finder.c | 25 ++++++++++-----
 2 files changed, 69 insertions(+), 25 deletions(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index bd68f7b33b2..aacbf730b47 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -556,7 +556,9 @@ static int parse_perf_probe_arg(char *str, struct perf_probe_arg *arg)
 	fieldp = &arg->field;
 
 	do {
-		*fieldp = xzalloc(sizeof(struct perf_probe_arg_field));
+		*fieldp = zalloc(sizeof(struct perf_probe_arg_field));
+		if (*fieldp == NULL)
+			return -ENOMEM;
 		if (*tmp == '.') {
 			str = tmp + 1;
 			(*fieldp)->ref = false;
@@ -608,7 +610,11 @@ int parse_perf_probe_command(const char *cmd, struct perf_probe_event *pev)
 
 	/* Copy arguments and ensure return probe has no C argument */
 	pev->nargs = argc - 1;
-	pev->args = xzalloc(sizeof(struct perf_probe_arg) * pev->nargs);
+	pev->args = zalloc(sizeof(struct perf_probe_arg) * pev->nargs);
+	if (pev->args == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	for (i = 0; i < pev->nargs && ret >= 0; i++) {
 		ret = parse_perf_probe_arg(argv[i + 1], &pev->args[i]);
 		if (ret >= 0 &&
@@ -680,7 +686,11 @@ int parse_kprobe_trace_command(const char *cmd, struct kprobe_trace_event *tev)
 		tp->offset = 0;
 
 	tev->nargs = argc - 2;
-	tev->args = xzalloc(sizeof(struct kprobe_trace_arg) * tev->nargs);
+	tev->args = zalloc(sizeof(struct kprobe_trace_arg) * tev->nargs);
+	if (tev->args == NULL) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	for (i = 0; i < tev->nargs; i++) {
 		p = strchr(argv[i + 2], '=');
 		if (p)	/* We don't need which register is assigned. */
@@ -745,7 +755,11 @@ static char *synthesize_perf_probe_point(struct perf_probe_point *pp)
 	char offs[32] = "", line[32] = "", file[32] = "";
 	int ret, len;
 
-	buf = xzalloc(MAX_CMDLEN);
+	buf = zalloc(MAX_CMDLEN);
+	if (buf == NULL) {
+		ret = -ENOMEM;
+		goto error;
+	}
 	if (pp->offset) {
 		ret = e_snprintf(offs, 32, "+%lu", pp->offset);
 		if (ret <= 0)
@@ -781,7 +795,8 @@ static char *synthesize_perf_probe_point(struct perf_probe_point *pp)
 error:
 	pr_debug("Failed to synthesize perf probe point: %s",
 		 strerror(-ret));
-	free(buf);
+	if (buf)
+		free(buf);
 	return NULL;
 }
 
@@ -890,7 +905,10 @@ char *synthesize_kprobe_trace_command(struct kprobe_trace_event *tev)
 	char *buf;
 	int i, len, ret;
 
-	buf = xzalloc(MAX_CMDLEN);
+	buf = zalloc(MAX_CMDLEN);
+	if (buf == NULL)
+		return NULL;
+
 	len = e_snprintf(buf, MAX_CMDLEN, "%c:%s/%s %s+%lu",
 			 tp->retprobe ? 'r' : 'p',
 			 tev->group, tev->event,
@@ -929,7 +947,9 @@ int convert_to_perf_probe_event(struct kprobe_trace_event *tev,
 
 	/* Convert trace_arg to probe_arg */
 	pev->nargs = tev->nargs;
-	pev->args = xzalloc(sizeof(struct perf_probe_arg) * pev->nargs);
+	pev->args = zalloc(sizeof(struct perf_probe_arg) * pev->nargs);
+	if (pev->args == NULL)
+		return -ENOMEM;
 	for (i = 0; i < tev->nargs && ret >= 0; i++)
 		if (tev->args[i].name)
 			pev->args[i].name = xstrdup(tev->args[i].name);
@@ -1326,25 +1346,31 @@ static int convert_to_kprobe_trace_events(struct perf_probe_event *pev,
 					  struct kprobe_trace_event **tevs)
 {
 	struct symbol *sym;
-	int ntevs = 0, i;
+	int ret = 0, i;
 	struct kprobe_trace_event *tev;
 
 	/* Convert perf_probe_event with debuginfo */
-	ntevs = try_to_find_kprobe_trace_events(pev, tevs);
-	if (ntevs != 0)
-		return ntevs;
+	ret = try_to_find_kprobe_trace_events(pev, tevs);
+	if (ret != 0)
+		return ret;
 
 	/* Allocate trace event buffer */
-	ntevs = 1;
-	tev = *tevs = xzalloc(sizeof(struct kprobe_trace_event));
+	tev = *tevs = zalloc(sizeof(struct kprobe_trace_event));
+	if (tev == NULL)
+		return -ENOMEM;
 
 	/* Copy parameters */
 	tev->point.symbol = xstrdup(pev->point.function);
 	tev->point.offset = pev->point.offset;
 	tev->nargs = pev->nargs;
 	if (tev->nargs) {
-		tev->args = xzalloc(sizeof(struct kprobe_trace_arg)
-				    * tev->nargs);
+		tev->args = zalloc(sizeof(struct kprobe_trace_arg)
+				   * tev->nargs);
+		if (tev->args == NULL) {
+			free(tev);
+			*tevs = NULL;
+			return -ENOMEM;
+		}
 		for (i = 0; i < tev->nargs; i++) {
 			if (pev->args[i].name)
 				tev->args[i].name = xstrdup(pev->args[i].name);
@@ -1360,9 +1386,14 @@ static int convert_to_kprobe_trace_events(struct perf_probe_event *pev,
 	if (!sym) {
 		pr_warning("Kernel symbol \'%s\' not found.\n",
 			   tev->point.symbol);
+		clear_kprobe_trace_event(tev);
+		free(tev);
+		*tevs = NULL;
 		return -ENOENT;
-	}
-	return ntevs;
+	} else
+		ret = 1;
+
+	return ret;
 }
 
 struct __event_package {
@@ -1377,7 +1408,9 @@ int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
 	int i, j, ret;
 	struct __event_package *pkgs;
 
-	pkgs = xzalloc(sizeof(struct __event_package) * npevs);
+	pkgs = zalloc(sizeof(struct __event_package) * npevs);
+	if (pkgs == NULL)
+		return -ENOMEM;
 
 	/* Init vmlinux path */
 	ret = init_vmlinux();
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 54daa91e901..ce1ac827f3d 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -111,7 +111,7 @@ static int strtailcmp(const char *s1, const char *s2)
 /* Line number list operations */
 
 /* Add a line to line number list */
-static void line_list__add_line(struct list_head *head, unsigned int line)
+static int line_list__add_line(struct list_head *head, unsigned int line)
 {
 	struct line_node *ln;
 	struct list_head *p;
@@ -122,16 +122,19 @@ static void line_list__add_line(struct list_head *head, unsigned int line)
 			p = &ln->list;
 			goto found;
 		} else if (ln->line == line)	/* Already exist */
-			return ;
+			return 1;
 	}
 	/* List is empty, or the smallest entry */
 	p = head;
 found:
 	pr_debug("line list: add a line %u\n", line);
-	ln = xzalloc(sizeof(struct line_node));
+	ln = zalloc(sizeof(struct line_node));
+	if (ln == NULL)
+		return -ENOMEM;
 	ln->line = line;
 	INIT_LIST_HEAD(&ln->list);
 	list_add(&ln->list, p);
+	return 0;
 }
 
 /* Check if the line in line number list */
@@ -423,7 +426,9 @@ static int convert_location(Dwarf_Op *op, struct probe_finder *pf)
 
 	tvar->value = xstrdup(regs);
 	if (ref) {
-		tvar->ref = xzalloc(sizeof(struct kprobe_trace_arg_ref));
+		tvar->ref = zalloc(sizeof(struct kprobe_trace_arg_ref));
+		if (tvar->ref == NULL)
+			return -ENOMEM;
 		tvar->ref->offset = (long)offs;
 	}
 	return 0;
@@ -500,7 +505,9 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 			return -EINVAL;
 		}
 
-		ref = xzalloc(sizeof(struct kprobe_trace_arg_ref));
+		ref = zalloc(sizeof(struct kprobe_trace_arg_ref));
+		if (ref == NULL)
+			return -ENOMEM;
 		if (*ref_ptr)
 			(*ref_ptr)->next = ref;
 		else
@@ -680,7 +687,9 @@ static int convert_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 
 	/* Find each argument */
 	tev->nargs = pf->pev->nargs;
-	tev->args = xzalloc(sizeof(struct kprobe_trace_arg) * tev->nargs);
+	tev->args = zalloc(sizeof(struct kprobe_trace_arg) * tev->nargs);
+	if (tev->args == NULL)
+		return -ENOMEM;
 	for (i = 0; i < pf->pev->nargs; i++) {
 		pf->pvar = &pf->pev->args[i];
 		pf->tvar = &tev->args[i];
@@ -938,7 +947,9 @@ int find_kprobe_trace_events(int fd, struct perf_probe_event *pev,
 	Dwarf *dbg;
 	int ret = 0;
 
-	pf.tevs = xzalloc(sizeof(struct kprobe_trace_event) * MAX_PROBES);
+	pf.tevs = zalloc(sizeof(struct kprobe_trace_event) * MAX_PROBES);
+	if (pf.tevs == NULL)
+		return -ENOMEM;
 	*tevs = pf.tevs;
 	pf.ntevs = 0;
 
-- 
cgit v1.2.3


From 02b95dadc8a1d2c302513e5fa24c492380d26e93 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 12 Apr 2010 13:17:56 -0400
Subject: perf probe: Remove xstrdup()/xstrndup() from util/probe-{event,
 finder}.c

Remove all xstr*dup() calls from util/probe-{event,finder}.c since
it may cause 'sudden death' in utility functions and it makes
reusing it from other code difficult.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100412171756.3790.89607.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-event.c  | 159 +++++++++++++++++++++++++++++------------
 tools/perf/util/probe-finder.c |  58 +++++++++++----
 2 files changed, 156 insertions(+), 61 deletions(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index aacbf730b47..ca108b2cd15 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -133,7 +133,9 @@ static int convert_to_perf_probe_point(struct kprobe_trace_point *tp,
 	if (ret <= 0) {
 		pr_debug("Failed to find corresponding probes from "
 			 "debuginfo. Use kprobe event information.\n");
-		pp->function = xstrdup(tp->symbol);
+		pp->function = strdup(tp->symbol);
+		if (pp->function == NULL)
+			return -ENOMEM;
 		pp->offset = tp->offset;
 	}
 	pp->retprobe = tp->retprobe;
@@ -300,7 +302,9 @@ end:
 static int convert_to_perf_probe_point(struct kprobe_trace_point *tp,
 					struct perf_probe_point *pp)
 {
-	pp->function = xstrdup(tp->symbol);
+	pp->function = strdup(tp->symbol);
+	if (pp->function == NULL)
+		return -ENOMEM;
 	pp->offset = tp->offset;
 	pp->retprobe = tp->retprobe;
 
@@ -355,9 +359,12 @@ int parse_line_range_desc(const char *arg, struct line_range *lr)
 				       *tmp);
 			return -EINVAL;
 		}
-		tmp = xstrndup(arg, (ptr - arg));
+		tmp = strndup(arg, (ptr - arg));
 	} else
-		tmp = xstrdup(arg);
+		tmp = strdup(arg);
+
+	if (tmp == NULL)
+		return -ENOMEM;
 
 	if (strchr(tmp, '.'))
 		lr->file = tmp;
@@ -406,7 +413,9 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 				       "follow C symbol-naming rule.\n", arg);
 			return -EINVAL;
 		}
-		pev->event = xstrdup(arg);
+		pev->event = strdup(arg);
+		if (pev->event == NULL)
+			return -ENOMEM;
 		pev->group = NULL;
 		arg = tmp;
 	}
@@ -417,18 +426,24 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 		*ptr++ = '\0';
 	}
 
+	tmp = strdup(arg);
+	if (tmp == NULL)
+		return -ENOMEM;
+
 	/* Check arg is function or file and copy it */
-	if (strchr(arg, '.'))	/* File */
-		pp->file = xstrdup(arg);
+	if (strchr(tmp, '.'))	/* File */
+		pp->file = tmp;
 	else			/* Function */
-		pp->function = xstrdup(arg);
+		pp->function = tmp;
 
 	/* Parse other options */
 	while (ptr) {
 		arg = ptr;
 		c = nc;
 		if (c == ';') {	/* Lazy pattern must be the last part */
-			pp->lazy_line = xstrdup(arg);
+			pp->lazy_line = strdup(arg);
+			if (pp->lazy_line == NULL)
+				return -ENOMEM;
 			break;
 		}
 		ptr = strpbrk(arg, ";:+@%");
@@ -458,7 +473,9 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
 				semantic_error("SRC@SRC is not allowed.\n");
 				return -EINVAL;
 			}
-			pp->file = xstrdup(arg);
+			pp->file = strdup(arg);
+			if (pp->file == NULL)
+				return -ENOMEM;
 			break;
 		case '%':	/* Probe places */
 			if (strcmp(arg, "return") == 0) {
@@ -530,7 +547,9 @@ static int parse_perf_probe_arg(char *str, struct perf_probe_arg *arg)
 
 	tmp = strchr(str, '=');
 	if (tmp) {
-		arg->name = xstrndup(str, tmp - str);
+		arg->name = strndup(str, tmp - str);
+		if (arg->name == NULL)
+			return -ENOMEM;
 		pr_debug("name:%s ", arg->name);
 		str = tmp + 1;
 	}
@@ -538,20 +557,26 @@ static int parse_perf_probe_arg(char *str, struct perf_probe_arg *arg)
 	tmp = strchr(str, ':');
 	if (tmp) {	/* Type setting */
 		*tmp = '\0';
-		arg->type = xstrdup(tmp + 1);
+		arg->type = strdup(tmp + 1);
+		if (arg->type == NULL)
+			return -ENOMEM;
 		pr_debug("type:%s ", arg->type);
 	}
 
 	tmp = strpbrk(str, "-.");
 	if (!is_c_varname(str) || !tmp) {
 		/* A variable, register, symbol or special value */
-		arg->var = xstrdup(str);
+		arg->var = strdup(str);
+		if (arg->var == NULL)
+			return -ENOMEM;
 		pr_debug("%s\n", arg->var);
 		return 0;
 	}
 
 	/* Structure fields */
-	arg->var = xstrndup(str, tmp - str);
+	arg->var = strndup(str, tmp - str);
+	if (arg->var == NULL)
+		return -ENOMEM;
 	pr_debug("%s, ", arg->var);
 	fieldp = &arg->field;
 
@@ -572,18 +597,24 @@ static int parse_perf_probe_arg(char *str, struct perf_probe_arg *arg)
 
 		tmp = strpbrk(str, "-.");
 		if (tmp) {
-			(*fieldp)->name = xstrndup(str, tmp - str);
+			(*fieldp)->name = strndup(str, tmp - str);
+			if ((*fieldp)->name == NULL)
+				return -ENOMEM;
 			pr_debug("%s(%d), ", (*fieldp)->name, (*fieldp)->ref);
 			fieldp = &(*fieldp)->next;
 		}
 	} while (tmp);
-	(*fieldp)->name = xstrdup(str);
+	(*fieldp)->name = strdup(str);
+	if ((*fieldp)->name == NULL)
+		return -ENOMEM;
 	pr_debug("%s(%d)\n", (*fieldp)->name, (*fieldp)->ref);
 
 	/* If no name is specified, set the last field name */
-	if (!arg->name)
-		arg->name = xstrdup((*fieldp)->name);
-
+	if (!arg->name) {
+		arg->name = strdup((*fieldp)->name);
+		if (arg->name == NULL)
+			return -ENOMEM;
+	}
 	return 0;
 }
 
@@ -697,9 +728,13 @@ int parse_kprobe_trace_command(const char *cmd, struct kprobe_trace_event *tev)
 			*p++ = '\0';
 		else
 			p = argv[i + 2];
-		tev->args[i].name = xstrdup(argv[i + 2]);
+		tev->args[i].name = strdup(argv[i + 2]);
 		/* TODO: parse regs and offset */
-		tev->args[i].value = xstrdup(p);
+		tev->args[i].value = strdup(p);
+		if (tev->args[i].name == NULL || tev->args[i].value == NULL) {
+			ret = -ENOMEM;
+			goto out;
+		}
 	}
 	ret = 0;
 out:
@@ -933,12 +968,14 @@ error:
 int convert_to_perf_probe_event(struct kprobe_trace_event *tev,
 				struct perf_probe_event *pev)
 {
-	char buf[64];
+	char buf[64] = "";
 	int i, ret;
 
 	/* Convert event/group name */
-	pev->event = xstrdup(tev->event);
-	pev->group = xstrdup(tev->group);
+	pev->event = strdup(tev->event);
+	pev->group = strdup(tev->group);
+	if (pev->event == NULL || pev->group == NULL)
+		return -ENOMEM;
 
 	/* Convert trace_point to probe_point */
 	ret = convert_to_perf_probe_point(&tev->point, &pev->point);
@@ -950,14 +987,17 @@ int convert_to_perf_probe_event(struct kprobe_trace_event *tev,
 	pev->args = zalloc(sizeof(struct perf_probe_arg) * pev->nargs);
 	if (pev->args == NULL)
 		return -ENOMEM;
-	for (i = 0; i < tev->nargs && ret >= 0; i++)
+	for (i = 0; i < tev->nargs && ret >= 0; i++) {
 		if (tev->args[i].name)
-			pev->args[i].name = xstrdup(tev->args[i].name);
+			pev->args[i].name = strdup(tev->args[i].name);
 		else {
 			ret = synthesize_kprobe_trace_arg(&tev->args[i],
 							  buf, 64);
-			pev->args[i].name = xstrdup(buf);
+			pev->args[i].name = strdup(buf);
 		}
+		if (pev->args[i].name == NULL && ret >= 0)
+			ret = -ENOMEM;
+	}
 
 	if (ret < 0)
 		clear_perf_probe_event(pev);
@@ -1282,7 +1322,7 @@ static int __add_kprobe_trace_events(struct perf_probe_event *pev,
 
 	ret = 0;
 	printf("Add new event%s\n", (ntevs > 1) ? "s:" : ":");
-	for (i = 0; i < ntevs && ret >= 0; i++) {
+	for (i = 0; i < ntevs; i++) {
 		tev = &tevs[i];
 		if (pev->event)
 			event = pev->event;
@@ -1303,8 +1343,12 @@ static int __add_kprobe_trace_events(struct perf_probe_event *pev,
 			break;
 		event = buf;
 
-		tev->event = xstrdup(event);
-		tev->group = xstrdup(group);
+		tev->event = strdup(event);
+		tev->group = strdup(group);
+		if (tev->event == NULL || tev->group == NULL) {
+			ret = -ENOMEM;
+			break;
+		}
 		ret = write_kprobe_trace_event(fd, tev);
 		if (ret < 0)
 			break;
@@ -1360,23 +1404,40 @@ static int convert_to_kprobe_trace_events(struct perf_probe_event *pev,
 		return -ENOMEM;
 
 	/* Copy parameters */
-	tev->point.symbol = xstrdup(pev->point.function);
+	tev->point.symbol = strdup(pev->point.function);
+	if (tev->point.symbol == NULL) {
+		ret = -ENOMEM;
+		goto error;
+	}
 	tev->point.offset = pev->point.offset;
 	tev->nargs = pev->nargs;
 	if (tev->nargs) {
 		tev->args = zalloc(sizeof(struct kprobe_trace_arg)
 				   * tev->nargs);
 		if (tev->args == NULL) {
-			free(tev);
-			*tevs = NULL;
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto error;
 		}
 		for (i = 0; i < tev->nargs; i++) {
-			if (pev->args[i].name)
-				tev->args[i].name = xstrdup(pev->args[i].name);
-			tev->args[i].value = xstrdup(pev->args[i].var);
-			if (pev->args[i].type)
-				tev->args[i].type = xstrdup(pev->args[i].type);
+			if (pev->args[i].name) {
+				tev->args[i].name = strdup(pev->args[i].name);
+				if (tev->args[i].name == NULL) {
+					ret = -ENOMEM;
+					goto error;
+				}
+			}
+			tev->args[i].value = strdup(pev->args[i].var);
+			if (tev->args[i].value == NULL) {
+				ret = -ENOMEM;
+				goto error;
+			}
+			if (pev->args[i].type) {
+				tev->args[i].type = strdup(pev->args[i].type);
+				if (tev->args[i].type == NULL) {
+					ret = -ENOMEM;
+					goto error;
+				}
+			}
 		}
 	}
 
@@ -1386,13 +1447,15 @@ static int convert_to_kprobe_trace_events(struct perf_probe_event *pev,
 	if (!sym) {
 		pr_warning("Kernel symbol \'%s\' not found.\n",
 			   tev->point.symbol);
-		clear_kprobe_trace_event(tev);
-		free(tev);
-		*tevs = NULL;
-		return -ENOENT;
-	} else
-		ret = 1;
+		ret = -ENOENT;
+		goto error;
+	}
 
+	return 1;
+error:
+	clear_kprobe_trace_event(tev);
+	free(tev);
+	*tevs = NULL;
 	return ret;
 }
 
@@ -1528,7 +1591,11 @@ int del_perf_probe_events(struct strlist *dellist)
 		return -EINVAL;
 
 	strlist__for_each(ent, dellist) {
-		str = xstrdup(ent->s);
+		str = strdup(ent->s);
+		if (str == NULL) {
+			ret = -ENOMEM;
+			break;
+		}
 		pr_debug("Parsing: %s\n", str);
 		p = strchr(str, ':');
 		if (p) {
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index ce1ac827f3d..e443e69a4d2 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -424,7 +424,10 @@ static int convert_location(Dwarf_Op *op, struct probe_finder *pf)
 		return -ERANGE;
 	}
 
-	tvar->value = xstrdup(regs);
+	tvar->value = strdup(regs);
+	if (tvar->value == NULL)
+		return -ENOMEM;
+
 	if (ref) {
 		tvar->ref = zalloc(sizeof(struct kprobe_trace_arg_ref));
 		if (tvar->ref == NULL)
@@ -466,7 +469,9 @@ static int convert_variable_type(Dwarf_Die *vr_die,
 				   strerror(-ret));
 			return ret;
 		}
-		targ->type = xstrdup(buf);
+		targ->type = strdup(buf);
+		if (targ->type == NULL)
+			return -ENOMEM;
 	}
 	return 0;
 }
@@ -576,9 +581,11 @@ static int convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
 		vr_die = &die_mem;
 	}
 	if (ret == 0) {
-		if (pf->pvar->type)
-			pf->tvar->type = xstrdup(pf->pvar->type);
-		else
+		if (pf->pvar->type) {
+			pf->tvar->type = strdup(pf->pvar->type);
+			if (pf->tvar->type == NULL)
+				ret = -ENOMEM;
+		} else
 			ret = convert_variable_type(vr_die, pf->tvar);
 	}
 	/* *expr will be cached in libdw. Don't free it. */
@@ -595,22 +602,30 @@ static int find_variable(Dwarf_Die *sp_die, struct probe_finder *pf)
 {
 	Dwarf_Die vr_die;
 	char buf[32], *ptr;
+	int ret;
 
 	/* TODO: Support arrays */
 	if (pf->pvar->name)
-		pf->tvar->name = xstrdup(pf->pvar->name);
+		pf->tvar->name = strdup(pf->pvar->name);
 	else {
-		synthesize_perf_probe_arg(pf->pvar, buf, 32);
+		ret = synthesize_perf_probe_arg(pf->pvar, buf, 32);
+		if (ret < 0)
+			return ret;
 		ptr = strchr(buf, ':');	/* Change type separator to _ */
 		if (ptr)
 			*ptr = '_';
-		pf->tvar->name = xstrdup(buf);
+		pf->tvar->name = strdup(buf);
 	}
+	if (pf->tvar->name == NULL)
+		return -ENOMEM;
 
 	if (!is_c_varname(pf->pvar->var)) {
 		/* Copy raw parameters */
-		pf->tvar->value = xstrdup(pf->pvar->var);
-		return 0;
+		pf->tvar->value = strdup(pf->pvar->var);
+		if (pf->tvar->value == NULL)
+			return -ENOMEM;
+		else
+			return 0;
 	}
 
 	pr_debug("Searching '%s' variable in context.\n",
@@ -660,7 +675,9 @@ static int convert_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 				   dwarf_diename(sp_die));
 			return -ENOENT;
 		}
-		tev->point.symbol = xstrdup(name);
+		tev->point.symbol = strdup(name);
+		if (tev->point.symbol == NULL)
+			return -ENOMEM;
 		tev->point.offset = (unsigned long)(pf->addr - eaddr);
 	} else
 		/* This function has no name. */
@@ -1028,7 +1045,11 @@ int find_perf_probe_point(int fd, unsigned long addr,
 			tmp = dwarf_linesrc(line, NULL, NULL);
 			if (tmp) {
 				ppt->line = lineno;
-				ppt->file = xstrdup(tmp);
+				ppt->file = strdup(tmp);
+				if (ppt->file == NULL) {
+					ret = -ENOMEM;
+					goto end;
+				}
 				found = true;
 			}
 		}
@@ -1064,7 +1085,11 @@ int find_perf_probe_point(int fd, unsigned long addr,
 		/* We don't have a line number, let's use offset */
 		ppt->offset = addr - (unsigned long)eaddr;
 found:
-		ppt->function = xstrdup(tmp);
+		ppt->function = strdup(tmp);
+		if (ppt->function == NULL) {
+			ret = -ENOMEM;
+			goto end;
+		}
 		found = true;
 	}
 
@@ -1116,8 +1141,11 @@ static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
 			continue;
 
 		/* Copy real path */
-		if (!lf->lr->path)
-			lf->lr->path = xstrdup(src);
+		if (!lf->lr->path) {
+			lf->lr->path = strdup(src);
+			if (lf->lr->path == NULL)
+				return -ENOMEM;
+		}
 		line_list__add_line(&lf->lr->line_list, (unsigned int)lineno);
 	}
 	/* Update status */
-- 
cgit v1.2.3


From 7ca5989dd065cbc48a958666c273794686ea7525 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 14 Apr 2010 18:39:28 -0400
Subject: perf probe: Fix to use correct debugfs path finder

Instead of using debugfs_path, use debugfs_find_mountpoint()
to find actual debugfs path.

LKML-Reference: <20100414223928.14630.38326.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-probe.c    |  4 ----
 tools/perf/util/probe-event.c | 12 ++++++++++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index 64bc11a183b..c1e54035e8c 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -40,7 +40,6 @@
 #include "util/debug.h"
 #include "util/debugfs.h"
 #include "util/parse-options.h"
-#include "util/parse-events.h"	/* For debugfs_path */
 #include "util/probe-finder.h"
 #include "util/probe-event.h"
 
@@ -205,9 +204,6 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 	     !params.show_lines))
 		usage_with_options(probe_usage, options);
 
-	if (debugfs_valid_mountpoint(debugfs_path) < 0)
-		die("Failed to find debugfs path.");
-
 	if (params.list_events) {
 		if (params.nevents != 0 || params.dellist) {
 			pr_err("  Error: Don't use --list with --add/--del.\n");
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index ca108b2cd15..1c4a20a284c 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -42,8 +42,8 @@
 #include "color.h"
 #include "symbol.h"
 #include "thread.h"
+#include "debugfs.h"
 #include "trace-event.h"	/* For __unused */
-#include "parse-events.h"	/* For debugfs_path */
 #include "probe-event.h"
 #include "probe-finder.h"
 
@@ -1075,10 +1075,18 @@ void clear_kprobe_trace_event(struct kprobe_trace_event *tev)
 static int open_kprobe_events(bool readwrite)
 {
 	char buf[PATH_MAX];
+	const char *__debugfs;
 	int ret;
 
-	ret = e_snprintf(buf, PATH_MAX, "%s/../kprobe_events", debugfs_path);
+	__debugfs = debugfs_find_mountpoint();
+	if (__debugfs == NULL) {
+		pr_warning("Debugfs is not mounted.\n");
+		return -ENOENT;
+	}
+
+	ret = e_snprintf(buf, PATH_MAX, "%stracing/kprobe_events", __debugfs);
 	if (ret >= 0) {
+		pr_debug("Opening %s write=%d\n", buf, readwrite);
 		if (readwrite && !probe_event_dry_run)
 			ret = open(buf, O_RDWR, O_APPEND);
 		else
-- 
cgit v1.2.3


From dd259c5db26ccda46409dbf6efc79d5a2b259e38 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 14 Apr 2010 18:39:35 -0400
Subject: perf probe: Fix mis-estimation for shortening filename

Fix mis-estimation size for making a short filename.
Since the buffer size is 32 bytes and there are '@' prefix and
'\0' termination, maximum shorten filename length should be
30. This means, before searching '/', it should be 31 bytes.

LKML-Reference: <20100414223935.14630.11954.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-event.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 1c4a20a284c..6d438391bae 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -806,12 +806,12 @@ static char *synthesize_perf_probe_point(struct perf_probe_point *pp)
 			goto error;
 	}
 	if (pp->file) {
-		len = strlen(pp->file) - 32;
+		len = strlen(pp->file) - 31;
 		if (len < 0)
 			len = 0;
 		tmp = strchr(pp->file + len, '/');
 		if (!tmp)
-			tmp = pp->file + len - 1;
+			tmp = pp->file + len;
 		ret = e_snprintf(file, 32, "@%s", tmp + 1);
 		if (ret <= 0)
 			goto error;
-- 
cgit v1.2.3


From d3b63d7ae04879a817bac5c0bf09749f73629d32 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 14 Apr 2010 18:39:42 -0400
Subject: perf probe: Fix a bug that --line range can be overflow

Since line_finder.lno_s/e are signed int but line_range.start/end
are unsigned int, it is possible to be overflow when converting
line_range->start/end to line_finder->lno_s/e.
This changes line_range.start/end and line_list.line to signed int
and adds overflow checks when setting line_finder.lno_s/e.

LKML-Reference: <20100414223942.14630.72730.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-event.c  | 23 ++++++++++++-----------
 tools/perf/util/probe-event.h  |  6 +++---
 tools/perf/util/probe-finder.c | 19 +++++++++----------
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 6d438391bae..954ca210e4b 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -189,7 +189,7 @@ static int try_to_find_kprobe_trace_events(struct perf_probe_event *pev,
 #define LINEBUF_SIZE 256
 #define NR_ADDITIONAL_LINES 2
 
-static int show_one_line(FILE *fp, unsigned int l, bool skip, bool show_num)
+static int show_one_line(FILE *fp, int l, bool skip, bool show_num)
 {
 	char buf[LINEBUF_SIZE];
 	const char *color = PERF_COLOR_BLUE;
@@ -198,7 +198,7 @@ static int show_one_line(FILE *fp, unsigned int l, bool skip, bool show_num)
 		goto error;
 	if (!skip) {
 		if (show_num)
-			fprintf(stdout, "%7u  %s", l, buf);
+			fprintf(stdout, "%7d  %s", l, buf);
 		else
 			color_fprintf(stdout, color, "         %s", buf);
 	}
@@ -231,7 +231,7 @@ error:
  */
 int show_line_range(struct line_range *lr)
 {
-	unsigned int l = 1;
+	int l = 1;
 	struct line_node *ln;
 	FILE *fp;
 	int fd, ret;
@@ -340,16 +340,15 @@ int parse_line_range_desc(const char *arg, struct line_range *lr)
 	 */
 	ptr = strchr(arg, ':');
 	if (ptr) {
-		lr->start = (unsigned int)strtoul(ptr + 1, &tmp, 0);
+		lr->start = (int)strtoul(ptr + 1, &tmp, 0);
 		if (*tmp == '+')
-			lr->end = lr->start + (unsigned int)strtoul(tmp + 1,
-								    &tmp, 0);
+			lr->end = lr->start + (int)strtoul(tmp + 1, &tmp, 0);
 		else if (*tmp == '-')
-			lr->end = (unsigned int)strtoul(tmp + 1, &tmp, 0);
+			lr->end = (int)strtoul(tmp + 1, &tmp, 0);
 		else
-			lr->end = 0;
-		pr_debug("Line range is %u to %u\n", lr->start, lr->end);
-		if (lr->end && lr->start > lr->end) {
+			lr->end = INT_MAX;
+		pr_debug("Line range is %d to %d\n", lr->start, lr->end);
+		if (lr->start > lr->end) {
 			semantic_error("Start line must be smaller"
 				       " than end line.\n");
 			return -EINVAL;
@@ -360,8 +359,10 @@ int parse_line_range_desc(const char *arg, struct line_range *lr)
 			return -EINVAL;
 		}
 		tmp = strndup(arg, (ptr - arg));
-	} else
+	} else {
 		tmp = strdup(arg);
+		lr->end = INT_MAX;
+	}
 
 	if (tmp == NULL)
 		return -ENOMEM;
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index ab549290170..e7ff0d02c0d 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -74,15 +74,15 @@ struct perf_probe_event {
 /* Line number container */
 struct line_node {
 	struct list_head	list;
-	unsigned int		line;
+	int			line;
 };
 
 /* Line range */
 struct line_range {
 	char			*file;		/* File name */
 	char			*function;	/* Function name */
-	unsigned int		start;		/* Start line number */
-	unsigned int		end;		/* End line number */
+	int			start;		/* Start line number */
+	int			end;		/* End line number */
 	int			offset;		/* Start line offset */
 	char			*path;		/* Real path name */
 	struct list_head	line_list;	/* Visible lines */
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index e443e69a4d2..b4c93659929 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -111,7 +111,7 @@ static int strtailcmp(const char *s1, const char *s2)
 /* Line number list operations */
 
 /* Add a line to line number list */
-static int line_list__add_line(struct list_head *head, unsigned int line)
+static int line_list__add_line(struct list_head *head, int line)
 {
 	struct line_node *ln;
 	struct list_head *p;
@@ -138,7 +138,7 @@ found:
 }
 
 /* Check if the line in line number list */
-static int line_list__has_line(struct list_head *head, unsigned int line)
+static int line_list__has_line(struct list_head *head, int line)
 {
 	struct line_node *ln;
 
@@ -1146,7 +1146,7 @@ static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
 			if (lf->lr->path == NULL)
 				return -ENOMEM;
 		}
-		line_list__add_line(&lf->lr->line_list, (unsigned int)lineno);
+		line_list__add_line(&lf->lr->line_list, lineno);
 	}
 	/* Update status */
 	if (!list_empty(&lf->lr->line_list))
@@ -1179,10 +1179,12 @@ static int line_range_search_cb(Dwarf_Die *sp_die, void *data)
 		dwarf_decl_line(sp_die, &lr->offset);
 		pr_debug("fname: %s, lineno:%d\n", lf->fname, lr->offset);
 		lf->lno_s = lr->offset + lr->start;
-		if (!lr->end)
+		if (lf->lno_s < 0)	/* Overflow */
+			lf->lno_s = INT_MAX;
+		lf->lno_e = lr->offset + lr->end;
+		if (lf->lno_e < 0)	/* Overflow */
 			lf->lno_e = INT_MAX;
-		else
-			lf->lno_e = lr->offset + lr->end;
+		pr_debug("New line range: %d to %d\n", lf->lno_s, lf->lno_e);
 		lr->start = lf->lno_s;
 		lr->end = lf->lno_e;
 		if (dwarf_func_inline(sp_die)) {
@@ -1244,10 +1246,7 @@ int find_line_range(int fd, struct line_range *lr)
 				ret = find_line_range_by_func(&lf);
 			else {
 				lf.lno_s = lr->start;
-				if (!lr->end)
-					lf.lno_e = INT_MAX;
-				else
-					lf.lno_e = lr->end;
+				lf.lno_e = lr->end;
 				ret = find_line_range_by_line(NULL, &lf);
 			}
 		}
-- 
cgit v1.2.3


From dda4ab34fe1905d3d590572b776dd92aa0866558 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 14 Apr 2010 18:39:50 -0400
Subject: perf probe: Fix line range to show end line

Line range should reject the range if the number of lines is 0
(e.g. "sched.c:1024+0"), and it should show the lines include
the end of line number (e.g. "sched.c:1024-2048" should show
2048th line).

LKML-Reference: <20100414223950.14630.42263.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-event.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 954ca210e4b..5bf8ab03446 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -290,7 +290,7 @@ int show_line_range(struct line_range *lr)
 
 	if (lr->end == INT_MAX)
 		lr->end = l + NR_ADDITIONAL_LINES;
-	while (l < lr->end && !feof(fp) && ret >= 0)
+	while (l <= lr->end && !feof(fp) && ret >= 0)
 		ret = show_one_line(fp, (l++) - lr->offset, false, false);
 end:
 	fclose(fp);
@@ -341,9 +341,15 @@ int parse_line_range_desc(const char *arg, struct line_range *lr)
 	ptr = strchr(arg, ':');
 	if (ptr) {
 		lr->start = (int)strtoul(ptr + 1, &tmp, 0);
-		if (*tmp == '+')
+		if (*tmp == '+') {
 			lr->end = lr->start + (int)strtoul(tmp + 1, &tmp, 0);
-		else if (*tmp == '-')
+			lr->end--;	/*
+					 * Adjust the number of lines here.
+					 * If the number of lines == 1, the
+					 * the end of line should be equal to
+					 * the start of line.
+					 */
+		} else if (*tmp == '-')
 			lr->end = (int)strtoul(tmp + 1, &tmp, 0);
 		else
 			lr->end = INT_MAX;
-- 
cgit v1.2.3


From de1439d8a521d22c3219fc007a570fcf944ac789 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 14 Apr 2010 17:44:00 -0300
Subject: perf probe: Support DW_OP_plus_uconst in DW_AT_data_member_location
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DW_OP_plus_uconst can be used for DW_AT_data_member_location.
This patch adds DW_OP_plus_uconst support when getting
structure member offset.

Commiter note:

Fixed up the size_t format specifier in one case:

cc1: warnings being treated as errors
util/probe-finder.c: In function ‘die_get_data_member_location’:
util/probe-finder.c:270: error: format ‘%d’ expects type ‘int’, but argument 4 has type ‘size_t’
make: *** [/home/acme/git/build/perf/util/probe-finder.o] Error 1

LKML-Reference: <20100414223958.14630.5230.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-finder.c | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index b4c93659929..03b469197a0 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -249,6 +249,33 @@ static int die_get_byte_size(Dwarf_Die *tp_die)
 	return (int)ret;
 }
 
+/* Get data_member_location offset */
+static int die_get_data_member_location(Dwarf_Die *mb_die, Dwarf_Word *offs)
+{
+	Dwarf_Attribute attr;
+	Dwarf_Op *expr;
+	size_t nexpr;
+	int ret;
+
+	if (dwarf_attr(mb_die, DW_AT_data_member_location, &attr) == NULL)
+		return -ENOENT;
+
+	if (dwarf_formudata(&attr, offs) != 0) {
+		/* DW_AT_data_member_location should be DW_OP_plus_uconst */
+		ret = dwarf_getlocation(&attr, &expr, &nexpr);
+		if (ret < 0 || nexpr == 0)
+			return -ENOENT;
+
+		if (expr[0].atom != DW_OP_plus_uconst || nexpr != 1) {
+			pr_debug("Unable to get offset:Unexpected OP %x (%zd)\n",
+				 expr[0].atom, nexpr);
+			return -ENOTSUP;
+		}
+		*offs = (Dwarf_Word)expr[0].number;
+	}
+	return 0;
+}
+
 /* Return values for die_find callbacks */
 enum {
 	DIE_FIND_CB_FOUND = 0,		/* End of Search */
@@ -482,9 +509,9 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 				    Dwarf_Die *die_mem)
 {
 	struct kprobe_trace_arg_ref *ref = *ref_ptr;
-	Dwarf_Attribute attr;
 	Dwarf_Die type;
 	Dwarf_Word offs;
+	int ret;
 
 	pr_debug("converting %s in %s\n", field->name, varname);
 	if (die_get_real_type(vr_die, &type) == NULL) {
@@ -542,17 +569,17 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 	}
 
 	/* Get the offset of the field */
-	if (dwarf_attr(die_mem, DW_AT_data_member_location, &attr) == NULL ||
-	    dwarf_formudata(&attr, &offs) != 0) {
+	ret = die_get_data_member_location(die_mem, &offs);
+	if (ret < 0) {
 		pr_warning("Failed to get the offset of %s.\n", field->name);
-		return -ENOENT;
+		return ret;
 	}
 	ref->offset += (long)offs;
 
 	/* Converting next field */
 	if (field->next)
 		return convert_variable_fields(die_mem, field->name,
-					       field->next, &ref, die_mem);
+					field->next, &ref, die_mem);
 	else
 		return 0;
 }
-- 
cgit v1.2.3


From f6c903f5856ffa75ae19dcee4dbb5093e320d45c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 14 Apr 2010 18:40:07 -0400
Subject: perf probe: Show function entry line as probe-able

Function entry line should be shown as probe-able line,
because each function has declared line attribute.

LKML-Reference: <20100414224007.14630.96915.stgit@localhost6.localdomain6>
Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-finder.c | 74 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 63 insertions(+), 11 deletions(-)

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 03b469197a0..3e7977560be 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -1127,6 +1127,45 @@ end:
 	return ret;
 }
 
+/* Add a line and store the src path */
+static int line_range_add_line(const char *src, unsigned int lineno,
+			       struct line_range *lr)
+{
+	/* Copy real path */
+	if (!lr->path) {
+		lr->path = strdup(src);
+		if (lr->path == NULL)
+			return -ENOMEM;
+	}
+	return line_list__add_line(&lr->line_list, lineno);
+}
+
+/* Search function declaration lines */
+static int line_range_funcdecl_cb(Dwarf_Die *sp_die, void *data)
+{
+	struct dwarf_callback_param *param = data;
+	struct line_finder *lf = param->data;
+	const char *src;
+	int lineno;
+
+	src = dwarf_decl_file(sp_die);
+	if (src && strtailcmp(src, lf->fname) != 0)
+		return DWARF_CB_OK;
+
+	if (dwarf_decl_line(sp_die, &lineno) != 0 ||
+	    (lf->lno_s > lineno || lf->lno_e < lineno))
+		return DWARF_CB_OK;
+
+	param->retval = line_range_add_line(src, lineno, lf->lr);
+	return DWARF_CB_OK;
+}
+
+static int find_line_range_func_decl_lines(struct line_finder *lf)
+{
+	struct dwarf_callback_param param = {.data = (void *)lf, .retval = 0};
+	dwarf_getfuncs(&lf->cu_die, line_range_funcdecl_cb, &param, 0);
+	return param.retval;
+}
 
 /* Find line range from its line number */
 static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
@@ -1135,7 +1174,7 @@ static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
 	Dwarf_Line *line;
 	size_t nlines, i;
 	Dwarf_Addr addr;
-	int lineno;
+	int lineno, ret = 0;
 	const char *src;
 	Dwarf_Die die_mem;
 
@@ -1145,6 +1184,7 @@ static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
 		return -ENOENT;
 	}
 
+	/* Search probable lines on lines list */
 	for (i = 0; i < nlines; i++) {
 		line = dwarf_onesrcline(lines, i);
 		if (dwarf_lineno(line, &lineno) != 0 ||
@@ -1167,22 +1207,34 @@ static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
 		if (strtailcmp(src, lf->fname) != 0)
 			continue;
 
-		/* Copy real path */
-		if (!lf->lr->path) {
-			lf->lr->path = strdup(src);
-			if (lf->lr->path == NULL)
-				return -ENOMEM;
-		}
-		line_list__add_line(&lf->lr->line_list, lineno);
+		ret = line_range_add_line(src, lineno, lf->lr);
+		if (ret < 0)
+			return ret;
 	}
+
+	/*
+	 * Dwarf lines doesn't include function declarations. We have to
+	 * check functions list or given function.
+	 */
+	if (sp_die) {
+		src = dwarf_decl_file(sp_die);
+		if (src && dwarf_decl_line(sp_die, &lineno) == 0 &&
+		    (lf->lno_s <= lineno && lf->lno_e >= lineno))
+			ret = line_range_add_line(src, lineno, lf->lr);
+	} else
+		ret = find_line_range_func_decl_lines(lf);
+
 	/* Update status */
-	if (!list_empty(&lf->lr->line_list))
-		lf->found = 1;
+	if (ret >= 0)
+		if (!list_empty(&lf->lr->line_list))
+			ret = lf->found = 1;
+		else
+			ret = 0;	/* Lines are not found */
 	else {
 		free(lf->lr->path);
 		lf->lr->path = NULL;
 	}
-	return lf->found;
+	return ret;
 }
 
 static int line_range_inline_cb(Dwarf_Die *in_die, void *data)
-- 
cgit v1.2.3


From 95476b64ab11d528de2557366ec584977c215b9e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 14 Apr 2010 23:42:18 +0200
Subject: perf: Fix hlist related build error

hlist helpers need to be available for all software events, not
only trace events.

Pull them out outside the ifdef CONFIG_EVENT_TRACING section.

Fixes:
	kernel/perf_event.c:4573: error: implicit declaration of function 'swevent_hlist_put'
	kernel/perf_event.c:4614: error: implicit declaration of function 'swevent_hlist_get'
	kernel/perf_event.c:5534: error: implicit declaration of function 'swevent_hlist_release

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1271281338-23491-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 60 ++++++++++++++++++++++++++---------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 095101d685b..07b7a435bf0 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4313,36 +4313,6 @@ static const struct pmu perf_ops_task_clock = {
 	.read		= task_clock_perf_event_read,
 };
 
-#ifdef CONFIG_EVENT_TRACING
-
-void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
-		   int entry_size, struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct perf_raw_record raw = {
-		.size = entry_size,
-		.data = record,
-	};
-
-	perf_sample_data_init(&data, addr);
-	data.raw = &raw;
-
-	/* Trace events already protected against recursion */
-	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
-			 &data, regs);
-}
-EXPORT_SYMBOL_GPL(perf_tp_event);
-
-static int perf_tp_event_match(struct perf_event *event,
-				struct perf_sample_data *data)
-{
-	void *record = data->raw->data;
-
-	if (likely(!event->filter) || filter_match_preds(event->filter, record))
-		return 1;
-	return 0;
-}
-
 static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
 {
 	struct swevent_hlist *hlist;
@@ -4442,6 +4412,36 @@ static int swevent_hlist_get(struct perf_event *event)
 	return err;
 }
 
+#ifdef CONFIG_EVENT_TRACING
+
+void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
+		   int entry_size, struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct perf_raw_record raw = {
+		.size = entry_size,
+		.data = record,
+	};
+
+	perf_sample_data_init(&data, addr);
+	data.raw = &raw;
+
+	/* Trace events already protected against recursion */
+	do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+			 &data, regs);
+}
+EXPORT_SYMBOL_GPL(perf_tp_event);
+
+static int perf_tp_event_match(struct perf_event *event,
+				struct perf_sample_data *data)
+{
+	void *record = data->raw->data;
+
+	if (likely(!event->filter) || filter_match_preds(event->filter, record))
+		return 1;
+	return 0;
+}
+
 static void tp_perf_event_destroy(struct perf_event *event)
 {
 	perf_trace_disable(event->attr.config);
-- 
cgit v1.2.3


From a1e2f60e3efc812bf66a2be0d8530ee175003f6d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 14 Apr 2010 23:58:03 +0200
Subject: perf: Fix dynamic field detection

Checking if a tracing field is an array with a dynamic length
requires to check the field type and seek the "__data_loc"
string that prepends the actual type, as can be found in a trace
event format file:

	field:__data_loc char[] name;	offset:16;	size:4;	signed:1;

But we actually use strcmp() to check if the field type fully
matches "__data_loc", which may fail as we trip over the rest of
the type.

To fix this, use strncmp to only check if it starts with
"__data_loc".

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1271282283-23721-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/trace-event-parse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 17d6d66ed76..d6ef414075a 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -761,7 +761,7 @@ static int field_is_string(struct format_field *field)
 
 static int field_is_dynamic(struct format_field *field)
 {
-	if (!strcmp(field->type, "__data_loc"))
+	if (!strncmp(field->type, "__data_loc", 10))
 		return 1;
 
 	return 0;
-- 
cgit v1.2.3


From 7865e817e9b4b378ac57ab7f16183100b95466ce Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 14 Apr 2010 19:42:07 +0200
Subject: perf: Make -f the default for perf record

Force the overwriting mode by default if append mode is not explicit.
Adding -f every time one uses perf on a daily basis quickly becomes a
burden.

Keep the -f among the options though to avoid breaking some random
users scripts.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 tools/perf/Documentation/perf-record.txt |  2 +-
 tools/perf/builtin-record.c              | 40 ++++++++++++++++++++------------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index fc46c0b40f6..b29bd2db6a4 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -58,7 +58,7 @@ OPTIONS
 
 -f::
 --force::
-	Overwrite existing data file.
+	Overwrite existing data file. (deprecated)
 
 -c::
 --count=::
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 9a951368723..dcda8993082 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -26,6 +26,11 @@
 #include <unistd.h>
 #include <sched.h>
 
+enum write_mode_t {
+	WRITE_FORCE,
+	WRITE_APPEND
+};
+
 static int			*fd[MAX_NR_CPUS][MAX_COUNTERS];
 
 static long			default_interval		=      0;
@@ -47,8 +52,7 @@ static pid_t			*all_tids			=      NULL;
 static int			thread_num			=      0;
 static pid_t			child_pid			=     -1;
 static bool			inherit				=   true;
-static bool			force				=  false;
-static bool			append_file			=  false;
+static enum write_mode_t	write_mode			= WRITE_FORCE;
 static bool			call_graph			=  false;
 static bool			inherit_stat			=  false;
 static bool			no_samples			=  false;
@@ -450,26 +454,19 @@ static int __cmd_record(int argc, const char **argv)
 	}
 
 	if (!stat(output_name, &st) && st.st_size) {
-		if (!force) {
-			if (!append_file) {
-				pr_err("Error, output file %s exists, use -A "
-				       "to append or -f to overwrite.\n",
-				       output_name);
-				exit(-1);
-			}
-		} else {
+		if (write_mode == WRITE_FORCE) {
 			char oldname[PATH_MAX];
 			snprintf(oldname, sizeof(oldname), "%s.old",
 				 output_name);
 			unlink(oldname);
 			rename(output_name, oldname);
 		}
-	} else {
-		append_file = false;
+	} else if (write_mode == WRITE_APPEND) {
+		write_mode = WRITE_FORCE;
 	}
 
 	flags = O_CREAT|O_RDWR;
-	if (append_file)
+	if (write_mode == WRITE_APPEND)
 		file_new = 0;
 	else
 		flags |= O_TRUNC;
@@ -480,7 +477,8 @@ static int __cmd_record(int argc, const char **argv)
 		exit(-1);
 	}
 
-	session = perf_session__new(output_name, O_WRONLY, force);
+	session = perf_session__new(output_name, O_WRONLY,
+				    write_mode == WRITE_FORCE);
 	if (session == NULL) {
 		pr_err("Not enough memory for reading perf file header\n");
 		return -1;
@@ -667,6 +665,8 @@ static const char * const record_usage[] = {
 	NULL
 };
 
+static bool force, append_file;
+
 static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
 		     "event selector. use 'perf list' to list available events",
@@ -688,7 +688,7 @@ static const struct option options[] = {
 	OPT_INTEGER('C', "profile_cpu", &profile_cpu,
 			    "CPU to profile on"),
 	OPT_BOOLEAN('f', "force", &force,
-			"overwrite existing data file"),
+			"overwrite existing data file (deprecated)"),
 	OPT_LONG('c', "count", &default_interval,
 		    "event period to sample"),
 	OPT_STRING('o', "output", &output_name, "file",
@@ -725,6 +725,16 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 		!system_wide && profile_cpu == -1)
 		usage_with_options(record_usage, options);
 
+	if (force && append_file) {
+		fprintf(stderr, "Can't overwrite and append at the same time."
+				" You need to choose between -f and -A");
+		usage_with_options(record_usage, options);
+	} else if (append_file) {
+		write_mode = WRITE_APPEND;
+	} else {
+		write_mode = WRITE_FORCE;
+	}
+
 	symbol__init();
 
 	if (!nr_counters) {
-- 
cgit v1.2.3


From bdef3b02ceeb97f5f67fcfa6dff13c4e70b34fb7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 14 Apr 2010 20:05:17 +0200
Subject: perf: Always record tracepoints raw samples from perf record

Trace events are mostly used for tracing rather than simple
counting. Don't bother anymore with adding -R when using them,
just record raw samples of trace events every time.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 tools/perf/Documentation/perf-record.txt |  2 +-
 tools/perf/util/parse-events.c           | 14 ++++----------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index b29bd2db6a4..020d871c793 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -101,7 +101,7 @@ OPTIONS
 
 -R::
 --raw-samples::
-Collect raw sample records from all opened counters (typically for tracepoint counters).
+Collect raw sample records from all opened counters (default for tracepoint counters).
 
 SEE ALSO
 --------
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 435781e0c20..880070c02fd 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -410,7 +410,6 @@ static enum event_result
 parse_single_tracepoint_event(char *sys_name,
 			      const char *evt_name,
 			      unsigned int evt_length,
-			      char *flags,
 			      struct perf_event_attr *attr,
 			      const char **strp)
 {
@@ -419,13 +418,9 @@ parse_single_tracepoint_event(char *sys_name,
 	u64 id;
 	int fd;
 
-	if (flags) {
-		if (!strncmp(flags, "record", strlen(flags))) {
-			attr->sample_type |= PERF_SAMPLE_RAW;
-			attr->sample_type |= PERF_SAMPLE_TIME;
-			attr->sample_type |= PERF_SAMPLE_CPU;
-		}
-	}
+	attr->sample_type |= PERF_SAMPLE_RAW;
+	attr->sample_type |= PERF_SAMPLE_TIME;
+	attr->sample_type |= PERF_SAMPLE_CPU;
 
 	snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path,
 		 sys_name, evt_name);
@@ -533,8 +528,7 @@ static enum event_result parse_tracepoint_event(const char **strp,
 						       flags);
 	} else
 		return parse_single_tracepoint_event(sys_name, evt_name,
-						     evt_length, flags,
-						     attr, strp);
+						     evt_length, attr, strp);
 }
 
 static enum event_result
-- 
cgit v1.2.3


From f92128193094c288bc315db1694fafeaeb7ee1d0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 14 Apr 2010 22:09:02 +0200
Subject: perf: Make the trace events sample period default to 1

Trace events are mostly used for tracing and then require not to
be lost when possible. As opposite to hardware events that really
require to trigger after a given sample period, trace events mostly
need to trigger everytime.

It is a frustrating experience to trace with perf and realize we
lost a lot of events because we forgot the "-c 1" option.

Then default sample_period to 1 for trace events but let the user
override it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 tools/perf/builtin-record.c    | 36 ++++++++++++++++++++++--------------
 tools/perf/util/parse-events.c |  2 ++
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index dcda8993082..ca2affc9233 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -33,11 +33,13 @@ enum write_mode_t {
 
 static int			*fd[MAX_NR_CPUS][MAX_COUNTERS];
 
+static unsigned int		user_interval 			= UINT_MAX;
 static long			default_interval		=      0;
 
 static int			nr_cpus				=      0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			=    128;
+static unsigned int		user_freq 			= UINT_MAX;
 static int			freq				=   1000;
 static int			output;
 static const char		*output_name			= "perf.data";
@@ -255,10 +257,19 @@ static void create_counter(int counter, int cpu)
 	if (nr_counters > 1)
 		attr->sample_type |= PERF_SAMPLE_ID;
 
-	if (freq) {
-		attr->sample_type	|= PERF_SAMPLE_PERIOD;
-		attr->freq		= 1;
-		attr->sample_freq	= freq;
+	/*
+	 * We default some events to a 1 default interval. But keep
+	 * it a weak assumption overridable by the user.
+	 */
+	if (!attr->sample_period || (user_freq != UINT_MAX &&
+				     user_interval != UINT_MAX)) {
+		if (freq) {
+			attr->sample_type	|= PERF_SAMPLE_PERIOD;
+			attr->freq		= 1;
+			attr->sample_freq	= freq;
+		} else {
+			attr->sample_period = default_interval;
+		}
 	}
 
 	if (no_samples)
@@ -689,13 +700,13 @@ static const struct option options[] = {
 			    "CPU to profile on"),
 	OPT_BOOLEAN('f', "force", &force,
 			"overwrite existing data file (deprecated)"),
-	OPT_LONG('c', "count", &default_interval,
+	OPT_LONG('c', "count", &user_interval,
 		    "event period to sample"),
 	OPT_STRING('o', "output", &output_name, "file",
 		    "output file name"),
 	OPT_BOOLEAN('i', "inherit", &inherit,
 		    "child tasks inherit counters"),
-	OPT_INTEGER('F', "freq", &freq,
+	OPT_INTEGER('F', "freq", &user_freq,
 		    "profile at this frequency"),
 	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
 		    "number of mmap data pages"),
@@ -716,7 +727,6 @@ static const struct option options[] = {
 
 int cmd_record(int argc, const char **argv, const char *prefix __used)
 {
-	int counter;
 	int i,j;
 
 	argc = parse_options(argc, argv, options, record_usage,
@@ -774,6 +784,11 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 	if (!event_array)
 		return -ENOMEM;
 
+	if (user_interval != UINT_MAX)
+		default_interval = user_interval;
+	if (user_freq != UINT_MAX)
+		freq = user_freq;
+
 	/*
 	 * User specified count overrides default frequency.
 	 */
@@ -786,12 +801,5 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 		exit(EXIT_FAILURE);
 	}
 
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (attrs[counter].sample_period)
-			continue;
-
-		attrs[counter].sample_period = default_interval;
-	}
-
 	return __cmd_record(argc, argv);
 }
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 880070c02fd..3b4ec679756 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -422,6 +422,8 @@ parse_single_tracepoint_event(char *sys_name,
 	attr->sample_type |= PERF_SAMPLE_TIME;
 	attr->sample_type |= PERF_SAMPLE_CPU;
 
+	attr->sample_period = 1;
+
 	snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path,
 		 sys_name, evt_name);
 
-- 
cgit v1.2.3


From 09a40af5240de02d848247ab82440ad75b31ab11 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 15 Apr 2010 07:29:59 +0200
Subject: sched: Fix UP update_avg() build warning

update_avg() is only used for SMP builds, move it to the nearest
SMP block.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1271309399.14779.17.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index ab562ae4007..de0da71daf7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1872,12 +1872,6 @@ static void set_load_weight(struct task_struct *p)
 	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
 }
 
-static void update_avg(u64 *avg, u64 sample)
-{
-	s64 diff = sample - *avg;
-	*avg += diff >> 3;
-}
-
 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
@@ -2332,6 +2326,12 @@ int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_
 
 	return cpu;
 }
+
+static void update_avg(u64 *avg, u64 sample)
+{
+	s64 diff = sample - *avg;
+	*avg += diff >> 3;
+}
 #endif
 
 /***
-- 
cgit v1.2.3


From a289cc7c70da784a2d370b91885cab4f966dcb0f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 16 Apr 2010 17:51:42 -0700
Subject: x86, UV: uv_irq.c: Fix all sparse warnings

Fix all sparse warnings in building uv_irq.c.

 arch/x86/kernel/uv_irq.c:46:17: warning: symbol 'uv_irq_chip' was not declared. Should it be static?
 arch/x86/kernel/uv_irq.c:143:50: error: no identifier for function argument
 arch/x86/kernel/uv_irq.c:162:13: error: typename in expression
 arch/x86/kernel/uv_irq.c:162:13: error: undefined identifier 'restrict'
 arch/x86/kernel/uv_irq.c:250:44: error: no identifier for function argument
 arch/x86/kernel/uv_irq.c:260:17: error: typename in expression
 arch/x86/kernel/uv_irq.c:260:17: error: undefined identifier 'restrict'
 arch/x86/kernel/uv_irq.c:233:50: warning: incorrect type in argument 3 (different signedness)
 arch/x86/kernel/uv_irq.c:233:50:    expected int *pnode
 arch/x86/kernel/uv_irq.c:233:50:    got unsigned int *<noident>
 arch/x86/include/asm/uv/uv_hub.h:318:44: warning: incorrect type in argument 2 (different address spaces)
 arch/x86/include/asm/uv/uv_hub.h:318:44:    expected void volatile [noderef] <asn:2>*addr
 arch/x86/include/asm/uv/uv_hub.h:318:44:    got unsigned long *

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Mike Travis <travis@sgi.com>
Cc: Cliff Wickman <cpw@sgi.com>
Cc: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20100416175142.f4b59683.randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_hub.h |  2 +-
 arch/x86/kernel/uv_irq.c         | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 14cc74ba5d2..bf6b88ef8ee 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -307,7 +307,7 @@ static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset
  * Access Global MMR space using the MMR space located at the top of physical
  * memory.
  */
-static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset)
+static inline volatile void __iomem *uv_global_mmr64_address(int pnode, unsigned long offset)
 {
 	return __va(UV_GLOBAL_MMR64_BASE |
 		    UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index ece73d8e324..1a9f55a3348 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -43,7 +43,7 @@ static void uv_ack_apic(unsigned int irq)
 	ack_APIC_irq();
 }
 
-struct irq_chip uv_irq_chip = {
+static struct irq_chip uv_irq_chip = {
 	.name		= "UV-CORE",
 	.startup	= uv_noop_ret,
 	.shutdown	= uv_noop,
@@ -140,7 +140,7 @@ int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
  */
 static int
 arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
-		       unsigned long mmr_offset, int restrict)
+		       unsigned long mmr_offset, int limit)
 {
 	const struct cpumask *eligible_cpu = cpumask_of(cpu);
 	struct irq_desc *desc = irq_to_desc(irq);
@@ -159,7 +159,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 	if (err != 0)
 		return err;
 
-	if (restrict == UV_AFFINITY_CPU)
+	if (limit == UV_AFFINITY_CPU)
 		desc->status |= IRQ_NO_BALANCING;
 	else
 		desc->status |= IRQ_MOVE_PCNTXT;
@@ -213,7 +213,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	unsigned long mmr_value;
 	struct uv_IO_APIC_route_entry *entry;
 	unsigned long mmr_offset;
-	unsigned mmr_pnode;
+	int mmr_pnode;
 
 	if (set_desc_affinity(desc, mask, &dest))
 		return -1;
@@ -247,7 +247,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
  * interrupt is raised.
  */
 int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
-		 unsigned long mmr_offset, int restrict)
+		 unsigned long mmr_offset, int limit)
 {
 	int irq, ret;
 
@@ -257,7 +257,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
 		return -EBUSY;
 
 	ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
-		restrict);
+		limit);
 	if (ret == irq)
 		uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
 	else
-- 
cgit v1.2.3


From 39447b386c846bbf1c56f6403c5282837486200f Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Mon, 19 Apr 2010 13:32:41 +0800
Subject: perf: Enhance perf to allow for guest statistic collection from host

Below patch introduces perf_guest_info_callbacks and related
register/unregister functions. Add more PERF_RECORD_MISC_XXX bits
meaning guest kernel and guest user space.

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/perf_event.h | 15 ++++-----------
 arch/x86/kernel/cpu/perf_event.c  | 31 +++++++++++++++++++++++++++++++
 include/linux/perf_event.h        | 21 ++++++++++++++++++++-
 kernel/perf_event.c               | 23 ++++++++++++++++++++++-
 4 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index f6d43dbfd8e..254883d0c7e 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -135,17 +135,10 @@ extern void perf_events_lapic_init(void);
  */
 #define PERF_EFLAGS_EXACT	(1UL << 3)
 
-#define perf_misc_flags(regs)				\
-({	int misc = 0;					\
-	if (user_mode(regs))				\
-		misc |= PERF_RECORD_MISC_USER;		\
-	else						\
-		misc |= PERF_RECORD_MISC_KERNEL;	\
-	if (regs->flags & PERF_EFLAGS_EXACT)		\
-		misc |= PERF_RECORD_MISC_EXACT;		\
-	misc; })
-
-#define perf_instruction_pointer(regs)	((regs)->ip)
+struct pt_regs;
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs)	perf_misc_flags(regs)
 
 #else
 static inline void init_hw_perf_events(void)		{ }
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 626154a9f53..2ea78abf69d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1720,6 +1720,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 {
 	struct perf_callchain_entry *entry;
 
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* TODO: We don't support guest os callchain now */
+		return NULL;
+	}
+
 	if (in_nmi())
 		entry = &__get_cpu_var(pmc_nmi_entry);
 	else
@@ -1743,3 +1748,29 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 	regs->cs = __KERNEL_CS;
 	local_save_flags(regs->flags);
 }
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+	unsigned long ip;
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+		ip = perf_guest_cbs->get_guest_ip();
+	else
+		ip = instruction_pointer(regs);
+	return ip;
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+	int misc = 0;
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		misc |= perf_guest_cbs->is_user_mode() ?
+			PERF_RECORD_MISC_GUEST_USER :
+			PERF_RECORD_MISC_GUEST_KERNEL;
+	} else
+		misc |= user_mode(regs) ? PERF_RECORD_MISC_USER :
+			PERF_RECORD_MISC_KERNEL;
+	if (regs->flags & PERF_EFLAGS_EXACT)
+		misc |= PERF_RECORD_MISC_EXACT;
+
+	return misc;
+}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index bf896d0b2e9..24de5f181a4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -288,11 +288,13 @@ struct perf_event_mmap_page {
 	__u64	data_tail;		/* user-space written tail */
 };
 
-#define PERF_RECORD_MISC_CPUMODE_MASK		(3 << 0)
+#define PERF_RECORD_MISC_CPUMODE_MASK		(7 << 0)
 #define PERF_RECORD_MISC_CPUMODE_UNKNOWN	(0 << 0)
 #define PERF_RECORD_MISC_KERNEL			(1 << 0)
 #define PERF_RECORD_MISC_USER			(2 << 0)
 #define PERF_RECORD_MISC_HYPERVISOR		(3 << 0)
+#define PERF_RECORD_MISC_GUEST_KERNEL		(4 << 0)
+#define PERF_RECORD_MISC_GUEST_USER		(5 << 0)
 
 #define PERF_RECORD_MISC_EXACT			(1 << 14)
 /*
@@ -446,6 +448,12 @@ enum perf_callchain_context {
 # include <asm/perf_event.h>
 #endif
 
+struct perf_guest_info_callbacks {
+	int (*is_in_guest) (void);
+	int (*is_user_mode) (void);
+	unsigned long (*get_guest_ip) (void);
+};
+
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 #include <asm/hw_breakpoint.h>
 #endif
@@ -932,6 +940,12 @@ static inline void perf_event_mmap(struct vm_area_struct *vma)
 		__perf_event_mmap(vma);
 }
 
+extern struct perf_guest_info_callbacks *perf_guest_cbs;
+extern int perf_register_guest_info_callbacks(
+		struct perf_guest_info_callbacks *);
+extern int perf_unregister_guest_info_callbacks(
+		struct perf_guest_info_callbacks *);
+
 extern void perf_event_comm(struct task_struct *tsk);
 extern void perf_event_fork(struct task_struct *tsk);
 
@@ -1001,6 +1015,11 @@ perf_sw_event(u32 event_id, u64 nr, int nmi,
 static inline void
 perf_bp_event(struct perf_event *event, void *data)			{ }
 
+static inline int perf_register_guest_info_callbacks
+(struct perf_guest_info_callbacks *) {return 0; }
+static inline int perf_unregister_guest_info_callbacks
+(struct perf_guest_info_callbacks *) {return 0; }
+
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 07b7a435bf0..9dbe8cdaf14 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2797,6 +2797,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 }
 
 
+/*
+ * We assume there is only KVM supporting the callbacks.
+ * Later on, we might change it to a list if there is
+ * another virtualization implementation supporting the callbacks.
+ */
+struct perf_guest_info_callbacks *perf_guest_cbs;
+
+int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+	perf_guest_cbs = cbs;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
+
+int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+{
+	perf_guest_cbs = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+
 /*
  * Output
  */
@@ -3749,7 +3770,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
 		.event_id  = {
 			.header = {
 				.type = PERF_RECORD_MMAP,
-				.misc = 0,
+				.misc = PERF_RECORD_MISC_USER,
 				/* .size */
 			},
 			/* .pid */
-- 
cgit v1.2.3


From ff9d07a0e7ce756a183e7c2e483aec452ee6b574 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Mon, 19 Apr 2010 13:32:45 +0800
Subject: KVM: Implement perf callbacks for guest sampling

Below patch implements the perf_guest_info_callbacks on kvm.

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c |  5 ++++-
 arch/x86/kvm/x86.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.h |  3 +++
 3 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 686492ed307..82be6dac3d2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3654,8 +3654,11 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 
 	/* We need to handle NMIs before interrupts are enabled */
 	if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
-	    (exit_intr_info & INTR_INFO_VALID_MASK))
+	    (exit_intr_info & INTR_INFO_VALID_MASK)) {
+		kvm_before_handle_nmi(&vmx->vcpu);
 		asm("int $2");
+		kvm_after_handle_nmi(&vmx->vcpu);
+	}
 
 	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 24cd0ee896e..c3a33b2bb16 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -40,6 +40,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
+#include <linux/perf_event.h>
 #include <trace/events/kvm.h>
 #undef TRACE_INCLUDE_FILE
 #define CREATE_TRACE_POINTS
@@ -3765,6 +3766,47 @@ static void kvm_timer_init(void)
 	}
 }
 
+static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
+
+static int kvm_is_in_guest(void)
+{
+	return percpu_read(current_vcpu) != NULL;
+}
+
+static int kvm_is_user_mode(void)
+{
+	int user_mode = 3;
+	if (percpu_read(current_vcpu))
+		user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
+	return user_mode != 0;
+}
+
+static unsigned long kvm_get_guest_ip(void)
+{
+	unsigned long ip = 0;
+	if (percpu_read(current_vcpu))
+		ip = kvm_rip_read(percpu_read(current_vcpu));
+	return ip;
+}
+
+static struct perf_guest_info_callbacks kvm_guest_cbs = {
+	.is_in_guest		= kvm_is_in_guest,
+	.is_user_mode		= kvm_is_user_mode,
+	.get_guest_ip		= kvm_get_guest_ip,
+};
+
+void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
+{
+	percpu_write(current_vcpu, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
+
+void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
+{
+	percpu_write(current_vcpu, NULL);
+}
+EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
+
 int kvm_arch_init(void *opaque)
 {
 	int r;
@@ -3801,6 +3843,8 @@ int kvm_arch_init(void *opaque)
 
 	kvm_timer_init();
 
+	perf_register_guest_info_callbacks(&kvm_guest_cbs);
+
 	return 0;
 
 out:
@@ -3809,6 +3853,8 @@ out:
 
 void kvm_arch_exit(void)
 {
+	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+
 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
 		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
 					    CPUFREQ_TRANSITION_NOTIFIER);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2d101639bd8..b7a404722d2 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,4 +65,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 	return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
 }
 
+void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
+void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+
 #endif
-- 
cgit v1.2.3


From a1645ce12adb6c9cc9e19d7695466204e3f017fe Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Mon, 19 Apr 2010 13:32:50 +0800
Subject: perf: 'perf kvm' tool for monitoring guest performance from host

Here is the patch of userspace perf tool.

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 tools/perf/Documentation/perf-kvm.txt |  67 ++++++
 tools/perf/Makefile                   |   1 +
 tools/perf/builtin-annotate.c         |   2 +-
 tools/perf/builtin-buildid-list.c     |   2 +-
 tools/perf/builtin-diff.c             |   6 +-
 tools/perf/builtin-kmem.c             |  11 +-
 tools/perf/builtin-kvm.c              | 144 +++++++++++++
 tools/perf/builtin-record.c           |  63 +++++-
 tools/perf/builtin-report.c           |   6 +-
 tools/perf/builtin-top.c              |  75 +++++--
 tools/perf/builtin.h                  |   1 +
 tools/perf/command-list.txt           |   1 +
 tools/perf/perf.c                     |   1 +
 tools/perf/perf.h                     |   2 +
 tools/perf/util/build-id.c            |   2 +-
 tools/perf/util/event.c               | 280 ++++++++++++++++++-------
 tools/perf/util/event.h               |  10 +-
 tools/perf/util/header.c              | 213 +++++++++++++++----
 tools/perf/util/header.h              |   1 +
 tools/perf/util/hist.c                |  72 ++++++-
 tools/perf/util/hist.h                |   3 +
 tools/perf/util/map.c                 | 139 ++++++++++++-
 tools/perf/util/map.h                 |  75 +++++--
 tools/perf/util/probe-event.c         |   7 +-
 tools/perf/util/session.c             |  77 +++----
 tools/perf/util/session.h             |  28 +--
 tools/perf/util/sort.h                |   5 +
 tools/perf/util/symbol.c              | 382 ++++++++++++++++++++++++++++------
 tools/perf/util/symbol.h              |  43 ++--
 tools/perf/util/thread.h              |   4 +-
 30 files changed, 1407 insertions(+), 316 deletions(-)
 create mode 100644 tools/perf/Documentation/perf-kvm.txt
 create mode 100644 tools/perf/builtin-kvm.c

diff --git a/tools/perf/Documentation/perf-kvm.txt b/tools/perf/Documentation/perf-kvm.txt
new file mode 100644
index 00000000000..93400a0f17f
--- /dev/null
+++ b/tools/perf/Documentation/perf-kvm.txt
@@ -0,0 +1,67 @@
+perf-kvm(1)
+==============
+
+NAME
+----
+perf-kvm - Tool to trace/measure kvm guest os
+
+SYNOPSIS
+--------
+[verse]
+'perf kvm' [--host] [--guest] [--guestmount=<path>
+	[--guestkallsyms=<path> --guestmodules=<path> | --guestvmlinux=<path>]]
+	{top|record|report|diff|buildid-list}
+'perf kvm' [--host] [--guest] [--guestkallsyms=<path> --guestmodules=<path>
+	| --guestvmlinux=<path>] {top|record|report|diff|buildid-list}
+
+DESCRIPTION
+-----------
+There are a couple of variants of perf kvm:
+
+  'perf kvm [options] top <command>' to generates and displays
+  a performance counter profile of guest os in realtime
+  of an arbitrary workload.
+
+  'perf kvm record <command>' to record the performance couinter profile
+  of an arbitrary workload and save it into a perf data file. If both
+  --host and --guest are input, the perf data file name is perf.data.kvm.
+  If there is  no --host but --guest, the file name is perf.data.guest.
+  If there is no --guest but --host, the file name is perf.data.host.
+
+  'perf kvm report' to display the performance counter profile information
+  recorded via perf kvm record.
+
+  'perf kvm diff' to displays the performance difference amongst two perf.data
+  files captured via perf record.
+
+  'perf kvm buildid-list' to  display the buildids found in a perf data file,
+  so that other tools can be used to fetch packages with matching symbol tables
+  for use by perf report.
+
+OPTIONS
+-------
+--host=::
+        Collect host side perforamnce profile.
+--guest=::
+        Collect guest side perforamnce profile.
+--guestmount=<path>::
+	Guest os root file system mount directory. Users mounts guest os
+        root directories under <path> by a specific filesystem access method,
+	typically, sshfs. For example, start 2 guest os. The one's pid is 8888
+	and the other's is 9999.
+        #mkdir ~/guestmount; cd ~/guestmount
+        #sshfs -o allow_other,direct_io -p 5551 localhost:/ 8888/
+        #sshfs -o allow_other,direct_io -p 5552 localhost:/ 9999/
+        #perf kvm --host --guest --guestmount=~/guestmount top
+--guestkallsyms=<path>::
+        Guest os /proc/kallsyms file copy. 'perf' kvm' reads it to get guest
+	kernel symbols. Users copy it out from guest os.
+--guestmodules=<path>::
+	Guest os /proc/modules file copy. 'perf' kvm' reads it to get guest
+	kernel module information. Users copy it out from guest os.
+--guestvmlinux=<path>::
+	Guest os kernel vmlinux.
+
+SEE ALSO
+--------
+linkperf:perf-top[1] perf-record[1] perf-report[1] perf-diff[1] perf-buildid-list[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 57b3569716d..3cb3449a964 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -472,6 +472,7 @@ BUILTIN_OBJS += $(OUTPUT)builtin-trace.o
 BUILTIN_OBJS += $(OUTPUT)builtin-probe.o
 BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o
 BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
+BUILTIN_OBJS += $(OUTPUT)builtin-kvm.o
 
 PERFLIBS = $(LIB_FILE)
 
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 06eaebe10d0..f924b4332be 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -571,7 +571,7 @@ static int __cmd_annotate(void)
 		perf_session__fprintf(session, stdout);
 
 	if (verbose > 2)
-		dsos__fprintf(stdout);
+		dsos__fprintf(&session->kerninfo_root, stdout);
 
 	perf_session__collapse_resort(&session->hists);
 	perf_session__output_resort(&session->hists, session->event_total[0]);
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index af2ad8b92f7..623afe3fdcb 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -46,7 +46,7 @@ static int __cmd_buildid_list(void)
 	if (with_hits)
 		perf_session__process_events(session, &build_id__mark_dso_hit_ops);
 
-	dsos__fprintf_buildid(stdout, with_hits);
+	dsos__fprintf_buildid(&session->kerninfo_root, stdout, with_hits);
 
 	perf_session__delete(session);
 	return err;
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 3a1d94d75dc..207e860591e 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -33,7 +33,7 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 		return -ENOMEM;
 
 	if (hit)
-		he->count += count;
+		__perf_session__add_count(he, al, count);
 
 	return 0;
 }
@@ -225,6 +225,10 @@ int cmd_diff(int argc, const char **argv, const char *prefix __used)
 			input_new = argv[1];
 		} else
 			input_new = argv[0];
+	} else if (symbol_conf.default_guest_vmlinux_name ||
+		   symbol_conf.default_guest_kallsyms) {
+		input_old = "perf.data.host";
+		input_new = "perf.data.guest";
 	}
 
 	symbol_conf.exclude_other = false;
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 513aa8a55db..db474bbf332 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -351,6 +351,7 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 			   int n_lines, int is_caller)
 {
 	struct rb_node *next;
+	struct kernel_info *kerninfo;
 
 	printf("%.102s\n", graph_dotted_line);
 	printf(" %-34s |",  is_caller ? "Callsite": "Alloc Ptr");
@@ -359,10 +360,16 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 
 	next = rb_first(root);
 
+	kerninfo = kerninfo__findhost(&session->kerninfo_root);
+	if (!kerninfo) {
+		pr_err("__print_result: couldn't find kernel information\n");
+		return;
+	}
 	while (next && n_lines--) {
 		struct alloc_stat *data = rb_entry(next, struct alloc_stat,
 						   node);
 		struct symbol *sym = NULL;
+		struct map_groups *kmaps = &kerninfo->kmaps;
 		struct map *map;
 		char buf[BUFSIZ];
 		u64 addr;
@@ -370,8 +377,8 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 		if (is_caller) {
 			addr = data->call_site;
 			if (!raw_ip)
-				sym = map_groups__find_function(&session->kmaps,
-								addr, &map, NULL);
+				sym = map_groups__find_function(kmaps, addr,
+								&map, NULL);
 		} else
 			addr = data->ptr;
 
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
new file mode 100644
index 00000000000..a4c7cae4502
--- /dev/null
+++ b/tools/perf/builtin-kvm.c
@@ -0,0 +1,144 @@
+#include "builtin.h"
+#include "perf.h"
+
+#include "util/util.h"
+#include "util/cache.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/header.h"
+#include "util/session.h"
+
+#include "util/parse-options.h"
+#include "util/trace-event.h"
+
+#include "util/debug.h"
+
+#include <sys/prctl.h>
+
+#include <semaphore.h>
+#include <pthread.h>
+#include <math.h>
+
+static char			*file_name;
+static char			name_buffer[256];
+
+int				perf_host = 1;
+int				perf_guest;
+
+static const char * const kvm_usage[] = {
+	"perf kvm [<options>] {top|record|report|diff|buildid-list}",
+	NULL
+};
+
+static const struct option kvm_options[] = {
+	OPT_STRING('i', "input", &file_name, "file",
+		   "Input file name"),
+	OPT_STRING('o', "output", &file_name, "file",
+		   "Output file name"),
+	OPT_BOOLEAN(0, "guest", &perf_guest,
+		    "Collect guest os data"),
+	OPT_BOOLEAN(0, "host", &perf_host,
+		    "Collect guest os data"),
+	OPT_STRING(0, "guestmount", &symbol_conf.guestmount, "directory",
+		   "guest mount directory under which every guest os"
+		   " instance has a subdir"),
+	OPT_STRING(0, "guestvmlinux", &symbol_conf.default_guest_vmlinux_name,
+		   "file", "file saving guest os vmlinux"),
+	OPT_STRING(0, "guestkallsyms", &symbol_conf.default_guest_kallsyms,
+		   "file", "file saving guest os /proc/kallsyms"),
+	OPT_STRING(0, "guestmodules", &symbol_conf.default_guest_modules,
+		   "file", "file saving guest os /proc/modules"),
+	OPT_END()
+};
+
+static int __cmd_record(int argc, const char **argv)
+{
+	int rec_argc, i = 0, j;
+	const char **rec_argv;
+
+	rec_argc = argc + 2;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+	rec_argv[i++] = strdup("record");
+	rec_argv[i++] = strdup("-o");
+	rec_argv[i++] = strdup(file_name);
+	for (j = 1; j < argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	BUG_ON(i != rec_argc);
+
+	return cmd_record(i, rec_argv, NULL);
+}
+
+static int __cmd_report(int argc, const char **argv)
+{
+	int rec_argc, i = 0, j;
+	const char **rec_argv;
+
+	rec_argc = argc + 2;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+	rec_argv[i++] = strdup("report");
+	rec_argv[i++] = strdup("-i");
+	rec_argv[i++] = strdup(file_name);
+	for (j = 1; j < argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	BUG_ON(i != rec_argc);
+
+	return cmd_report(i, rec_argv, NULL);
+}
+
+static int __cmd_buildid_list(int argc, const char **argv)
+{
+	int rec_argc, i = 0, j;
+	const char **rec_argv;
+
+	rec_argc = argc + 2;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+	rec_argv[i++] = strdup("buildid-list");
+	rec_argv[i++] = strdup("-i");
+	rec_argv[i++] = strdup(file_name);
+	for (j = 1; j < argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	BUG_ON(i != rec_argc);
+
+	return cmd_buildid_list(i, rec_argv, NULL);
+}
+
+int cmd_kvm(int argc, const char **argv, const char *prefix __used)
+{
+	perf_host = perf_guest = 0;
+
+	argc = parse_options(argc, argv, kvm_options, kvm_usage,
+			PARSE_OPT_STOP_AT_NON_OPTION);
+	if (!argc)
+		usage_with_options(kvm_usage, kvm_options);
+
+	if (!perf_host)
+		perf_guest = 1;
+
+	if (!file_name) {
+		if (perf_host && !perf_guest)
+			sprintf(name_buffer, "perf.data.host");
+		else if (!perf_host && perf_guest)
+			sprintf(name_buffer, "perf.data.guest");
+		else
+			sprintf(name_buffer, "perf.data.kvm");
+		file_name = name_buffer;
+	}
+
+	if (!strncmp(argv[0], "rec", 3))
+		return __cmd_record(argc, argv);
+	else if (!strncmp(argv[0], "rep", 3))
+		return __cmd_report(argc, argv);
+	else if (!strncmp(argv[0], "diff", 4))
+		return cmd_diff(argc, argv, NULL);
+	else if (!strncmp(argv[0], "top", 3))
+		return cmd_top(argc, argv, NULL);
+	else if (!strncmp(argv[0], "buildid-list", 12))
+		return __cmd_buildid_list(argc, argv);
+	else
+		usage_with_options(kvm_usage, kvm_options);
+
+	return 0;
+}
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index a1b99eeac3c..27f992aca8b 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -456,6 +456,52 @@ static void atexit_header(void)
 	}
 }
 
+static void event__synthesize_guest_os(struct kernel_info *kerninfo,
+		void *data __attribute__((unused)))
+{
+	int err;
+	char *guest_kallsyms;
+	char path[PATH_MAX];
+
+	if (is_host_kernel(kerninfo))
+		return;
+
+	/*
+	 *As for guest kernel when processing subcommand record&report,
+	 *we arrange module mmap prior to guest kernel mmap and trigger
+	 *a preload dso because default guest module symbols are loaded
+	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
+	 *method is used to avoid symbol missing when the first addr is
+	 *in module instead of in guest kernel.
+	 */
+	err = event__synthesize_modules(process_synthesized_event,
+			session,
+			kerninfo);
+	if (err < 0)
+		pr_err("Couldn't record guest kernel [%d]'s reference"
+			" relocation symbol.\n", kerninfo->pid);
+
+	if (is_default_guest(kerninfo))
+		guest_kallsyms = (char *) symbol_conf.default_guest_kallsyms;
+	else {
+		sprintf(path, "%s/proc/kallsyms", kerninfo->root_dir);
+		guest_kallsyms = path;
+	}
+
+	/*
+	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
+	 * have no _text sometimes.
+	 */
+	err = event__synthesize_kernel_mmap(process_synthesized_event,
+			session, kerninfo, "_text");
+	if (err < 0)
+		err = event__synthesize_kernel_mmap(process_synthesized_event,
+				session, kerninfo, "_stext");
+	if (err < 0)
+		pr_err("Couldn't record guest kernel [%d]'s reference"
+			" relocation symbol.\n", kerninfo->pid);
+}
+
 static int __cmd_record(int argc, const char **argv)
 {
 	int i, counter;
@@ -467,6 +513,7 @@ static int __cmd_record(int argc, const char **argv)
 	int child_ready_pipe[2], go_pipe[2];
 	const bool forks = argc > 0;
 	char buf;
+	struct kernel_info *kerninfo;
 
 	page_size = sysconf(_SC_PAGE_SIZE);
 
@@ -635,21 +682,31 @@ static int __cmd_record(int argc, const char **argv)
 		advance_output(err);
 	}
 
+	kerninfo = kerninfo__findhost(&session->kerninfo_root);
+	if (!kerninfo) {
+		pr_err("Couldn't find native kernel information.\n");
+		return -1;
+	}
+
 	err = event__synthesize_kernel_mmap(process_synthesized_event,
-					    session, "_text");
+			session, kerninfo, "_text");
 	if (err < 0)
 		err = event__synthesize_kernel_mmap(process_synthesized_event,
-						    session, "_stext");
+				session, kerninfo, "_stext");
 	if (err < 0) {
 		pr_err("Couldn't record kernel reference relocation symbol.\n");
 		return err;
 	}
 
-	err = event__synthesize_modules(process_synthesized_event, session);
+	err = event__synthesize_modules(process_synthesized_event,
+				session, kerninfo);
 	if (err < 0) {
 		pr_err("Couldn't record kernel reference relocation symbol.\n");
 		return err;
 	}
+	if (perf_guest)
+		kerninfo__process_allkernels(&session->kerninfo_root,
+			event__synthesize_guest_os, session);
 
 	if (!system_wide && profile_cpu == -1)
 		event__synthesize_thread(target_tid, process_synthesized_event,
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 7da5fb36526..816edae7c5b 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -108,7 +108,7 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 		return -ENOMEM;
 
 	if (hit)
-		he->count += data->period;
+		__perf_session__add_count(he, al,  data->period);
 
 	if (symbol_conf.use_callchain) {
 		if (!hit)
@@ -313,7 +313,7 @@ static int __cmd_report(void)
 		perf_session__fprintf(session, stdout);
 
 	if (verbose > 2)
-		dsos__fprintf(stdout);
+		dsos__fprintf(&session->kerninfo_root, stdout);
 
 	next = rb_first(&session->stats_by_id);
 	while (next) {
@@ -450,6 +450,8 @@ static const struct option options[] = {
 		   "sort by key(s): pid, comm, dso, symbol, parent"),
 	OPT_BOOLEAN('P', "full-paths", &symbol_conf.full_paths,
 		    "Don't shorten the pathnames taking into account the cwd"),
+	OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization,
+		    "Show sample percentage for different cpu modes"),
 	OPT_STRING('p', "parent", &parent_pattern, "regex",
 		   "regex filter to identify parent, see: '--sort parent'"),
 	OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other,
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 40f24dd46ef..dfd7ea7dabd 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -420,8 +420,9 @@ static double sym_weight(const struct sym_entry *sym)
 }
 
 static long			samples;
-static long			userspace_samples;
+static long			kernel_samples, us_samples;
 static long			exact_samples;
+static long			guest_us_samples, guest_kernel_samples;
 static const char		CONSOLE_CLEAR[] = "[H[2J";
 
 static void __list_insert_active_sym(struct sym_entry *syme)
@@ -461,7 +462,10 @@ static void print_sym_table(void)
 	int printed = 0, j;
 	int counter, snap = !display_weighted ? sym_counter : 0;
 	float samples_per_sec = samples/delay_secs;
-	float ksamples_per_sec = (samples-userspace_samples)/delay_secs;
+	float ksamples_per_sec = kernel_samples/delay_secs;
+	float us_samples_per_sec = (us_samples)/delay_secs;
+	float guest_kernel_samples_per_sec = (guest_kernel_samples)/delay_secs;
+	float guest_us_samples_per_sec = (guest_us_samples)/delay_secs;
 	float esamples_percent = (100.0*exact_samples)/samples;
 	float sum_ksamples = 0.0;
 	struct sym_entry *syme, *n;
@@ -470,7 +474,8 @@ static void print_sym_table(void)
 	int sym_width = 0, dso_width = 0, dso_short_width = 0;
 	const int win_width = winsize.ws_col - 1;
 
-	samples = userspace_samples = exact_samples = 0;
+	samples = us_samples = kernel_samples = exact_samples = 0;
+	guest_kernel_samples = guest_us_samples = 0;
 
 	/* Sort the active symbols */
 	pthread_mutex_lock(&active_symbols_lock);
@@ -501,10 +506,30 @@ static void print_sym_table(void)
 	puts(CONSOLE_CLEAR);
 
 	printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
-	printf( "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%%  exact: %4.1f%% [",
-		samples_per_sec,
-		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)),
-		esamples_percent);
+	if (!perf_guest) {
+		printf("   PerfTop:%8.0f irqs/sec  kernel:%4.1f%%"
+			"  exact: %4.1f%% [",
+			samples_per_sec,
+			100.0 - (100.0 * ((samples_per_sec - ksamples_per_sec) /
+					 samples_per_sec)),
+			esamples_percent);
+	} else {
+		printf("   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% us:%4.1f%%"
+			" guest kernel:%4.1f%% guest us:%4.1f%%"
+			" exact: %4.1f%% [",
+			samples_per_sec,
+			100.0 - (100.0 * ((samples_per_sec-ksamples_per_sec) /
+					  samples_per_sec)),
+			100.0 - (100.0 * ((samples_per_sec-us_samples_per_sec) /
+					  samples_per_sec)),
+			100.0 - (100.0 * ((samples_per_sec -
+						guest_kernel_samples_per_sec) /
+					  samples_per_sec)),
+			100.0 - (100.0 * ((samples_per_sec -
+					   guest_us_samples_per_sec) /
+					  samples_per_sec)),
+			esamples_percent);
+	}
 
 	if (nr_counters == 1 || !display_weighted) {
 		printf("%Ld", (u64)attrs[0].sample_period);
@@ -597,7 +622,6 @@ static void print_sym_table(void)
 
 		syme = rb_entry(nd, struct sym_entry, rb_node);
 		sym = sym_entry__symbol(syme);
-
 		if (++printed > print_entries || (int)syme->snap_count < count_filter)
 			continue;
 
@@ -761,7 +785,7 @@ static int key_mapped(int c)
 	return 0;
 }
 
-static void handle_keypress(int c)
+static void handle_keypress(struct perf_session *session, int c)
 {
 	if (!key_mapped(c)) {
 		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
@@ -830,7 +854,7 @@ static void handle_keypress(int c)
 		case 'Q':
 			printf("exiting.\n");
 			if (dump_symtab)
-				dsos__fprintf(stderr);
+				dsos__fprintf(&session->kerninfo_root, stderr);
 			exit(0);
 		case 's':
 			prompt_symbol(&sym_filter_entry, "Enter details symbol");
@@ -866,6 +890,7 @@ static void *display_thread(void *arg __used)
 	struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
 	struct termios tc, save;
 	int delay_msecs, c;
+	struct perf_session *session = (struct perf_session *) arg;
 
 	tcgetattr(0, &save);
 	tc = save;
@@ -886,7 +911,7 @@ repeat:
 	c = getc(stdin);
 	tcsetattr(0, TCSAFLUSH, &save);
 
-	handle_keypress(c);
+	handle_keypress(session, c);
 	goto repeat;
 
 	return NULL;
@@ -957,24 +982,46 @@ static void event__process_sample(const event_t *self,
 	u64 ip = self->ip.ip;
 	struct sym_entry *syme;
 	struct addr_location al;
+	struct kernel_info *kerninfo;
 	u8 origin = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
 	++samples;
 
 	switch (origin) {
 	case PERF_RECORD_MISC_USER:
-		++userspace_samples;
+		++us_samples;
 		if (hide_user_symbols)
 			return;
+		kerninfo = kerninfo__findhost(&session->kerninfo_root);
 		break;
 	case PERF_RECORD_MISC_KERNEL:
+		++kernel_samples;
 		if (hide_kernel_symbols)
 			return;
+		kerninfo = kerninfo__findhost(&session->kerninfo_root);
 		break;
+	case PERF_RECORD_MISC_GUEST_KERNEL:
+		++guest_kernel_samples;
+		kerninfo = kerninfo__find(&session->kerninfo_root,
+					  self->ip.pid);
+		break;
+	case PERF_RECORD_MISC_GUEST_USER:
+		++guest_us_samples;
+		/*
+		 * TODO: we don't process guest user from host side
+		 * except simple counting.
+		 */
+		return;
 	default:
 		return;
 	}
 
+	if (!kerninfo && perf_guest) {
+		pr_err("Can't find guest [%d]'s kernel information\n",
+			self->ip.pid);
+		return;
+	}
+
 	if (self->header.misc & PERF_RECORD_MISC_EXACT)
 		exact_samples++;
 
@@ -994,7 +1041,7 @@ static void event__process_sample(const event_t *self,
 		 * --hide-kernel-symbols, even if the user specifies an
 		 * invalid --vmlinux ;-)
 		 */
-		if (al.map == session->vmlinux_maps[MAP__FUNCTION] &&
+		if (al.map == kerninfo->vmlinux_maps[MAP__FUNCTION] &&
 		    RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) {
 			pr_err("The %s file can't be used\n",
 			       symbol_conf.vmlinux_name);
@@ -1261,7 +1308,7 @@ static int __cmd_top(void)
 
 	perf_session__mmap_read(session);
 
-	if (pthread_create(&thread, NULL, display_thread, NULL)) {
+	if (pthread_create(&thread, NULL, display_thread, session)) {
 		printf("Could not create display thread.\n");
 		exit(-1);
 	}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 10fe49e7048..ab28bca92e5 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -32,5 +32,6 @@ extern int cmd_version(int argc, const char **argv, const char *prefix);
 extern int cmd_probe(int argc, const char **argv, const char *prefix);
 extern int cmd_kmem(int argc, const char **argv, const char *prefix);
 extern int cmd_lock(int argc, const char **argv, const char *prefix);
+extern int cmd_kvm(int argc, const char **argv, const char *prefix);
 
 #endif
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index db6ee94d4a8..2a1162d413a 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -19,3 +19,4 @@ perf-trace			mainporcelain common
 perf-probe			mainporcelain common
 perf-kmem			mainporcelain common
 perf-lock			mainporcelain common
+perf-kvm			mainporcelain common
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index d4be55b6cd3..985cdb4bd00 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -307,6 +307,7 @@ static void handle_internal_command(int argc, const char **argv)
 		{ "probe",	cmd_probe,	0 },
 		{ "kmem",	cmd_kmem,	0 },
 		{ "lock",	cmd_lock,	0 },
+		{ "kvm",	cmd_kvm,	0 },
 	};
 	unsigned int i;
 	static const char ext[] = STRIP_EXTENSION;
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index ec212748d65..02821febb70 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -131,4 +131,6 @@ struct ip_callchain {
 	u64 ips[0];
 };
 
+extern int perf_host, perf_guest;
+
 #endif
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 04904b35ba8..0f60a390680 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -24,7 +24,7 @@ static int build_id__mark_dso_hit(event_t *event, struct perf_session *session)
 	}
 
 	thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION,
-			      event->ip.ip, &al);
+			      event->ip.pid, event->ip.ip, &al);
 
 	if (al.map != NULL)
 		al.map->dso->hit = 1;
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 571fb25f7eb..e3fa8d3d11b 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -112,7 +112,11 @@ static int event__synthesize_mmap_events(pid_t pid, pid_t tgid,
 		event_t ev = {
 			.header = {
 				.type = PERF_RECORD_MMAP,
-				.misc = 0, /* Just like the kernel, see kernel/perf_event.c __perf_event_mmap */
+				/*
+				 * Just like the kernel, see __perf_event_mmap
+				 * in kernel/perf_event.c
+				 */
+				.misc = PERF_RECORD_MISC_USER,
 			 },
 		};
 		int n;
@@ -167,11 +171,23 @@ static int event__synthesize_mmap_events(pid_t pid, pid_t tgid,
 }
 
 int event__synthesize_modules(event__handler_t process,
-			      struct perf_session *session)
+			      struct perf_session *session,
+			      struct kernel_info *kerninfo)
 {
 	struct rb_node *nd;
+	struct map_groups *kmaps = &kerninfo->kmaps;
+	u16 misc;
 
-	for (nd = rb_first(&session->kmaps.maps[MAP__FUNCTION]);
+	/*
+	 * kernel uses 0 for user space maps, see kernel/perf_event.c
+	 * __perf_event_mmap
+	 */
+	if (is_host_kernel(kerninfo))
+		misc = PERF_RECORD_MISC_KERNEL;
+	else
+		misc = PERF_RECORD_MISC_GUEST_KERNEL;
+
+	for (nd = rb_first(&kmaps->maps[MAP__FUNCTION]);
 	     nd; nd = rb_next(nd)) {
 		event_t ev;
 		size_t size;
@@ -182,12 +198,13 @@ int event__synthesize_modules(event__handler_t process,
 
 		size = ALIGN(pos->dso->long_name_len + 1, sizeof(u64));
 		memset(&ev, 0, sizeof(ev));
-		ev.mmap.header.misc = 1; /* kernel uses 0 for user space maps, see kernel/perf_event.c __perf_event_mmap */
+		ev.mmap.header.misc = misc;
 		ev.mmap.header.type = PERF_RECORD_MMAP;
 		ev.mmap.header.size = (sizeof(ev.mmap) -
 				        (sizeof(ev.mmap.filename) - size));
 		ev.mmap.start = pos->start;
 		ev.mmap.len   = pos->end - pos->start;
+		ev.mmap.pid   = kerninfo->pid;
 
 		memcpy(ev.mmap.filename, pos->dso->long_name,
 		       pos->dso->long_name_len + 1);
@@ -250,13 +267,18 @@ static int find_symbol_cb(void *arg, const char *name, char type, u64 start)
 
 int event__synthesize_kernel_mmap(event__handler_t process,
 				  struct perf_session *session,
+				  struct kernel_info *kerninfo,
 				  const char *symbol_name)
 {
 	size_t size;
+	const char *filename, *mmap_name;
+	char path[PATH_MAX];
+	char name_buff[PATH_MAX];
+	struct map *map;
+
 	event_t ev = {
 		.header = {
 			.type = PERF_RECORD_MMAP,
-			.misc = 1, /* kernel uses 0 for user space maps, see kernel/perf_event.c __perf_event_mmap */
 		},
 	};
 	/*
@@ -266,16 +288,37 @@ int event__synthesize_kernel_mmap(event__handler_t process,
 	 */
 	struct process_symbol_args args = { .name = symbol_name, };
 
-	if (kallsyms__parse("/proc/kallsyms", &args, find_symbol_cb) <= 0)
+	mmap_name = kern_mmap_name(kerninfo, name_buff);
+	if (is_host_kernel(kerninfo)) {
+		/*
+		 * kernel uses PERF_RECORD_MISC_USER for user space maps,
+		 * see kernel/perf_event.c __perf_event_mmap
+		 */
+		ev.header.misc = PERF_RECORD_MISC_KERNEL;
+		filename = "/proc/kallsyms";
+	} else {
+		ev.header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
+		if (is_default_guest(kerninfo))
+			filename = (char *) symbol_conf.default_guest_kallsyms;
+		else {
+			sprintf(path, "%s/proc/kallsyms", kerninfo->root_dir);
+			filename = path;
+		}
+	}
+
+	if (kallsyms__parse(filename, &args, find_symbol_cb) <= 0)
 		return -ENOENT;
 
+	map = kerninfo->vmlinux_maps[MAP__FUNCTION];
 	size = snprintf(ev.mmap.filename, sizeof(ev.mmap.filename),
-			"[kernel.kallsyms.%s]", symbol_name) + 1;
+			"%s%s", mmap_name, symbol_name) + 1;
 	size = ALIGN(size, sizeof(u64));
-	ev.mmap.header.size = (sizeof(ev.mmap) - (sizeof(ev.mmap.filename) - size));
+	ev.mmap.header.size = (sizeof(ev.mmap) -
+			(sizeof(ev.mmap.filename) - size));
 	ev.mmap.pgoff = args.start;
-	ev.mmap.start = session->vmlinux_maps[MAP__FUNCTION]->start;
-	ev.mmap.len   = session->vmlinux_maps[MAP__FUNCTION]->end - ev.mmap.start ;
+	ev.mmap.start = map->start;
+	ev.mmap.len   = map->end - ev.mmap.start;
+	ev.mmap.pid   = kerninfo->pid;
 
 	return process(&ev, session);
 }
@@ -329,22 +372,50 @@ int event__process_lost(event_t *self, struct perf_session *session)
 	return 0;
 }
 
-int event__process_mmap(event_t *self, struct perf_session *session)
+static void event_set_kernel_mmap_len(struct map **maps, event_t *self)
+{
+	maps[MAP__FUNCTION]->start = self->mmap.start;
+	maps[MAP__FUNCTION]->end   = self->mmap.start + self->mmap.len;
+	/*
+	 * Be a bit paranoid here, some perf.data file came with
+	 * a zero sized synthesized MMAP event for the kernel.
+	 */
+	if (maps[MAP__FUNCTION]->end == 0)
+		maps[MAP__FUNCTION]->end = ~0UL;
+}
+
+static int event__process_kernel_mmap(event_t *self,
+			struct perf_session *session)
 {
-	struct thread *thread;
 	struct map *map;
+	char kmmap_prefix[PATH_MAX];
+	struct kernel_info *kerninfo;
+	enum dso_kernel_type kernel_type;
+	bool is_kernel_mmap;
+
+	kerninfo = kerninfo__findnew(&session->kerninfo_root, self->mmap.pid);
+	if (!kerninfo) {
+		pr_err("Can't find id %d's kerninfo\n", self->mmap.pid);
+		goto out_problem;
+	}
 
-	dump_printf(" %d/%d: [%#Lx(%#Lx) @ %#Lx]: %s\n",
-		    self->mmap.pid, self->mmap.tid, self->mmap.start,
-		    self->mmap.len, self->mmap.pgoff, self->mmap.filename);
+	kern_mmap_name(kerninfo, kmmap_prefix);
+	if (is_host_kernel(kerninfo))
+		kernel_type = DSO_TYPE_KERNEL;
+	else
+		kernel_type = DSO_TYPE_GUEST_KERNEL;
 
-	if (self->mmap.pid == 0) {
-		static const char kmmap_prefix[] = "[kernel.kallsyms.";
+	is_kernel_mmap = memcmp(self->mmap.filename,
+				kmmap_prefix,
+				strlen(kmmap_prefix)) == 0;
+	if (self->mmap.filename[0] == '/' ||
+	    (!is_kernel_mmap && self->mmap.filename[0] == '[')) {
 
-		if (self->mmap.filename[0] == '/') {
-			char short_module_name[1024];
-			char *name = strrchr(self->mmap.filename, '/'), *dot;
+		char short_module_name[1024];
+		char *name, *dot;
 
+		if (self->mmap.filename[0] == '/') {
+			name = strrchr(self->mmap.filename, '/');
 			if (name == NULL)
 				goto out_problem;
 
@@ -352,59 +423,86 @@ int event__process_mmap(event_t *self, struct perf_session *session)
 			dot = strrchr(name, '.');
 			if (dot == NULL)
 				goto out_problem;
-
 			snprintf(short_module_name, sizeof(short_module_name),
-				 "[%.*s]", (int)(dot - name), name);
+					"[%.*s]", (int)(dot - name), name);
 			strxfrchar(short_module_name, '-', '_');
-
-			map = perf_session__new_module_map(session,
-							   self->mmap.start,
-							   self->mmap.filename);
-			if (map == NULL)
-				goto out_problem;
-
-			name = strdup(short_module_name);
-			if (name == NULL)
-				goto out_problem;
-
-			map->dso->short_name = name;
-			map->end = map->start + self->mmap.len;
-		} else if (memcmp(self->mmap.filename, kmmap_prefix,
-				sizeof(kmmap_prefix) - 1) == 0) {
-			const char *symbol_name = (self->mmap.filename +
-						   sizeof(kmmap_prefix) - 1);
+		} else
+			strcpy(short_module_name, self->mmap.filename);
+
+		map = map_groups__new_module(&kerninfo->kmaps,
+				self->mmap.start,
+				self->mmap.filename,
+				kerninfo);
+		if (map == NULL)
+			goto out_problem;
+
+		name = strdup(short_module_name);
+		if (name == NULL)
+			goto out_problem;
+
+		map->dso->short_name = name;
+		map->end = map->start + self->mmap.len;
+	} else if (is_kernel_mmap) {
+		const char *symbol_name = (self->mmap.filename +
+				strlen(kmmap_prefix));
+		/*
+		 * Should be there already, from the build-id table in
+		 * the header.
+		 */
+		struct dso *kernel = __dsos__findnew(&kerninfo->dsos__kernel,
+				kmmap_prefix);
+		if (kernel == NULL)
+			goto out_problem;
+
+		kernel->kernel = kernel_type;
+		if (__map_groups__create_kernel_maps(&kerninfo->kmaps,
+					kerninfo->vmlinux_maps, kernel) < 0)
+			goto out_problem;
+
+		event_set_kernel_mmap_len(kerninfo->vmlinux_maps, self);
+		perf_session__set_kallsyms_ref_reloc_sym(kerninfo->vmlinux_maps,
+				symbol_name,
+				self->mmap.pgoff);
+		if (is_default_guest(kerninfo)) {
 			/*
-			 * Should be there already, from the build-id table in
-			 * the header.
+			 * preload dso of guest kernel and modules
 			 */
-			struct dso *kernel = __dsos__findnew(&dsos__kernel,
-							     "[kernel.kallsyms]");
-			if (kernel == NULL)
-				goto out_problem;
-
-			kernel->kernel = 1;
-			if (__perf_session__create_kernel_maps(session, kernel) < 0)
-				goto out_problem;
+			dso__load(kernel,
+				kerninfo->vmlinux_maps[MAP__FUNCTION],
+				NULL);
+		}
+	}
+	return 0;
+out_problem:
+	return -1;
+}
 
-			session->vmlinux_maps[MAP__FUNCTION]->start = self->mmap.start;
-			session->vmlinux_maps[MAP__FUNCTION]->end   = self->mmap.start + self->mmap.len;
-			/*
-			 * Be a bit paranoid here, some perf.data file came with
-			 * a zero sized synthesized MMAP event for the kernel.
-			 */
-			if (session->vmlinux_maps[MAP__FUNCTION]->end == 0)
-				session->vmlinux_maps[MAP__FUNCTION]->end = ~0UL;
+int event__process_mmap(event_t *self, struct perf_session *session)
+{
+	struct kernel_info *kerninfo;
+	struct thread *thread;
+	struct map *map;
+	u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+	int ret = 0;
 
-			perf_session__set_kallsyms_ref_reloc_sym(session, symbol_name,
-								 self->mmap.pgoff);
-		}
+	dump_printf(" %d/%d: [%#Lx(%#Lx) @ %#Lx]: %s\n",
+			self->mmap.pid, self->mmap.tid, self->mmap.start,
+			self->mmap.len, self->mmap.pgoff, self->mmap.filename);
+
+	if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL ||
+	    cpumode == PERF_RECORD_MISC_KERNEL) {
+		ret = event__process_kernel_mmap(self, session);
+		if (ret < 0)
+			goto out_problem;
 		return 0;
 	}
 
 	thread = perf_session__findnew(session, self->mmap.pid);
-	map = map__new(self->mmap.start, self->mmap.len, self->mmap.pgoff,
-		       self->mmap.pid, self->mmap.filename, MAP__FUNCTION,
-		       session->cwd, session->cwdlen);
+	kerninfo = kerninfo__findhost(&session->kerninfo_root);
+	map = map__new(&kerninfo->dsos__user, self->mmap.start,
+			self->mmap.len, self->mmap.pgoff,
+			self->mmap.pid, self->mmap.filename,
+			MAP__FUNCTION, session->cwd, session->cwdlen);
 
 	if (thread == NULL || map == NULL)
 		goto out_problem;
@@ -444,22 +542,52 @@ int event__process_task(event_t *self, struct perf_session *session)
 
 void thread__find_addr_map(struct thread *self,
 			   struct perf_session *session, u8 cpumode,
-			   enum map_type type, u64 addr,
+			   enum map_type type, pid_t pid, u64 addr,
 			   struct addr_location *al)
 {
 	struct map_groups *mg = &self->mg;
+	struct kernel_info *kerninfo = NULL;
 
 	al->thread = self;
 	al->addr = addr;
+	al->cpumode = cpumode;
+	al->filtered = false;
 
-	if (cpumode == PERF_RECORD_MISC_KERNEL) {
+	if (cpumode == PERF_RECORD_MISC_KERNEL && perf_host) {
 		al->level = 'k';
-		mg = &session->kmaps;
-	} else if (cpumode == PERF_RECORD_MISC_USER)
+		kerninfo = kerninfo__findhost(&session->kerninfo_root);
+		mg = &kerninfo->kmaps;
+	} else if (cpumode == PERF_RECORD_MISC_USER && perf_host) {
 		al->level = '.';
-	else {
-		al->level = 'H';
+		kerninfo = kerninfo__findhost(&session->kerninfo_root);
+	} else if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL && perf_guest) {
+		al->level = 'g';
+		kerninfo = kerninfo__find(&session->kerninfo_root, pid);
+		if (!kerninfo) {
+			al->map = NULL;
+			return;
+		}
+		mg = &kerninfo->kmaps;
+	} else {
+		/*
+		 * 'u' means guest os user space.
+		 * TODO: We don't support guest user space. Might support late.
+		 */
+		if (cpumode == PERF_RECORD_MISC_GUEST_USER && perf_guest)
+			al->level = 'u';
+		else
+			al->level = 'H';
 		al->map = NULL;
+
+		if ((cpumode == PERF_RECORD_MISC_GUEST_USER ||
+			cpumode == PERF_RECORD_MISC_GUEST_KERNEL) &&
+			!perf_guest)
+			al->filtered = true;
+		if ((cpumode == PERF_RECORD_MISC_USER ||
+			cpumode == PERF_RECORD_MISC_KERNEL) &&
+			!perf_host)
+			al->filtered = true;
+
 		return;
 	}
 try_again:
@@ -474,8 +602,11 @@ try_again:
 		 * "[vdso]" dso, but for now lets use the old trick of looking
 		 * in the whole kernel symbol list.
 		 */
-		if ((long long)al->addr < 0 && mg != &session->kmaps) {
-			mg = &session->kmaps;
+		if ((long long)al->addr < 0 &&
+			cpumode == PERF_RECORD_MISC_KERNEL &&
+			kerninfo &&
+			mg != &kerninfo->kmaps)  {
+			mg = &kerninfo->kmaps;
 			goto try_again;
 		}
 	} else
@@ -484,11 +615,11 @@ try_again:
 
 void thread__find_addr_location(struct thread *self,
 				struct perf_session *session, u8 cpumode,
-				enum map_type type, u64 addr,
+				enum map_type type, pid_t pid, u64 addr,
 				struct addr_location *al,
 				symbol_filter_t filter)
 {
-	thread__find_addr_map(self, session, cpumode, type, addr, al);
+	thread__find_addr_map(self, session, cpumode, type, pid, addr, al);
 	if (al->map != NULL)
 		al->sym = map__find_symbol(al->map, al->addr, filter);
 	else
@@ -524,7 +655,7 @@ int event__preprocess_sample(const event_t *self, struct perf_session *session,
 	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
 
 	thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION,
-			      self->ip.ip, al);
+			      self->ip.pid, self->ip.ip, al);
 	dump_printf(" ...... dso: %s\n",
 		    al->map ? al->map->dso->long_name :
 			al->level == 'H' ? "[hypervisor]" : "<not found>");
@@ -554,7 +685,6 @@ int event__preprocess_sample(const event_t *self, struct perf_session *session,
 	    !strlist__has_entry(symbol_conf.sym_list, al->sym->name))
 		goto out_filtered;
 
-	al->filtered = false;
 	return 0;
 
 out_filtered:
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index e5740ea140a..4af2ed5d48a 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -79,6 +79,7 @@ struct sample_data {
 
 struct build_id_event {
 	struct perf_event_header header;
+	pid_t			 pid;
 	u8			 build_id[ALIGN(BUILD_ID_SIZE, sizeof(u64))];
 	char			 filename[];
 };
@@ -154,10 +155,13 @@ int event__synthesize_thread(pid_t pid, event__handler_t process,
 void event__synthesize_threads(event__handler_t process,
 			       struct perf_session *session);
 int event__synthesize_kernel_mmap(event__handler_t process,
-				  struct perf_session *session,
-				  const char *symbol_name);
+				struct perf_session *session,
+				struct kernel_info *kerninfo,
+				const char *symbol_name);
+
 int event__synthesize_modules(event__handler_t process,
-			      struct perf_session *session);
+			      struct perf_session *session,
+			      struct kernel_info *kerninfo);
 
 int event__process_comm(event_t *self, struct perf_session *session);
 int event__process_lost(event_t *self, struct perf_session *session);
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 628173ba689..75d01676802 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -190,7 +190,8 @@ static int write_padded(int fd, const void *bf, size_t count,
 			continue;		\
 		else
 
-static int __dsos__write_buildid_table(struct list_head *head, u16 misc, int fd)
+static int __dsos__write_buildid_table(struct list_head *head, pid_t pid,
+				u16 misc, int fd)
 {
 	struct dso *pos;
 
@@ -205,6 +206,7 @@ static int __dsos__write_buildid_table(struct list_head *head, u16 misc, int fd)
 		len = ALIGN(len, NAME_ALIGN);
 		memset(&b, 0, sizeof(b));
 		memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id));
+		b.pid = pid;
 		b.header.misc = misc;
 		b.header.size = sizeof(b) + len;
 		err = do_write(fd, &b, sizeof(b));
@@ -219,13 +221,33 @@ static int __dsos__write_buildid_table(struct list_head *head, u16 misc, int fd)
 	return 0;
 }
 
-static int dsos__write_buildid_table(int fd)
+static int dsos__write_buildid_table(struct perf_header *header, int fd)
 {
-	int err = __dsos__write_buildid_table(&dsos__kernel,
-					      PERF_RECORD_MISC_KERNEL, fd);
-	if (err == 0)
-		err = __dsos__write_buildid_table(&dsos__user,
-						  PERF_RECORD_MISC_USER, fd);
+	struct perf_session *session = container_of(header,
+			struct perf_session, header);
+	struct rb_node *nd;
+	int err = 0;
+	u16 kmisc, umisc;
+
+	for (nd = rb_first(&session->kerninfo_root); nd; nd = rb_next(nd)) {
+		struct kernel_info *pos = rb_entry(nd, struct kernel_info,
+				rb_node);
+		if (is_host_kernel(pos)) {
+			kmisc = PERF_RECORD_MISC_KERNEL;
+			umisc = PERF_RECORD_MISC_USER;
+		} else {
+			kmisc = PERF_RECORD_MISC_GUEST_KERNEL;
+			umisc = PERF_RECORD_MISC_GUEST_USER;
+		}
+
+		err = __dsos__write_buildid_table(&pos->dsos__kernel, pos->pid,
+				kmisc, fd);
+		if (err == 0)
+			err = __dsos__write_buildid_table(&pos->dsos__user,
+				pos->pid, umisc, fd);
+		if (err)
+			break;
+	}
 	return err;
 }
 
@@ -342,9 +364,12 @@ static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir)
 	return err;
 }
 
-static int dsos__cache_build_ids(void)
+static int dsos__cache_build_ids(struct perf_header *self)
 {
-	int err_kernel, err_user;
+	struct perf_session *session = container_of(self,
+			struct perf_session, header);
+	struct rb_node *nd;
+	int ret = 0;
 	char debugdir[PATH_MAX];
 
 	snprintf(debugdir, sizeof(debugdir), "%s/%s", getenv("HOME"),
@@ -353,9 +378,30 @@ static int dsos__cache_build_ids(void)
 	if (mkdir(debugdir, 0755) != 0 && errno != EEXIST)
 		return -1;
 
-	err_kernel = __dsos__cache_build_ids(&dsos__kernel, debugdir);
-	err_user   = __dsos__cache_build_ids(&dsos__user, debugdir);
-	return err_kernel || err_user ? -1 : 0;
+	for (nd = rb_first(&session->kerninfo_root); nd; nd = rb_next(nd)) {
+		struct kernel_info *pos = rb_entry(nd, struct kernel_info,
+				rb_node);
+		ret |= __dsos__cache_build_ids(&pos->dsos__kernel, debugdir);
+		ret |= __dsos__cache_build_ids(&pos->dsos__user, debugdir);
+	}
+	return ret ? -1 : 0;
+}
+
+static bool dsos__read_build_ids(struct perf_header *self, bool with_hits)
+{
+	bool ret = false;
+	struct perf_session *session = container_of(self,
+			struct perf_session, header);
+	struct rb_node *nd;
+
+	for (nd = rb_first(&session->kerninfo_root); nd; nd = rb_next(nd)) {
+		struct kernel_info *pos = rb_entry(nd, struct kernel_info,
+				rb_node);
+		ret |= __dsos__read_build_ids(&pos->dsos__kernel, with_hits);
+		ret |= __dsos__read_build_ids(&pos->dsos__user, with_hits);
+	}
+
+	return ret;
 }
 
 static int perf_header__adds_write(struct perf_header *self, int fd)
@@ -366,7 +412,7 @@ static int perf_header__adds_write(struct perf_header *self, int fd)
 	u64 sec_start;
 	int idx = 0, err;
 
-	if (dsos__read_build_ids(true))
+	if (dsos__read_build_ids(self, true))
 		perf_header__set_feat(self, HEADER_BUILD_ID);
 
 	nr_sections = bitmap_weight(self->adds_features, HEADER_FEAT_BITS);
@@ -401,14 +447,14 @@ static int perf_header__adds_write(struct perf_header *self, int fd)
 
 		/* Write build-ids */
 		buildid_sec->offset = lseek(fd, 0, SEEK_CUR);
-		err = dsos__write_buildid_table(fd);
+		err = dsos__write_buildid_table(self, fd);
 		if (err < 0) {
 			pr_debug("failed to write buildid table\n");
 			goto out_free;
 		}
 		buildid_sec->size = lseek(fd, 0, SEEK_CUR) -
 					  buildid_sec->offset;
-		dsos__cache_build_ids();
+		dsos__cache_build_ids(self);
 	}
 
 	lseek(fd, sec_start, SEEK_SET);
@@ -633,6 +679,85 @@ int perf_file_header__read(struct perf_file_header *self,
 	return 0;
 }
 
+static int __event_process_build_id(struct build_id_event *bev,
+				    char *filename,
+				    struct perf_session *session)
+{
+	int err = -1;
+	struct list_head *head;
+	struct kernel_info *kerninfo;
+	u16 misc;
+	struct dso *dso;
+	enum dso_kernel_type dso_type;
+
+	kerninfo = kerninfo__findnew(&session->kerninfo_root, bev->pid);
+	if (!kerninfo)
+		goto out;
+
+	misc = bev->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+
+	switch (misc) {
+	case PERF_RECORD_MISC_KERNEL:
+		dso_type = DSO_TYPE_KERNEL;
+		head = &kerninfo->dsos__kernel;
+		break;
+	case PERF_RECORD_MISC_GUEST_KERNEL:
+		dso_type = DSO_TYPE_GUEST_KERNEL;
+		head = &kerninfo->dsos__kernel;
+		break;
+	case PERF_RECORD_MISC_USER:
+	case PERF_RECORD_MISC_GUEST_USER:
+		dso_type = DSO_TYPE_USER;
+		head = &kerninfo->dsos__user;
+		break;
+	default:
+		goto out;
+	}
+
+	dso = __dsos__findnew(head, filename);
+	if (dso != NULL) {
+		dso__set_build_id(dso, &bev->build_id);
+			if (filename[0] == '[')
+				dso->kernel = dso_type;
+		}
+
+	err = 0;
+out:
+	return err;
+}
+
+static int perf_header__read_build_ids(struct perf_header *self,
+			int input, u64 offset, u64 size)
+{
+	struct perf_session *session = container_of(self,
+			struct perf_session, header);
+	struct build_id_event bev;
+	char filename[PATH_MAX];
+	u64 limit = offset + size;
+	int err = -1;
+
+	while (offset < limit) {
+		ssize_t len;
+
+		if (read(input, &bev, sizeof(bev)) != sizeof(bev))
+			goto out;
+
+		if (self->needs_swap)
+			perf_event_header__bswap(&bev.header);
+
+		len = bev.header.size - sizeof(bev);
+		if (read(input, filename, len) != len)
+			goto out;
+
+		__event_process_build_id(&bev, filename, session);
+
+		offset += bev.header.size;
+	}
+	err = 0;
+out:
+	return err;
+}
+
 static int perf_file_section__process(struct perf_file_section *self,
 				      struct perf_header *ph,
 				      int feat, int fd)
@@ -989,6 +1114,7 @@ int event__process_tracing_data(event_t *self,
 
 int event__synthesize_build_id(struct dso *pos, u16 misc,
 			       event__handler_t process,
+			       struct kernel_info *kerninfo,
 			       struct perf_session *session)
 {
 	event_t ev;
@@ -1005,6 +1131,7 @@ int event__synthesize_build_id(struct dso *pos, u16 misc,
 	memcpy(&ev.build_id.build_id, pos->build_id, sizeof(pos->build_id));
 	ev.build_id.header.type = PERF_RECORD_HEADER_BUILD_ID;
 	ev.build_id.header.misc = misc;
+	ev.build_id.pid = kerninfo->pid;
 	ev.build_id.header.size = sizeof(ev.build_id) + len;
 	memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len);
 
@@ -1015,6 +1142,7 @@ int event__synthesize_build_id(struct dso *pos, u16 misc,
 
 static int __event_synthesize_build_ids(struct list_head *head, u16 misc,
 					event__handler_t process,
+					struct kernel_info *kerninfo,
 					struct perf_session *session)
 {
 	struct dso *pos;
@@ -1024,7 +1152,8 @@ static int __event_synthesize_build_ids(struct list_head *head, u16 misc,
 		if (!pos->hit)
 			continue;
 
-		err = event__synthesize_build_id(pos, misc, process, session);
+		err = event__synthesize_build_id(pos, misc, process,
+					kerninfo, session);
 		if (err < 0)
 			return err;
 	}
@@ -1035,44 +1164,48 @@ static int __event_synthesize_build_ids(struct list_head *head, u16 misc,
 int event__synthesize_build_ids(event__handler_t process,
 				struct perf_session *session)
 {
-	int err;
+	int err = 0;
+	u16 kmisc, umisc;
+	struct kernel_info *pos;
+	struct rb_node *nd;
 
-	if (!dsos__read_build_ids(true))
+	if (!dsos__read_build_ids(&session->header, true))
 		return 0;
 
-	err = __event_synthesize_build_ids(&dsos__kernel,
-					   PERF_RECORD_MISC_KERNEL,
-					   process, session);
-	if (err == 0)
-		err = __event_synthesize_build_ids(&dsos__user,
-						   PERF_RECORD_MISC_USER,
-						   process, session);
+	for (nd = rb_first(&session->kerninfo_root); nd; nd = rb_next(nd)) {
+		pos = rb_entry(nd, struct kernel_info, rb_node);
+		if (is_host_kernel(pos)) {
+			kmisc = PERF_RECORD_MISC_KERNEL;
+			umisc = PERF_RECORD_MISC_USER;
+		} else {
+			kmisc = PERF_RECORD_MISC_GUEST_KERNEL;
+			umisc = PERF_RECORD_MISC_GUEST_USER;
+		}
+
+		err = __event_synthesize_build_ids(&pos->dsos__kernel,
+				kmisc, process, pos, session);
+		if (err == 0)
+			err = __event_synthesize_build_ids(&pos->dsos__user,
+					umisc, process, pos, session);
+		if (err)
+			break;
+	}
 
 	if (err < 0) {
 		pr_debug("failed to synthesize build ids\n");
 		return err;
 	}
 
-	dsos__cache_build_ids();
+	dsos__cache_build_ids(&session->header);
 
 	return 0;
 }
 
 int event__process_build_id(event_t *self,
-			    struct perf_session *session __unused)
+			    struct perf_session *session)
 {
-	struct list_head *head = &dsos__user;
-	struct dso *dso;
-
-	if (self->build_id.header.misc & PERF_RECORD_MISC_KERNEL)
-		head = &dsos__kernel;
-
-	dso = __dsos__findnew(head, self->build_id.filename);
-	if (dso != NULL) {
-		dso__set_build_id(dso, &self->build_id.build_id);
-		if (head == &dsos__kernel && self->build_id.filename[0] == '[')
-			dso->kernel = 1;
-	}
-
+	__event_process_build_id(&self->build_id,
+				 self->build_id.filename,
+				 session);
 	return 0;
 }
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 4214e237565..27591545814 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -120,6 +120,7 @@ int event__process_tracing_data(event_t *self,
 
 int event__synthesize_build_id(struct dso *pos, u16 misc,
 			       event__handler_t process,
+			       struct kernel_info *kerninfo,
 			       struct perf_session *session);
 int event__synthesize_build_ids(event__handler_t process,
 				struct perf_session *session);
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 9c2b8743cef..ad6b22dde27 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -8,6 +8,30 @@ struct callchain_param	callchain_param = {
 	.min_percent = 0.5
 };
 
+void __perf_session__add_count(struct hist_entry *he,
+			struct addr_location *al,
+			u64 count)
+{
+	he->count += count;
+
+	switch (al->cpumode) {
+	case PERF_RECORD_MISC_KERNEL:
+		he->count_sys += count;
+		break;
+	case PERF_RECORD_MISC_USER:
+		he->count_us += count;
+		break;
+	case PERF_RECORD_MISC_GUEST_KERNEL:
+		he->count_guest_sys += count;
+		break;
+	case PERF_RECORD_MISC_GUEST_USER:
+		he->count_guest_us += count;
+		break;
+	default:
+		break;
+	}
+}
+
 /*
  * histogram, sorted on item, collects counts
  */
@@ -464,7 +488,7 @@ int hist_entry__snprintf(struct hist_entry *self,
 			   u64 session_total)
 {
 	struct sort_entry *se;
-	u64 count, total;
+	u64 count, total, count_sys, count_us, count_guest_sys, count_guest_us;
 	const char *sep = symbol_conf.field_sep;
 	int ret;
 
@@ -474,9 +498,17 @@ int hist_entry__snprintf(struct hist_entry *self,
 	if (pair_session) {
 		count = self->pair ? self->pair->count : 0;
 		total = pair_session->events_stats.total;
+		count_sys = self->pair ? self->pair->count_sys : 0;
+		count_us = self->pair ? self->pair->count_us : 0;
+		count_guest_sys = self->pair ? self->pair->count_guest_sys : 0;
+		count_guest_us = self->pair ? self->pair->count_guest_us : 0;
 	} else {
 		count = self->count;
 		total = session_total;
+		count_sys = self->count_sys;
+		count_us = self->count_us;
+		count_guest_sys = self->count_guest_sys;
+		count_guest_us = self->count_guest_us;
 	}
 
 	if (total) {
@@ -487,6 +519,26 @@ int hist_entry__snprintf(struct hist_entry *self,
 		else
 			ret = snprintf(s, size, sep ? "%.2f" : "   %6.2f%%",
 				       (count * 100.0) / total);
+		if (symbol_conf.show_cpu_utilization) {
+			ret += percent_color_snprintf(s + ret, size - ret,
+					sep ? "%.2f" : "   %6.2f%%",
+					(count_sys * 100.0) / total);
+			ret += percent_color_snprintf(s + ret, size - ret,
+					sep ? "%.2f" : "   %6.2f%%",
+					(count_us * 100.0) / total);
+			if (perf_guest) {
+				ret += percent_color_snprintf(s + ret,
+						size - ret,
+						sep ? "%.2f" : "   %6.2f%%",
+						(count_guest_sys * 100.0) /
+								total);
+				ret += percent_color_snprintf(s + ret,
+						size - ret,
+						sep ? "%.2f" : "   %6.2f%%",
+						(count_guest_us * 100.0) /
+								total);
+			}
+		}
 	} else
 		ret = snprintf(s, size, sep ? "%lld" : "%12lld ", count);
 
@@ -597,6 +649,24 @@ size_t perf_session__fprintf_hists(struct rb_root *hists,
 			fputs("  Samples  ", fp);
 	}
 
+	if (symbol_conf.show_cpu_utilization) {
+		if (sep) {
+			ret += fprintf(fp, "%csys", *sep);
+			ret += fprintf(fp, "%cus", *sep);
+			if (perf_guest) {
+				ret += fprintf(fp, "%cguest sys", *sep);
+				ret += fprintf(fp, "%cguest us", *sep);
+			}
+		} else {
+			ret += fprintf(fp, "  sys  ");
+			ret += fprintf(fp, "  us  ");
+			if (perf_guest) {
+				ret += fprintf(fp, "  guest sys  ");
+				ret += fprintf(fp, "  guest us  ");
+			}
+		}
+	}
+
 	if (pair) {
 		if (sep)
 			ret += fprintf(fp, "%cDelta", *sep);
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index ad17f0ad798..9df1c340ec9 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -12,6 +12,9 @@ struct addr_location;
 struct symbol;
 struct rb_root;
 
+void __perf_session__add_count(struct hist_entry *he,
+			struct addr_location *al,
+			u64 count);
 struct hist_entry *__perf_session__add_hist_entry(struct rb_root *hists,
 						  struct addr_location *al,
 						  struct symbol *parent,
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 37913b241bd..7facd016ec9 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -4,6 +4,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
+#include <unistd.h>
 #include "map.h"
 
 const char *map_type__name[MAP__NR_TYPES] = {
@@ -37,9 +38,11 @@ void map__init(struct map *self, enum map_type type,
 	self->map_ip   = map__map_ip;
 	self->unmap_ip = map__unmap_ip;
 	RB_CLEAR_NODE(&self->rb_node);
+	self->groups   = NULL;
 }
 
-struct map *map__new(u64 start, u64 len, u64 pgoff, u32 pid, char *filename,
+struct map *map__new(struct list_head *dsos__list, u64 start, u64 len,
+		     u64 pgoff, u32 pid, char *filename,
 		     enum map_type type, char *cwd, int cwdlen)
 {
 	struct map *self = malloc(sizeof(*self));
@@ -66,7 +69,7 @@ struct map *map__new(u64 start, u64 len, u64 pgoff, u32 pid, char *filename,
 			filename = newfilename;
 		}
 
-		dso = dsos__findnew(filename);
+		dso = __dsos__findnew(dsos__list, filename);
 		if (dso == NULL)
 			goto out_delete;
 
@@ -242,6 +245,7 @@ void map_groups__init(struct map_groups *self)
 		self->maps[i] = RB_ROOT;
 		INIT_LIST_HEAD(&self->removed_maps[i]);
 	}
+	 self->this_kerninfo = NULL;
 }
 
 void map_groups__flush(struct map_groups *self)
@@ -508,3 +512,134 @@ struct map *maps__find(struct rb_root *maps, u64 ip)
 
 	return NULL;
 }
+
+struct kernel_info *add_new_kernel_info(struct rb_root *kerninfo_root,
+			pid_t pid, const char *root_dir)
+{
+	struct rb_node **p = &kerninfo_root->rb_node;
+	struct rb_node *parent = NULL;
+	struct kernel_info *kerninfo, *pos;
+
+	kerninfo = malloc(sizeof(struct kernel_info));
+	if (!kerninfo)
+		return NULL;
+
+	kerninfo->pid = pid;
+	map_groups__init(&kerninfo->kmaps);
+	kerninfo->root_dir = strdup(root_dir);
+	RB_CLEAR_NODE(&kerninfo->rb_node);
+	INIT_LIST_HEAD(&kerninfo->dsos__user);
+	INIT_LIST_HEAD(&kerninfo->dsos__kernel);
+	kerninfo->kmaps.this_kerninfo = kerninfo;
+
+	while (*p != NULL) {
+		parent = *p;
+		pos = rb_entry(parent, struct kernel_info, rb_node);
+		if (pid < pos->pid)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&kerninfo->rb_node, parent, p);
+	rb_insert_color(&kerninfo->rb_node, kerninfo_root);
+
+	return kerninfo;
+}
+
+struct kernel_info *kerninfo__find(struct rb_root *kerninfo_root, pid_t pid)
+{
+	struct rb_node **p = &kerninfo_root->rb_node;
+	struct rb_node *parent = NULL;
+	struct kernel_info *kerninfo;
+	struct kernel_info *default_kerninfo = NULL;
+
+	while (*p != NULL) {
+		parent = *p;
+		kerninfo = rb_entry(parent, struct kernel_info, rb_node);
+		if (pid < kerninfo->pid)
+			p = &(*p)->rb_left;
+		else if (pid > kerninfo->pid)
+			p = &(*p)->rb_right;
+		else
+			return kerninfo;
+		if (!kerninfo->pid)
+			default_kerninfo = kerninfo;
+	}
+
+	return default_kerninfo;
+}
+
+struct kernel_info *kerninfo__findhost(struct rb_root *kerninfo_root)
+{
+	struct rb_node **p = &kerninfo_root->rb_node;
+	struct rb_node *parent = NULL;
+	struct kernel_info *kerninfo;
+	pid_t pid = HOST_KERNEL_ID;
+
+	while (*p != NULL) {
+		parent = *p;
+		kerninfo = rb_entry(parent, struct kernel_info, rb_node);
+		if (pid < kerninfo->pid)
+			p = &(*p)->rb_left;
+		else if (pid > kerninfo->pid)
+			p = &(*p)->rb_right;
+		else
+			return kerninfo;
+	}
+
+	return NULL;
+}
+
+struct kernel_info *kerninfo__findnew(struct rb_root *kerninfo_root, pid_t pid)
+{
+	char path[PATH_MAX];
+	const char *root_dir;
+	int ret;
+	struct kernel_info *kerninfo = kerninfo__find(kerninfo_root, pid);
+
+	if (!kerninfo || kerninfo->pid != pid) {
+		if (pid == HOST_KERNEL_ID || pid == DEFAULT_GUEST_KERNEL_ID)
+			root_dir = "";
+		else {
+			if (!symbol_conf.guestmount)
+				goto out;
+			sprintf(path, "%s/%d", symbol_conf.guestmount, pid);
+			ret = access(path, R_OK);
+			if (ret) {
+				pr_err("Can't access file %s\n", path);
+				goto out;
+			}
+			root_dir = path;
+		}
+		kerninfo = add_new_kernel_info(kerninfo_root, pid, root_dir);
+	}
+
+out:
+	return kerninfo;
+}
+
+void kerninfo__process_allkernels(struct rb_root *kerninfo_root,
+		process_kernel_info process,
+		void *data)
+{
+	struct rb_node *nd;
+
+	for (nd = rb_first(kerninfo_root); nd; nd = rb_next(nd)) {
+		struct kernel_info *pos = rb_entry(nd, struct kernel_info,
+							rb_node);
+		process(pos, data);
+	}
+}
+
+char *kern_mmap_name(struct kernel_info *kerninfo, char *buff)
+{
+	if (is_host_kernel(kerninfo))
+		sprintf(buff, "[%s]", "kernel.kallsyms");
+	else if (is_default_guest(kerninfo))
+		sprintf(buff, "[%s]", "guest.kernel.kallsyms");
+	else
+		sprintf(buff, "[%s.%d]", "guest.kernel.kallsyms", kerninfo->pid);
+
+	return buff;
+}
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 2031278cc06..30d38d634e0 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -19,6 +19,7 @@ extern const char *map_type__name[MAP__NR_TYPES];
 struct dso;
 struct ref_reloc_sym;
 struct map_groups;
+struct kernel_info;
 
 struct map {
 	union {
@@ -36,6 +37,7 @@ struct map {
 	u64			(*unmap_ip)(struct map *, u64);
 
 	struct dso		*dso;
+	struct map_groups	*groups;
 };
 
 struct kmap {
@@ -43,6 +45,26 @@ struct kmap {
 	struct map_groups	*kmaps;
 };
 
+struct map_groups {
+	struct rb_root		maps[MAP__NR_TYPES];
+	struct list_head	removed_maps[MAP__NR_TYPES];
+	struct kernel_info	*this_kerninfo;
+};
+
+/* Native host kernel uses -1 as pid index in kernel_info */
+#define	HOST_KERNEL_ID			(-1)
+#define	DEFAULT_GUEST_KERNEL_ID		(0)
+
+struct kernel_info {
+	struct rb_node rb_node;
+	pid_t pid;
+	char *root_dir;
+	struct list_head dsos__user;
+	struct list_head dsos__kernel;
+	struct map_groups kmaps;
+	struct map *vmlinux_maps[MAP__NR_TYPES];
+};
+
 static inline struct kmap *map__kmap(struct map *self)
 {
 	return (struct kmap *)(self + 1);
@@ -74,7 +96,8 @@ typedef int (*symbol_filter_t)(struct map *map, struct symbol *sym);
 
 void map__init(struct map *self, enum map_type type,
 	       u64 start, u64 end, u64 pgoff, struct dso *dso);
-struct map *map__new(u64 start, u64 len, u64 pgoff, u32 pid, char *filename,
+struct map *map__new(struct list_head *dsos__list, u64 start, u64 len,
+		     u64 pgoff, u32 pid, char *filename,
 		     enum map_type type, char *cwd, int cwdlen);
 void map__delete(struct map *self);
 struct map *map__clone(struct map *self);
@@ -91,11 +114,6 @@ void map__fixup_end(struct map *self);
 
 void map__reloc_vmlinux(struct map *self);
 
-struct map_groups {
-	struct rb_root		maps[MAP__NR_TYPES];
-	struct list_head	removed_maps[MAP__NR_TYPES];
-};
-
 size_t __map_groups__fprintf_maps(struct map_groups *self,
 				  enum map_type type, int verbose, FILE *fp);
 void maps__insert(struct rb_root *maps, struct map *map);
@@ -106,9 +124,40 @@ int map_groups__clone(struct map_groups *self,
 size_t map_groups__fprintf(struct map_groups *self, int verbose, FILE *fp);
 size_t map_groups__fprintf_maps(struct map_groups *self, int verbose, FILE *fp);
 
+struct kernel_info *add_new_kernel_info(struct rb_root *kerninfo_root,
+			pid_t pid, const char *root_dir);
+struct kernel_info *kerninfo__find(struct rb_root *kerninfo_root, pid_t pid);
+struct kernel_info *kerninfo__findnew(struct rb_root *kerninfo_root, pid_t pid);
+struct kernel_info *kerninfo__findhost(struct rb_root *kerninfo_root);
+char *kern_mmap_name(struct kernel_info *kerninfo, char *buff);
+
+/*
+ * Default guest kernel is defined by parameter --guestkallsyms
+ * and --guestmodules
+ */
+static inline int is_default_guest(struct kernel_info *kerninfo)
+{
+	if (!kerninfo)
+		return 0;
+	return kerninfo->pid == DEFAULT_GUEST_KERNEL_ID;
+}
+
+static inline int is_host_kernel(struct kernel_info *kerninfo)
+{
+	if (!kerninfo)
+		return 0;
+	return kerninfo->pid == HOST_KERNEL_ID;
+}
+
+typedef void (*process_kernel_info)(struct kernel_info *kerninfo, void *data);
+void kerninfo__process_allkernels(struct rb_root *kerninfo_root,
+				  process_kernel_info process,
+				  void *data);
+
 static inline void map_groups__insert(struct map_groups *self, struct map *map)
 {
-	 maps__insert(&self->maps[map->type], map);
+	maps__insert(&self->maps[map->type], map);
+	map->groups = self;
 }
 
 static inline struct map *map_groups__find(struct map_groups *self,
@@ -148,13 +197,11 @@ int map_groups__fixup_overlappings(struct map_groups *self, struct map *map,
 
 struct map *map_groups__find_by_name(struct map_groups *self,
 				     enum map_type type, const char *name);
-int __map_groups__create_kernel_maps(struct map_groups *self,
-				     struct map *vmlinux_maps[MAP__NR_TYPES],
-				     struct dso *kernel);
-int map_groups__create_kernel_maps(struct map_groups *self,
-				   struct map *vmlinux_maps[MAP__NR_TYPES]);
-struct map *map_groups__new_module(struct map_groups *self, u64 start,
-				   const char *filename);
+struct map *map_groups__new_module(struct map_groups *self,
+				    u64 start,
+				    const char *filename,
+				    struct kernel_info *kerninfo);
+
 void map_groups__flush(struct map_groups *self);
 
 #endif /* __PERF_MAP_H */
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 5bf8ab03446..3967f8f63d0 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -78,6 +78,7 @@ static struct map *kmaps[MAP__NR_TYPES];
 /* Initialize symbol maps and path of vmlinux */
 static int init_vmlinux(void)
 {
+	struct dso *kernel;
 	int ret;
 
 	symbol_conf.sort_by_name = true;
@@ -91,8 +92,12 @@ static int init_vmlinux(void)
 		goto out;
 	}
 
+	kernel = dso__new_kernel(symbol_conf.vmlinux_name);
+	if (kernel == NULL)
+		die("Failed to create kernel dso.");
+
 	map_groups__init(&kmap_groups);
-	ret = map_groups__create_kernel_maps(&kmap_groups, kmaps);
+	ret = __map_groups__create_kernel_maps(&kmap_groups, kmaps, kernel);
 	if (ret < 0)
 		pr_debug("Failed to create kernel maps.\n");
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 0fdf3ebef1e..7d88ae5c270 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -67,6 +67,17 @@ void perf_session__update_sample_type(struct perf_session *self)
 	self->sample_type = perf_header__sample_type(&self->header);
 }
 
+int perf_session__create_kernel_maps(struct perf_session *self)
+{
+	int ret;
+	struct rb_root *root = &self->kerninfo_root;
+
+	ret = map_groups__create_kernel_maps(root, HOST_KERNEL_ID);
+	if (ret >= 0)
+		ret = map_groups__create_guest_kernel_maps(root);
+	return ret;
+}
+
 struct perf_session *perf_session__new(const char *filename, int mode, bool force)
 {
 	size_t len = filename ? strlen(filename) + 1 : 0;
@@ -86,7 +97,7 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
 	self->cwd = NULL;
 	self->cwdlen = 0;
 	self->unknown_events = 0;
-	map_groups__init(&self->kmaps);
+	self->kerninfo_root = RB_ROOT;
 
 	if (mode == O_RDONLY) {
 		if (perf_session__open(self, force) < 0)
@@ -157,8 +168,9 @@ struct map_symbol *perf_session__resolve_callchain(struct perf_session *self,
 			continue;
 		}
 
+		al.filtered = false;
 		thread__find_addr_location(thread, self, cpumode,
-					   MAP__FUNCTION, ip, &al, NULL);
+				MAP__FUNCTION, thread->pid, ip, &al, NULL);
 		if (al.sym != NULL) {
 			if (sort__has_parent && !*parent &&
 			    symbol__match_parent_regex(al.sym))
@@ -399,46 +411,6 @@ void perf_event_header__bswap(struct perf_event_header *self)
 	self->size = bswap_16(self->size);
 }
 
-int perf_header__read_build_ids(struct perf_header *self,
-				int input, u64 offset, u64 size)
-{
-	struct build_id_event bev;
-	char filename[PATH_MAX];
-	u64 limit = offset + size;
-	int err = -1;
-
-	while (offset < limit) {
-		struct dso *dso;
-		ssize_t len;
-		struct list_head *head = &dsos__user;
-
-		if (read(input, &bev, sizeof(bev)) != sizeof(bev))
-			goto out;
-
-		if (self->needs_swap)
-			perf_event_header__bswap(&bev.header);
-
-		len = bev.header.size - sizeof(bev);
-		if (read(input, filename, len) != len)
-			goto out;
-
-		if (bev.header.misc & PERF_RECORD_MISC_KERNEL)
-			head = &dsos__kernel;
-
-		dso = __dsos__findnew(head, filename);
-		if (dso != NULL) {
-			dso__set_build_id(dso, &bev.build_id);
-			if (head == &dsos__kernel && filename[0] == '[')
-				dso->kernel = 1;
-		}
-
-		offset += bev.header.size;
-	}
-	err = 0;
-out:
-	return err;
-}
-
 static struct thread *perf_session__register_idle_thread(struct perf_session *self)
 {
 	struct thread *thread = perf_session__findnew(self, 0);
@@ -690,26 +662,33 @@ bool perf_session__has_traces(struct perf_session *self, const char *msg)
 	return true;
 }
 
-int perf_session__set_kallsyms_ref_reloc_sym(struct perf_session *self,
+int perf_session__set_kallsyms_ref_reloc_sym(struct map **maps,
 					     const char *symbol_name,
 					     u64 addr)
 {
 	char *bracket;
 	enum map_type i;
+	struct ref_reloc_sym *ref;
+
+	ref = zalloc(sizeof(struct ref_reloc_sym));
+	if (ref == NULL)
+		return -ENOMEM;
 
-	self->ref_reloc_sym.name = strdup(symbol_name);
-	if (self->ref_reloc_sym.name == NULL)
+	ref->name = strdup(symbol_name);
+	if (ref->name == NULL) {
+		free(ref);
 		return -ENOMEM;
+	}
 
-	bracket = strchr(self->ref_reloc_sym.name, ']');
+	bracket = strchr(ref->name, ']');
 	if (bracket)
 		*bracket = '\0';
 
-	self->ref_reloc_sym.addr = addr;
+	ref->addr = addr;
 
 	for (i = 0; i < MAP__NR_TYPES; ++i) {
-		struct kmap *kmap = map__kmap(self->vmlinux_maps[i]);
-		kmap->ref_reloc_sym = &self->ref_reloc_sym;
+		struct kmap *kmap = map__kmap(maps[i]);
+		kmap->ref_reloc_sym = ref;
 	}
 
 	return 0;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 0ac14d42dc2..5e47c87b926 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -15,17 +15,15 @@ struct perf_session {
 	struct perf_header	header;
 	unsigned long		size;
 	unsigned long		mmap_window;
-	struct map_groups	kmaps;
 	struct rb_root		threads;
 	struct thread		*last_match;
-	struct map		*vmlinux_maps[MAP__NR_TYPES];
+	struct rb_root		kerninfo_root;
 	struct events_stats	events_stats;
 	struct rb_root		stats_by_id;
 	unsigned long		event_total[PERF_RECORD_MAX];
 	unsigned long		unknown_events;
 	struct rb_root		hists;
 	u64			sample_type;
-	struct ref_reloc_sym	ref_reloc_sym;
 	int			fd;
 	bool			fd_pipe;
 	int			cwdlen;
@@ -69,33 +67,13 @@ struct map_symbol *perf_session__resolve_callchain(struct perf_session *self,
 
 bool perf_session__has_traces(struct perf_session *self, const char *msg);
 
-int perf_header__read_build_ids(struct perf_header *self, int input,
-				u64 offset, u64 file_size);
-
-int perf_session__set_kallsyms_ref_reloc_sym(struct perf_session *self,
+int perf_session__set_kallsyms_ref_reloc_sym(struct map **maps,
 					     const char *symbol_name,
 					     u64 addr);
 
 void mem_bswap_64(void *src, int byte_size);
 
-static inline int __perf_session__create_kernel_maps(struct perf_session *self,
-						struct dso *kernel)
-{
-	return __map_groups__create_kernel_maps(&self->kmaps,
-						self->vmlinux_maps, kernel);
-}
-
-static inline int perf_session__create_kernel_maps(struct perf_session *self)
-{
-	return map_groups__create_kernel_maps(&self->kmaps, self->vmlinux_maps);
-}
-
-static inline struct map *
-	perf_session__new_module_map(struct perf_session *self,
-				     u64 start, const char *filename)
-{
-	return map_groups__new_module(&self->kmaps, start, filename);
-}
+int perf_session__create_kernel_maps(struct perf_session *self);
 
 int do_read(int fd, void *buf, size_t size);
 void perf_session__update_sample_type(struct perf_session *self);
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 1d857aa2c01..b7c54eeed9c 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -44,6 +44,11 @@ extern enum sort_type sort__first_dimension;
 struct hist_entry {
 	struct rb_node		rb_node;
 	u64			count;
+	u64			count_sys;
+	u64			count_us;
+	u64			count_guest_sys;
+	u64			count_guest_us;
+
 	/*
 	 * XXX WARNING!
 	 * thread _has_ to come after ms, see
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index f3d4151e46a..e782e7db16c 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -28,6 +28,8 @@ static void dsos__add(struct list_head *head, struct dso *dso);
 static struct map *map__new2(u64 start, struct dso *dso, enum map_type type);
 static int dso__load_kernel_sym(struct dso *self, struct map *map,
 				symbol_filter_t filter);
+static int dso__load_guest_kernel_sym(struct dso *self, struct map *map,
+			symbol_filter_t filter);
 static int vmlinux_path__nr_entries;
 static char **vmlinux_path;
 
@@ -186,6 +188,7 @@ struct dso *dso__new(const char *name)
 		self->loaded = 0;
 		self->sorted_by_name = 0;
 		self->has_build_id = 0;
+		self->kernel = DSO_TYPE_USER;
 	}
 
 	return self;
@@ -402,12 +405,9 @@ int kallsyms__parse(const char *filename, void *arg,
 		char *symbol_name;
 
 		line_len = getline(&line, &n, file);
-		if (line_len < 0)
+		if (line_len < 0 || !line)
 			break;
 
-		if (!line)
-			goto out_failure;
-
 		line[--line_len] = '\0'; /* \n */
 
 		len = hex2u64(line, &start);
@@ -459,6 +459,7 @@ static int map__process_kallsym_symbol(void *arg, const char *name,
 	 * map__split_kallsyms, when we have split the maps per module
 	 */
 	symbols__insert(root, sym);
+
 	return 0;
 }
 
@@ -483,6 +484,7 @@ static int dso__split_kallsyms(struct dso *self, struct map *map,
 			       symbol_filter_t filter)
 {
 	struct map_groups *kmaps = map__kmap(map)->kmaps;
+	struct kernel_info *kerninfo = kmaps->this_kerninfo;
 	struct map *curr_map = map;
 	struct symbol *pos;
 	int count = 0;
@@ -504,15 +506,33 @@ static int dso__split_kallsyms(struct dso *self, struct map *map,
 			*module++ = '\0';
 
 			if (strcmp(curr_map->dso->short_name, module)) {
-				curr_map = map_groups__find_by_name(kmaps, map->type, module);
+				if (curr_map != map &&
+					self->kernel == DSO_TYPE_GUEST_KERNEL &&
+					is_default_guest(kerninfo)) {
+					/*
+					 * We assume all symbols of a module are
+					 * continuous in * kallsyms, so curr_map
+					 * points to a module and all its
+					 * symbols are in its kmap. Mark it as
+					 * loaded.
+					 */
+					dso__set_loaded(curr_map->dso,
+							curr_map->type);
+				}
+
+				curr_map = map_groups__find_by_name(kmaps,
+							map->type, module);
 				if (curr_map == NULL) {
-					pr_debug("/proc/{kallsyms,modules} "
+					pr_err("%s/proc/{kallsyms,modules} "
 					         "inconsistency while looking "
-						 "for \"%s\" module!\n", module);
-					return -1;
+						 "for \"%s\" module!\n",
+						 kerninfo->root_dir, module);
+					curr_map = map;
+					goto discard_symbol;
 				}
 
-				if (curr_map->dso->loaded)
+				if (curr_map->dso->loaded &&
+					!is_default_guest(kmaps->this_kerninfo))
 					goto discard_symbol;
 			}
 			/*
@@ -525,13 +545,21 @@ static int dso__split_kallsyms(struct dso *self, struct map *map,
 			char dso_name[PATH_MAX];
 			struct dso *dso;
 
-			snprintf(dso_name, sizeof(dso_name), "[kernel].%d",
-				 kernel_range++);
+			if (self->kernel == DSO_TYPE_GUEST_KERNEL)
+				snprintf(dso_name, sizeof(dso_name),
+					"[guest.kernel].%d",
+					kernel_range++);
+			else
+				snprintf(dso_name, sizeof(dso_name),
+					"[kernel].%d",
+					kernel_range++);
 
 			dso = dso__new(dso_name);
 			if (dso == NULL)
 				return -1;
 
+			dso->kernel = self->kernel;
+
 			curr_map = map__new2(pos->start, dso, map->type);
 			if (curr_map == NULL) {
 				dso__delete(dso);
@@ -555,6 +583,12 @@ discard_symbol:		rb_erase(&pos->rb_node, root);
 		}
 	}
 
+	if (curr_map != map &&
+	    self->kernel == DSO_TYPE_GUEST_KERNEL &&
+	    is_default_guest(kmaps->this_kerninfo)) {
+		dso__set_loaded(curr_map->dso, curr_map->type);
+	}
+
 	return count;
 }
 
@@ -565,7 +599,10 @@ int dso__load_kallsyms(struct dso *self, const char *filename,
 		return -1;
 
 	symbols__fixup_end(&self->symbols[map->type]);
-	self->origin = DSO__ORIG_KERNEL;
+	if (self->kernel == DSO_TYPE_GUEST_KERNEL)
+		self->origin = DSO__ORIG_GUEST_KERNEL;
+	else
+		self->origin = DSO__ORIG_KERNEL;
 
 	return dso__split_kallsyms(self, map, filter);
 }
@@ -952,7 +989,7 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name,
 	nr_syms = shdr.sh_size / shdr.sh_entsize;
 
 	memset(&sym, 0, sizeof(sym));
-	if (!self->kernel) {
+	if (self->kernel == DSO_TYPE_USER) {
 		self->adjust_symbols = (ehdr.e_type == ET_EXEC ||
 				elf_section_by_name(elf, &ehdr, &shdr,
 						     ".gnu.prelink_undo",
@@ -984,7 +1021,7 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name,
 
 		section_name = elf_sec__name(&shdr, secstrs);
 
-		if (self->kernel || kmodule) {
+		if (self->kernel != DSO_TYPE_USER || kmodule) {
 			char dso_name[PATH_MAX];
 
 			if (strcmp(section_name,
@@ -1011,6 +1048,7 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name,
 				curr_dso = dso__new(dso_name);
 				if (curr_dso == NULL)
 					goto out_elf_end;
+				curr_dso->kernel = self->kernel;
 				curr_map = map__new2(start, curr_dso,
 						     map->type);
 				if (curr_map == NULL) {
@@ -1021,7 +1059,7 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name,
 				curr_map->unmap_ip = identity__map_ip;
 				curr_dso->origin = self->origin;
 				map_groups__insert(kmap->kmaps, curr_map);
-				dsos__add(&dsos__kernel, curr_dso);
+				dsos__add(&self->node, curr_dso);
 				dso__set_loaded(curr_dso, map->type);
 			} else
 				curr_dso = curr_map->dso;
@@ -1083,7 +1121,7 @@ static bool dso__build_id_equal(const struct dso *self, u8 *build_id)
 	return memcmp(self->build_id, build_id, sizeof(self->build_id)) == 0;
 }
 
-static bool __dsos__read_build_ids(struct list_head *head, bool with_hits)
+bool __dsos__read_build_ids(struct list_head *head, bool with_hits)
 {
 	bool have_build_id = false;
 	struct dso *pos;
@@ -1101,13 +1139,6 @@ static bool __dsos__read_build_ids(struct list_head *head, bool with_hits)
 	return have_build_id;
 }
 
-bool dsos__read_build_ids(bool with_hits)
-{
-	bool kbuildids = __dsos__read_build_ids(&dsos__kernel, with_hits),
-	     ubuildids = __dsos__read_build_ids(&dsos__user, with_hits);
-	return kbuildids || ubuildids;
-}
-
 /*
  * Align offset to 4 bytes as needed for note name and descriptor data.
  */
@@ -1242,6 +1273,8 @@ char dso__symtab_origin(const struct dso *self)
 		[DSO__ORIG_BUILDID] =  'b',
 		[DSO__ORIG_DSO] =      'd',
 		[DSO__ORIG_KMODULE] =  'K',
+		[DSO__ORIG_GUEST_KERNEL] =  'g',
+		[DSO__ORIG_GUEST_KMODULE] =  'G',
 	};
 
 	if (self == NULL || self->origin == DSO__ORIG_NOT_FOUND)
@@ -1257,11 +1290,20 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter)
 	char build_id_hex[BUILD_ID_SIZE * 2 + 1];
 	int ret = -1;
 	int fd;
+	struct kernel_info *kerninfo;
+	const char *root_dir;
 
 	dso__set_loaded(self, map->type);
 
-	if (self->kernel)
+	if (self->kernel == DSO_TYPE_KERNEL)
 		return dso__load_kernel_sym(self, map, filter);
+	else if (self->kernel == DSO_TYPE_GUEST_KERNEL)
+		return dso__load_guest_kernel_sym(self, map, filter);
+
+	if (map->groups && map->groups->this_kerninfo)
+		kerninfo = map->groups->this_kerninfo;
+	else
+		kerninfo = NULL;
 
 	name = malloc(size);
 	if (!name)
@@ -1315,6 +1357,13 @@ more:
 		case DSO__ORIG_DSO:
 			snprintf(name, size, "%s", self->long_name);
 			break;
+		case DSO__ORIG_GUEST_KMODULE:
+			if (map->groups && map->groups->this_kerninfo)
+				root_dir = map->groups->this_kerninfo->root_dir;
+			else
+				root_dir = "";
+			snprintf(name, size, "%s%s", root_dir, self->long_name);
+			break;
 
 		default:
 			goto out;
@@ -1368,7 +1417,8 @@ struct map *map_groups__find_by_name(struct map_groups *self,
 	return NULL;
 }
 
-static int dso__kernel_module_get_build_id(struct dso *self)
+static int dso__kernel_module_get_build_id(struct dso *self,
+				const char *root_dir)
 {
 	char filename[PATH_MAX];
 	/*
@@ -1378,8 +1428,8 @@ static int dso__kernel_module_get_build_id(struct dso *self)
 	const char *name = self->short_name + 1;
 
 	snprintf(filename, sizeof(filename),
-		 "/sys/module/%.*s/notes/.note.gnu.build-id",
-		 (int)strlen(name - 1), name);
+		 "%s/sys/module/%.*s/notes/.note.gnu.build-id",
+		 root_dir, (int)strlen(name) - 1, name);
 
 	if (sysfs__read_build_id(filename, self->build_id,
 				 sizeof(self->build_id)) == 0)
@@ -1388,7 +1438,8 @@ static int dso__kernel_module_get_build_id(struct dso *self)
 	return 0;
 }
 
-static int map_groups__set_modules_path_dir(struct map_groups *self, char *dir_name)
+static int map_groups__set_modules_path_dir(struct map_groups *self,
+				const char *dir_name)
 {
 	struct dirent *dent;
 	DIR *dir = opendir(dir_name);
@@ -1400,8 +1451,14 @@ static int map_groups__set_modules_path_dir(struct map_groups *self, char *dir_n
 
 	while ((dent = readdir(dir)) != NULL) {
 		char path[PATH_MAX];
+		struct stat st;
+
+		/*sshfs might return bad dent->d_type, so we have to stat*/
+		sprintf(path, "%s/%s", dir_name, dent->d_name);
+		if (stat(path, &st))
+			continue;
 
-		if (dent->d_type == DT_DIR) {
+		if (S_ISDIR(st.st_mode)) {
 			if (!strcmp(dent->d_name, ".") ||
 			    !strcmp(dent->d_name, ".."))
 				continue;
@@ -1433,7 +1490,7 @@ static int map_groups__set_modules_path_dir(struct map_groups *self, char *dir_n
 			if (long_name == NULL)
 				goto failure;
 			dso__set_long_name(map->dso, long_name);
-			dso__kernel_module_get_build_id(map->dso);
+			dso__kernel_module_get_build_id(map->dso, "");
 		}
 	}
 
@@ -1443,16 +1500,46 @@ failure:
 	return -1;
 }
 
-static int map_groups__set_modules_path(struct map_groups *self)
+static char *get_kernel_version(const char *root_dir)
 {
-	struct utsname uts;
+	char version[PATH_MAX];
+	FILE *file;
+	char *name, *tmp;
+	const char *prefix = "Linux version ";
+
+	sprintf(version, "%s/proc/version", root_dir);
+	file = fopen(version, "r");
+	if (!file)
+		return NULL;
+
+	version[0] = '\0';
+	tmp = fgets(version, sizeof(version), file);
+	fclose(file);
+
+	name = strstr(version, prefix);
+	if (!name)
+		return NULL;
+	name += strlen(prefix);
+	tmp = strchr(name, ' ');
+	if (tmp)
+		*tmp = '\0';
+
+	return strdup(name);
+}
+
+static int map_groups__set_modules_path(struct map_groups *self,
+				const char *root_dir)
+{
+	char *version;
 	char modules_path[PATH_MAX];
 
-	if (uname(&uts) < 0)
+	version = get_kernel_version(root_dir);
+	if (!version)
 		return -1;
 
-	snprintf(modules_path, sizeof(modules_path), "/lib/modules/%s/kernel",
-		 uts.release);
+	snprintf(modules_path, sizeof(modules_path), "%s/lib/modules/%s/kernel",
+		 root_dir, version);
+	free(version);
 
 	return map_groups__set_modules_path_dir(self, modules_path);
 }
@@ -1477,11 +1564,13 @@ static struct map *map__new2(u64 start, struct dso *dso, enum map_type type)
 }
 
 struct map *map_groups__new_module(struct map_groups *self, u64 start,
-				   const char *filename)
+				const char *filename,
+				struct kernel_info *kerninfo)
 {
 	struct map *map;
-	struct dso *dso = __dsos__findnew(&dsos__kernel, filename);
+	struct dso *dso;
 
+	dso = __dsos__findnew(&kerninfo->dsos__kernel, filename);
 	if (dso == NULL)
 		return NULL;
 
@@ -1489,21 +1578,37 @@ struct map *map_groups__new_module(struct map_groups *self, u64 start,
 	if (map == NULL)
 		return NULL;
 
-	dso->origin = DSO__ORIG_KMODULE;
+	if (is_host_kernel(kerninfo))
+		dso->origin = DSO__ORIG_KMODULE;
+	else
+		dso->origin = DSO__ORIG_GUEST_KMODULE;
 	map_groups__insert(self, map);
 	return map;
 }
 
-static int map_groups__create_modules(struct map_groups *self)
+static int map_groups__create_modules(struct kernel_info *kerninfo)
 {
 	char *line = NULL;
 	size_t n;
-	FILE *file = fopen("/proc/modules", "r");
+	FILE *file;
 	struct map *map;
+	const char *root_dir;
+	const char *modules;
+	char path[PATH_MAX];
+
+	if (is_default_guest(kerninfo))
+		modules = symbol_conf.default_guest_modules;
+	else {
+		sprintf(path, "%s/proc/modules", kerninfo->root_dir);
+		modules = path;
+	}
 
+	file = fopen(modules, "r");
 	if (file == NULL)
 		return -1;
 
+	root_dir = kerninfo->root_dir;
+
 	while (!feof(file)) {
 		char name[PATH_MAX];
 		u64 start;
@@ -1532,16 +1637,17 @@ static int map_groups__create_modules(struct map_groups *self)
 		*sep = '\0';
 
 		snprintf(name, sizeof(name), "[%s]", line);
-		map = map_groups__new_module(self, start, name);
+		map = map_groups__new_module(&kerninfo->kmaps,
+				start, name, kerninfo);
 		if (map == NULL)
 			goto out_delete_line;
-		dso__kernel_module_get_build_id(map->dso);
+		dso__kernel_module_get_build_id(map->dso, root_dir);
 	}
 
 	free(line);
 	fclose(file);
 
-	return map_groups__set_modules_path(self);
+	return map_groups__set_modules_path(&kerninfo->kmaps, root_dir);
 
 out_delete_line:
 	free(line);
@@ -1708,8 +1814,57 @@ out_fixup:
 	return err;
 }
 
-LIST_HEAD(dsos__user);
-LIST_HEAD(dsos__kernel);
+static int dso__load_guest_kernel_sym(struct dso *self, struct map *map,
+				symbol_filter_t filter)
+{
+	int err;
+	const char *kallsyms_filename = NULL;
+	struct kernel_info *kerninfo;
+	char path[PATH_MAX];
+
+	if (!map->groups) {
+		pr_debug("Guest kernel map hasn't the point to groups\n");
+		return -1;
+	}
+	kerninfo = map->groups->this_kerninfo;
+
+	if (is_default_guest(kerninfo)) {
+		/*
+		 * if the user specified a vmlinux filename, use it and only
+		 * it, reporting errors to the user if it cannot be used.
+		 * Or use file guest_kallsyms inputted by user on commandline
+		 */
+		if (symbol_conf.default_guest_vmlinux_name != NULL) {
+			err = dso__load_vmlinux(self, map,
+				symbol_conf.default_guest_vmlinux_name, filter);
+			goto out_try_fixup;
+		}
+
+		kallsyms_filename = symbol_conf.default_guest_kallsyms;
+		if (!kallsyms_filename)
+			return -1;
+	} else {
+		sprintf(path, "%s/proc/kallsyms", kerninfo->root_dir);
+		kallsyms_filename = path;
+	}
+
+	err = dso__load_kallsyms(self, kallsyms_filename, map, filter);
+	if (err > 0)
+		pr_debug("Using %s for symbols\n", kallsyms_filename);
+
+out_try_fixup:
+	if (err > 0) {
+		if (kallsyms_filename != NULL) {
+			kern_mmap_name(kerninfo, path);
+			dso__set_long_name(self,
+				strdup(path));
+		}
+		map__fixup_start(map);
+		map__fixup_end(map);
+	}
+
+	return err;
+}
 
 static void dsos__add(struct list_head *head, struct dso *dso)
 {
@@ -1752,10 +1907,16 @@ static void __dsos__fprintf(struct list_head *head, FILE *fp)
 	}
 }
 
-void dsos__fprintf(FILE *fp)
+void dsos__fprintf(struct rb_root *kerninfo_root, FILE *fp)
 {
-	__dsos__fprintf(&dsos__kernel, fp);
-	__dsos__fprintf(&dsos__user, fp);
+	struct rb_node *nd;
+
+	for (nd = rb_first(kerninfo_root); nd; nd = rb_next(nd)) {
+		struct kernel_info *pos = rb_entry(nd, struct kernel_info,
+				rb_node);
+		__dsos__fprintf(&pos->dsos__kernel, fp);
+		__dsos__fprintf(&pos->dsos__user, fp);
+	}
 }
 
 static size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp,
@@ -1773,10 +1934,21 @@ static size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp,
 	return ret;
 }
 
-size_t dsos__fprintf_buildid(FILE *fp, bool with_hits)
+size_t dsos__fprintf_buildid(struct rb_root *kerninfo_root,
+		FILE *fp, bool with_hits)
 {
-	return (__dsos__fprintf_buildid(&dsos__kernel, fp, with_hits) +
-		__dsos__fprintf_buildid(&dsos__user, fp, with_hits));
+	struct rb_node *nd;
+	size_t ret = 0;
+
+	for (nd = rb_first(kerninfo_root); nd; nd = rb_next(nd)) {
+		struct kernel_info *pos = rb_entry(nd, struct kernel_info,
+				rb_node);
+		ret += __dsos__fprintf_buildid(&pos->dsos__kernel,
+					fp, with_hits);
+		ret += __dsos__fprintf_buildid(&pos->dsos__user,
+					fp, with_hits);
+	}
+	return ret;
 }
 
 struct dso *dso__new_kernel(const char *name)
@@ -1785,28 +1957,59 @@ struct dso *dso__new_kernel(const char *name)
 
 	if (self != NULL) {
 		dso__set_short_name(self, "[kernel]");
-		self->kernel	 = 1;
+		self->kernel = DSO_TYPE_KERNEL;
+	}
+
+	return self;
+}
+
+static struct dso *dso__new_guest_kernel(struct kernel_info *kerninfo,
+					const char *name)
+{
+	char buff[PATH_MAX];
+	struct dso *self;
+
+	kern_mmap_name(kerninfo, buff);
+	self = dso__new(name ?: buff);
+	if (self != NULL) {
+		dso__set_short_name(self, "[guest.kernel]");
+		self->kernel = DSO_TYPE_GUEST_KERNEL;
 	}
 
 	return self;
 }
 
-void dso__read_running_kernel_build_id(struct dso *self)
+void dso__read_running_kernel_build_id(struct dso *self,
+			struct kernel_info *kerninfo)
 {
-	if (sysfs__read_build_id("/sys/kernel/notes", self->build_id,
+	char path[PATH_MAX];
+
+	if (is_default_guest(kerninfo))
+		return;
+	sprintf(path, "%s/sys/kernel/notes", kerninfo->root_dir);
+	if (sysfs__read_build_id(path, self->build_id,
 				 sizeof(self->build_id)) == 0)
 		self->has_build_id = true;
 }
 
-static struct dso *dsos__create_kernel(const char *vmlinux)
+static struct dso *dsos__create_kernel(struct kernel_info *kerninfo)
 {
-	struct dso *kernel = dso__new_kernel(vmlinux);
+	const char *vmlinux_name = NULL;
+	struct dso *kernel;
 
-	if (kernel != NULL) {
-		dso__read_running_kernel_build_id(kernel);
-		dsos__add(&dsos__kernel, kernel);
+	if (is_host_kernel(kerninfo)) {
+		vmlinux_name = symbol_conf.vmlinux_name;
+		kernel = dso__new_kernel(vmlinux_name);
+	} else {
+		if (is_default_guest(kerninfo))
+			vmlinux_name = symbol_conf.default_guest_vmlinux_name;
+		kernel = dso__new_guest_kernel(kerninfo, vmlinux_name);
 	}
 
+	if (kernel != NULL) {
+		dso__read_running_kernel_build_id(kernel, kerninfo);
+		dsos__add(&kerninfo->dsos__kernel, kernel);
+	}
 	return kernel;
 }
 
@@ -1950,23 +2153,29 @@ out_free_comm_list:
 	return -1;
 }
 
-int map_groups__create_kernel_maps(struct map_groups *self,
-				   struct map *vmlinux_maps[MAP__NR_TYPES])
+int map_groups__create_kernel_maps(struct rb_root *kerninfo_root, pid_t pid)
 {
-	struct dso *kernel = dsos__create_kernel(symbol_conf.vmlinux_name);
+	struct kernel_info *kerninfo;
+	struct dso *kernel;
 
+	kerninfo = kerninfo__findnew(kerninfo_root, pid);
+	if (kerninfo == NULL)
+		return -1;
+	kernel = dsos__create_kernel(kerninfo);
 	if (kernel == NULL)
 		return -1;
 
-	if (__map_groups__create_kernel_maps(self, vmlinux_maps, kernel) < 0)
+	if (__map_groups__create_kernel_maps(&kerninfo->kmaps,
+			kerninfo->vmlinux_maps, kernel) < 0)
 		return -1;
 
-	if (symbol_conf.use_modules && map_groups__create_modules(self) < 0)
+	if (symbol_conf.use_modules &&
+		map_groups__create_modules(kerninfo) < 0)
 		pr_debug("Problems creating module maps, continuing anyway...\n");
 	/*
 	 * Now that we have all the maps created, just set the ->end of them:
 	 */
-	map_groups__fixup_end(self);
+	map_groups__fixup_end(&kerninfo->kmaps);
 	return 0;
 }
 
@@ -2012,3 +2221,46 @@ char *strxfrchar(char *s, char from, char to)
 
 	return s;
 }
+
+int map_groups__create_guest_kernel_maps(struct rb_root *kerninfo_root)
+{
+	int ret = 0;
+	struct dirent **namelist = NULL;
+	int i, items = 0;
+	char path[PATH_MAX];
+	pid_t pid;
+
+	if (symbol_conf.default_guest_vmlinux_name ||
+	    symbol_conf.default_guest_modules ||
+	    symbol_conf.default_guest_kallsyms) {
+		map_groups__create_kernel_maps(kerninfo_root,
+					DEFAULT_GUEST_KERNEL_ID);
+	}
+
+	if (symbol_conf.guestmount) {
+		items = scandir(symbol_conf.guestmount, &namelist, NULL, NULL);
+		if (items <= 0)
+			return -ENOENT;
+		for (i = 0; i < items; i++) {
+			if (!isdigit(namelist[i]->d_name[0])) {
+				/* Filter out . and .. */
+				continue;
+			}
+			pid = atoi(namelist[i]->d_name);
+			sprintf(path, "%s/%s/proc/kallsyms",
+				symbol_conf.guestmount,
+				namelist[i]->d_name);
+			ret = access(path, R_OK);
+			if (ret) {
+				pr_debug("Can't access file %s\n", path);
+				goto failure;
+			}
+			map_groups__create_kernel_maps(kerninfo_root,
+							pid);
+		}
+failure:
+		free(namelist);
+	}
+
+	return ret;
+}
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 757fae3f5ee..478f5ab3778 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -69,10 +69,15 @@ struct symbol_conf {
 			show_nr_samples,
 			use_callchain,
 			exclude_other,
-			full_paths;
+			full_paths,
+			show_cpu_utilization;
 	const char	*vmlinux_name,
 			*field_sep;
-	char            *dso_list_str,
+	const char	*default_guest_vmlinux_name,
+			*default_guest_kallsyms,
+			*default_guest_modules;
+	const char	*guestmount;
+	char		*dso_list_str,
 			*comm_list_str,
 			*sym_list_str,
 			*col_width_list_str;
@@ -106,6 +111,13 @@ struct addr_location {
 	u64	      addr;
 	char	      level;
 	bool	      filtered;
+	unsigned int  cpumode;
+};
+
+enum dso_kernel_type {
+	DSO_TYPE_USER = 0,
+	DSO_TYPE_KERNEL,
+	DSO_TYPE_GUEST_KERNEL
 };
 
 struct dso {
@@ -115,7 +127,7 @@ struct dso {
 	u8		 adjust_symbols:1;
 	u8		 slen_calculated:1;
 	u8		 has_build_id:1;
-	u8		 kernel:1;
+	enum dso_kernel_type	kernel;
 	u8		 hit:1;
 	u8		 annotate_warned:1;
 	unsigned char	 origin;
@@ -143,34 +155,30 @@ static inline void dso__set_loaded(struct dso *self, enum map_type type)
 
 void dso__sort_by_name(struct dso *self, enum map_type type);
 
-extern struct list_head dsos__user, dsos__kernel;
-
 struct dso *__dsos__findnew(struct list_head *head, const char *name);
 
-static inline struct dso *dsos__findnew(const char *name)
-{
-	return __dsos__findnew(&dsos__user, name);
-}
-
 int dso__load(struct dso *self, struct map *map, symbol_filter_t filter);
 int dso__load_vmlinux_path(struct dso *self, struct map *map,
 			   symbol_filter_t filter);
 int dso__load_kallsyms(struct dso *self, const char *filename, struct map *map,
 		       symbol_filter_t filter);
-void dsos__fprintf(FILE *fp);
-size_t dsos__fprintf_buildid(FILE *fp, bool with_hits);
+void dsos__fprintf(struct rb_root *kerninfo_root, FILE *fp);
+size_t dsos__fprintf_buildid(struct rb_root *kerninfo_root,
+		FILE *fp, bool with_hits);
 
 size_t dso__fprintf_buildid(struct dso *self, FILE *fp);
 size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp);
 
 enum dso_origin {
 	DSO__ORIG_KERNEL = 0,
+	DSO__ORIG_GUEST_KERNEL,
 	DSO__ORIG_JAVA_JIT,
 	DSO__ORIG_BUILD_ID_CACHE,
 	DSO__ORIG_FEDORA,
 	DSO__ORIG_UBUNTU,
 	DSO__ORIG_BUILDID,
 	DSO__ORIG_DSO,
+	DSO__ORIG_GUEST_KMODULE,
 	DSO__ORIG_KMODULE,
 	DSO__ORIG_NOT_FOUND,
 };
@@ -178,19 +186,26 @@ enum dso_origin {
 char dso__symtab_origin(const struct dso *self);
 void dso__set_long_name(struct dso *self, char *name);
 void dso__set_build_id(struct dso *self, void *build_id);
-void dso__read_running_kernel_build_id(struct dso *self);
+void dso__read_running_kernel_build_id(struct dso *self,
+		struct kernel_info *kerninfo);
 struct symbol *dso__find_symbol(struct dso *self, enum map_type type, u64 addr);
 struct symbol *dso__find_symbol_by_name(struct dso *self, enum map_type type,
 					const char *name);
 
 int filename__read_build_id(const char *filename, void *bf, size_t size);
 int sysfs__read_build_id(const char *filename, void *bf, size_t size);
-bool dsos__read_build_ids(bool with_hits);
+bool __dsos__read_build_ids(struct list_head *head, bool with_hits);
 int build_id__sprintf(const u8 *self, int len, char *bf);
 int kallsyms__parse(const char *filename, void *arg,
 		    int (*process_symbol)(void *arg, const char *name,
 					  char type, u64 start));
 
+int __map_groups__create_kernel_maps(struct map_groups *self,
+			struct map *vmlinux_maps[MAP__NR_TYPES],
+			struct dso *kernel);
+int map_groups__create_kernel_maps(struct rb_root *kerninfo_root, pid_t pid);
+int map_groups__create_guest_kernel_maps(struct rb_root *kerninfo_root);
+
 int symbol__init(void);
 bool symbol_type__is_a(char symbol_type, enum map_type map_type);
 
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 9c488fcadec..1dfd9ff8bdc 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -33,12 +33,12 @@ static inline struct map *thread__find_map(struct thread *self,
 
 void thread__find_addr_map(struct thread *self,
 			   struct perf_session *session, u8 cpumode,
-			   enum map_type type, u64 addr,
+			   enum map_type type, pid_t pid, u64 addr,
 			   struct addr_location *al);
 
 void thread__find_addr_location(struct thread *self,
 				struct perf_session *session, u8 cpumode,
-				enum map_type type, u64 addr,
+				enum map_type type, pid_t pid, u64 addr,
 				struct addr_location *al,
 				symbol_filter_t filter);
 #endif	/* __PERF_THREAD_H */
-- 
cgit v1.2.3


From dcf46b9443ad48a227a61713adea001228925adf Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Tue, 20 Apr 2010 10:13:58 +0800
Subject: perf & kvm: Clean up some of the guest profiling callback API details

Fix some build bug and programming style issues:

 - use valid C
 - fix up various style details

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Sheng Yang <sheng@linux.intel.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: oerg Roedel <joro@8bytes.org>
Cc: Jes Sorensen <Jes.Sorensen@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Zachary Amsden <zamsden@redhat.com>
Cc: zhiteng.huang@intel.com
Cc: tim.c.chen@intel.com
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
LKML-Reference: <1271729638.2078.624.camel@ymzhang.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 20 ++++++++++++++------
 arch/x86/kvm/x86.c               |  4 ++++
 include/linux/perf_event.h       | 10 ++++------
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2ea78abf69d..7de70613e6c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1752,23 +1752,31 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
 	unsigned long ip;
+
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
 		ip = perf_guest_cbs->get_guest_ip();
 	else
 		ip = instruction_pointer(regs);
+
 	return ip;
 }
 
 unsigned long perf_misc_flags(struct pt_regs *regs)
 {
 	int misc = 0;
+
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		misc |= perf_guest_cbs->is_user_mode() ?
-			PERF_RECORD_MISC_GUEST_USER :
-			PERF_RECORD_MISC_GUEST_KERNEL;
-	} else
-		misc |= user_mode(regs) ? PERF_RECORD_MISC_USER :
-			PERF_RECORD_MISC_KERNEL;
+		if (perf_guest_cbs->is_user_mode())
+			misc |= PERF_RECORD_MISC_GUEST_USER;
+		else
+			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+	} else {
+		if (user_mode(regs))
+			misc |= PERF_RECORD_MISC_USER;
+		else
+			misc |= PERF_RECORD_MISC_KERNEL;
+	}
+
 	if (regs->flags & PERF_EFLAGS_EXACT)
 		misc |= PERF_RECORD_MISC_EXACT;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c3a33b2bb16..21b9b6aa3e8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3776,16 +3776,20 @@ static int kvm_is_in_guest(void)
 static int kvm_is_user_mode(void)
 {
 	int user_mode = 3;
+
 	if (percpu_read(current_vcpu))
 		user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
+
 	return user_mode != 0;
 }
 
 static unsigned long kvm_get_guest_ip(void)
 {
 	unsigned long ip = 0;
+
 	if (percpu_read(current_vcpu))
 		ip = kvm_rip_read(percpu_read(current_vcpu));
+
 	return ip;
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 24de5f181a4..ace31fbac51 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -941,10 +941,8 @@ static inline void perf_event_mmap(struct vm_area_struct *vma)
 }
 
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
-extern int perf_register_guest_info_callbacks(
-		struct perf_guest_info_callbacks *);
-extern int perf_unregister_guest_info_callbacks(
-		struct perf_guest_info_callbacks *);
+extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
+extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
 
 extern void perf_event_comm(struct task_struct *tsk);
 extern void perf_event_fork(struct task_struct *tsk);
@@ -1016,9 +1014,9 @@ static inline void
 perf_bp_event(struct perf_event *event, void *data)			{ }
 
 static inline int perf_register_guest_info_callbacks
-(struct perf_guest_info_callbacks *) {return 0; }
+(struct perf_guest_info_callbacks *callbacks) { return 0; }
 static inline int perf_unregister_guest_info_callbacks
-(struct perf_guest_info_callbacks *) {return 0; }
+(struct perf_guest_info_callbacks *callbacks) { return 0; }
 
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_comm(struct task_struct *tsk)		{ }
-- 
cgit v1.2.3


From 315e995c63a15cb4d4efdbfd70fe2db191917f7a Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggen@suse.de>
Date: Wed, 21 Apr 2010 03:18:28 +0000
Subject: [CIFS] use add_to_page_cache_lru

add_to_page_cache_lru is exported, so it should be used. Benefits over
using a private pagevec: neater code, 128 bytes fewer stack used, percpu
lru ordering is preserved, and finally don't need to flush pagevec
before returning so batching may be shared with other LRU insertions.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Reviewed-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9b11a8f56f3..1361d67f68f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1931,8 +1931,7 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 
 static void cifs_copy_cache_pages(struct address_space *mapping,
-	struct list_head *pages, int bytes_read, char *data,
-	struct pagevec *plru_pvec)
+	struct list_head *pages, int bytes_read, char *data)
 {
 	struct page *page;
 	char *target;
@@ -1944,7 +1943,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
 		page = list_entry(pages->prev, struct page, lru);
 		list_del(&page->lru);
 
-		if (add_to_page_cache(page, mapping, page->index,
+		if (add_to_page_cache_lru(page, mapping, page->index,
 				      GFP_KERNEL)) {
 			page_cache_release(page);
 			cFYI(1, ("Add page cache failed"));
@@ -1970,8 +1969,6 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
 		flush_dcache_page(page);
 		SetPageUptodate(page);
 		unlock_page(page);
-		if (!pagevec_add(plru_pvec, page))
-			__pagevec_lru_add_file(plru_pvec);
 		data += PAGE_CACHE_SIZE;
 	}
 	return;
@@ -1990,7 +1987,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 	unsigned int read_size, i;
 	char *smb_read_data = NULL;
 	struct smb_com_read_rsp *pSMBr;
-	struct pagevec lru_pvec;
 	struct cifsFileInfo *open_file;
 	int buf_type = CIFS_NO_BUFFER;
 
@@ -2004,7 +2000,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	pTcon = cifs_sb->tcon;
 
-	pagevec_init(&lru_pvec, 0);
 	cFYI(DBG2, ("rpages: num pages %d", num_pages));
 	for (i = 0; i < num_pages; ) {
 		unsigned contig_pages;
@@ -2073,7 +2068,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 			pSMBr = (struct smb_com_read_rsp *)smb_read_data;
 			cifs_copy_cache_pages(mapping, page_list, bytes_read,
 				smb_read_data + 4 /* RFC1001 hdr */ +
-				le16_to_cpu(pSMBr->DataOffset), &lru_pvec);
+				le16_to_cpu(pSMBr->DataOffset));
 
 			i +=  bytes_read >> PAGE_CACHE_SHIFT;
 			cifs_stats_bytes_read(pTcon, bytes_read);
@@ -2106,8 +2101,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		bytes_read = 0;
 	}
 
-	pagevec_lru_add_file(&lru_pvec);
-
 /* need to free smb_read_data buf before exit */
 	if (smb_read_data) {
 		if (buf_type == CIFS_SMALL_BUFFER)
-- 
cgit v1.2.3


From b6b38f704a8193daba520493ebdaf7e819962fc8 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 21 Apr 2010 03:50:45 +0000
Subject: [CIFS] Neaten cERROR and cFYI macros, reduce text space

Neaten cERROR and cFYI macros, reduce text space
~2.5K

Convert '__FILE__ ": " fmt' to '"%s: " fmt', __FILE__' to save text space
Surround macros with do {} while
Add parentheses to macros
Make statement expression macro from macro with assign
Remove now unnecessary parentheses from cFYI and cERROR uses

defconfig with CIFS support old
$ size fs/cifs/built-in.o
   text	   data	    bss	    dec	    hex	filename
 156012	   1760	    148	 157920	  268e0	fs/cifs/built-in.o

defconfig with CIFS support old
$ size fs/cifs/built-in.o
   text	   data	    bss	    dec	    hex	filename
 153508	   1760	    148	 155416	  25f18	fs/cifs/built-in.o

allyesconfig old:
$ size fs/cifs/built-in.o
   text	   data	    bss	    dec	    hex	filename
 309138	   3864	  74824	 387826	  5eaf2	fs/cifs/built-in.o

allyesconfig new
$ size fs/cifs/built-in.o
   text	   data	    bss	    dec	    hex	filename
 305655	   3864	  74824	 384343	  5dd57	fs/cifs/built-in.o

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/asn1.c         |  73 +++++----
 fs/cifs/cifs_debug.c   |  34 ++---
 fs/cifs/cifs_debug.h   |  42 ++++--
 fs/cifs/cifs_dfs_ref.c |  34 ++---
 fs/cifs/cifs_spnego.c  |   2 +-
 fs/cifs/cifs_unicode.c |   5 +-
 fs/cifs/cifsacl.c      |  76 +++++-----
 fs/cifs/cifsencrypt.c  |   8 +-
 fs/cifs/cifsfs.c       |  37 +++--
 fs/cifs/cifsproto.h    |  16 +-
 fs/cifs/cifssmb.c      | 393 ++++++++++++++++++++++++-------------------------
 fs/cifs/connect.c      | 254 ++++++++++++++++----------------
 fs/cifs/dir.c          |  43 +++---
 fs/cifs/dns_resolve.c  |  16 +-
 fs/cifs/export.c       |   2 +-
 fs/cifs/file.c         | 172 +++++++++++-----------
 fs/cifs/inode.c        |  82 +++++------
 fs/cifs/ioctl.c        |  10 +-
 fs/cifs/link.c         |  10 +-
 fs/cifs/misc.c         |  81 +++++-----
 fs/cifs/netmisc.c      |  16 +-
 fs/cifs/readdir.c      |  85 ++++++-----
 fs/cifs/sess.c         |  58 ++++----
 fs/cifs/transport.c    |  91 ++++++------
 fs/cifs/xattr.c        |  40 ++---
 25 files changed, 845 insertions(+), 835 deletions(-)

diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index a20bea59893..6d555c05dba 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -510,11 +510,11 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 
 	/* GSSAPI header */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding negTokenInit header"));
+		cFYI(1, "Error decoding negTokenInit header");
 		return 0;
 	} else if ((cls != ASN1_APL) || (con != ASN1_CON)
 		   || (tag != ASN1_EOC)) {
-		cFYI(1, ("cls = %d con = %d tag = %d", cls, con, tag));
+		cFYI(1, "cls = %d con = %d tag = %d", cls, con, tag);
 		return 0;
 	}
 
@@ -535,56 +535,52 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 
 	/* SPNEGO OID not present or garbled -- bail out */
 	if (!rc) {
-		cFYI(1, ("Error decoding negTokenInit header"));
+		cFYI(1, "Error decoding negTokenInit header");
 		return 0;
 	}
 
 	/* SPNEGO */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding negTokenInit"));
+		cFYI(1, "Error decoding negTokenInit");
 		return 0;
 	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
 		   || (tag != ASN1_EOC)) {
-		cFYI(1,
-		     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
-		      cls, con, tag, end, *end));
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
+		     cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* negTokenInit */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding negTokenInit"));
+		cFYI(1, "Error decoding negTokenInit");
 		return 0;
 	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
 		   || (tag != ASN1_SEQ)) {
-		cFYI(1,
-		     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
-		      cls, con, tag, end, *end));
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
+		     cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* sequence */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+		cFYI(1, "Error decoding 2nd part of negTokenInit");
 		return 0;
 	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)
 		   || (tag != ASN1_EOC)) {
-		cFYI(1,
-		     ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
-		      cls, con, tag, end, *end));
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 0",
+		     cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* sequence of */
 	if (asn1_header_decode
 	    (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+		cFYI(1, "Error decoding 2nd part of negTokenInit");
 		return 0;
 	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
 		   || (tag != ASN1_SEQ)) {
-		cFYI(1,
-		     ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
-		      cls, con, tag, end, *end));
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d) exit 1",
+		     cls, con, tag, end, *end);
 		return 0;
 	}
 
@@ -592,16 +588,15 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 	while (!asn1_eoc_decode(&ctx, sequence_end)) {
 		rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
 		if (!rc) {
-			cFYI(1,
-			     ("Error decoding negTokenInit hdr exit2"));
+			cFYI(1, "Error decoding negTokenInit hdr exit2");
 			return 0;
 		}
 		if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
 			if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
 
-				cFYI(1, ("OID len = %d oid = 0x%lx 0x%lx "
-					 "0x%lx 0x%lx", oidlen, *oid,
-					 *(oid + 1), *(oid + 2), *(oid + 3)));
+				cFYI(1, "OID len = %d oid = 0x%lx 0x%lx "
+					"0x%lx 0x%lx", oidlen, *oid,
+					*(oid + 1), *(oid + 2), *(oid + 3));
 
 				if (compare_oid(oid, oidlen, MSKRB5_OID,
 						MSKRB5_OID_LEN) &&
@@ -622,7 +617,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 				kfree(oid);
 			}
 		} else {
-			cFYI(1, ("Should be an oid what is going on?"));
+			cFYI(1, "Should be an oid what is going on?");
 		}
 	}
 
@@ -632,47 +627,47 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 		   no mechListMic (e.g. NTLMSSP instead of KRB5) */
 		if (ctx.error == ASN1_ERR_DEC_EMPTY)
 			goto decode_negtoken_exit;
-		cFYI(1, ("Error decoding last part negTokenInit exit3"));
+		cFYI(1, "Error decoding last part negTokenInit exit3");
 		return 0;
 	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
 		/* tag = 3 indicating mechListMIC */
-		cFYI(1, ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
-			 cls, con, tag, end, *end));
+		cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
+			cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* sequence */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding last part negTokenInit exit5"));
+		cFYI(1, "Error decoding last part negTokenInit exit5");
 		return 0;
 	} else if ((cls != ASN1_UNI) || (con != ASN1_CON)
 		   || (tag != ASN1_SEQ)) {
-		cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)",
-			cls, con, tag, end, *end));
+		cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)",
+			cls, con, tag, end, *end);
 	}
 
 	/* sequence of */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding last part negTokenInit exit 7"));
+		cFYI(1, "Error decoding last part negTokenInit exit 7");
 		return 0;
 	} else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
-		cFYI(1, ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
-			 cls, con, tag, end, *end));
+		cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
+			cls, con, tag, end, *end);
 		return 0;
 	}
 
 	/* general string */
 	if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
-		cFYI(1, ("Error decoding last part negTokenInit exit9"));
+		cFYI(1, "Error decoding last part negTokenInit exit9");
 		return 0;
 	} else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
 		   || (tag != ASN1_GENSTR)) {
-		cFYI(1, ("Exit10 cls = %d con = %d tag = %d end = %p (%d)",
-			 cls, con, tag, end, *end));
+		cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)",
+			cls, con, tag, end, *end);
 		return 0;
 	}
-	cFYI(1, ("Need to call asn1_octets_decode() function for %s",
-		 ctx.pointer));	/* is this UTF-8 or ASCII? */
+	cFYI(1, "Need to call asn1_octets_decode() function for %s",
+		ctx.pointer);	/* is this UTF-8 or ASCII? */
 decode_negtoken_exit:
 	if (use_kerberos)
 		*secType = Kerberos;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 42cec2a7c0c..54951804734 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -60,10 +60,10 @@ cifs_dump_mem(char *label, void *data, int length)
 #ifdef CONFIG_CIFS_DEBUG2
 void cifs_dump_detail(struct smb_hdr *smb)
 {
-	cERROR(1, ("Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
+	cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
 		  smb->Command, smb->Status.CifsError,
-		  smb->Flags, smb->Flags2, smb->Mid, smb->Pid));
-	cERROR(1, ("smb buf %p len %d", smb, smbCalcSize_LE(smb)));
+		  smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
+	cERROR(1, "smb buf %p len %d", smb, smbCalcSize_LE(smb));
 }
 
 
@@ -75,25 +75,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
 	if (server == NULL)
 		return;
 
-	cERROR(1, ("Dump pending requests:"));
+	cERROR(1, "Dump pending requests:");
 	spin_lock(&GlobalMid_Lock);
 	list_for_each(tmp, &server->pending_mid_q) {
 		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
-		cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+		cERROR(1, "State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
 			mid_entry->midState,
 			(int)mid_entry->command,
 			mid_entry->pid,
 			mid_entry->tsk,
-			mid_entry->mid));
+			mid_entry->mid);
 #ifdef CONFIG_CIFS_STATS2
-		cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld",
+		cERROR(1, "IsLarge: %d buf: %p time rcv: %ld now: %ld",
 			mid_entry->largeBuf,
 			mid_entry->resp_buf,
 			mid_entry->when_received,
-			jiffies));
+			jiffies);
 #endif /* STATS2 */
-		cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp,
-			  mid_entry->multiEnd));
+		cERROR(1, "IsMult: %d IsEnd: %d", mid_entry->multiRsp,
+			  mid_entry->multiEnd);
 		if (mid_entry->resp_buf) {
 			cifs_dump_detail(mid_entry->resp_buf);
 			cifs_dump_mem("existing buf: ",
@@ -750,7 +750,7 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
 			extended_security = CIFSSEC_MAX;
 			return count;
 		} else if (!isdigit(c)) {
-			cERROR(1, ("invalid flag %c", c));
+			cERROR(1, "invalid flag %c", c);
 			return -EINVAL;
 		}
 	}
@@ -758,16 +758,16 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
 
 	flags = simple_strtoul(flags_string, NULL, 0);
 
-	cFYI(1, ("sec flags 0x%x", flags));
+	cFYI(1, "sec flags 0x%x", flags);
 
 	if (flags <= 0)  {
-		cERROR(1, ("invalid security flags %s", flags_string));
+		cERROR(1, "invalid security flags %s", flags_string);
 		return -EINVAL;
 	}
 
 	if (flags & ~CIFSSEC_MASK) {
-		cERROR(1, ("attempt to set unsupported security flags 0x%x",
-			flags & ~CIFSSEC_MASK));
+		cERROR(1, "attempt to set unsupported security flags 0x%x",
+			flags & ~CIFSSEC_MASK);
 		return -EINVAL;
 	}
 	/* flags look ok - update the global security flags for cifs module */
@@ -775,9 +775,9 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
 	if (extended_security & CIFSSEC_MUST_SIGN) {
 		/* requiring signing implies signing is allowed */
 		extended_security |= CIFSSEC_MAY_SIGN;
-		cFYI(1, ("packet signing now required"));
+		cFYI(1, "packet signing now required");
 	} else if ((extended_security & CIFSSEC_MAY_SIGN) == 0) {
-		cFYI(1, ("packet signing disabled"));
+		cFYI(1, "packet signing disabled");
 	}
 	/* BB should we turn on MAY flags for other MUST options? */
 	return count;
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index 5eb3b83bbfa..aa316891ac0 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -43,34 +43,54 @@ void dump_smb(struct smb_hdr *, int);
  */
 #ifdef CIFS_DEBUG
 
-
 /* information message: e.g., configuration, major event */
 extern int cifsFYI;
-#define cifsfyi(format,arg...) if (cifsFYI & CIFS_INFO) printk(KERN_DEBUG " " __FILE__ ": " format "\n" "" , ## arg)
+#define cifsfyi(fmt, arg...)						\
+do {									\
+	if (cifsFYI & CIFS_INFO)					\
+		printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg);	\
+} while (0)
 
-#define cFYI(button,prspec) if (button) cifsfyi prspec
+#define cFYI(set, fmt, arg...)			\
+do {						\
+	if (set)				\
+		cifsfyi(fmt, ##arg);		\
+} while (0)
 
-#define cifswarn(format, arg...) printk(KERN_WARNING ": " format "\n" , ## arg)
+#define cifswarn(fmt, arg...)			\
+	printk(KERN_WARNING fmt "\n", ##arg)
 
 /* debug event message: */
 extern int cifsERROR;
 
-#define cEVENT(format,arg...) if (cifsERROR) printk(KERN_EVENT __FILE__ ": " format "\n" , ## arg)
+#define cEVENT(fmt, arg...)						\
+do {									\
+	if (cifsERROR)							\
+		printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg);	\
+} while (0)
 
 /* error event message: e.g., i/o error */
-#define cifserror(format,arg...) if (cifsERROR) printk(KERN_ERR " CIFS VFS: " format "\n" "" , ## arg)
+#define cifserror(fmt, arg...)					\
+do {								\
+	if (cifsERROR)						\
+		printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg);	\
+} while (0)
 
-#define cERROR(button, prspec) if (button) cifserror prspec
+#define cERROR(set, fmt, arg...)		\
+do {						\
+	if (set)				\
+		cifserror(fmt, ##arg);		\
+} while (0)
 
 /*
  *	debug OFF
  *	---------
  */
 #else		/* _CIFS_DEBUG */
-#define cERROR(button, prspec)
-#define cEVENT(format, arg...)
-#define cFYI(button, prspec)
-#define cifserror(format, arg...)
+#define cERROR(set, fmt, arg...)
+#define cEVENT(fmt, arg...)
+#define cFYI(set, fmt, arg...)
+#define cifserror(fmt, arg...)
 #endif		/* _CIFS_DEBUG */
 
 #endif				/* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 78e4d2a3a68..ac19a6f3dae 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -85,8 +85,8 @@ static char *cifs_get_share_name(const char *node_name)
 	/* find server name end */
 	pSep = memchr(UNC+2, '\\', len-2);
 	if (!pSep) {
-		cERROR(1, ("%s: no server name end in node name: %s",
-			__func__, node_name));
+		cERROR(1, "%s: no server name end in node name: %s",
+			__func__, node_name);
 		kfree(UNC);
 		return ERR_PTR(-EINVAL);
 	}
@@ -142,8 +142,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 
 	rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
 	if (rc != 0) {
-		cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
-			  __func__, *devname, rc));
+		cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
+			  __func__, *devname, rc);
 		goto compose_mount_options_err;
 	}
 	/* md_len = strlen(...) + 12 for 'sep+prefixpath='
@@ -217,8 +217,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 		strcat(mountdata, fullpath + ref->path_consumed);
 	}
 
-	/*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
-	/*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
+	/*cFYI(1, "%s: parent mountdata: %s", __func__,sb_mountdata);*/
+	/*cFYI(1, "%s: submount mountdata: %s", __func__, mountdata );*/
 
 compose_mount_options_out:
 	kfree(srvIP);
@@ -294,11 +294,11 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 
 static void dump_referral(const struct dfs_info3_param *ref)
 {
-	cFYI(1, ("DFS: ref path: %s", ref->path_name));
-	cFYI(1, ("DFS: node path: %s", ref->node_name));
-	cFYI(1, ("DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type));
-	cFYI(1, ("DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
-				ref->path_consumed));
+	cFYI(1, "DFS: ref path: %s", ref->path_name);
+	cFYI(1, "DFS: node path: %s", ref->node_name);
+	cFYI(1, "DFS: fl: %hd, srv_type: %hd", ref->flags, ref->server_type);
+	cFYI(1, "DFS: ref_flags: %hd, path_consumed: %hd", ref->ref_flag,
+				ref->path_consumed);
 }
 
 
@@ -314,7 +314,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	int rc = 0;
 	struct vfsmount *mnt = ERR_PTR(-ENOENT);
 
-	cFYI(1, ("in %s", __func__));
+	cFYI(1, "in %s", __func__);
 	BUG_ON(IS_ROOT(dentry));
 
 	xid = GetXid();
@@ -352,15 +352,15 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 		/* connect to a node */
 		len = strlen(referrals[i].node_name);
 		if (len < 2) {
-			cERROR(1, ("%s: Net Address path too short: %s",
-					__func__, referrals[i].node_name));
+			cERROR(1, "%s: Net Address path too short: %s",
+					__func__, referrals[i].node_name);
 			rc = -EINVAL;
 			goto out_err;
 		}
 		mnt = cifs_dfs_do_refmount(nd->path.mnt,
 				nd->path.dentry, referrals + i);
-		cFYI(1, ("%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
-					referrals[i].node_name, mnt));
+		cFYI(1, "%s: cifs_dfs_do_refmount:%s , mnt:%p", __func__,
+					referrals[i].node_name, mnt);
 
 		/* complete mount procedure if we accured submount */
 		if (!IS_ERR(mnt))
@@ -378,7 +378,7 @@ out:
 	FreeXid(xid);
 	free_dfs_info_array(referrals, num_referrals);
 	kfree(full_path);
-	cFYI(1, ("leaving %s" , __func__));
+	cFYI(1, "leaving %s" , __func__);
 	return ERR_PTR(rc);
 out_err:
 	path_put(&nd->path);
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 310d12f69a9..c53587b8330 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -149,7 +149,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	dp = description + strlen(description);
 	sprintf(dp, ";pid=0x%x", current->pid);
 
-	cFYI(1, ("key description = %s", description));
+	cFYI(1, "key description = %s", description);
 	spnego_key = request_key(&cifs_spnego_key_type, description, "");
 
 #ifdef CONFIG_CIFS_DEBUG2
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index d07676bd76d..430f510a172 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -200,9 +200,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 		/* works for 2.4.0 kernel or later */
 		charlen = codepage->char2uni(from, len, &wchar_to[i]);
 		if (charlen < 1) {
-			cERROR(1,
-			       ("strtoUCS: char2uni of %d returned %d",
-				(int)*from, charlen));
+			cERROR(1, "strtoUCS: char2uni of %d returned %d",
+				(int)*from, charlen);
 			/* A question mark */
 			to[i] = cpu_to_le16(0x003f);
 			charlen = 1;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 9b716d044bb..85d7cf7ff2c 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -87,11 +87,11 @@ int match_sid(struct cifs_sid *ctsid)
 				continue; /* all sub_auth values do not match */
 		}
 
-		cFYI(1, ("matching sid: %s\n", wksidarr[i].sidname));
+		cFYI(1, "matching sid: %s\n", wksidarr[i].sidname);
 		return 0; /* sids compare/match */
 	}
 
-	cFYI(1, ("No matching sid"));
+	cFYI(1, "No matching sid");
 	return -1;
 }
 
@@ -208,14 +208,14 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
 			*pbits_to_set &= ~S_IXUGO;
 		return;
 	} else if (type != ACCESS_ALLOWED) {
-		cERROR(1, ("unknown access control type %d", type));
+		cERROR(1, "unknown access control type %d", type);
 		return;
 	}
 	/* else ACCESS_ALLOWED type */
 
 	if (flags & GENERIC_ALL) {
 		*pmode |= (S_IRWXUGO & (*pbits_to_set));
-		cFYI(DBG2, ("all perms"));
+		cFYI(DBG2, "all perms");
 		return;
 	}
 	if ((flags & GENERIC_WRITE) ||
@@ -228,7 +228,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
 			((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
 		*pmode |= (S_IXUGO & (*pbits_to_set));
 
-	cFYI(DBG2, ("access flags 0x%x mode now 0x%x", flags, *pmode));
+	cFYI(DBG2, "access flags 0x%x mode now 0x%x", flags, *pmode);
 	return;
 }
 
@@ -257,7 +257,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
 	if (mode & S_IXUGO)
 		*pace_flags |= SET_FILE_EXEC_RIGHTS;
 
-	cFYI(DBG2, ("mode: 0x%x, access flags now 0x%x", mode, *pace_flags));
+	cFYI(DBG2, "mode: 0x%x, access flags now 0x%x", mode, *pace_flags);
 	return;
 }
 
@@ -297,24 +297,24 @@ static void dump_ace(struct cifs_ace *pace, char *end_of_acl)
 	/* validate that we do not go past end of acl */
 
 	if (le16_to_cpu(pace->size) < 16) {
-		cERROR(1, ("ACE too small, %d", le16_to_cpu(pace->size)));
+		cERROR(1, "ACE too small %d", le16_to_cpu(pace->size));
 		return;
 	}
 
 	if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) {
-		cERROR(1, ("ACL too small to parse ACE"));
+		cERROR(1, "ACL too small to parse ACE");
 		return;
 	}
 
 	num_subauth = pace->sid.num_subauth;
 	if (num_subauth) {
 		int i;
-		cFYI(1, ("ACE revision %d num_auth %d type %d flags %d size %d",
+		cFYI(1, "ACE revision %d num_auth %d type %d flags %d size %d",
 			pace->sid.revision, pace->sid.num_subauth, pace->type,
-			pace->flags, le16_to_cpu(pace->size)));
+			pace->flags, le16_to_cpu(pace->size));
 		for (i = 0; i < num_subauth; ++i) {
-			cFYI(1, ("ACE sub_auth[%d]: 0x%x", i,
-				le32_to_cpu(pace->sid.sub_auth[i])));
+			cFYI(1, "ACE sub_auth[%d]: 0x%x", i,
+				le32_to_cpu(pace->sid.sub_auth[i]));
 		}
 
 		/* BB add length check to make sure that we do not have huge
@@ -347,13 +347,13 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
 
 	/* validate that we do not go past end of acl */
 	if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
-		cERROR(1, ("ACL too small to parse DACL"));
+		cERROR(1, "ACL too small to parse DACL");
 		return;
 	}
 
-	cFYI(DBG2, ("DACL revision %d size %d num aces %d",
+	cFYI(DBG2, "DACL revision %d size %d num aces %d",
 		le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
-		le32_to_cpu(pdacl->num_aces)));
+		le32_to_cpu(pdacl->num_aces));
 
 	/* reset rwx permissions for user/group/other.
 	   Also, if num_aces is 0 i.e. DACL has no ACEs,
@@ -437,25 +437,25 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
 	/* validate that we do not go past end of ACL - sid must be at least 8
 	   bytes long (assuming no sub-auths - e.g. the null SID */
 	if (end_of_acl < (char *)psid + 8) {
-		cERROR(1, ("ACL too small to parse SID %p", psid));
+		cERROR(1, "ACL too small to parse SID %p", psid);
 		return -EINVAL;
 	}
 
 	if (psid->num_subauth) {
 #ifdef CONFIG_CIFS_DEBUG2
 		int i;
-		cFYI(1, ("SID revision %d num_auth %d",
-			psid->revision, psid->num_subauth));
+		cFYI(1, "SID revision %d num_auth %d",
+			psid->revision, psid->num_subauth);
 
 		for (i = 0; i < psid->num_subauth; i++) {
-			cFYI(1, ("SID sub_auth[%d]: 0x%x ", i,
-				le32_to_cpu(psid->sub_auth[i])));
+			cFYI(1, "SID sub_auth[%d]: 0x%x ", i,
+				le32_to_cpu(psid->sub_auth[i]));
 		}
 
 		/* BB add length check to make sure that we do not have huge
 			num auths and therefore go off the end */
-		cFYI(1, ("RID 0x%x",
-			le32_to_cpu(psid->sub_auth[psid->num_subauth-1])));
+		cFYI(1, "RID 0x%x",
+			le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
 #endif
 	}
 
@@ -482,11 +482,11 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 				le32_to_cpu(pntsd->gsidoffset));
 	dacloffset = le32_to_cpu(pntsd->dacloffset);
 	dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset);
-	cFYI(DBG2, ("revision %d type 0x%x ooffset 0x%x goffset 0x%x "
+	cFYI(DBG2, "revision %d type 0x%x ooffset 0x%x goffset 0x%x "
 		 "sacloffset 0x%x dacloffset 0x%x",
 		 pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset),
 		 le32_to_cpu(pntsd->gsidoffset),
-		 le32_to_cpu(pntsd->sacloffset), dacloffset));
+		 le32_to_cpu(pntsd->sacloffset), dacloffset);
 /*	cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
 	rc = parse_sid(owner_sid_ptr, end_of_acl);
 	if (rc)
@@ -500,7 +500,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
 		parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
 			   group_sid_ptr, fattr);
 	else
-		cFYI(1, ("no ACL")); /* BB grant all or default perms? */
+		cFYI(1, "no ACL"); /* BB grant all or default perms? */
 
 /*	cifscred->uid = owner_sid_ptr->rid;
 	cifscred->gid = group_sid_ptr->rid;
@@ -563,7 +563,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
 	FreeXid(xid);
 
 
-	cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+	cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
 	return pntsd;
 }
 
@@ -581,12 +581,12 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
 			 &fid, &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
-		cERROR(1, ("Unable to open file to get ACL"));
+		cERROR(1, "Unable to open file to get ACL");
 		goto out;
 	}
 
 	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
-	cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+	cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen);
 
 	CIFSSMBClose(xid, cifs_sb->tcon, fid);
  out:
@@ -621,7 +621,7 @@ static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
 	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
 	FreeXid(xid);
 
-	cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
+	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
 	return rc;
 }
 
@@ -638,12 +638,12 @@ static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
 			 &fid, &oplock, NULL, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
-		cERROR(1, ("Unable to open file to set ACL"));
+		cERROR(1, "Unable to open file to set ACL");
 		goto out;
 	}
 
 	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
-	cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
+	cFYI(DBG2, "SetCIFSACL rc = %d", rc);
 
 	CIFSSMBClose(xid, cifs_sb->tcon, fid);
  out:
@@ -659,7 +659,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 	struct cifsFileInfo *open_file;
 	int rc;
 
-	cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+	cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
 
 	open_file = find_readable_file(CIFS_I(inode));
 	if (!open_file)
@@ -679,7 +679,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
 	u32 acllen = 0;
 	int rc = 0;
 
-	cFYI(DBG2, ("converting ACL to mode for %s", path));
+	cFYI(DBG2, "converting ACL to mode for %s", path);
 
 	if (pfid)
 		pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
@@ -690,7 +690,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
 	if (pntsd)
 		rc = parse_sec_desc(pntsd, acllen, fattr);
 	if (rc)
-		cFYI(1, ("parse sec desc failed rc = %d", rc));
+		cFYI(1, "parse sec desc failed rc = %d", rc);
 
 	kfree(pntsd);
 	return;
@@ -704,7 +704,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
 	struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
 	struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
 
-	cFYI(DBG2, ("set ACL from mode for %s", path));
+	cFYI(DBG2, "set ACL from mode for %s", path);
 
 	/* Get the security descriptor */
 	pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
@@ -721,19 +721,19 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
 					DEFSECDESCLEN : secdesclen;
 		pnntsd = kmalloc(secdesclen, GFP_KERNEL);
 		if (!pnntsd) {
-			cERROR(1, ("Unable to allocate security descriptor"));
+			cERROR(1, "Unable to allocate security descriptor");
 			kfree(pntsd);
 			return -ENOMEM;
 		}
 
 		rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
 
-		cFYI(DBG2, ("build_sec_desc rc: %d", rc));
+		cFYI(DBG2, "build_sec_desc rc: %d", rc);
 
 		if (!rc) {
 			/* Set the security descriptor */
 			rc = set_cifs_acl(pnntsd, secdesclen, inode, path);
-			cFYI(DBG2, ("set_cifs_acl rc: %d", rc));
+			cFYI(DBG2, "set_cifs_acl rc: %d", rc);
 		}
 
 		kfree(pnntsd);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index fbe986430d0..61e41525206 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -103,7 +103,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 		if (iov[i].iov_len == 0)
 			continue;
 		if (iov[i].iov_base == NULL) {
-			cERROR(1, ("null iovec entry"));
+			cERROR(1, "null iovec entry");
 			return -EIO;
 		}
 		/* The first entry includes a length field (which does not get
@@ -181,8 +181,8 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 
 	/* Do not need to verify session setups with signature "BSRSPYL "  */
 	if (memcmp(cifs_pdu->Signature.SecuritySignature, "BSRSPYL ", 8) == 0)
-		cFYI(1, ("dummy signature received for smb command 0x%x",
-			cifs_pdu->Command));
+		cFYI(1, "dummy signature received for smb command 0x%x",
+			cifs_pdu->Command);
 
 	/* save off the origiginal signature so we can modify the smb and check
 		its signature against what the server sent */
@@ -398,7 +398,7 @@ void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
 	/* calculate buf->ntlmv2_hash */
 	rc = calc_ntlmv2_hash(ses, nls_cp);
 	if (rc)
-		cERROR(1, ("could not get v2 hash rc %d", rc));
+		cERROR(1, "could not get v2 hash rc %d", rc);
 	CalcNTLMv2_response(ses, resp_buf);
 
 	/* now calculate the MAC key for NTLMv2 */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ded66be6597..53e794131c2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -128,8 +128,7 @@ cifs_read_super(struct super_block *sb, void *data,
 
 	if (rc) {
 		if (!silent)
-			cERROR(1,
-			       ("cifs_mount failed w/return code = %d", rc));
+			cERROR(1, "cifs_mount failed w/return code = %d", rc);
 		goto out_mount_failed;
 	}
 
@@ -160,7 +159,7 @@ cifs_read_super(struct super_block *sb, void *data,
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-		cFYI(1, ("export ops supported"));
+		cFYI(1, "export ops supported");
 		sb->s_export_op = &cifs_export_ops;
 	}
 #endif /* EXPERIMENTAL */
@@ -168,7 +167,7 @@ cifs_read_super(struct super_block *sb, void *data,
 	return 0;
 
 out_no_root:
-	cERROR(1, ("cifs_read_super: get root inode failed"));
+	cERROR(1, "cifs_read_super: get root inode failed");
 	if (inode)
 		iput(inode);
 
@@ -194,10 +193,10 @@ cifs_put_super(struct super_block *sb)
 	int rc = 0;
 	struct cifs_sb_info *cifs_sb;
 
-	cFYI(1, ("In cifs_put_super"));
+	cFYI(1, "In cifs_put_super");
 	cifs_sb = CIFS_SB(sb);
 	if (cifs_sb == NULL) {
-		cFYI(1, ("Empty cifs superblock info passed to unmount"));
+		cFYI(1, "Empty cifs superblock info passed to unmount");
 		return;
 	}
 
@@ -205,7 +204,7 @@ cifs_put_super(struct super_block *sb)
 
 	rc = cifs_umount(sb, cifs_sb);
 	if (rc)
-		cERROR(1, ("cifs_umount failed with return code %d", rc));
+		cERROR(1, "cifs_umount failed with return code %d", rc);
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	if (cifs_sb->mountdata) {
 		kfree(cifs_sb->mountdata);
@@ -439,7 +438,7 @@ int cifs_xquota_set(struct super_block *sb, int quota_type, qid_t qid,
 
 	xid = GetXid();
 	if (pTcon) {
-		cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
+		cFYI(1, "set type: 0x%x id: %d", quota_type, qid);
 	} else
 		rc = -EIO;
 
@@ -462,7 +461,7 @@ int cifs_xquota_get(struct super_block *sb, int quota_type, qid_t qid,
 
 	xid = GetXid();
 	if (pTcon) {
-		cFYI(1, ("set type: 0x%x id: %d", quota_type, qid));
+		cFYI(1, "set type: 0x%x id: %d", quota_type, qid);
 	} else
 		rc = -EIO;
 
@@ -484,7 +483,7 @@ int cifs_xstate_set(struct super_block *sb, unsigned int flags, int operation)
 
 	xid = GetXid();
 	if (pTcon) {
-		cFYI(1, ("flags: 0x%x operation: 0x%x", flags, operation));
+		cFYI(1, "flags: 0x%x operation: 0x%x", flags, operation);
 	} else
 		rc = -EIO;
 
@@ -506,7 +505,7 @@ int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
 
 	xid = GetXid();
 	if (pTcon) {
-		cFYI(1, ("pqstats %p", qstats));
+		cFYI(1, "pqstats %p", qstats);
 	} else
 		rc = -EIO;
 
@@ -548,7 +547,7 @@ static void cifs_umount_begin(struct super_block *sb)
 	/* cancel_brl_requests(tcon); */ /* BB mark all brl mids as exiting */
 	/* cancel_notify_requests(tcon); */
 	if (tcon->ses && tcon->ses->server) {
-		cFYI(1, ("wake up tasks now - umount begin not complete"));
+		cFYI(1, "wake up tasks now - umount begin not complete");
 		wake_up_all(&tcon->ses->server->request_q);
 		wake_up_all(&tcon->ses->server->response_q);
 		msleep(1); /* yield */
@@ -599,7 +598,7 @@ cifs_get_sb(struct file_system_type *fs_type,
 	int rc;
 	struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL);
 
-	cFYI(1, ("Devname: %s flags: %d ", dev_name, flags));
+	cFYI(1, "Devname: %s flags: %d ", dev_name, flags);
 
 	if (IS_ERR(sb))
 		return PTR_ERR(sb);
@@ -868,7 +867,7 @@ cifs_init_request_bufs(void)
 	} else {
 		CIFSMaxBufSize &= 0x1FE00; /* Round size to even 512 byte mult*/
 	}
-/*	cERROR(1,("CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize)); */
+/*	cERROR(1, "CIFSMaxBufSize %d 0x%x",CIFSMaxBufSize,CIFSMaxBufSize); */
 	cifs_req_cachep = kmem_cache_create("cifs_request",
 					    CIFSMaxBufSize +
 					    MAX_CIFS_HDR_SIZE, 0,
@@ -880,7 +879,7 @@ cifs_init_request_bufs(void)
 		cifs_min_rcv = 1;
 	else if (cifs_min_rcv > 64) {
 		cifs_min_rcv = 64;
-		cERROR(1, ("cifs_min_rcv set to maximum (64)"));
+		cERROR(1, "cifs_min_rcv set to maximum (64)");
 	}
 
 	cifs_req_poolp = mempool_create_slab_pool(cifs_min_rcv,
@@ -911,7 +910,7 @@ cifs_init_request_bufs(void)
 		cifs_min_small = 2;
 	else if (cifs_min_small > 256) {
 		cifs_min_small = 256;
-		cFYI(1, ("cifs_min_small set to maximum (256)"));
+		cFYI(1, "cifs_min_small set to maximum (256)");
 	}
 
 	cifs_sm_req_poolp = mempool_create_slab_pool(cifs_min_small,
@@ -1009,10 +1008,10 @@ init_cifs(void)
 
 	if (cifs_max_pending < 2) {
 		cifs_max_pending = 2;
-		cFYI(1, ("cifs_max_pending set to min of 2"));
+		cFYI(1, "cifs_max_pending set to min of 2");
 	} else if (cifs_max_pending > 256) {
 		cifs_max_pending = 256;
-		cFYI(1, ("cifs_max_pending set to max of 256"));
+		cFYI(1, "cifs_max_pending set to max of 256");
 	}
 
 	rc = cifs_init_inodecache();
@@ -1070,7 +1069,7 @@ init_cifs(void)
 static void __exit
 exit_cifs(void)
 {
-	cFYI(DBG2, ("exit_cifs"));
+	cFYI(DBG2, "exit_cifs");
 	cifs_proc_clean();
 #ifdef CONFIG_CIFS_DFS_UPCALL
 	cifs_dfs_release_automount_timer();
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 39e47f46dea..32262e15be3 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -39,8 +39,20 @@ extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
 			unsigned int /* length */);
 extern unsigned int _GetXid(void);
 extern void _FreeXid(unsigned int);
-#define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
-#define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
+#define GetXid()						\
+({								\
+	int __xid = (int)_GetXid();				\
+	cFYI(1, "CIFS VFS: in %s as Xid: %d with uid: %d",	\
+	     __func__, __xid, current_fsuid());			\
+	__xid;							\
+})
+
+#define FreeXid(curr_xid)					\
+do {								\
+	_FreeXid(curr_xid);					\
+	cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d",	\
+	     __func__, curr_xid, (int)rc);			\
+} while (0)
 extern char *build_path_from_dentry(struct dentry *);
 extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5d3f29fef53..be23e426ffb 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -130,8 +130,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 		if (smb_command != SMB_COM_WRITE_ANDX &&
 		    smb_command != SMB_COM_OPEN_ANDX &&
 		    smb_command != SMB_COM_TREE_DISCONNECT) {
-			cFYI(1, ("can not send cmd %d while umounting",
-				smb_command));
+			cFYI(1, "can not send cmd %d while umounting",
+				smb_command);
 			return -ENODEV;
 		}
 	}
@@ -157,7 +157,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 		 * back on-line
 		 */
 		if (!tcon->retry || ses->status == CifsExiting) {
-			cFYI(1, ("gave up waiting on reconnect in smb_init"));
+			cFYI(1, "gave up waiting on reconnect in smb_init");
 			return -EHOSTDOWN;
 		}
 	}
@@ -184,7 +184,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 	mark_open_files_invalid(tcon);
 	rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
 	mutex_unlock(&ses->session_mutex);
-	cFYI(1, ("reconnect tcon rc = %d", rc));
+	cFYI(1, "reconnect tcon rc = %d", rc);
 
 	if (rc)
 		goto out;
@@ -374,7 +374,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	else /* if override flags set only sign/seal OR them with global auth */
 		secFlags = extended_security | ses->overrideSecFlg;
 
-	cFYI(1, ("secFlags 0x%x", secFlags));
+	cFYI(1, "secFlags 0x%x", secFlags);
 
 	pSMB->hdr.Mid = GetNextMid(server);
 	pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
@@ -382,14 +382,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	if ((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_KRB5) {
-		cFYI(1, ("Kerberos only mechanism, enable extended security"));
+		cFYI(1, "Kerberos only mechanism, enable extended security");
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	}
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	else if ((secFlags & CIFSSEC_MUST_NTLMSSP) == CIFSSEC_MUST_NTLMSSP)
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	else if ((secFlags & CIFSSEC_AUTH_MASK) == CIFSSEC_MAY_NTLMSSP) {
-		cFYI(1, ("NTLMSSP only mechanism, enable extended security"));
+		cFYI(1, "NTLMSSP only mechanism, enable extended security");
 		pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC;
 	}
 #endif
@@ -409,7 +409,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		goto neg_err_exit;
 
 	dialect = le16_to_cpu(pSMBr->DialectIndex);
-	cFYI(1, ("Dialect: %d", dialect));
+	cFYI(1, "Dialect: %d", dialect);
 	/* Check wct = 1 error case */
 	if ((pSMBr->hdr.WordCount < 13) || (dialect == BAD_PROT)) {
 		/* core returns wct = 1, but we do not ask for core - otherwise
@@ -428,8 +428,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			(secFlags & CIFSSEC_MAY_PLNTXT))
 			server->secType = LANMAN;
 		else {
-			cERROR(1, ("mount failed weak security disabled"
-				   " in /proc/fs/cifs/SecurityFlags"));
+			cERROR(1, "mount failed weak security disabled"
+				   " in /proc/fs/cifs/SecurityFlags");
 			rc = -EOPNOTSUPP;
 			goto neg_err_exit;
 		}
@@ -462,9 +462,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			utc = CURRENT_TIME;
 			ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
 					    rsp->SrvTime.Time, 0);
-			cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d",
+			cFYI(1, "SrvTime %d sec since 1970 (utc: %d) diff: %d",
 				(int)ts.tv_sec, (int)utc.tv_sec,
-				(int)(utc.tv_sec - ts.tv_sec)));
+				(int)(utc.tv_sec - ts.tv_sec));
 			val = (int)(utc.tv_sec - ts.tv_sec);
 			seconds = abs(val);
 			result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
@@ -478,7 +478,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			server->timeAdj = (int)tmp;
 			server->timeAdj *= 60; /* also in seconds */
 		}
-		cFYI(1, ("server->timeAdj: %d seconds", server->timeAdj));
+		cFYI(1, "server->timeAdj: %d seconds", server->timeAdj);
 
 
 		/* BB get server time for time conversions and add
@@ -512,14 +512,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	/* else wct == 17 NTLM */
 	server->secMode = pSMBr->SecurityMode;
 	if ((server->secMode & SECMODE_USER) == 0)
-		cFYI(1, ("share mode security"));
+		cFYI(1, "share mode security");
 
 	if ((server->secMode & SECMODE_PW_ENCRYPT) == 0)
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 		if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0)
 #endif /* CIFS_WEAK_PW_HASH */
-			cERROR(1, ("Server requests plain text password"
-				  " but client support disabled"));
+			cERROR(1, "Server requests plain text password"
+				  " but client support disabled");
 
 	if ((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2)
 		server->secType = NTLMv2;
@@ -539,7 +539,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 #endif */
 	else {
 		rc = -EOPNOTSUPP;
-		cERROR(1, ("Invalid security type"));
+		cERROR(1, "Invalid security type");
 		goto neg_err_exit;
 	}
 	/* else ... any others ...? */
@@ -551,7 +551,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
 			(__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
 	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
-	cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf));
+	cFYI(DBG2, "Max buf = %d", ses->server->maxBuf);
 	GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
 	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
 	server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
@@ -582,7 +582,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			if (memcmp(server->server_GUID,
 				   pSMBr->u.extended_response.
 				   GUID, 16) != 0) {
-				cFYI(1, ("server UID changed"));
+				cFYI(1, "server UID changed");
 				memcpy(server->server_GUID,
 					pSMBr->u.extended_response.GUID,
 					16);
@@ -614,22 +614,21 @@ signing_check:
 	if ((secFlags & CIFSSEC_MAY_SIGN) == 0) {
 		/* MUST_SIGN already includes the MAY_SIGN FLAG
 		   so if this is zero it means that signing is disabled */
-		cFYI(1, ("Signing disabled"));
+		cFYI(1, "Signing disabled");
 		if (server->secMode & SECMODE_SIGN_REQUIRED) {
-			cERROR(1, ("Server requires "
+			cERROR(1, "Server requires "
 				   "packet signing to be enabled in "
-				   "/proc/fs/cifs/SecurityFlags."));
+				   "/proc/fs/cifs/SecurityFlags.");
 			rc = -EOPNOTSUPP;
 		}
 		server->secMode &=
 			~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED);
 	} else if ((secFlags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
 		/* signing required */
-		cFYI(1, ("Must sign - secFlags 0x%x", secFlags));
+		cFYI(1, "Must sign - secFlags 0x%x", secFlags);
 		if ((server->secMode &
 			(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) {
-			cERROR(1,
-				("signing required but server lacks support"));
+			cERROR(1, "signing required but server lacks support");
 			rc = -EOPNOTSUPP;
 		} else
 			server->secMode |= SECMODE_SIGN_REQUIRED;
@@ -643,7 +642,7 @@ signing_check:
 neg_err_exit:
 	cifs_buf_release(pSMB);
 
-	cFYI(1, ("negprot rc %d", rc));
+	cFYI(1, "negprot rc %d", rc);
 	return rc;
 }
 
@@ -653,7 +652,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 	struct smb_hdr *smb_buffer;
 	int rc = 0;
 
-	cFYI(1, ("In tree disconnect"));
+	cFYI(1, "In tree disconnect");
 
 	/* BB: do we need to check this? These should never be NULL. */
 	if ((tcon->ses == NULL) || (tcon->ses->server == NULL))
@@ -675,7 +674,7 @@ CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon)
 
 	rc = SendReceiveNoRsp(xid, tcon->ses, smb_buffer, 0);
 	if (rc)
-		cFYI(1, ("Tree disconnect failed %d", rc));
+		cFYI(1, "Tree disconnect failed %d", rc);
 
 	/* No need to return error on this operation if tid invalidated and
 	   closed on server already e.g. due to tcp session crashing */
@@ -691,7 +690,7 @@ CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses)
 	LOGOFF_ANDX_REQ *pSMB;
 	int rc = 0;
 
-	cFYI(1, ("In SMBLogoff for session disconnect"));
+	cFYI(1, "In SMBLogoff for session disconnect");
 
 	/*
 	 * BB: do we need to check validity of ses and server? They should
@@ -744,7 +743,7 @@ CIFSPOSIXDelFile(const int xid, struct cifsTconInfo *tcon, const char *fileName,
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, ("In POSIX delete"));
+	cFYI(1, "In POSIX delete");
 PsxDelete:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -796,7 +795,7 @@ PsxDelete:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, ("Posix delete returned %d", rc));
+		cFYI(1, "Posix delete returned %d", rc);
 	cifs_buf_release(pSMB);
 
 	cifs_stats_inc(&tcon->num_deletes);
@@ -843,7 +842,7 @@ DelFileRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_deletes);
 	if (rc)
-		cFYI(1, ("Error in RMFile = %d", rc));
+		cFYI(1, "Error in RMFile = %d", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -862,7 +861,7 @@ CIFSSMBRmDir(const int xid, struct cifsTconInfo *tcon, const char *dirName,
 	int bytes_returned;
 	int name_len;
 
-	cFYI(1, ("In CIFSSMBRmDir"));
+	cFYI(1, "In CIFSSMBRmDir");
 RmDirRetry:
 	rc = smb_init(SMB_COM_DELETE_DIRECTORY, 0, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -887,7 +886,7 @@ RmDirRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_rmdirs);
 	if (rc)
-		cFYI(1, ("Error in RMDir = %d", rc));
+		cFYI(1, "Error in RMDir = %d", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -905,7 +904,7 @@ CIFSSMBMkDir(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned;
 	int name_len;
 
-	cFYI(1, ("In CIFSSMBMkDir"));
+	cFYI(1, "In CIFSSMBMkDir");
 MkDirRetry:
 	rc = smb_init(SMB_COM_CREATE_DIRECTORY, 0, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -930,7 +929,7 @@ MkDirRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_mkdirs);
 	if (rc)
-		cFYI(1, ("Error in Mkdir = %d", rc));
+		cFYI(1, "Error in Mkdir = %d", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -953,7 +952,7 @@ CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon, __u32 posix_flags,
 	OPEN_PSX_REQ *pdata;
 	OPEN_PSX_RSP *psx_rsp;
 
-	cFYI(1, ("In POSIX Create"));
+	cFYI(1, "In POSIX Create");
 PsxCreat:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -1007,11 +1006,11 @@ PsxCreat:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Posix create returned %d", rc));
+		cFYI(1, "Posix create returned %d", rc);
 		goto psx_create_err;
 	}
 
-	cFYI(1, ("copying inode info"));
+	cFYI(1, "copying inode info");
 	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 	if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
@@ -1033,11 +1032,11 @@ PsxCreat:
 	/* check to make sure response data is there */
 	if (psx_rsp->ReturnedLevel != cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC)) {
 		pRetData->Type = cpu_to_le32(-1); /* unknown */
-		cFYI(DBG2, ("unknown type"));
+		cFYI(DBG2, "unknown type");
 	} else {
 		if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP)
 					+ sizeof(FILE_UNIX_BASIC_INFO)) {
-			cERROR(1, ("Open response data too small"));
+			cERROR(1, "Open response data too small");
 			pRetData->Type = cpu_to_le32(-1);
 			goto psx_create_err;
 		}
@@ -1084,7 +1083,7 @@ static __u16 convert_disposition(int disposition)
 			ofun = SMBOPEN_OCREATE | SMBOPEN_OTRUNC;
 			break;
 		default:
-			cFYI(1, ("unknown disposition %d", disposition));
+			cFYI(1, "unknown disposition %d", disposition);
 			ofun =  SMBOPEN_OAPPEND; /* regular open */
 	}
 	return ofun;
@@ -1175,7 +1174,7 @@ OldOpenRetry:
 			(struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
 	cifs_stats_inc(&tcon->num_opens);
 	if (rc) {
-		cFYI(1, ("Error in Open = %d", rc));
+		cFYI(1, "Error in Open = %d", rc);
 	} else {
 	/* BB verify if wct == 15 */
 
@@ -1288,7 +1287,7 @@ openRetry:
 			(struct smb_hdr *)pSMBr, &bytes_returned, CIFS_LONG_OP);
 	cifs_stats_inc(&tcon->num_opens);
 	if (rc) {
-		cFYI(1, ("Error in Open = %d", rc));
+		cFYI(1, "Error in Open = %d", rc);
 	} else {
 		*pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
 		*netfid = pSMBr->Fid;	/* cifs fid stays in le */
@@ -1326,7 +1325,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 	int resp_buf_type = 0;
 	struct kvec iov[1];
 
-	cFYI(1, ("Reading %d bytes on fid %d", count, netfid));
+	cFYI(1, "Reading %d bytes on fid %d", count, netfid);
 	if (tcon->ses->capabilities & CAP_LARGE_FILES)
 		wct = 12;
 	else {
@@ -1371,7 +1370,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 	cifs_stats_inc(&tcon->num_reads);
 	pSMBr = (READ_RSP *)iov[0].iov_base;
 	if (rc) {
-		cERROR(1, ("Send error in read = %d", rc));
+		cERROR(1, "Send error in read = %d", rc);
 	} else {
 		int data_length = le16_to_cpu(pSMBr->DataLengthHigh);
 		data_length = data_length << 16;
@@ -1381,15 +1380,15 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
 		/*check that DataLength would not go beyond end of SMB */
 		if ((data_length > CIFSMaxBufSize)
 				|| (data_length > count)) {
-			cFYI(1, ("bad length %d for count %d",
-				 data_length, count));
+			cFYI(1, "bad length %d for count %d",
+				 data_length, count);
 			rc = -EIO;
 			*nbytes = 0;
 		} else {
 			pReadData = (char *) (&pSMBr->hdr.Protocol) +
 					le16_to_cpu(pSMBr->DataOffset);
 /*			if (rc = copy_to_user(buf, pReadData, data_length)) {
-				cERROR(1,("Faulting on read rc = %d",rc));
+				cERROR(1, "Faulting on read rc = %d",rc);
 				rc = -EFAULT;
 			}*/ /* can not use copy_to_user when using page cache*/
 			if (*buf)
@@ -1433,7 +1432,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 
 	*nbytes = 0;
 
-	/* cFYI(1, ("write at %lld %d bytes", offset, count));*/
+	/* cFYI(1, "write at %lld %d bytes", offset, count);*/
 	if (tcon->ses == NULL)
 		return -ECONNABORTED;
 
@@ -1551,7 +1550,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 
 	*nbytes = 0;
 
-	cFYI(1, ("write2 at %lld %d bytes", (long long)offset, count));
+	cFYI(1, "write2 at %lld %d bytes", (long long)offset, count);
 
 	if (tcon->ses->capabilities & CAP_LARGE_FILES) {
 		wct = 14;
@@ -1606,7 +1605,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
 			  long_op);
 	cifs_stats_inc(&tcon->num_writes);
 	if (rc) {
-		cFYI(1, ("Send error Write2 = %d", rc));
+		cFYI(1, "Send error Write2 = %d", rc);
 	} else if (resp_buf_type == 0) {
 		/* presumably this can not happen, but best to be safe */
 		rc = -EIO;
@@ -1651,7 +1650,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	int timeout = 0;
 	__u16 count;
 
-	cFYI(1, ("CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock));
+	cFYI(1, "CIFSSMBLock timeout %d numLock %d", (int)waitFlag, numLock);
 	rc = small_smb_init(SMB_COM_LOCKING_ANDX, 8, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -1699,7 +1698,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	}
 	cifs_stats_inc(&tcon->num_locks);
 	if (rc)
-		cFYI(1, ("Send error in Lock = %d", rc));
+		cFYI(1, "Send error in Lock = %d", rc);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 	since file handle passed in no longer valid */
@@ -1722,7 +1721,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	__u16 params, param_offset, offset, byte_count, count;
 	struct kvec iov[1];
 
-	cFYI(1, ("Posix Lock"));
+	cFYI(1, "Posix Lock");
 
 	if (pLockData == NULL)
 		return -EINVAL;
@@ -1792,7 +1791,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	}
 
 	if (rc) {
-		cFYI(1, ("Send error in Posix Lock = %d", rc));
+		cFYI(1, "Send error in Posix Lock = %d", rc);
 	} else if (get_flag) {
 		/* lock structure can be returned on get */
 		__u16 data_offset;
@@ -1849,7 +1848,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 {
 	int rc = 0;
 	CLOSE_REQ *pSMB = NULL;
-	cFYI(1, ("In CIFSSMBClose"));
+	cFYI(1, "In CIFSSMBClose");
 
 /* do not retry on dead session on close */
 	rc = small_smb_init(SMB_COM_CLOSE, 3, tcon, (void **) &pSMB);
@@ -1866,7 +1865,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 	if (rc) {
 		if (rc != -EINTR) {
 			/* EINTR is expected when user ctl-c to kill app */
-			cERROR(1, ("Send error in Close = %d", rc));
+			cERROR(1, "Send error in Close = %d", rc);
 		}
 	}
 
@@ -1882,7 +1881,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 {
 	int rc = 0;
 	FLUSH_REQ *pSMB = NULL;
-	cFYI(1, ("In CIFSSMBFlush"));
+	cFYI(1, "In CIFSSMBFlush");
 
 	rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
 	if (rc)
@@ -1893,7 +1892,7 @@ CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	cifs_stats_inc(&tcon->num_flushes);
 	if (rc)
-		cERROR(1, ("Send error in Flush = %d", rc));
+		cERROR(1, "Send error in Flush = %d", rc);
 
 	return rc;
 }
@@ -1910,7 +1909,7 @@ CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
 	int name_len, name_len2;
 	__u16 count;
 
-	cFYI(1, ("In CIFSSMBRename"));
+	cFYI(1, "In CIFSSMBRename");
 renameRetry:
 	rc = smb_init(SMB_COM_RENAME, 1, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -1956,7 +1955,7 @@ renameRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_renames);
 	if (rc)
-		cFYI(1, ("Send error in rename = %d", rc));
+		cFYI(1, "Send error in rename = %d", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -1980,7 +1979,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
 	int len_of_str;
 	__u16 params, param_offset, offset, count, byte_count;
 
-	cFYI(1, ("Rename to File by handle"));
+	cFYI(1, "Rename to File by handle");
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, pTcon, (void **) &pSMB,
 			(void **) &pSMBr);
 	if (rc)
@@ -2035,7 +2034,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&pTcon->num_t2renames);
 	if (rc)
-		cFYI(1, ("Send error in Rename (by file handle) = %d", rc));
+		cFYI(1, "Send error in Rename (by file handle) = %d", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -2057,7 +2056,7 @@ CIFSSMBCopy(const int xid, struct cifsTconInfo *tcon, const char *fromName,
 	int name_len, name_len2;
 	__u16 count;
 
-	cFYI(1, ("In CIFSSMBCopy"));
+	cFYI(1, "In CIFSSMBCopy");
 copyRetry:
 	rc = smb_init(SMB_COM_COPY, 1, tcon, (void **) &pSMB,
 			(void **) &pSMBr);
@@ -2102,8 +2101,8 @@ copyRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in copy = %d with %d files copied",
-			rc, le16_to_cpu(pSMBr->CopyCount)));
+		cFYI(1, "Send error in copy = %d with %d files copied",
+			rc, le16_to_cpu(pSMBr->CopyCount));
 	}
 	cifs_buf_release(pSMB);
 
@@ -2127,7 +2126,7 @@ CIFSUnixCreateSymLink(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, ("In Symlink Unix style"));
+	cFYI(1, "In Symlink Unix style");
 createSymLinkRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -2192,7 +2191,7 @@ createSymLinkRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_symlinks);
 	if (rc)
-		cFYI(1, ("Send error in SetPathInfo create symlink = %d", rc));
+		cFYI(1, "Send error in SetPathInfo create symlink = %d", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -2216,7 +2215,7 @@ CIFSUnixCreateHardLink(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, ("In Create Hard link Unix style"));
+	cFYI(1, "In Create Hard link Unix style");
 createHardLinkRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -2278,7 +2277,7 @@ createHardLinkRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_hardlinks);
 	if (rc)
-		cFYI(1, ("Send error in SetPathInfo (hard link) = %d", rc));
+		cFYI(1, "Send error in SetPathInfo (hard link) = %d", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -2299,7 +2298,7 @@ CIFSCreateHardLink(const int xid, struct cifsTconInfo *tcon,
 	int name_len, name_len2;
 	__u16 count;
 
-	cFYI(1, ("In CIFSCreateHardLink"));
+	cFYI(1, "In CIFSCreateHardLink");
 winCreateHardLinkRetry:
 
 	rc = smb_init(SMB_COM_NT_RENAME, 4, tcon, (void **) &pSMB,
@@ -2350,7 +2349,7 @@ winCreateHardLinkRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_hardlinks);
 	if (rc)
-		cFYI(1, ("Send error in hard link (NT rename) = %d", rc));
+		cFYI(1, "Send error in hard link (NT rename) = %d", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -2373,7 +2372,7 @@ CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
 	__u16 params, byte_count;
 	char *data_start;
 
-	cFYI(1, ("In QPathSymLinkInfo (Unix) for path %s", searchName));
+	cFYI(1, "In QPathSymLinkInfo (Unix) for path %s", searchName);
 
 querySymLinkRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2420,7 +2419,7 @@ querySymLinkRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QuerySymLinkInfo = %d", rc));
+		cFYI(1, "Send error in QuerySymLinkInfo = %d", rc);
 	} else {
 		/* decode response */
 
@@ -2521,21 +2520,21 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 
 	/* should we also check that parm and data areas do not overlap? */
 	if (*ppparm > end_of_smb) {
-		cFYI(1, ("parms start after end of smb"));
+		cFYI(1, "parms start after end of smb");
 		return -EINVAL;
 	} else if (parm_count + *ppparm > end_of_smb) {
-		cFYI(1, ("parm end after end of smb"));
+		cFYI(1, "parm end after end of smb");
 		return -EINVAL;
 	} else if (*ppdata > end_of_smb) {
-		cFYI(1, ("data starts after end of smb"));
+		cFYI(1, "data starts after end of smb");
 		return -EINVAL;
 	} else if (data_count + *ppdata > end_of_smb) {
-		cFYI(1, ("data %p + count %d (%p) ends after end of smb %p start %p",
+		cFYI(1, "data %p + count %d (%p) ends after end of smb %p start %p",
 			*ppdata, data_count, (data_count + *ppdata),
-			end_of_smb, pSMBr));
+			end_of_smb, pSMBr);
 		return -EINVAL;
 	} else if (parm_count + data_count > pSMBr->ByteCount) {
-		cFYI(1, ("parm count and data count larger than SMB"));
+		cFYI(1, "parm count and data count larger than SMB");
 		return -EINVAL;
 	}
 	*pdatalen = data_count;
@@ -2554,7 +2553,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 	struct smb_com_transaction_ioctl_req *pSMB;
 	struct smb_com_transaction_ioctl_rsp *pSMBr;
 
-	cFYI(1, ("In Windows reparse style QueryLink for path %s", searchName));
+	cFYI(1, "In Windows reparse style QueryLink for path %s", searchName);
 	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
 	if (rc)
@@ -2583,7 +2582,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QueryReparseLinkInfo = %d", rc));
+		cFYI(1, "Send error in QueryReparseLinkInfo = %d", rc);
 	} else {		/* decode response */
 		__u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
 		__u32 data_count = le32_to_cpu(pSMBr->DataCount);
@@ -2607,7 +2606,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 			if ((reparse_buf->LinkNamesBuf +
 				reparse_buf->TargetNameOffset +
 				reparse_buf->TargetNameLen) > end_of_smb) {
-				cFYI(1, ("reparse buf beyond SMB"));
+				cFYI(1, "reparse buf beyond SMB");
 				rc = -EIO;
 				goto qreparse_out;
 			}
@@ -2628,12 +2627,12 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
 			}
 		} else {
 			rc = -EIO;
-			cFYI(1, ("Invalid return data count on "
-				 "get reparse info ioctl"));
+			cFYI(1, "Invalid return data count on "
+				 "get reparse info ioctl");
 		}
 		symlinkinfo[buflen] = 0; /* just in case so the caller
 					does not go off the end of the buffer */
-		cFYI(1, ("readlink result - %s", symlinkinfo));
+		cFYI(1, "readlink result - %s", symlinkinfo);
 	}
 
 qreparse_out:
@@ -2656,7 +2655,7 @@ static void cifs_convert_ace(posix_acl_xattr_entry *ace,
 	ace->e_perm = cpu_to_le16(cifs_ace->cifs_e_perm);
 	ace->e_tag  = cpu_to_le16(cifs_ace->cifs_e_tag);
 	ace->e_id   = cpu_to_le32(le64_to_cpu(cifs_ace->cifs_uid));
-	/* cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id)); */
+	/* cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id); */
 
 	return;
 }
@@ -2682,8 +2681,8 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
 		size += sizeof(struct cifs_posix_ace) * count;
 		/* check if we would go beyond end of SMB */
 		if (size_of_data_area < size) {
-			cFYI(1, ("bad CIFS POSIX ACL size %d vs. %d",
-				size_of_data_area, size));
+			cFYI(1, "bad CIFS POSIX ACL size %d vs. %d",
+				size_of_data_area, size);
 			return -EINVAL;
 		}
 	} else if (acl_type & ACL_TYPE_DEFAULT) {
@@ -2730,7 +2729,7 @@ static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
 		cifs_ace->cifs_uid = cpu_to_le64(-1);
 	} else
 		cifs_ace->cifs_uid = cpu_to_le64(le32_to_cpu(local_ace->e_id));
-	/*cFYI(1,("perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id));*/
+	/*cFYI(1, "perm %d tag %d id %d",ace->e_perm,ace->e_tag,ace->e_id);*/
 	return rc;
 }
 
@@ -2748,12 +2747,12 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
 		return 0;
 
 	count = posix_acl_xattr_count((size_t)buflen);
-	cFYI(1, ("setting acl with %d entries from buf of length %d and "
+	cFYI(1, "setting acl with %d entries from buf of length %d and "
 		"version of %d",
-		count, buflen, le32_to_cpu(local_acl->a_version)));
+		count, buflen, le32_to_cpu(local_acl->a_version));
 	if (le32_to_cpu(local_acl->a_version) != 2) {
-		cFYI(1, ("unknown POSIX ACL version %d",
-		     le32_to_cpu(local_acl->a_version)));
+		cFYI(1, "unknown POSIX ACL version %d",
+		     le32_to_cpu(local_acl->a_version));
 		return 0;
 	}
 	cifs_acl->version = cpu_to_le16(1);
@@ -2762,7 +2761,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
 	else if (acl_type == ACL_TYPE_DEFAULT)
 		cifs_acl->default_entry_count = cpu_to_le16(count);
 	else {
-		cFYI(1, ("unknown ACL type %d", acl_type));
+		cFYI(1, "unknown ACL type %d", acl_type);
 		return 0;
 	}
 	for (i = 0; i < count; i++) {
@@ -2795,7 +2794,7 @@ CIFSSMBGetPosixACL(const int xid, struct cifsTconInfo *tcon,
 	int name_len;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In GetPosixACL (Unix) for path %s", searchName));
+	cFYI(1, "In GetPosixACL (Unix) for path %s", searchName);
 
 queryAclRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -2847,7 +2846,7 @@ queryAclRetry:
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	cifs_stats_inc(&tcon->num_acl_get);
 	if (rc) {
-		cFYI(1, ("Send error in Query POSIX ACL = %d", rc));
+		cFYI(1, "Send error in Query POSIX ACL = %d", rc);
 	} else {
 		/* decode response */
 
@@ -2884,7 +2883,7 @@ CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned = 0;
 	__u16 params, byte_count, data_count, param_offset, offset;
 
-	cFYI(1, ("In SetPosixACL (Unix) for path %s", fileName));
+	cFYI(1, "In SetPosixACL (Unix) for path %s", fileName);
 setAclRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -2939,7 +2938,7 @@ setAclRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, ("Set POSIX ACL returned %d", rc));
+		cFYI(1, "Set POSIX ACL returned %d", rc);
 
 setACLerrorExit:
 	cifs_buf_release(pSMB);
@@ -2959,7 +2958,7 @@ CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In GetExtAttr"));
+	cFYI(1, "In GetExtAttr");
 	if (tcon == NULL)
 		return -ENODEV;
 
@@ -2998,7 +2997,7 @@ GetExtAttrRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("error %d in GetExtAttr", rc));
+		cFYI(1, "error %d in GetExtAttr", rc);
 	} else {
 		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3013,7 +3012,7 @@ GetExtAttrRetry:
 			struct file_chattr_info *pfinfo;
 			/* BB Do we need a cast or hash here ? */
 			if (count != 16) {
-				cFYI(1, ("Illegal size ret in GetExtAttr"));
+				cFYI(1, "Illegal size ret in GetExtAttr");
 				rc = -EIO;
 				goto GetExtAttrOut;
 			}
@@ -3043,7 +3042,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 	QUERY_SEC_DESC_REQ *pSMB;
 	struct kvec iov[1];
 
-	cFYI(1, ("GetCifsACL"));
+	cFYI(1, "GetCifsACL");
 
 	*pbuflen = 0;
 	*acl_inf = NULL;
@@ -3068,7 +3067,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 			 CIFS_STD_OP);
 	cifs_stats_inc(&tcon->num_acl_get);
 	if (rc) {
-		cFYI(1, ("Send error in QuerySecDesc = %d", rc));
+		cFYI(1, "Send error in QuerySecDesc = %d", rc);
 	} else {                /* decode response */
 		__le32 *parm;
 		__u32 parm_len;
@@ -3083,7 +3082,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 			goto qsec_out;
 		pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
 
-		cFYI(1, ("smb %p parm %p data %p", pSMBr, parm, *acl_inf));
+		cFYI(1, "smb %p parm %p data %p", pSMBr, parm, *acl_inf);
 
 		if (le32_to_cpu(pSMBr->ParameterCount) != 4) {
 			rc = -EIO;      /* bad smb */
@@ -3095,8 +3094,8 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 
 		acl_len = le32_to_cpu(*parm);
 		if (acl_len != *pbuflen) {
-			cERROR(1, ("acl length %d does not match %d",
-				   acl_len, *pbuflen));
+			cERROR(1, "acl length %d does not match %d",
+				   acl_len, *pbuflen);
 			if (*pbuflen > acl_len)
 				*pbuflen = acl_len;
 		}
@@ -3105,7 +3104,7 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
 		   header followed by the smallest SID */
 		if ((*pbuflen < sizeof(struct cifs_ntsd) + 8) ||
 		    (*pbuflen >= 64 * 1024)) {
-			cERROR(1, ("bad acl length %d", *pbuflen));
+			cERROR(1, "bad acl length %d", *pbuflen);
 			rc = -EINVAL;
 			*pbuflen = 0;
 		} else {
@@ -3179,9 +3178,9 @@ setCifsAclRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 
-	cFYI(1, ("SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc));
+	cFYI(1, "SetCIFSACL bytes_returned: %d, rc: %d", bytes_returned, rc);
 	if (rc)
-		cFYI(1, ("Set CIFS ACL returned %d", rc));
+		cFYI(1, "Set CIFS ACL returned %d", rc);
 	cifs_buf_release(pSMB);
 
 	if (rc == -EAGAIN)
@@ -3205,7 +3204,7 @@ int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned;
 	int name_len;
 
-	cFYI(1, ("In SMBQPath path %s", searchName));
+	cFYI(1, "In SMBQPath path %s", searchName);
 QInfRetry:
 	rc = smb_init(SMB_COM_QUERY_INFORMATION, 0, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -3231,7 +3230,7 @@ QInfRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QueryInfo = %d", rc));
+		cFYI(1, "Send error in QueryInfo = %d", rc);
 	} else if (pFinfo) {
 		struct timespec ts;
 		__u32 time = le32_to_cpu(pSMBr->last_write_time);
@@ -3343,7 +3342,7 @@ CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
 	int name_len;
 	__u16 params, byte_count;
 
-/* cFYI(1, ("In QPathInfo path %s", searchName)); */
+/* cFYI(1, "In QPathInfo path %s", searchName); */
 QPathInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -3393,7 +3392,7 @@ QPathInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QPathInfo = %d", rc));
+		cFYI(1, "Send error in QPathInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -3512,7 +3511,7 @@ CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
 	int name_len;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In QPathInfo (Unix) the path %s", searchName));
+	cFYI(1, "In QPathInfo (Unix) the path %s", searchName);
 UnixQPathInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -3559,14 +3558,14 @@ UnixQPathInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QPathInfo = %d", rc));
+		cFYI(1, "Send error in QPathInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 		if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
-			cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
+			cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
 				   "Unix Extensions can be disabled on mount "
-				   "by specifying the nosfu mount option."));
+				   "by specifying the nosfu mount option.");
 			rc = -EIO;	/* bad smb */
 		} else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3600,7 +3599,7 @@ CIFSFindFirst(const int xid, struct cifsTconInfo *tcon,
 	int name_len;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In FindFirst for %s", searchName));
+	cFYI(1, "In FindFirst for %s", searchName);
 
 findFirstRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -3677,7 +3676,7 @@ findFirstRetry:
 	if (rc) {/* BB add logic to retry regular search if Unix search
 			rejected unexpectedly by server */
 		/* BB Add code to handle unsupported level rc */
-		cFYI(1, ("Error in FindFirst = %d", rc));
+		cFYI(1, "Error in FindFirst = %d", rc);
 
 		cifs_buf_release(pSMB);
 
@@ -3716,7 +3715,7 @@ findFirstRetry:
 			lnoff = le16_to_cpu(parms->LastNameOffset);
 			if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
 			      lnoff) {
-				cERROR(1, ("ignoring corrupt resume name"));
+				cERROR(1, "ignoring corrupt resume name");
 				psrch_inf->last_entry = NULL;
 				return rc;
 			}
@@ -3744,7 +3743,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned, name_len;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In FindNext"));
+	cFYI(1, "In FindNext");
 
 	if (psrch_inf->endOfSearch)
 		return -ENOENT;
@@ -3808,7 +3807,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 			cifs_buf_release(pSMB);
 			rc = 0; /* search probably was closed at end of search*/
 		} else
-			cFYI(1, ("FindNext returned = %d", rc));
+			cFYI(1, "FindNext returned = %d", rc);
 	} else {                /* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -3844,15 +3843,15 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 			lnoff = le16_to_cpu(parms->LastNameOffset);
 			if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
 			      lnoff) {
-				cERROR(1, ("ignoring corrupt resume name"));
+				cERROR(1, "ignoring corrupt resume name");
 				psrch_inf->last_entry = NULL;
 				return rc;
 			} else
 				psrch_inf->last_entry =
 					psrch_inf->srch_entries_start + lnoff;
 
-/*  cFYI(1,("fnxt2 entries in buf %d index_of_last %d",
-	    psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */
+/*  cFYI(1, "fnxt2 entries in buf %d index_of_last %d",
+	    psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry); */
 
 			/* BB fixme add unlock here */
 		}
@@ -3877,7 +3876,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
 	int rc = 0;
 	FINDCLOSE_REQ *pSMB = NULL;
 
-	cFYI(1, ("In CIFSSMBFindClose"));
+	cFYI(1, "In CIFSSMBFindClose");
 	rc = small_smb_init(SMB_COM_FIND_CLOSE2, 1, tcon, (void **)&pSMB);
 
 	/* no sense returning error if session restarted
@@ -3891,7 +3890,7 @@ CIFSFindClose(const int xid, struct cifsTconInfo *tcon,
 	pSMB->ByteCount = 0;
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	if (rc)
-		cERROR(1, ("Send error in FindClose = %d", rc));
+		cERROR(1, "Send error in FindClose = %d", rc);
 
 	cifs_stats_inc(&tcon->num_fclose);
 
@@ -3914,7 +3913,7 @@ CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
 	int name_len, bytes_returned;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In GetSrvInodeNum for %s", searchName));
+	cFYI(1, "In GetSrvInodeNum for %s", searchName);
 	if (tcon == NULL)
 		return -ENODEV;
 
@@ -3964,7 +3963,7 @@ GetInodeNumberRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("error %d in QueryInternalInfo", rc));
+		cFYI(1, "error %d in QueryInternalInfo", rc);
 	} else {
 		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -3979,7 +3978,7 @@ GetInodeNumberRetry:
 			struct file_internal_info *pfinfo;
 			/* BB Do we need a cast or hash here ? */
 			if (count < 8) {
-				cFYI(1, ("Illegal size ret in QryIntrnlInf"));
+				cFYI(1, "Illegal size ret in QryIntrnlInf");
 				rc = -EIO;
 				goto GetInodeNumOut;
 			}
@@ -4020,16 +4019,16 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 	*num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
 
 	if (*num_of_nodes < 1) {
-		cERROR(1, ("num_referrals: must be at least > 0,"
-			"but we get num_referrals = %d\n", *num_of_nodes));
+		cERROR(1, "num_referrals: must be at least > 0,"
+			"but we get num_referrals = %d\n", *num_of_nodes);
 		rc = -EINVAL;
 		goto parse_DFS_referrals_exit;
 	}
 
 	ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
 	if (ref->VersionNumber != cpu_to_le16(3)) {
-		cERROR(1, ("Referrals of V%d version are not supported,"
-			"should be V3", le16_to_cpu(ref->VersionNumber)));
+		cERROR(1, "Referrals of V%d version are not supported,"
+			"should be V3", le16_to_cpu(ref->VersionNumber));
 		rc = -EINVAL;
 		goto parse_DFS_referrals_exit;
 	}
@@ -4038,14 +4037,14 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 	data_end = (char *)(&(pSMBr->PathConsumed)) +
 				le16_to_cpu(pSMBr->t2.DataCount);
 
-	cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n",
+	cFYI(1, "num_referrals: %d dfs flags: 0x%x ... \n",
 			*num_of_nodes,
-			le32_to_cpu(pSMBr->DFSFlags)));
+			le32_to_cpu(pSMBr->DFSFlags));
 
 	*target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
 			*num_of_nodes, GFP_KERNEL);
 	if (*target_nodes == NULL) {
-		cERROR(1, ("Failed to allocate buffer for target_nodes\n"));
+		cERROR(1, "Failed to allocate buffer for target_nodes\n");
 		rc = -ENOMEM;
 		goto parse_DFS_referrals_exit;
 	}
@@ -4121,7 +4120,7 @@ CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
 	*num_of_nodes = 0;
 	*target_nodes = NULL;
 
-	cFYI(1, ("In GetDFSRefer the path %s", searchName));
+	cFYI(1, "In GetDFSRefer the path %s", searchName);
 	if (ses == NULL)
 		return -ENODEV;
 getDFSRetry:
@@ -4188,7 +4187,7 @@ getDFSRetry:
 	rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in GetDFSRefer = %d", rc));
+		cFYI(1, "Send error in GetDFSRefer = %d", rc);
 		goto GetDFSRefExit;
 	}
 	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4199,9 +4198,9 @@ getDFSRetry:
 		goto GetDFSRefExit;
 	}
 
-	cFYI(1, ("Decoding GetDFSRefer response BCC: %d  Offset %d",
+	cFYI(1, "Decoding GetDFSRefer response BCC: %d  Offset %d",
 				pSMBr->ByteCount,
-				le16_to_cpu(pSMBr->t2.DataOffset)));
+				le16_to_cpu(pSMBr->t2.DataOffset));
 
 	/* parse returned result into more usable form */
 	rc = parse_DFS_referrals(pSMBr, num_of_nodes,
@@ -4229,7 +4228,7 @@ SMBOldQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, ("OldQFSInfo"));
+	cFYI(1, "OldQFSInfo");
 oldQFSInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		(void **) &pSMBr);
@@ -4262,7 +4261,7 @@ oldQFSInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 		(struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QFSInfo = %d", rc));
+		cFYI(1, "Send error in QFSInfo = %d", rc);
 	} else {                /* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4270,8 +4269,8 @@ oldQFSInfoRetry:
 			rc = -EIO;      /* bad smb */
 		else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-			cFYI(1, ("qfsinf resp BCC: %d  Offset %d",
-				 pSMBr->ByteCount, data_offset));
+			cFYI(1, "qfsinf resp BCC: %d  Offset %d",
+				 pSMBr->ByteCount, data_offset);
 
 			response_data = (FILE_SYSTEM_ALLOC_INFO *)
 				(((char *) &pSMBr->hdr.Protocol) + data_offset);
@@ -4283,11 +4282,10 @@ oldQFSInfoRetry:
 			       le32_to_cpu(response_data->TotalAllocationUnits);
 			FSData->f_bfree = FSData->f_bavail =
 				le32_to_cpu(response_data->FreeAllocationUnits);
-			cFYI(1,
-			     ("Blocks: %lld  Free: %lld Block size %ld",
-			      (unsigned long long)FSData->f_blocks,
-			      (unsigned long long)FSData->f_bfree,
-			      FSData->f_bsize));
+			cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
+			     (unsigned long long)FSData->f_blocks,
+			     (unsigned long long)FSData->f_bfree,
+			     FSData->f_bsize);
 		}
 	}
 	cifs_buf_release(pSMB);
@@ -4309,7 +4307,7 @@ CIFSSMBQFSInfo(const int xid, struct cifsTconInfo *tcon, struct kstatfs *FSData)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In QFSInfo"));
+	cFYI(1, "In QFSInfo");
 QFSInfoRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4342,7 +4340,7 @@ QFSInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QFSInfo = %d", rc));
+		cFYI(1, "Send error in QFSInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4363,11 +4361,10 @@ QFSInfoRetry:
 			    le64_to_cpu(response_data->TotalAllocationUnits);
 			FSData->f_bfree = FSData->f_bavail =
 			    le64_to_cpu(response_data->FreeAllocationUnits);
-			cFYI(1,
-			     ("Blocks: %lld  Free: %lld Block size %ld",
-			      (unsigned long long)FSData->f_blocks,
-			      (unsigned long long)FSData->f_bfree,
-			      FSData->f_bsize));
+			cFYI(1, "Blocks: %lld  Free: %lld Block size %ld",
+			     (unsigned long long)FSData->f_blocks,
+			     (unsigned long long)FSData->f_bfree,
+			     FSData->f_bsize);
 		}
 	}
 	cifs_buf_release(pSMB);
@@ -4389,7 +4386,7 @@ CIFSSMBQFSAttributeInfo(const int xid, struct cifsTconInfo *tcon)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In QFSAttributeInfo"));
+	cFYI(1, "In QFSAttributeInfo");
 QFSAttributeRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4423,7 +4420,7 @@ QFSAttributeRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cERROR(1, ("Send error in QFSAttributeInfo = %d", rc));
+		cERROR(1, "Send error in QFSAttributeInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4459,7 +4456,7 @@ CIFSSMBQFSDeviceInfo(const int xid, struct cifsTconInfo *tcon)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In QFSDeviceInfo"));
+	cFYI(1, "In QFSDeviceInfo");
 QFSDeviceRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4494,7 +4491,7 @@ QFSDeviceRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QFSDeviceInfo = %d", rc));
+		cFYI(1, "Send error in QFSDeviceInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4529,7 +4526,7 @@ CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon)
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In QFSUnixInfo"));
+	cFYI(1, "In QFSUnixInfo");
 QFSUnixRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4563,7 +4560,7 @@ QFSUnixRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cERROR(1, ("Send error in QFSUnixInfo = %d", rc));
+		cERROR(1, "Send error in QFSUnixInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4598,7 +4595,7 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap)
 	int bytes_returned = 0;
 	__u16 params, param_offset, offset, byte_count;
 
-	cFYI(1, ("In SETFSUnixInfo"));
+	cFYI(1, "In SETFSUnixInfo");
 SETFSUnixRetry:
 	/* BB switch to small buf init to save memory */
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -4646,7 +4643,7 @@ SETFSUnixRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cERROR(1, ("Send error in SETFSUnixInfo = %d", rc));
+		cERROR(1, "Send error in SETFSUnixInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 		if (rc)
@@ -4674,7 +4671,7 @@ CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
 	int bytes_returned = 0;
 	__u16 params, byte_count;
 
-	cFYI(1, ("In QFSPosixInfo"));
+	cFYI(1, "In QFSPosixInfo");
 QFSPosixRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4708,7 +4705,7 @@ QFSPosixRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QFSUnixInfo = %d", rc));
+		cFYI(1, "Send error in QFSUnixInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4768,7 +4765,7 @@ CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon, const char *fileName,
 	int bytes_returned = 0;
 	__u16 params, byte_count, data_count, param_offset, offset;
 
-	cFYI(1, ("In SetEOF"));
+	cFYI(1, "In SetEOF");
 SetEOFRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -4834,7 +4831,7 @@ SetEOFRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, ("SetPathInfo (file size) returned %d", rc));
+		cFYI(1, "SetPathInfo (file size) returned %d", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -4854,8 +4851,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 	int rc = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, ("SetFileSize (via SetFileInfo) %lld",
-			(long long)size));
+	cFYI(1, "SetFileSize (via SetFileInfo) %lld",
+			(long long)size);
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -4914,9 +4911,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	if (rc) {
-		cFYI(1,
-		     ("Send error in SetFileInfo (SetFileSize) = %d",
-		      rc));
+		cFYI(1, "Send error in SetFileInfo (SetFileSize) = %d", rc);
 	}
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
@@ -4940,7 +4935,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 	int rc = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, ("Set Times (via SetFileInfo)"));
+	cFYI(1, "Set Times (via SetFileInfo)");
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -4985,7 +4980,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 	memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	if (rc)
-		cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
+		cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 		since file handle passed in no longer valid */
@@ -5002,7 +4997,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
 	int rc = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, ("Set File Disposition (via SetFileInfo)"));
+	cFYI(1, "Set File Disposition (via SetFileInfo)");
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -5044,7 +5039,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
 	*data_offset = delete_file ? 1 : 0;
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	if (rc)
-		cFYI(1, ("Send error in SetFileDisposition = %d", rc));
+		cFYI(1, "Send error in SetFileDisposition = %d", rc);
 
 	return rc;
 }
@@ -5062,7 +5057,7 @@ CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
 	char *data_offset;
 	__u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, ("In SetTimes"));
+	cFYI(1, "In SetTimes");
 
 SetTimesRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
@@ -5118,7 +5113,7 @@ SetTimesRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, ("SetPathInfo (times) returned %d", rc));
+		cFYI(1, "SetPathInfo (times) returned %d", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -5143,7 +5138,7 @@ CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, char *fileName,
 	int bytes_returned;
 	int name_len;
 
-	cFYI(1, ("In SetAttrLegacy"));
+	cFYI(1, "In SetAttrLegacy");
 
 SetAttrLgcyRetry:
 	rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
@@ -5169,7 +5164,7 @@ SetAttrLgcyRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, ("Error in LegacySetAttr = %d", rc));
+		cFYI(1, "Error in LegacySetAttr = %d", rc);
 
 	cifs_buf_release(pSMB);
 
@@ -5231,7 +5226,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 	int rc = 0;
 	u16 params, param_offset, offset, byte_count, count;
 
-	cFYI(1, ("Set Unix Info (via SetFileInfo)"));
+	cFYI(1, "Set Unix Info (via SetFileInfo)");
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
 	if (rc)
@@ -5276,7 +5271,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
 
 	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
 	if (rc)
-		cFYI(1, ("Send error in Set Time (SetFileInfo) = %d", rc));
+		cFYI(1, "Send error in Set Time (SetFileInfo) = %d", rc);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 		since file handle passed in no longer valid */
@@ -5297,7 +5292,7 @@ CIFSSMBUnixSetPathInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
 	FILE_UNIX_BASIC_INFO *data_offset;
 	__u16 params, param_offset, offset, count, byte_count;
 
-	cFYI(1, ("In SetUID/GID/Mode"));
+	cFYI(1, "In SetUID/GID/Mode");
 setPermsRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -5353,7 +5348,7 @@ setPermsRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, ("SetPathInfo (perms) returned %d", rc));
+		cFYI(1, "SetPathInfo (perms) returned %d", rc);
 
 	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
@@ -5372,7 +5367,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
 	struct dir_notify_req *dnotify_req;
 	int bytes_returned;
 
-	cFYI(1, ("In CIFSSMBNotify for file handle %d", (int)netfid));
+	cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
 	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
 	if (rc)
@@ -5406,7 +5401,7 @@ int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
 			 (struct smb_hdr *)pSMBr, &bytes_returned,
 			 CIFS_ASYNC_OP);
 	if (rc) {
-		cFYI(1, ("Error in Notify = %d", rc));
+		cFYI(1, "Error in Notify = %d", rc);
 	} else {
 		/* Add file to outstanding requests */
 		/* BB change to kmem cache alloc */
@@ -5462,7 +5457,7 @@ CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
 	char *end_of_smb;
 	__u16 params, byte_count, data_offset;
 
-	cFYI(1, ("In Query All EAs path %s", searchName));
+	cFYI(1, "In Query All EAs path %s", searchName);
 QAllEAsRetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -5509,7 +5504,7 @@ QAllEAsRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QueryAllEAs = %d", rc));
+		cFYI(1, "Send error in QueryAllEAs = %d", rc);
 		goto QAllEAsOut;
 	}
 
@@ -5537,16 +5532,16 @@ QAllEAsRetry:
 				(((char *) &pSMBr->hdr.Protocol) + data_offset);
 
 	list_len = le32_to_cpu(ea_response_data->list_len);
-	cFYI(1, ("ea length %d", list_len));
+	cFYI(1, "ea length %d", list_len);
 	if (list_len <= 8) {
-		cFYI(1, ("empty EA list returned from server"));
+		cFYI(1, "empty EA list returned from server");
 		goto QAllEAsOut;
 	}
 
 	/* make sure list_len doesn't go past end of SMB */
 	end_of_smb = (char *)pByteArea(&pSMBr->hdr) + BCC(&pSMBr->hdr);
 	if ((char *)ea_response_data + list_len > end_of_smb) {
-		cFYI(1, ("EA list appears to go beyond SMB"));
+		cFYI(1, "EA list appears to go beyond SMB");
 		rc = -EIO;
 		goto QAllEAsOut;
 	}
@@ -5563,7 +5558,7 @@ QAllEAsRetry:
 		temp_ptr += 4;
 		/* make sure we can read name_len and value_len */
 		if (list_len < 0) {
-			cFYI(1, ("EA entry goes beyond length of list"));
+			cFYI(1, "EA entry goes beyond length of list");
 			rc = -EIO;
 			goto QAllEAsOut;
 		}
@@ -5572,7 +5567,7 @@ QAllEAsRetry:
 		value_len = le16_to_cpu(temp_fea->value_len);
 		list_len -= name_len + 1 + value_len;
 		if (list_len < 0) {
-			cFYI(1, ("EA entry goes beyond length of list"));
+			cFYI(1, "EA entry goes beyond length of list");
 			rc = -EIO;
 			goto QAllEAsOut;
 		}
@@ -5639,7 +5634,7 @@ CIFSSMBSetEA(const int xid, struct cifsTconInfo *tcon, const char *fileName,
 	int bytes_returned = 0;
 	__u16 params, param_offset, byte_count, offset, count;
 
-	cFYI(1, ("In SetEA"));
+	cFYI(1, "In SetEA");
 SetEARetry:
 	rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
@@ -5721,7 +5716,7 @@ SetEARetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc)
-		cFYI(1, ("SetPathInfo (EA) returned %d", rc));
+		cFYI(1, "SetPathInfo (EA) returned %d", rc);
 
 	cifs_buf_release(pSMB);
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d9566bf8f91..58a2109e7b3 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -135,7 +135,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	spin_unlock(&GlobalMid_Lock);
 	server->maxBuf = 0;
 
-	cFYI(1, ("Reconnecting tcp session"));
+	cFYI(1, "Reconnecting tcp session");
 
 	/* before reconnecting the tcp session, mark the smb session (uid)
 		and the tid bad so they are not used until reconnected */
@@ -153,12 +153,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
 	/* do not want to be sending data on a socket we are freeing */
 	mutex_lock(&server->srv_mutex);
 	if (server->ssocket) {
-		cFYI(1, ("State: 0x%x Flags: 0x%lx", server->ssocket->state,
-			server->ssocket->flags));
+		cFYI(1, "State: 0x%x Flags: 0x%lx", server->ssocket->state,
+			server->ssocket->flags);
 		kernel_sock_shutdown(server->ssocket, SHUT_WR);
-		cFYI(1, ("Post shutdown state: 0x%x Flags: 0x%lx",
+		cFYI(1, "Post shutdown state: 0x%x Flags: 0x%lx",
 			server->ssocket->state,
-			server->ssocket->flags));
+			server->ssocket->flags);
 		sock_release(server->ssocket);
 		server->ssocket = NULL;
 	}
@@ -187,7 +187,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		else
 			rc = ipv4_connect(server);
 		if (rc) {
-			cFYI(1, ("reconnect error %d", rc));
+			cFYI(1, "reconnect error %d", rc);
 			msleep(3000);
 		} else {
 			atomic_inc(&tcpSesReconnectCount);
@@ -223,7 +223,7 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
 	/* check for plausible wct, bcc and t2 data and parm sizes */
 	/* check for parm and data offset going beyond end of smb */
 	if (pSMB->WordCount != 10) { /* coalesce_t2 depends on this */
-		cFYI(1, ("invalid transact2 word count"));
+		cFYI(1, "invalid transact2 word count");
 		return -EINVAL;
 	}
 
@@ -237,15 +237,15 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
 	if (remaining == 0)
 		return 0;
 	else if (remaining < 0) {
-		cFYI(1, ("total data %d smaller than data in frame %d",
-			total_data_size, data_in_this_rsp));
+		cFYI(1, "total data %d smaller than data in frame %d",
+			total_data_size, data_in_this_rsp);
 		return -EINVAL;
 	} else {
-		cFYI(1, ("missing %d bytes from transact2, check next response",
-			remaining));
+		cFYI(1, "missing %d bytes from transact2, check next response",
+			remaining);
 		if (total_data_size > maxBufSize) {
-			cERROR(1, ("TotalDataSize %d is over maximum buffer %d",
-				total_data_size, maxBufSize));
+			cERROR(1, "TotalDataSize %d is over maximum buffer %d",
+				total_data_size, maxBufSize);
 			return -EINVAL;
 		}
 		return remaining;
@@ -267,7 +267,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 	total_data_size = le16_to_cpu(pSMBt->t2_rsp.TotalDataCount);
 
 	if (total_data_size != le16_to_cpu(pSMB2->t2_rsp.TotalDataCount)) {
-		cFYI(1, ("total data size of primary and secondary t2 differ"));
+		cFYI(1, "total data size of primary and secondary t2 differ");
 	}
 
 	total_in_buf = le16_to_cpu(pSMBt->t2_rsp.DataCount);
@@ -282,7 +282,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 
 	total_in_buf2 = le16_to_cpu(pSMB2->t2_rsp.DataCount);
 	if (remaining < total_in_buf2) {
-		cFYI(1, ("transact2 2nd response contains too much data"));
+		cFYI(1, "transact2 2nd response contains too much data");
 	}
 
 	/* find end of first SMB data area */
@@ -311,7 +311,7 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 	pTargetSMB->smb_buf_length = byte_count;
 
 	if (remaining == total_in_buf2) {
-		cFYI(1, ("found the last secondary response"));
+		cFYI(1, "found the last secondary response");
 		return 0; /* we are done */
 	} else /* more responses to go */
 		return 1;
@@ -339,7 +339,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 	int reconnect;
 
 	current->flags |= PF_MEMALLOC;
-	cFYI(1, ("Demultiplex PID: %d", task_pid_nr(current)));
+	cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
 
 	length = atomic_inc_return(&tcpSesAllocCount);
 	if (length > 1)
@@ -353,7 +353,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 		if (bigbuf == NULL) {
 			bigbuf = cifs_buf_get();
 			if (!bigbuf) {
-				cERROR(1, ("No memory for large SMB response"));
+				cERROR(1, "No memory for large SMB response");
 				msleep(3000);
 				/* retry will check if exiting */
 				continue;
@@ -366,7 +366,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
 		if (smallbuf == NULL) {
 			smallbuf = cifs_small_buf_get();
 			if (!smallbuf) {
-				cERROR(1, ("No memory for SMB response"));
+				cERROR(1, "No memory for SMB response");
 				msleep(1000);
 				/* retry will check if exiting */
 				continue;
@@ -391,9 +391,9 @@ incomplete_rcv:
 		if (server->tcpStatus == CifsExiting) {
 			break;
 		} else if (server->tcpStatus == CifsNeedReconnect) {
-			cFYI(1, ("Reconnect after server stopped responding"));
+			cFYI(1, "Reconnect after server stopped responding");
 			cifs_reconnect(server);
-			cFYI(1, ("call to reconnect done"));
+			cFYI(1, "call to reconnect done");
 			csocket = server->ssocket;
 			continue;
 		} else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) {
@@ -411,7 +411,7 @@ incomplete_rcv:
 				continue;
 		} else if (length <= 0) {
 			if (server->tcpStatus == CifsNew) {
-				cFYI(1, ("tcp session abend after SMBnegprot"));
+				cFYI(1, "tcp session abend after SMBnegprot");
 				/* some servers kill the TCP session rather than
 				   returning an SMB negprot error, in which
 				   case reconnecting here is not going to help,
@@ -419,18 +419,18 @@ incomplete_rcv:
 				break;
 			}
 			if (!try_to_freeze() && (length == -EINTR)) {
-				cFYI(1, ("cifsd thread killed"));
+				cFYI(1, "cifsd thread killed");
 				break;
 			}
-			cFYI(1, ("Reconnect after unexpected peek error %d",
-				length));
+			cFYI(1, "Reconnect after unexpected peek error %d",
+				length);
 			cifs_reconnect(server);
 			csocket = server->ssocket;
 			wake_up(&server->response_q);
 			continue;
 		} else if (length < pdu_length) {
-			cFYI(1, ("requested %d bytes but only got %d bytes",
-				  pdu_length, length));
+			cFYI(1, "requested %d bytes but only got %d bytes",
+				  pdu_length, length);
 			pdu_length -= length;
 			msleep(1);
 			goto incomplete_rcv;
@@ -450,18 +450,18 @@ incomplete_rcv:
 		pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
 		smb_buffer->smb_buf_length = pdu_length;
 
-		cFYI(1, ("rfc1002 length 0x%x", pdu_length+4));
+		cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
 
 		if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
 			continue;
 		} else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
-			cFYI(1, ("Good RFC 1002 session rsp"));
+			cFYI(1, "Good RFC 1002 session rsp");
 			continue;
 		} else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
 			/* we get this from Windows 98 instead of
 			   an error on SMB negprot response */
-			cFYI(1, ("Negative RFC1002 Session Response Error 0x%x)",
-				pdu_length));
+			cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
+				pdu_length);
 			if (server->tcpStatus == CifsNew) {
 				/* if nack on negprot (rather than
 				ret of smb negprot error) reconnecting
@@ -484,7 +484,7 @@ incomplete_rcv:
 				continue;
 			}
 		} else if (temp != (char) 0) {
-			cERROR(1, ("Unknown RFC 1002 frame"));
+			cERROR(1, "Unknown RFC 1002 frame");
 			cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
 				      length);
 			cifs_reconnect(server);
@@ -495,8 +495,8 @@ incomplete_rcv:
 		/* else we have an SMB response */
 		if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
 			    (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
-			cERROR(1, ("Invalid size SMB length %d pdu_length %d",
-					length, pdu_length+4));
+			cERROR(1, "Invalid size SMB length %d pdu_length %d",
+					length, pdu_length+4);
 			cifs_reconnect(server);
 			csocket = server->ssocket;
 			wake_up(&server->response_q);
@@ -539,8 +539,8 @@ incomplete_rcv:
 				length = 0;
 				continue;
 			} else if (length <= 0) {
-				cERROR(1, ("Received no data, expecting %d",
-					      pdu_length - total_read));
+				cERROR(1, "Received no data, expecting %d",
+					      pdu_length - total_read);
 				cifs_reconnect(server);
 				csocket = server->ssocket;
 				reconnect = 1;
@@ -588,7 +588,7 @@ incomplete_rcv:
 						}
 					} else {
 						if (!isLargeBuf) {
-							cERROR(1,("1st trans2 resp needs bigbuf"));
+							cERROR(1, "1st trans2 resp needs bigbuf");
 					/* BB maybe we can fix this up,  switch
 					   to already allocated large buffer? */
 						} else {
@@ -630,8 +630,8 @@ multi_t2_fnd:
 			wake_up_process(task_to_wake);
 		} else if (!is_valid_oplock_break(smb_buffer, server) &&
 			   !isMultiRsp) {
-			cERROR(1, ("No task to wake, unknown frame received! "
-				   "NumMids %d", midCount.counter));
+			cERROR(1, "No task to wake, unknown frame received! "
+				   "NumMids %d", midCount.counter);
 			cifs_dump_mem("Received Data is: ", (char *)smb_buffer,
 				      sizeof(struct smb_hdr));
 #ifdef CONFIG_CIFS_DEBUG2
@@ -708,8 +708,8 @@ multi_t2_fnd:
 		list_for_each(tmp, &server->pending_mid_q) {
 		mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
 			if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
-				cFYI(1, ("Clearing Mid 0x%x - waking up ",
-					 mid_entry->mid));
+				cFYI(1, "Clearing Mid 0x%x - waking up ",
+					 mid_entry->mid);
 				task_to_wake = mid_entry->tsk;
 				if (task_to_wake)
 					wake_up_process(task_to_wake);
@@ -728,7 +728,7 @@ multi_t2_fnd:
 		to wait at least 45 seconds before giving up
 		on a request getting a response and going ahead
 		and killing cifsd */
-		cFYI(1, ("Wait for exit from demultiplex thread"));
+		cFYI(1, "Wait for exit from demultiplex thread");
 		msleep(46000);
 		/* if threads still have not exited they are probably never
 		coming home not much else we can do but free the memory */
@@ -849,7 +849,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 			separator[0] = options[4];
 			options += 5;
 		} else {
-			cFYI(1, ("Null separator not allowed"));
+			cFYI(1, "Null separator not allowed");
 		}
 	}
 
@@ -974,7 +974,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 			}
 		} else if (strnicmp(data, "sec", 3) == 0) {
 			if (!value || !*value) {
-				cERROR(1, ("no security value specified"));
+				cERROR(1, "no security value specified");
 				continue;
 			} else if (strnicmp(value, "krb5i", 5) == 0) {
 				vol->secFlg |= CIFSSEC_MAY_KRB5 |
@@ -982,7 +982,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 			} else if (strnicmp(value, "krb5p", 5) == 0) {
 				/* vol->secFlg |= CIFSSEC_MUST_SEAL |
 					CIFSSEC_MAY_KRB5; */
-				cERROR(1, ("Krb5 cifs privacy not supported"));
+				cERROR(1, "Krb5 cifs privacy not supported");
 				return 1;
 			} else if (strnicmp(value, "krb5", 4) == 0) {
 				vol->secFlg |= CIFSSEC_MAY_KRB5;
@@ -1014,7 +1014,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 			} else if (strnicmp(value, "none", 4) == 0) {
 				vol->nullauth = 1;
 			} else {
-				cERROR(1, ("bad security option: %s", value));
+				cERROR(1, "bad security option: %s", value);
 				return 1;
 			}
 		} else if ((strnicmp(data, "unc", 3) == 0)
@@ -1053,7 +1053,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 			a domain name and need special handling? */
 			if (strnlen(value, 256) < 256) {
 				vol->domainname = value;
-				cFYI(1, ("Domain name set"));
+				cFYI(1, "Domain name set");
 			} else {
 				printk(KERN_WARNING "CIFS: domain name too "
 						    "long\n");
@@ -1076,7 +1076,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 					strcpy(vol->prepath+1, value);
 				} else
 					strcpy(vol->prepath, value);
-				cFYI(1, ("prefix path %s", vol->prepath));
+				cFYI(1, "prefix path %s", vol->prepath);
 			} else {
 				printk(KERN_WARNING "CIFS: prefix too long\n");
 				return 1;
@@ -1092,7 +1092,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 					vol->iocharset = value;
 				/* if iocharset not set then load_nls_default
 				   is used by caller */
-				cFYI(1, ("iocharset set to %s", value));
+				cFYI(1, "iocharset set to %s", value);
 			} else {
 				printk(KERN_WARNING "CIFS: iocharset name "
 						    "too long.\n");
@@ -1144,14 +1144,14 @@ cifs_parse_mount_options(char *options, const char *devname,
 			}
 		} else if (strnicmp(data, "sockopt", 5) == 0) {
 			if (!value || !*value) {
-				cERROR(1, ("no socket option specified"));
+				cERROR(1, "no socket option specified");
 				continue;
 			} else if (strnicmp(value, "TCP_NODELAY", 11) == 0) {
 				vol->sockopt_tcp_nodelay = 1;
 			}
 		} else if (strnicmp(data, "netbiosname", 4) == 0) {
 			if (!value || !*value || (*value == ' ')) {
-				cFYI(1, ("invalid (empty) netbiosname"));
+				cFYI(1, "invalid (empty) netbiosname");
 			} else {
 				memset(vol->source_rfc1001_name, 0x20, 15);
 				for (i = 0; i < 15; i++) {
@@ -1175,7 +1175,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 		} else if (strnicmp(data, "servern", 7) == 0) {
 			/* servernetbiosname specified override *SMBSERVER */
 			if (!value || !*value || (*value == ' ')) {
-				cFYI(1, ("empty server netbiosname specified"));
+				cFYI(1, "empty server netbiosname specified");
 			} else {
 				/* last byte, type, is 0x20 for servr type */
 				memset(vol->target_rfc1001_name, 0x20, 16);
@@ -1434,7 +1434,7 @@ cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
 
 		++server->srv_count;
 		write_unlock(&cifs_tcp_ses_lock);
-		cFYI(1, ("Existing tcp session with server found"));
+		cFYI(1, "Existing tcp session with server found");
 		return server;
 	}
 	write_unlock(&cifs_tcp_ses_lock);
@@ -1475,7 +1475,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 
 	memset(&addr, 0, sizeof(struct sockaddr_storage));
 
-	cFYI(1, ("UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip));
+	cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
 
 	if (volume_info->UNCip && volume_info->UNC) {
 		rc = cifs_convert_address(volume_info->UNCip, &addr);
@@ -1487,13 +1487,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	} else if (volume_info->UNCip) {
 		/* BB using ip addr as tcp_ses name to connect to the
 		   DFS root below */
-		cERROR(1, ("Connecting to DFS root not implemented yet"));
+		cERROR(1, "Connecting to DFS root not implemented yet");
 		rc = -EINVAL;
 		goto out_err;
 	} else /* which tcp_sess DFS root would we conect to */ {
-		cERROR(1,
-		       ("CIFS mount error: No UNC path (e.g. -o "
-			"unc=//192.168.1.100/public) specified"));
+		cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
+			"unc=//192.168.1.100/public) specified");
 		rc = -EINVAL;
 		goto out_err;
 	}
@@ -1540,7 +1539,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	++tcp_ses->srv_count;
 
 	if (addr.ss_family == AF_INET6) {
-		cFYI(1, ("attempting ipv6 connect"));
+		cFYI(1, "attempting ipv6 connect");
 		/* BB should we allow ipv6 on port 139? */
 		/* other OS never observed in Wild doing 139 with v6 */
 		sin_server6->sin6_port = htons(volume_info->port);
@@ -1554,7 +1553,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		rc = ipv4_connect(tcp_ses);
 	}
 	if (rc < 0) {
-		cERROR(1, ("Error connecting to socket. Aborting operation"));
+		cERROR(1, "Error connecting to socket. Aborting operation");
 		goto out_err;
 	}
 
@@ -1567,7 +1566,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 				  tcp_ses, "cifsd");
 	if (IS_ERR(tcp_ses->tsk)) {
 		rc = PTR_ERR(tcp_ses->tsk);
-		cERROR(1, ("error %d create cifsd thread", rc));
+		cERROR(1, "error %d create cifsd thread", rc);
 		module_put(THIS_MODULE);
 		goto out_err;
 	}
@@ -1703,8 +1702,7 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 		strcpy(temp_unc + 2, pSesInfo->serverName);
 		strcpy(temp_unc + 2 + strlen(pSesInfo->serverName), "\\IPC$");
 		rc = CIFSTCon(xid, pSesInfo, temp_unc, NULL, nls_codepage);
-		cFYI(1,
-		     ("CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid));
+		cFYI(1, "CIFS Tcon rc = %d ipc_tid = %d", rc, pSesInfo->ipc_tid);
 		kfree(temp_unc);
 	}
 	if (rc == 0)
@@ -1777,12 +1775,12 @@ ipv4_connect(struct TCP_Server_Info *server)
 		rc = sock_create_kern(PF_INET, SOCK_STREAM,
 				      IPPROTO_TCP, &socket);
 		if (rc < 0) {
-			cERROR(1, ("Error %d creating socket", rc));
+			cERROR(1, "Error %d creating socket", rc);
 			return rc;
 		}
 
 		/* BB other socket options to set KEEPALIVE, NODELAY? */
-		cFYI(1, ("Socket created"));
+		cFYI(1, "Socket created");
 		server->ssocket = socket;
 		socket->sk->sk_allocation = GFP_NOFS;
 		cifs_reclassify_socket4(socket);
@@ -1827,7 +1825,7 @@ ipv4_connect(struct TCP_Server_Info *server)
 	if (!connected) {
 		if (orig_port)
 			server->addr.sockAddr.sin_port = orig_port;
-		cFYI(1, ("Error %d connecting to server via ipv4", rc));
+		cFYI(1, "Error %d connecting to server via ipv4", rc);
 		sock_release(socket);
 		server->ssocket = NULL;
 		return rc;
@@ -1855,12 +1853,12 @@ ipv4_connect(struct TCP_Server_Info *server)
 		rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
 				(char *)&val, sizeof(val));
 		if (rc)
-			cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
+			cFYI(1, "set TCP_NODELAY socket option error %d", rc);
 	}
 
-	 cFYI(1, ("sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
+	 cFYI(1, "sndbuf %d rcvbuf %d rcvtimeo 0x%lx",
 		 socket->sk->sk_sndbuf,
-		 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo));
+		 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
 
 	/* send RFC1001 sessinit */
 	if (server->addr.sockAddr.sin_port == htons(RFC1001_PORT)) {
@@ -1938,13 +1936,13 @@ ipv6_connect(struct TCP_Server_Info *server)
 		rc = sock_create_kern(PF_INET6, SOCK_STREAM,
 				      IPPROTO_TCP, &socket);
 		if (rc < 0) {
-			cERROR(1, ("Error %d creating ipv6 socket", rc));
+			cERROR(1, "Error %d creating ipv6 socket", rc);
 			socket = NULL;
 			return rc;
 		}
 
 		/* BB other socket options to set KEEPALIVE, NODELAY? */
-		cFYI(1, ("ipv6 Socket created"));
+		cFYI(1, "ipv6 Socket created");
 		server->ssocket = socket;
 		socket->sk->sk_allocation = GFP_NOFS;
 		cifs_reclassify_socket6(socket);
@@ -1988,7 +1986,7 @@ ipv6_connect(struct TCP_Server_Info *server)
 	if (!connected) {
 		if (orig_port)
 			server->addr.sockAddr6.sin6_port = orig_port;
-		cFYI(1, ("Error %d connecting to server via ipv6", rc));
+		cFYI(1, "Error %d connecting to server via ipv6", rc);
 		sock_release(socket);
 		server->ssocket = NULL;
 		return rc;
@@ -2007,7 +2005,7 @@ ipv6_connect(struct TCP_Server_Info *server)
 		rc = kernel_setsockopt(socket, SOL_TCP, TCP_NODELAY,
 				(char *)&val, sizeof(val));
 		if (rc)
-			cFYI(1, ("set TCP_NODELAY socket option error %d", rc));
+			cFYI(1, "set TCP_NODELAY socket option error %d", rc);
 	}
 
 	server->ssocket = socket;
@@ -2032,13 +2030,13 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
 	if (vol_info && vol_info->no_linux_ext) {
 		tcon->fsUnixInfo.Capability = 0;
 		tcon->unix_ext = 0; /* Unix Extensions disabled */
-		cFYI(1, ("Linux protocol extensions disabled"));
+		cFYI(1, "Linux protocol extensions disabled");
 		return;
 	} else if (vol_info)
 		tcon->unix_ext = 1; /* Unix Extensions supported */
 
 	if (tcon->unix_ext == 0) {
-		cFYI(1, ("Unix extensions disabled so not set on reconnect"));
+		cFYI(1, "Unix extensions disabled so not set on reconnect");
 		return;
 	}
 
@@ -2054,12 +2052,11 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
 				cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
 			if ((saved_cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
 				if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
-					cERROR(1, ("POSIXPATH support change"));
+					cERROR(1, "POSIXPATH support change");
 				cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
 			} else if ((cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) == 0) {
-				cERROR(1, ("possible reconnect error"));
-				cERROR(1,
-					("server disabled POSIX path support"));
+				cERROR(1, "possible reconnect error");
+				cERROR(1, "server disabled POSIX path support");
 			}
 		}
 
@@ -2067,7 +2064,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
 		if (vol_info && vol_info->no_psx_acl)
 			cap &= ~CIFS_UNIX_POSIX_ACL_CAP;
 		else if (CIFS_UNIX_POSIX_ACL_CAP & cap) {
-			cFYI(1, ("negotiated posix acl support"));
+			cFYI(1, "negotiated posix acl support");
 			if (sb)
 				sb->s_flags |= MS_POSIXACL;
 		}
@@ -2075,7 +2072,7 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
 		if (vol_info && vol_info->posix_paths == 0)
 			cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP;
 		else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) {
-			cFYI(1, ("negotiate posix pathnames"));
+			cFYI(1, "negotiate posix pathnames");
 			if (sb)
 				CIFS_SB(sb)->mnt_cifs_flags |=
 					CIFS_MOUNT_POSIX_PATHS;
@@ -2090,39 +2087,38 @@ void reset_cifs_unix_caps(int xid, struct cifsTconInfo *tcon,
 		if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) {
 			if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
 				CIFS_SB(sb)->rsize = 127 * 1024;
-				cFYI(DBG2,
-					("larger reads not supported by srv"));
+				cFYI(DBG2, "larger reads not supported by srv");
 			}
 		}
 
 
-		cFYI(1, ("Negotiate caps 0x%x", (int)cap));
+		cFYI(1, "Negotiate caps 0x%x", (int)cap);
 #ifdef CONFIG_CIFS_DEBUG2
 		if (cap & CIFS_UNIX_FCNTL_CAP)
-			cFYI(1, ("FCNTL cap"));
+			cFYI(1, "FCNTL cap");
 		if (cap & CIFS_UNIX_EXTATTR_CAP)
-			cFYI(1, ("EXTATTR cap"));
+			cFYI(1, "EXTATTR cap");
 		if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP)
-			cFYI(1, ("POSIX path cap"));
+			cFYI(1, "POSIX path cap");
 		if (cap & CIFS_UNIX_XATTR_CAP)
-			cFYI(1, ("XATTR cap"));
+			cFYI(1, "XATTR cap");
 		if (cap & CIFS_UNIX_POSIX_ACL_CAP)
-			cFYI(1, ("POSIX ACL cap"));
+			cFYI(1, "POSIX ACL cap");
 		if (cap & CIFS_UNIX_LARGE_READ_CAP)
-			cFYI(1, ("very large read cap"));
+			cFYI(1, "very large read cap");
 		if (cap & CIFS_UNIX_LARGE_WRITE_CAP)
-			cFYI(1, ("very large write cap"));
+			cFYI(1, "very large write cap");
 #endif /* CIFS_DEBUG2 */
 		if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) {
 			if (vol_info == NULL) {
-				cFYI(1, ("resetting capabilities failed"));
+				cFYI(1, "resetting capabilities failed");
 			} else
-				cERROR(1, ("Negotiating Unix capabilities "
+				cERROR(1, "Negotiating Unix capabilities "
 					   "with the server failed.  Consider "
 					   "mounting with the Unix Extensions\n"
 					   "disabled, if problems are found, "
 					   "by specifying the nounix mount "
-					   "option."));
+					   "option.");
 
 		}
 	}
@@ -2152,8 +2148,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 			  struct cifs_sb_info *cifs_sb)
 {
 	if (pvolume_info->rsize > CIFSMaxBufSize) {
-		cERROR(1, ("rsize %d too large, using MaxBufSize",
-			pvolume_info->rsize));
+		cERROR(1, "rsize %d too large, using MaxBufSize",
+			pvolume_info->rsize);
 		cifs_sb->rsize = CIFSMaxBufSize;
 	} else if ((pvolume_info->rsize) &&
 			(pvolume_info->rsize <= CIFSMaxBufSize))
@@ -2162,8 +2158,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 		cifs_sb->rsize = CIFSMaxBufSize;
 
 	if (pvolume_info->wsize > PAGEVEC_SIZE * PAGE_CACHE_SIZE) {
-		cERROR(1, ("wsize %d too large, using 4096 instead",
-			  pvolume_info->wsize));
+		cERROR(1, "wsize %d too large, using 4096 instead",
+			  pvolume_info->wsize);
 		cifs_sb->wsize = 4096;
 	} else if (pvolume_info->wsize)
 		cifs_sb->wsize = pvolume_info->wsize;
@@ -2181,7 +2177,7 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 	if (cifs_sb->rsize < 2048) {
 		cifs_sb->rsize = 2048;
 		/* Windows ME may prefer this */
-		cFYI(1, ("readsize set to minimum: 2048"));
+		cFYI(1, "readsize set to minimum: 2048");
 	}
 	/* calculate prepath */
 	cifs_sb->prepath = pvolume_info->prepath;
@@ -2199,8 +2195,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 	cifs_sb->mnt_gid = pvolume_info->linux_gid;
 	cifs_sb->mnt_file_mode = pvolume_info->file_mode;
 	cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
-	cFYI(1, ("file mode: 0x%x  dir mode: 0x%x",
-		cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode));
+	cFYI(1, "file mode: 0x%x  dir mode: 0x%x",
+		cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
 
 	if (pvolume_info->noperm)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM;
@@ -2229,13 +2225,13 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 	if (pvolume_info->dynperm)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
 	if (pvolume_info->direct_io) {
-		cFYI(1, ("mounting share using direct i/o"));
+		cFYI(1, "mounting share using direct i/o");
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
 	}
 
 	if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
-		cERROR(1, ("mount option dynperm ignored if cifsacl "
-			   "mount option supported"));
+		cERROR(1, "mount option dynperm ignored if cifsacl "
+			   "mount option supported");
 }
 
 static int
@@ -2344,11 +2340,11 @@ try_mount_again:
 	}
 
 	if (volume_info->nullauth) {
-		cFYI(1, ("null user"));
+		cFYI(1, "null user");
 		volume_info->username = "";
 	} else if (volume_info->username) {
 		/* BB fixme parse for domain name here */
-		cFYI(1, ("Username: %s", volume_info->username));
+		cFYI(1, "Username: %s", volume_info->username);
 	} else {
 		cifserror("No username specified");
 	/* In userspace mount helper we can get user name from alternate
@@ -2365,8 +2361,8 @@ try_mount_again:
 	} else {
 		cifs_sb->local_nls = load_nls(volume_info->iocharset);
 		if (cifs_sb->local_nls == NULL) {
-			cERROR(1, ("CIFS mount error: iocharset %s not found",
-				 volume_info->iocharset));
+			cERROR(1, "CIFS mount error: iocharset %s not found",
+				 volume_info->iocharset);
 			rc = -ELIBACC;
 			goto out;
 		}
@@ -2381,8 +2377,8 @@ try_mount_again:
 
 	pSesInfo = cifs_find_smb_ses(srvTcp, volume_info->username);
 	if (pSesInfo) {
-		cFYI(1, ("Existing smb sess found (status=%d)",
-			pSesInfo->status));
+		cFYI(1, "Existing smb sess found (status=%d)",
+			pSesInfo->status);
 		/*
 		 * The existing SMB session already has a reference to srvTcp,
 		 * so we can put back the extra one we got before
@@ -2391,13 +2387,13 @@ try_mount_again:
 
 		mutex_lock(&pSesInfo->session_mutex);
 		if (pSesInfo->need_reconnect) {
-			cFYI(1, ("Session needs reconnect"));
+			cFYI(1, "Session needs reconnect");
 			rc = cifs_setup_session(xid, pSesInfo,
 						cifs_sb->local_nls);
 		}
 		mutex_unlock(&pSesInfo->session_mutex);
 	} else if (!rc) {
-		cFYI(1, ("Existing smb sess not found"));
+		cFYI(1, "Existing smb sess not found");
 		pSesInfo = sesInfoAlloc();
 		if (pSesInfo == NULL) {
 			rc = -ENOMEM;
@@ -2452,12 +2448,12 @@ try_mount_again:
 
 		tcon = cifs_find_tcon(pSesInfo, volume_info->UNC);
 		if (tcon) {
-			cFYI(1, ("Found match on UNC path"));
+			cFYI(1, "Found match on UNC path");
 			/* existing tcon already has a reference */
 			cifs_put_smb_ses(pSesInfo);
 			if (tcon->seal != volume_info->seal)
-				cERROR(1, ("transport encryption setting "
-					   "conflicts with existing tid"));
+				cERROR(1, "transport encryption setting "
+					   "conflicts with existing tid");
 		} else {
 			tcon = tconInfoAlloc();
 			if (tcon == NULL) {
@@ -2477,7 +2473,7 @@ try_mount_again:
 
 			if ((strchr(volume_info->UNC + 3, '\\') == NULL)
 			    && (strchr(volume_info->UNC + 3, '/') == NULL)) {
-				cERROR(1, ("Missing share name"));
+				cERROR(1, "Missing share name");
 				rc = -ENODEV;
 				goto mount_fail_check;
 			} else {
@@ -2486,11 +2482,11 @@ try_mount_again:
 				 * we do on SessSetup and reconnect? */
 				rc = CIFSTCon(xid, pSesInfo, volume_info->UNC,
 					      tcon, cifs_sb->local_nls);
-				cFYI(1, ("CIFS Tcon rc = %d", rc));
+				cFYI(1, "CIFS Tcon rc = %d", rc);
 				if (volume_info->nodfs) {
 					tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
-					cFYI(1, ("DFS disabled (%d)",
-						tcon->Flags));
+					cFYI(1, "DFS disabled (%d)",
+						tcon->Flags);
 				}
 			}
 			if (rc)
@@ -2544,7 +2540,7 @@ try_mount_again:
 
 	if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
 		cifs_sb->rsize = 1024 * 127;
-		cFYI(DBG2, ("no very large read support, rsize now 127K"));
+		cFYI(DBG2, "no very large read support, rsize now 127K");
 	}
 	if (!(tcon->ses->capabilities & CAP_LARGE_WRITE_X))
 		cifs_sb->wsize = min(cifs_sb->wsize,
@@ -2593,7 +2589,7 @@ remote_path_check:
 			goto mount_fail_check;
 		}
 
-		cFYI(1, ("Getting referral for: %s", full_path));
+		cFYI(1, "Getting referral for: %s", full_path);
 		rc = get_dfs_path(xid, pSesInfo , full_path + 1,
 			cifs_sb->local_nls, &num_referrals, &referrals,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -2778,13 +2774,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		if (length == 3) {
 			if ((bcc_ptr[0] == 'I') && (bcc_ptr[1] == 'P') &&
 			    (bcc_ptr[2] == 'C')) {
-				cFYI(1, ("IPC connection"));
+				cFYI(1, "IPC connection");
 				tcon->ipc = 1;
 			}
 		} else if (length == 2) {
 			if ((bcc_ptr[0] == 'A') && (bcc_ptr[1] == ':')) {
 				/* the most common case */
-				cFYI(1, ("disk share connection"));
+				cFYI(1, "disk share connection");
 			}
 		}
 		bcc_ptr += length + 1;
@@ -2797,7 +2793,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 						      bytes_left, is_unicode,
 						      nls_codepage);
 
-		cFYI(1, ("nativeFileSystem=%s", tcon->nativeFileSystem));
+		cFYI(1, "nativeFileSystem=%s", tcon->nativeFileSystem);
 
 		if ((smb_buffer_response->WordCount == 3) ||
 			 (smb_buffer_response->WordCount == 7))
@@ -2805,7 +2801,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 			tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
 		else
 			tcon->Flags = 0;
-		cFYI(1, ("Tcon flags: 0x%x ", tcon->Flags));
+		cFYI(1, "Tcon flags: 0x%x ", tcon->Flags);
 	} else if ((rc == 0) && tcon == NULL) {
 		/* all we need to save for IPC$ connection */
 		ses->ipc_tid = smb_buffer_response->Tid;
@@ -2869,14 +2865,14 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 	if (linuxExtEnabled == 0)
 		pSesInfo->capabilities &= (~CAP_UNIX);
 
-	cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
-		 server->secMode, server->capabilities, server->timeAdj));
+	cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
+		 server->secMode, server->capabilities, server->timeAdj);
 
 	rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
 	if (rc) {
-		cERROR(1, ("Send error in SessSetup = %d", rc));
+		cERROR(1, "Send error in SessSetup = %d", rc);
 	} else {
-		cFYI(1, ("CIFS Session Established successfully"));
+		cFYI(1, "CIFS Session Established successfully");
 		spin_lock(&GlobalMid_Lock);
 		pSesInfo->status = CifsGood;
 		pSesInfo->need_reconnect = false;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e9f7ecc2714..4aa2fe3f535 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -73,7 +73,7 @@ cifs_bp_rename_retry:
 		namelen += (1 + temp->d_name.len);
 		temp = temp->d_parent;
 		if (temp == NULL) {
-			cERROR(1, ("corrupt dentry"));
+			cERROR(1, "corrupt dentry");
 			return NULL;
 		}
 	}
@@ -90,19 +90,18 @@ cifs_bp_rename_retry:
 			full_path[namelen] = dirsep;
 			strncpy(full_path + namelen + 1, temp->d_name.name,
 				temp->d_name.len);
-			cFYI(0, ("name: %s", full_path + namelen));
+			cFYI(0, "name: %s", full_path + namelen);
 		}
 		temp = temp->d_parent;
 		if (temp == NULL) {
-			cERROR(1, ("corrupt dentry"));
+			cERROR(1, "corrupt dentry");
 			kfree(full_path);
 			return NULL;
 		}
 	}
 	if (namelen != pplen + dfsplen) {
-		cERROR(1,
-		       ("did not end path lookup where expected namelen is %d",
-			namelen));
+		cERROR(1, "did not end path lookup where expected namelen is %d",
+			namelen);
 		/* presumably this is only possible if racing with a rename
 		of one of the parent directories  (we can not lock the dentries
 		above us to prevent this, but retrying should be harmless) */
@@ -173,7 +172,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
 		if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
 			pCifsInode->clientCanCacheAll = true;
 			pCifsInode->clientCanCacheRead = true;
-			cFYI(1, ("Exclusive Oplock inode %p", newinode));
+			cFYI(1, "Exclusive Oplock inode %p", newinode);
 		} else if ((oplock & 0xF) == OPLOCK_READ)
 				pCifsInode->clientCanCacheRead = true;
 	}
@@ -192,7 +191,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 	struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
 	struct cifs_fattr fattr;
 
-	cFYI(1, ("posix open %s", full_path));
+	cFYI(1, "posix open %s", full_path);
 
 	presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
 	if (presp_data == NULL)
@@ -358,7 +357,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		else if ((oflags & O_CREAT) == O_CREAT)
 			disposition = FILE_OPEN_IF;
 		else
-			cFYI(1, ("Create flag not set in create function"));
+			cFYI(1, "Create flag not set in create function");
 	}
 
 	/* BB add processing to set equivalent of mode - e.g. via CreateX with
@@ -394,7 +393,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	}
 	if (rc) {
-		cFYI(1, ("cifs_create returned 0x%x", rc));
+		cFYI(1, "cifs_create returned 0x%x", rc);
 		goto cifs_create_out;
 	}
 
@@ -457,7 +456,7 @@ cifs_create_set_dentry:
 	if (rc == 0)
 		setup_cifs_dentry(tcon, direntry, newinode);
 	else
-		cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc));
+		cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
 
 	/* nfsd case - nfs srv does not set nd */
 	if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
@@ -531,7 +530,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 			u16 fileHandle;
 			FILE_ALL_INFO *buf;
 
-			cFYI(1, ("sfu compat create special file"));
+			cFYI(1, "sfu compat create special file");
 
 			buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
 			if (buf == NULL) {
@@ -616,8 +615,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 
 	xid = GetXid();
 
-	cFYI(1, ("parent inode = 0x%p name is: %s and dentry = 0x%p",
-	      parent_dir_inode, direntry->d_name.name, direntry));
+	cFYI(1, "parent inode = 0x%p name is: %s and dentry = 0x%p",
+	      parent_dir_inode, direntry->d_name.name, direntry);
 
 	/* check whether path exists */
 
@@ -632,7 +631,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 		int i;
 		for (i = 0; i < direntry->d_name.len; i++)
 			if (direntry->d_name.name[i] == '\\') {
-				cFYI(1, ("Invalid file name"));
+				cFYI(1, "Invalid file name");
 				FreeXid(xid);
 				return ERR_PTR(-EINVAL);
 			}
@@ -657,11 +656,11 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	}
 
 	if (direntry->d_inode != NULL) {
-		cFYI(1, ("non-NULL inode in lookup"));
+		cFYI(1, "non-NULL inode in lookup");
 	} else {
-		cFYI(1, ("NULL inode in lookup"));
+		cFYI(1, "NULL inode in lookup");
 	}
-	cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode));
+	cFYI(1, "Full path: %s inode = 0x%p", full_path, direntry->d_inode);
 
 	/* Posix open is only called (at lookup time) for file create now.
 	 * For opens (rather than creates), because we do not know if it
@@ -723,7 +722,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 	/*	if it was once a directory (but how can we tell?) we could do
 		shrink_dcache_parent(direntry); */
 	} else if (rc != -EACCES) {
-		cERROR(1, ("Unexpected lookup error %d", rc));
+		cERROR(1, "Unexpected lookup error %d", rc);
 		/* We special case check for Access Denied - since that
 		is a common return code */
 	}
@@ -742,8 +741,8 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 		if (cifs_revalidate_dentry(direntry))
 			return 0;
 	} else {
-		cFYI(1, ("neg dentry 0x%p name = %s",
-			 direntry, direntry->d_name.name));
+		cFYI(1, "neg dentry 0x%p name = %s",
+			 direntry, direntry->d_name.name);
 		if (time_after(jiffies, direntry->d_time + HZ) ||
 			!lookupCacheEnabled) {
 			d_drop(direntry);
@@ -758,7 +757,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 {
 	int rc = 0;
 
-	cFYI(1, ("In cifs d_delete, name = %s", direntry->d_name.name));
+	cFYI(1, "In cifs d_delete, name = %s", direntry->d_name.name);
 
 	return rc;
 }     */
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 6f8a0e3fb25..4db2c5e7283 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -106,14 +106,14 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 	/* search for server name delimiter */
 	len = strlen(unc);
 	if (len < 3) {
-		cFYI(1, ("%s: unc is too short: %s", __func__, unc));
+		cFYI(1, "%s: unc is too short: %s", __func__, unc);
 		return -EINVAL;
 	}
 	len -= 2;
 	name = memchr(unc+2, '\\', len);
 	if (!name) {
-		cFYI(1, ("%s: probably server name is whole unc: %s",
-					__func__, unc));
+		cFYI(1, "%s: probably server name is whole unc: %s",
+					__func__, unc);
 	} else {
 		len = (name - unc) - 2/* leading // */;
 	}
@@ -127,8 +127,8 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 	name[len] = 0;
 
 	if (is_ip(name)) {
-		cFYI(1, ("%s: it is IP, skipping dns upcall: %s",
-					__func__, name));
+		cFYI(1, "%s: it is IP, skipping dns upcall: %s",
+					__func__, name);
 		data = name;
 		goto skip_upcall;
 	}
@@ -138,7 +138,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 		len = rkey->type_data.x[0];
 		data = rkey->payload.data;
 	} else {
-		cERROR(1, ("%s: unable to resolve: %s", __func__, name));
+		cERROR(1, "%s: unable to resolve: %s", __func__, name);
 		goto out;
 	}
 
@@ -148,10 +148,10 @@ skip_upcall:
 		if (*ip_addr) {
 			memcpy(*ip_addr, data, len + 1);
 			if (!IS_ERR(rkey))
-				cFYI(1, ("%s: resolved: %s to %s", __func__,
+				cFYI(1, "%s: resolved: %s to %s", __func__,
 							name,
 							*ip_addr
-					));
+					);
 			rc = 0;
 		} else {
 			rc = -ENOMEM;
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 6177f7cca16..993f82045bf 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -49,7 +49,7 @@
 static struct dentry *cifs_get_parent(struct dentry *dentry)
 {
 	/* BB need to add code here eventually to enable export via NFSD */
-	cFYI(1, ("get parent for %p", dentry));
+	cFYI(1, "get parent for %p", dentry);
 	return ERR_PTR(-EACCES);
 }
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1361d67f68f..ed3689e6617 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -136,15 +136,15 @@ cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
 	if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
 			   (file->f_path.dentry->d_inode->i_size ==
 			    (loff_t)le64_to_cpu(buf->EndOfFile))) {
-		cFYI(1, ("inode unchanged on server"));
+		cFYI(1, "inode unchanged on server");
 	} else {
 		if (file->f_path.dentry->d_inode->i_mapping) {
 			rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
 			if (rc != 0)
 				CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
 		}
-		cFYI(1, ("invalidating remote inode since open detected it "
-			 "changed"));
+		cFYI(1, "invalidating remote inode since open detected it "
+			 "changed");
 		invalidate_remote_inode(file->f_path.dentry->d_inode);
 	} */
 
@@ -152,8 +152,8 @@ psx_client_can_cache:
 	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
 		pCifsInode->clientCanCacheAll = true;
 		pCifsInode->clientCanCacheRead = true;
-		cFYI(1, ("Exclusive Oplock granted on inode %p",
-			 file->f_path.dentry->d_inode));
+		cFYI(1, "Exclusive Oplock granted on inode %p",
+			 file->f_path.dentry->d_inode);
 	} else if ((oplock & 0xF) == OPLOCK_READ)
 		pCifsInode->clientCanCacheRead = true;
 
@@ -190,8 +190,8 @@ cifs_fill_filedata(struct file *file)
 	if (file->private_data != NULL) {
 		return pCifsFile;
 	} else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL))
-			cERROR(1, ("could not find file instance for "
-				   "new file %p", file));
+			cERROR(1, "could not find file instance for "
+				   "new file %p", file);
 	return NULL;
 }
 
@@ -217,7 +217,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 	if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
 			   (file->f_path.dentry->d_inode->i_size ==
 			    (loff_t)le64_to_cpu(buf->EndOfFile))) {
-		cFYI(1, ("inode unchanged on server"));
+		cFYI(1, "inode unchanged on server");
 	} else {
 		if (file->f_path.dentry->d_inode->i_mapping) {
 			/* BB no need to lock inode until after invalidate
@@ -226,8 +226,8 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 			if (rc != 0)
 				CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
 		}
-		cFYI(1, ("invalidating remote inode since open detected it "
-			 "changed"));
+		cFYI(1, "invalidating remote inode since open detected it "
+			 "changed");
 		invalidate_remote_inode(file->f_path.dentry->d_inode);
 	}
 
@@ -242,8 +242,8 @@ client_can_cache:
 	if ((*oplock & 0xF) == OPLOCK_EXCLUSIVE) {
 		pCifsInode->clientCanCacheAll = true;
 		pCifsInode->clientCanCacheRead = true;
-		cFYI(1, ("Exclusive Oplock granted on inode %p",
-			 file->f_path.dentry->d_inode));
+		cFYI(1, "Exclusive Oplock granted on inode %p",
+			 file->f_path.dentry->d_inode);
 	} else if ((*oplock & 0xF) == OPLOCK_READ)
 		pCifsInode->clientCanCacheRead = true;
 
@@ -285,8 +285,8 @@ int cifs_open(struct inode *inode, struct file *file)
 		return rc;
 	}
 
-	cFYI(1, ("inode = 0x%p file flags are 0x%x for %s",
-		 inode, file->f_flags, full_path));
+	cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
+		 inode, file->f_flags, full_path);
 
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
@@ -303,7 +303,7 @@ int cifs_open(struct inode *inode, struct file *file)
 				     cifs_sb->mnt_file_mode /* ignored */,
 				     oflags, &oplock, &netfid, xid);
 		if (rc == 0) {
-			cFYI(1, ("posix open succeeded"));
+			cFYI(1, "posix open succeeded");
 			/* no need for special case handling of setting mode
 			   on read only files needed here */
 
@@ -313,12 +313,12 @@ int cifs_open(struct inode *inode, struct file *file)
 			goto out;
 		} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			if (tcon->ses->serverNOS)
-				cERROR(1, ("server %s of type %s returned"
+				cERROR(1, "server %s of type %s returned"
 					   " unexpected error on SMB posix open"
 					   ", disabling posix open support."
 					   " Check if server update available.",
 					   tcon->ses->serverName,
-					   tcon->ses->serverNOS));
+					   tcon->ses->serverNOS);
 			tcon->broken_posix_open = true;
 		} else if ((rc != -EIO) && (rc != -EREMOTE) &&
 			 (rc != -EOPNOTSUPP)) /* path not found or net err */
@@ -386,7 +386,7 @@ int cifs_open(struct inode *inode, struct file *file)
 				& CIFS_MOUNT_MAP_SPECIAL_CHR);
 	}
 	if (rc) {
-		cFYI(1, ("cifs_open returned 0x%x", rc));
+		cFYI(1, "cifs_open returned 0x%x", rc);
 		goto out;
 	}
 
@@ -469,7 +469,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 	}
 
 	if (file->f_path.dentry == NULL) {
-		cERROR(1, ("no valid name if dentry freed"));
+		cERROR(1, "no valid name if dentry freed");
 		dump_stack();
 		rc = -EBADF;
 		goto reopen_error_exit;
@@ -477,7 +477,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 
 	inode = file->f_path.dentry->d_inode;
 	if (inode == NULL) {
-		cERROR(1, ("inode not valid"));
+		cERROR(1, "inode not valid");
 		dump_stack();
 		rc = -EBADF;
 		goto reopen_error_exit;
@@ -499,8 +499,8 @@ reopen_error_exit:
 		return rc;
 	}
 
-	cFYI(1, ("inode = 0x%p file flags 0x%x for %s",
-		 inode, file->f_flags, full_path));
+	cFYI(1, "inode = 0x%p file flags 0x%x for %s",
+		 inode, file->f_flags, full_path);
 
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
@@ -516,7 +516,7 @@ reopen_error_exit:
 				     cifs_sb->mnt_file_mode /* ignored */,
 				     oflags, &oplock, &netfid, xid);
 		if (rc == 0) {
-			cFYI(1, ("posix reopen succeeded"));
+			cFYI(1, "posix reopen succeeded");
 			goto reopen_success;
 		}
 		/* fallthrough to retry open the old way on errors, especially
@@ -537,8 +537,8 @@ reopen_error_exit:
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
 		mutex_unlock(&pCifsFile->fh_mutex);
-		cFYI(1, ("cifs_open returned 0x%x", rc));
-		cFYI(1, ("oplock: %d", oplock));
+		cFYI(1, "cifs_open returned 0x%x", rc);
+		cFYI(1, "oplock: %d", oplock);
 	} else {
 reopen_success:
 		pCifsFile->netfid = netfid;
@@ -570,8 +570,8 @@ reopen_success:
 			if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
 				pCifsInode->clientCanCacheAll = true;
 				pCifsInode->clientCanCacheRead = true;
-				cFYI(1, ("Exclusive Oplock granted on inode %p",
-					 file->f_path.dentry->d_inode));
+				cFYI(1, "Exclusive Oplock granted on inode %p",
+					 file->f_path.dentry->d_inode);
 			} else if ((oplock & 0xF) == OPLOCK_READ) {
 				pCifsInode->clientCanCacheRead = true;
 				pCifsInode->clientCanCacheAll = false;
@@ -619,8 +619,7 @@ int cifs_close(struct inode *inode, struct file *file)
 					the struct would be in each open file,
 					but this should give enough time to
 					clear the socket */
-					cFYI(DBG2,
-						("close delay, write pending"));
+					cFYI(DBG2, "close delay, write pending");
 					msleep(timeout);
 					timeout *= 4;
 				}
@@ -653,7 +652,7 @@ int cifs_close(struct inode *inode, struct file *file)
 
 	read_lock(&GlobalSMBSeslock);
 	if (list_empty(&(CIFS_I(inode)->openFileList))) {
-		cFYI(1, ("closing last open instance for inode %p", inode));
+		cFYI(1, "closing last open instance for inode %p", inode);
 		/* if the file is not open we do not know if we can cache info
 		   on this inode, much less write behind and read ahead */
 		CIFS_I(inode)->clientCanCacheRead = false;
@@ -674,7 +673,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
 	    (struct cifsFileInfo *)file->private_data;
 	char *ptmp;
 
-	cFYI(1, ("Closedir inode = 0x%p", inode));
+	cFYI(1, "Closedir inode = 0x%p", inode);
 
 	xid = GetXid();
 
@@ -685,22 +684,22 @@ int cifs_closedir(struct inode *inode, struct file *file)
 
 		pTcon = cifs_sb->tcon;
 
-		cFYI(1, ("Freeing private data in close dir"));
+		cFYI(1, "Freeing private data in close dir");
 		write_lock(&GlobalSMBSeslock);
 		if (!pCFileStruct->srch_inf.endOfSearch &&
 		    !pCFileStruct->invalidHandle) {
 			pCFileStruct->invalidHandle = true;
 			write_unlock(&GlobalSMBSeslock);
 			rc = CIFSFindClose(xid, pTcon, pCFileStruct->netfid);
-			cFYI(1, ("Closing uncompleted readdir with rc %d",
-				 rc));
+			cFYI(1, "Closing uncompleted readdir with rc %d",
+				 rc);
 			/* not much we can do if it fails anyway, ignore rc */
 			rc = 0;
 		} else
 			write_unlock(&GlobalSMBSeslock);
 		ptmp = pCFileStruct->srch_inf.ntwrk_buf_start;
 		if (ptmp) {
-			cFYI(1, ("closedir free smb buf in srch struct"));
+			cFYI(1, "closedir free smb buf in srch struct");
 			pCFileStruct->srch_inf.ntwrk_buf_start = NULL;
 			if (pCFileStruct->srch_inf.smallBuf)
 				cifs_small_buf_release(ptmp);
@@ -748,49 +747,49 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 	rc = -EACCES;
 	xid = GetXid();
 
-	cFYI(1, ("Lock parm: 0x%x flockflags: "
+	cFYI(1, "Lock parm: 0x%x flockflags: "
 		 "0x%x flocktype: 0x%x start: %lld end: %lld",
 		cmd, pfLock->fl_flags, pfLock->fl_type, pfLock->fl_start,
-		pfLock->fl_end));
+		pfLock->fl_end);
 
 	if (pfLock->fl_flags & FL_POSIX)
-		cFYI(1, ("Posix"));
+		cFYI(1, "Posix");
 	if (pfLock->fl_flags & FL_FLOCK)
-		cFYI(1, ("Flock"));
+		cFYI(1, "Flock");
 	if (pfLock->fl_flags & FL_SLEEP) {
-		cFYI(1, ("Blocking lock"));
+		cFYI(1, "Blocking lock");
 		wait_flag = true;
 	}
 	if (pfLock->fl_flags & FL_ACCESS)
-		cFYI(1, ("Process suspended by mandatory locking - "
-			 "not implemented yet"));
+		cFYI(1, "Process suspended by mandatory locking - "
+			 "not implemented yet");
 	if (pfLock->fl_flags & FL_LEASE)
-		cFYI(1, ("Lease on file - not implemented yet"));
+		cFYI(1, "Lease on file - not implemented yet");
 	if (pfLock->fl_flags &
 	    (~(FL_POSIX | FL_FLOCK | FL_SLEEP | FL_ACCESS | FL_LEASE)))
-		cFYI(1, ("Unknown lock flags 0x%x", pfLock->fl_flags));
+		cFYI(1, "Unknown lock flags 0x%x", pfLock->fl_flags);
 
 	if (pfLock->fl_type == F_WRLCK) {
-		cFYI(1, ("F_WRLCK "));
+		cFYI(1, "F_WRLCK ");
 		numLock = 1;
 	} else if (pfLock->fl_type == F_UNLCK) {
-		cFYI(1, ("F_UNLCK"));
+		cFYI(1, "F_UNLCK");
 		numUnlock = 1;
 		/* Check if unlock includes more than
 		one lock range */
 	} else if (pfLock->fl_type == F_RDLCK) {
-		cFYI(1, ("F_RDLCK"));
+		cFYI(1, "F_RDLCK");
 		lockType |= LOCKING_ANDX_SHARED_LOCK;
 		numLock = 1;
 	} else if (pfLock->fl_type == F_EXLCK) {
-		cFYI(1, ("F_EXLCK"));
+		cFYI(1, "F_EXLCK");
 		numLock = 1;
 	} else if (pfLock->fl_type == F_SHLCK) {
-		cFYI(1, ("F_SHLCK"));
+		cFYI(1, "F_SHLCK");
 		lockType |= LOCKING_ANDX_SHARED_LOCK;
 		numLock = 1;
 	} else
-		cFYI(1, ("Unknown type of lock"));
+		cFYI(1, "Unknown type of lock");
 
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	tcon = cifs_sb->tcon;
@@ -833,8 +832,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 					 0 /* wait flag */ );
 			pfLock->fl_type = F_UNLCK;
 			if (rc != 0)
-				cERROR(1, ("Error unlocking previously locked "
-					   "range %d during test of lock", rc));
+				cERROR(1, "Error unlocking previously locked "
+					   "range %d during test of lock", rc);
 			rc = 0;
 
 		} else {
@@ -988,9 +987,8 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
 
 	pTcon = cifs_sb->tcon;
 
-	/* cFYI(1,
-	   (" write %d bytes to offset %lld of %s", write_size,
-	   *poffset, file->f_path.dentry->d_name.name)); */
+	/* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
+	   *poffset, file->f_path.dentry->d_name.name); */
 
 	if (file->private_data == NULL)
 		return -EBADF;
@@ -1091,8 +1089,8 @@ static ssize_t cifs_write(struct file *file, const char *write_data,
 
 	pTcon = cifs_sb->tcon;
 
-	cFYI(1, ("write %zd bytes to offset %lld of %s", write_size,
-	   *poffset, file->f_path.dentry->d_name.name));
+	cFYI(1, "write %zd bytes to offset %lld of %s", write_size,
+	   *poffset, file->f_path.dentry->d_name.name);
 
 	if (file->private_data == NULL)
 		return -EBADF;
@@ -1233,7 +1231,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
 	it being zero) during stress testcases so we need to check for it */
 
 	if (cifs_inode == NULL) {
-		cERROR(1, ("Null inode passed to cifs_writeable_file"));
+		cERROR(1, "Null inode passed to cifs_writeable_file");
 		dump_stack();
 		return NULL;
 	}
@@ -1277,7 +1275,7 @@ refind_writable:
 			again. Note that it would be bad
 			to hold up writepages here (rather than
 			in caller) with continuous retries */
-			cFYI(1, ("wp failed on reopen file"));
+			cFYI(1, "wp failed on reopen file");
 			read_lock(&GlobalSMBSeslock);
 			/* can not use this handle, no write
 			   pending on this one after all */
@@ -1353,7 +1351,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 		else if (bytes_written < 0)
 			rc = bytes_written;
 	} else {
-		cFYI(1, ("No writeable filehandles for inode"));
+		cFYI(1, "No writeable filehandles for inode");
 		rc = -EIO;
 	}
 
@@ -1525,7 +1523,7 @@ retry:
 			 */
 			open_file = find_writable_file(CIFS_I(mapping->host));
 			if (!open_file) {
-				cERROR(1, ("No writable handles for inode"));
+				cERROR(1, "No writable handles for inode");
 				rc = -EBADF;
 			} else {
 				long_op = cifs_write_timeout(cifsi, offset);
@@ -1538,8 +1536,8 @@ retry:
 				cifs_update_eof(cifsi, offset, bytes_written);
 
 				if (rc || bytes_written < bytes_to_write) {
-					cERROR(1, ("Write2 ret %d, wrote %d",
-						  rc, bytes_written));
+					cERROR(1, "Write2 ret %d, wrote %d",
+						  rc, bytes_written);
 					/* BB what if continued retry is
 					   requested via mount flags? */
 					if (rc == -ENOSPC)
@@ -1600,7 +1598,7 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
 /* BB add check for wbc flags */
 	page_cache_get(page);
 	if (!PageUptodate(page))
-		cFYI(1, ("ppw - page not up to date"));
+		cFYI(1, "ppw - page not up to date");
 
 	/*
 	 * Set the "writeback" flag, and clear "dirty" in the radix tree.
@@ -1629,8 +1627,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 	int rc;
 	struct inode *inode = mapping->host;
 
-	cFYI(1, ("write_end for page %p from pos %lld with %d bytes",
-		 page, pos, copied));
+	cFYI(1, "write_end for page %p from pos %lld with %d bytes",
+		 page, pos, copied);
 
 	if (PageChecked(page)) {
 		if (copied == len)
@@ -1686,8 +1684,8 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 
 	xid = GetXid();
 
-	cFYI(1, ("Sync file - name: %s datasync: 0x%x",
-		dentry->d_name.name, datasync));
+	cFYI(1, "Sync file - name: %s datasync: 0x%x",
+		dentry->d_name.name, datasync);
 
 	rc = filemap_write_and_wait(inode->i_mapping);
 	if (rc == 0) {
@@ -1711,7 +1709,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	unsigned int rpages = 0;
 	int rc = 0;
 
-	cFYI(1, ("sync page %p",page));
+	cFYI(1, "sync page %p",page);
 	mapping = page->mapping;
 	if (!mapping)
 		return 0;
@@ -1722,7 +1720,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 /*	fill in rpages then
 	result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
 
-/*	cFYI(1, ("rpages is %d for sync page of Index %ld", rpages, index));
+/*	cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
 
 #if 0
 	if (rc < 0)
@@ -1756,7 +1754,7 @@ int cifs_flush(struct file *file, fl_owner_t id)
 		CIFS_I(inode)->write_behind_rc = 0;
 	}
 
-	cFYI(1, ("Flush inode %p file %p rc %d", inode, file, rc));
+	cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc);
 
 	return rc;
 }
@@ -1788,7 +1786,7 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data,
 	open_file = (struct cifsFileInfo *)file->private_data;
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-		cFYI(1, ("attempting read on write only file instance"));
+		cFYI(1, "attempting read on write only file instance");
 
 	for (total_read = 0, current_offset = read_data;
 	     read_size > total_read;
@@ -1869,7 +1867,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
 	open_file = (struct cifsFileInfo *)file->private_data;
 
 	if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-		cFYI(1, ("attempting read on write only file instance"));
+		cFYI(1, "attempting read on write only file instance");
 
 	for (total_read = 0, current_offset = read_data;
 	     read_size > total_read;
@@ -1920,7 +1918,7 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	xid = GetXid();
 	rc = cifs_revalidate_file(file);
 	if (rc) {
-		cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
+		cFYI(1, "Validation prior to mmap failed, error=%d", rc);
 		FreeXid(xid);
 		return rc;
 	}
@@ -1946,7 +1944,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
 		if (add_to_page_cache_lru(page, mapping, page->index,
 				      GFP_KERNEL)) {
 			page_cache_release(page);
-			cFYI(1, ("Add page cache failed"));
+			cFYI(1, "Add page cache failed");
 			data += PAGE_CACHE_SIZE;
 			bytes_read -= PAGE_CACHE_SIZE;
 			continue;
@@ -2033,8 +2031,8 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		/* Read size needs to be in multiples of one page */
 		read_size = min_t(const unsigned int, read_size,
 				  cifs_sb->rsize & PAGE_CACHE_MASK);
-		cFYI(DBG2, ("rpages: read size 0x%x  contiguous pages %d",
-				read_size, contig_pages));
+		cFYI(DBG2, "rpages: read size 0x%x  contiguous pages %d",
+				read_size, contig_pages);
 		rc = -EAGAIN;
 		while (rc == -EAGAIN) {
 			if ((open_file->invalidHandle) &&
@@ -2061,7 +2059,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 			}
 		}
 		if ((rc < 0) || (smb_read_data == NULL)) {
-			cFYI(1, ("Read error in readpages: %d", rc));
+			cFYI(1, "Read error in readpages: %d", rc);
 			break;
 		} else if (bytes_read > 0) {
 			task_io_account_read(bytes_read);
@@ -2084,9 +2082,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 				/* break; */
 			}
 		} else {
-			cFYI(1, ("No bytes read (%d) at offset %lld . "
-				 "Cleaning remaining pages from readahead list",
-				 bytes_read, offset));
+			cFYI(1, "No bytes read (%d) at offset %lld . "
+			        "Cleaning remaining pages from readahead list",
+				bytes_read, offset);
 			/* BB turn off caching and do new lookup on
 			   file size at server? */
 			break;
@@ -2129,7 +2127,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 	if (rc < 0)
 		goto io_error;
 	else
-		cFYI(1, ("Bytes read %d", rc));
+		cFYI(1, "Bytes read %d", rc);
 
 	file->f_path.dentry->d_inode->i_atime =
 		current_fs_time(file->f_path.dentry->d_inode->i_sb);
@@ -2161,8 +2159,8 @@ static int cifs_readpage(struct file *file, struct page *page)
 		return rc;
 	}
 
-	cFYI(1, ("readpage %p at offset %d 0x%x\n",
-		 page, (int)offset, (int)offset));
+	cFYI(1, "readpage %p at offset %d 0x%x\n",
+		 page, (int)offset, (int)offset);
 
 	rc = cifs_readpage_worker(file, page, &offset);
 
@@ -2232,7 +2230,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 	struct page *page;
 	int rc = 0;
 
-	cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
+	cFYI(1, "write_begin from %lld len %d", (long long)pos, len);
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
@@ -2319,7 +2317,7 @@ cifs_oplock_break(struct slow_work *work)
 			rc = waitrc;
 		if (rc)
 			cinode->write_behind_rc = rc;
-		cFYI(1, ("Oplock flush inode %p rc %d", inode, rc));
+		cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
 	}
 
 	/*
@@ -2331,7 +2329,7 @@ cifs_oplock_break(struct slow_work *work)
 	if (!cfile->closePend && !cfile->oplock_break_cancelled) {
 		rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
 				 LOCKING_ANDX_OPLOCK_RELEASE, false);
-		cFYI(1, ("Oplock release rc = %d", rc));
+		cFYI(1, "Oplock release rc = %d", rc);
 	}
 }
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 35ec1171621..7524a90aa94 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -144,8 +144,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	else
 		cifs_i->time = jiffies;
 
-	cFYI(1, ("inode 0x%p old_time=%ld new_time=%ld", inode,
-		 oldtime, cifs_i->time));
+	cFYI(1, "inode 0x%p old_time=%ld new_time=%ld", inode,
+		 oldtime, cifs_i->time);
 
 	cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
 
@@ -227,7 +227,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
 		/* safest to call it a file if we do not know */
 		fattr->cf_mode |= S_IFREG;
 		fattr->cf_dtype = DT_REG;
-		cFYI(1, ("unknown type %d", le32_to_cpu(info->Type)));
+		cFYI(1, "unknown type %d", le32_to_cpu(info->Type));
 		break;
 	}
 
@@ -256,7 +256,7 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 
-	cFYI(1, ("creating fake fattr for DFS referral"));
+	cFYI(1, "creating fake fattr for DFS referral");
 
 	memset(fattr, 0, sizeof(*fattr));
 	fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
@@ -305,7 +305,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 
 	tcon = cifs_sb->tcon;
-	cFYI(1, ("Getting info on %s", full_path));
+	cFYI(1, "Getting info on %s", full_path);
 
 	/* could have done a find first instead but this returns more info */
 	rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data,
@@ -373,7 +373,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 				 &bytes_read, &pbuf, &buf_type);
 		if ((rc == 0) && (bytes_read >= 8)) {
 			if (memcmp("IntxBLK", pbuf, 8) == 0) {
-				cFYI(1, ("Block device"));
+				cFYI(1, "Block device");
 				fattr->cf_mode |= S_IFBLK;
 				fattr->cf_dtype = DT_BLK;
 				if (bytes_read == 24) {
@@ -385,7 +385,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 					fattr->cf_rdev = MKDEV(mjr, mnr);
 				}
 			} else if (memcmp("IntxCHR", pbuf, 8) == 0) {
-				cFYI(1, ("Char device"));
+				cFYI(1, "Char device");
 				fattr->cf_mode |= S_IFCHR;
 				fattr->cf_dtype = DT_CHR;
 				if (bytes_read == 24) {
@@ -397,7 +397,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
 					fattr->cf_rdev = MKDEV(mjr, mnr);
 				}
 			} else if (memcmp("IntxLNK", pbuf, 7) == 0) {
-				cFYI(1, ("Symlink"));
+				cFYI(1, "Symlink");
 				fattr->cf_mode |= S_IFLNK;
 				fattr->cf_dtype = DT_LNK;
 			} else {
@@ -439,10 +439,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 	else if (rc > 3) {
 		mode = le32_to_cpu(*((__le32 *)ea_value));
 		fattr->cf_mode &= ~SFBITS_MASK;
-		cFYI(1, ("special bits 0%o org mode 0%o", mode,
-			 fattr->cf_mode));
+		cFYI(1, "special bits 0%o org mode 0%o", mode,
+			 fattr->cf_mode);
 		fattr->cf_mode = (mode & SFBITS_MASK) | fattr->cf_mode;
-		cFYI(1, ("special mode bits 0%o", mode));
+		cFYI(1, "special mode bits 0%o", mode);
 	}
 
 	return 0;
@@ -548,11 +548,11 @@ int cifs_get_inode_info(struct inode **pinode,
 	struct cifs_fattr fattr;
 
 	pTcon = cifs_sb->tcon;
-	cFYI(1, ("Getting info on %s", full_path));
+	cFYI(1, "Getting info on %s", full_path);
 
 	if ((pfindData == NULL) && (*pinode != NULL)) {
 		if (CIFS_I(*pinode)->clientCanCacheRead) {
-			cFYI(1, ("No need to revalidate cached inode sizes"));
+			cFYI(1, "No need to revalidate cached inode sizes");
 			return rc;
 		}
 	}
@@ -618,7 +618,7 @@ int cifs_get_inode_info(struct inode **pinode,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			if (rc1 || !fattr.cf_uniqueid) {
-				cFYI(1, ("GetSrvInodeNum rc %d", rc1));
+				cFYI(1, "GetSrvInodeNum rc %d", rc1);
 				fattr.cf_uniqueid = iunique(sb, ROOT_I);
 				cifs_autodisable_serverino(cifs_sb);
 			}
@@ -634,13 +634,13 @@ int cifs_get_inode_info(struct inode **pinode,
 	    cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
 		tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
 		if (tmprc)
-			cFYI(1, ("cifs_sfu_type failed: %d", tmprc));
+			cFYI(1, "cifs_sfu_type failed: %d", tmprc);
 	}
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	/* fill in 0777 bits from ACL */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-		cFYI(1, ("Getting mode bits from ACL"));
+		cFYI(1, "Getting mode bits from ACL");
 		cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid);
 	}
 #endif
@@ -734,7 +734,7 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
 	unsigned long hash;
 	struct inode *inode;
 
-	cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid));
+	cFYI(1, "looking for uniqueid=%llu", fattr->cf_uniqueid);
 
 	/* hash down to 32-bits on 32-bit arch */
 	hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
@@ -780,7 +780,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 		return ERR_PTR(-ENOMEM);
 
 	if (rc && cifs_sb->tcon->ipc) {
-		cFYI(1, ("ipc connection - fake read inode"));
+		cFYI(1, "ipc connection - fake read inode");
 		inode->i_mode |= S_IFDIR;
 		inode->i_nlink = 2;
 		inode->i_op = &cifs_ipc_inode_ops;
@@ -842,7 +842,7 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 	 * server times.
 	 */
 	if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
-		cFYI(1, ("CIFS - CTIME changed"));
+		cFYI(1, "CIFS - CTIME changed");
 		info_buf.ChangeTime =
 		    cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
 	} else
@@ -877,8 +877,8 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 			goto out;
 	}
 
-	cFYI(1, ("calling SetFileInfo since SetPathInfo for "
-		 "times not supported by this server"));
+	cFYI(1, "calling SetFileInfo since SetPathInfo for "
+		 "times not supported by this server");
 	rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
 			 SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
 			 CREATE_NOT_DIR, &netfid, &oplock,
@@ -1036,7 +1036,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 	struct iattr *attrs = NULL;
 	__u32 dosattr = 0, origattr = 0;
 
-	cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry));
+	cFYI(1, "cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry);
 
 	xid = GetXid();
 
@@ -1055,7 +1055,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
 		rc = CIFSPOSIXDelFile(xid, tcon, full_path,
 			SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-		cFYI(1, ("posix del rc %d", rc));
+		cFYI(1, "posix del rc %d", rc);
 		if ((rc == 0) || (rc == -ENOENT))
 			goto psx_del_no_retry;
 	}
@@ -1129,7 +1129,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 	struct inode *newinode = NULL;
 	struct cifs_fattr fattr;
 
-	cFYI(1, ("In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode));
+	cFYI(1, "In cifs_mkdir, mode = 0x%x inode = 0x%p", mode, inode);
 
 	xid = GetXid();
 
@@ -1164,7 +1164,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			kfree(pInfo);
 			goto mkdir_retry_old;
 		} else if (rc) {
-			cFYI(1, ("posix mkdir returned 0x%x", rc));
+			cFYI(1, "posix mkdir returned 0x%x", rc);
 			d_drop(direntry);
 		} else {
 			if (pInfo->Type == cpu_to_le32(-1)) {
@@ -1190,12 +1190,12 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			d_instantiate(direntry, newinode);
 
 #ifdef CONFIG_CIFS_DEBUG2
-			cFYI(1, ("instantiated dentry %p %s to inode %p",
-				direntry, direntry->d_name.name, newinode));
+			cFYI(1, "instantiated dentry %p %s to inode %p",
+				direntry, direntry->d_name.name, newinode);
 
 			if (newinode->i_nlink != 2)
-				cFYI(1, ("unexpected number of links %d",
-					newinode->i_nlink));
+				cFYI(1, "unexpected number of links %d",
+					newinode->i_nlink);
 #endif
 		}
 		kfree(pInfo);
@@ -1206,7 +1206,7 @@ mkdir_retry_old:
 	rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls,
 			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	if (rc) {
-		cFYI(1, ("cifs_mkdir returned 0x%x", rc));
+		cFYI(1, "cifs_mkdir returned 0x%x", rc);
 		d_drop(direntry);
 	} else {
 mkdir_get_info:
@@ -1309,7 +1309,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	char *full_path = NULL;
 	struct cifsInodeInfo *cifsInode;
 
-	cFYI(1, ("cifs_rmdir, inode = 0x%p", inode));
+	cFYI(1, "cifs_rmdir, inode = 0x%p", inode);
 
 	xid = GetXid();
 
@@ -1673,12 +1673,12 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 		rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
 					npid, false);
 		cifsFileInfo_put(open_file);
-		cFYI(1, ("SetFSize for attrs rc = %d", rc));
+		cFYI(1, "SetFSize for attrs rc = %d", rc);
 		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			unsigned int bytes_written;
 			rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
 					  &bytes_written, NULL, NULL, 1);
-			cFYI(1, ("Wrt seteof rc %d", rc));
+			cFYI(1, "Wrt seteof rc %d", rc);
 		}
 	} else
 		rc = -EINVAL;
@@ -1692,7 +1692,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 				   false, cifs_sb->local_nls,
 				   cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc));
+		cFYI(1, "SetEOF by path (setattrs) rc = %d", rc);
 		if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			__u16 netfid;
 			int oplock = 0;
@@ -1709,7 +1709,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 						  attrs->ia_size,
 						  &bytes_written, NULL,
 						  NULL, 1);
-				cFYI(1, ("wrt seteof rc %d", rc));
+				cFYI(1, "wrt seteof rc %d", rc);
 				CIFSSMBClose(xid, pTcon, netfid);
 			}
 		}
@@ -1737,8 +1737,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	struct cifs_unix_set_info_args *args = NULL;
 	struct cifsFileInfo *open_file;
 
-	cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x",
-		 direntry->d_name.name, attrs->ia_valid));
+	cFYI(1, "setattr_unix on file %s attrs->ia_valid=0x%x",
+		 direntry->d_name.name, attrs->ia_valid);
 
 	xid = GetXid();
 
@@ -1868,8 +1868,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 
 	xid = GetXid();
 
-	cFYI(1, ("setattr on file %s attrs->iavalid 0x%x",
-		 direntry->d_name.name, attrs->ia_valid));
+	cFYI(1, "setattr on file %s attrs->iavalid 0x%x",
+		 direntry->d_name.name, attrs->ia_valid);
 
 	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
 		/* check if we have permission to change attrs */
@@ -1926,7 +1926,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 		attrs->ia_valid &= ~ATTR_MODE;
 
 	if (attrs->ia_valid & ATTR_MODE) {
-		cFYI(1, ("Mode changed to 0%o", attrs->ia_mode));
+		cFYI(1, "Mode changed to 0%o", attrs->ia_mode);
 		mode = attrs->ia_mode;
 	}
 
@@ -2012,7 +2012,7 @@ cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 #if 0
 void cifs_delete_inode(struct inode *inode)
 {
-	cFYI(1, ("In cifs_delete_inode, inode = 0x%p", inode));
+	cFYI(1, "In cifs_delete_inode, inode = 0x%p", inode);
 	/* may have to add back in if and when safe distributed caching of
 	   directories added e.g. via FindNotify */
 }
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index f94650683a0..505926f1ee6 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -47,7 +47,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 
 	xid = GetXid();
 
-	cFYI(1, ("ioctl file %p  cmd %u  arg %lu", filep, command, arg));
+	cFYI(1, "ioctl file %p  cmd %u  arg %lu", filep, command, arg);
 
 	cifs_sb = CIFS_SB(inode->i_sb);
 
@@ -64,12 +64,12 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 
 	switch (command) {
 		case CIFS_IOC_CHECKUMOUNT:
-			cFYI(1, ("User unmount attempted"));
+			cFYI(1, "User unmount attempted");
 			if (cifs_sb->mnt_uid == current_uid())
 				rc = 0;
 			else {
 				rc = -EACCES;
-				cFYI(1, ("uids do not match"));
+				cFYI(1, "uids do not match");
 			}
 			break;
 #ifdef CONFIG_CIFS_POSIX
@@ -97,11 +97,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 				/* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid,
 					extAttrBits, &ExtAttrMask);*/
 			}
-			cFYI(1, ("set flags not implemented yet"));
+			cFYI(1, "set flags not implemented yet");
 			break;
 #endif /* CONFIG_CIFS_POSIX */
 		default:
-			cFYI(1, ("unsupported ioctl"));
+			cFYI(1, "unsupported ioctl");
 			break;
 	}
 
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index c1a9d4236a8..473ca803365 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -139,7 +139,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 	if (!full_path)
 		goto out;
 
-	cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode));
+	cFYI(1, "Full path: %s inode = 0x%p", full_path, inode);
 
 	rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
 				     cifs_sb->local_nls);
@@ -178,8 +178,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 		return rc;
 	}
 
-	cFYI(1, ("Full path: %s", full_path));
-	cFYI(1, ("symname is %s", symname));
+	cFYI(1, "Full path: %s", full_path);
+	cFYI(1, "symname is %s", symname);
 
 	/* BB what if DFS and this volume is on different share? BB */
 	if (pTcon->unix_ext)
@@ -198,8 +198,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
 						 inode->i_sb, xid, NULL);
 
 		if (rc != 0) {
-			cFYI(1, ("Create symlink ok, getinodeinfo fail rc = %d",
-			      rc));
+			cFYI(1, "Create symlink ok, getinodeinfo fail rc = %d",
+			      rc);
 		} else {
 			if (pTcon->nocase)
 				direntry->d_op = &cifs_ci_dentry_ops;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d1474996a81..1394aa37f26 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -51,7 +51,7 @@ _GetXid(void)
 	if (GlobalTotalActiveXid > GlobalMaxActiveXid)
 		GlobalMaxActiveXid = GlobalTotalActiveXid;
 	if (GlobalTotalActiveXid > 65000)
-		cFYI(1, ("warning: more than 65000 requests active"));
+		cFYI(1, "warning: more than 65000 requests active");
 	xid = GlobalCurrentXid++;
 	spin_unlock(&GlobalMid_Lock);
 	return xid;
@@ -88,7 +88,7 @@ void
 sesInfoFree(struct cifsSesInfo *buf_to_free)
 {
 	if (buf_to_free == NULL) {
-		cFYI(1, ("Null buffer passed to sesInfoFree"));
+		cFYI(1, "Null buffer passed to sesInfoFree");
 		return;
 	}
 
@@ -126,7 +126,7 @@ void
 tconInfoFree(struct cifsTconInfo *buf_to_free)
 {
 	if (buf_to_free == NULL) {
-		cFYI(1, ("Null buffer passed to tconInfoFree"));
+		cFYI(1, "Null buffer passed to tconInfoFree");
 		return;
 	}
 	atomic_dec(&tconInfoAllocCount);
@@ -166,7 +166,7 @@ void
 cifs_buf_release(void *buf_to_free)
 {
 	if (buf_to_free == NULL) {
-		/* cFYI(1, ("Null buffer passed to cifs_buf_release"));*/
+		/* cFYI(1, "Null buffer passed to cifs_buf_release");*/
 		return;
 	}
 	mempool_free(buf_to_free, cifs_req_poolp);
@@ -202,7 +202,7 @@ cifs_small_buf_release(void *buf_to_free)
 {
 
 	if (buf_to_free == NULL) {
-		cFYI(1, ("Null buffer passed to cifs_small_buf_release"));
+		cFYI(1, "Null buffer passed to cifs_small_buf_release");
 		return;
 	}
 	mempool_free(buf_to_free, cifs_sm_req_poolp);
@@ -345,19 +345,19 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 		/*      with userid/password pairs found on the smb session   */
 		/*	for other target tcp/ip addresses 		BB    */
 				if (current_fsuid() != treeCon->ses->linux_uid) {
-					cFYI(1, ("Multiuser mode and UID "
-						 "did not match tcon uid"));
+					cFYI(1, "Multiuser mode and UID "
+						 "did not match tcon uid");
 					read_lock(&cifs_tcp_ses_lock);
 					list_for_each(temp_item, &treeCon->ses->server->smb_ses_list) {
 						ses = list_entry(temp_item, struct cifsSesInfo, smb_ses_list);
 						if (ses->linux_uid == current_fsuid()) {
 							if (ses->server == treeCon->ses->server) {
-								cFYI(1, ("found matching uid substitute right smb_uid"));
+								cFYI(1, "found matching uid substitute right smb_uid");
 								buffer->Uid = ses->Suid;
 								break;
 							} else {
 				/* BB eventually call cifs_setup_session here */
-								cFYI(1, ("local UID found but no smb sess with this server exists"));
+								cFYI(1, "local UID found but no smb sess with this server exists");
 							}
 						}
 					}
@@ -394,17 +394,16 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid)
 			if (smb->Command == SMB_COM_LOCKING_ANDX)
 				return 0;
 			else
-				cERROR(1, ("Received Request not response"));
+				cERROR(1, "Received Request not response");
 		}
 	} else { /* bad signature or mid */
 		if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
-			cERROR(1,
-			       ("Bad protocol string signature header %x",
-				*(unsigned int *) smb->Protocol));
+			cERROR(1, "Bad protocol string signature header %x",
+				*(unsigned int *) smb->Protocol);
 		if (mid != smb->Mid)
-			cERROR(1, ("Mids do not match"));
+			cERROR(1, "Mids do not match");
 	}
-	cERROR(1, ("bad smb detected. The Mid=%d", smb->Mid));
+	cERROR(1, "bad smb detected. The Mid=%d", smb->Mid);
 	return 1;
 }
 
@@ -413,7 +412,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 {
 	__u32 len = smb->smb_buf_length;
 	__u32 clc_len;  /* calculated length */
-	cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len));
+	cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
 
 	if (length < 2 + sizeof(struct smb_hdr)) {
 		if ((length >= sizeof(struct smb_hdr) - 1)
@@ -437,15 +436,15 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 				tmp[sizeof(struct smb_hdr)+1] = 0;
 				return 0;
 			}
-			cERROR(1, ("rcvd invalid byte count (bcc)"));
+			cERROR(1, "rcvd invalid byte count (bcc)");
 		} else {
-			cERROR(1, ("Length less than smb header size"));
+			cERROR(1, "Length less than smb header size");
 		}
 		return 1;
 	}
 	if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-		cERROR(1, ("smb length greater than MaxBufSize, mid=%d",
-				   smb->Mid));
+		cERROR(1, "smb length greater than MaxBufSize, mid=%d",
+				   smb->Mid);
 		return 1;
 	}
 
@@ -454,8 +453,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 	clc_len = smbCalcSize_LE(smb);
 
 	if (4 + len != length) {
-		cERROR(1, ("Length read does not match RFC1001 length %d",
-			   len));
+		cERROR(1, "Length read does not match RFC1001 length %d",
+			   len);
 		return 1;
 	}
 
@@ -466,8 +465,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 			if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
 				return 0; /* bcc wrapped */
 		}
-		cFYI(1, ("Calculated size %d vs length %d mismatch for mid %d",
-				clc_len, 4 + len, smb->Mid));
+		cFYI(1, "Calculated size %d vs length %d mismatch for mid %d",
+				clc_len, 4 + len, smb->Mid);
 		/* Windows XP can return a few bytes too much, presumably
 		an illegal pad, at the end of byte range lock responses
 		so we allow for that three byte pad, as long as actual
@@ -482,8 +481,8 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 		if ((4+len > clc_len) && (len <= clc_len + 512))
 			return 0;
 		else {
-			cERROR(1, ("RFC1001 size %d bigger than SMB for Mid=%d",
-					len, smb->Mid));
+			cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
+					len, smb->Mid);
 			return 1;
 		}
 	}
@@ -501,7 +500,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 	struct cifsFileInfo *netfile;
 	int rc;
 
-	cFYI(1, ("Checking for oplock break or dnotify response"));
+	cFYI(1, "Checking for oplock break or dnotify response");
 	if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
 	   (pSMB->hdr.Flags & SMBFLG_RESPONSE)) {
 		struct smb_com_transaction_change_notify_rsp *pSMBr =
@@ -513,15 +512,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 
 			pnotify = (struct file_notify_information *)
 				((char *)&pSMBr->hdr.Protocol + data_offset);
-			cFYI(1, ("dnotify on %s Action: 0x%x",
-				 pnotify->FileName, pnotify->Action));
+			cFYI(1, "dnotify on %s Action: 0x%x",
+				 pnotify->FileName, pnotify->Action);
 			/*   cifs_dump_mem("Rcvd notify Data: ",buf,
 				sizeof(struct smb_hdr)+60); */
 			return true;
 		}
 		if (pSMBr->hdr.Status.CifsError) {
-			cFYI(1, ("notify err 0x%d",
-				pSMBr->hdr.Status.CifsError));
+			cFYI(1, "notify err 0x%d",
+				pSMBr->hdr.Status.CifsError);
 			return true;
 		}
 		return false;
@@ -535,7 +534,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 		   large dirty files cached on the client */
 		if ((NT_STATUS_INVALID_HANDLE) ==
 		   le32_to_cpu(pSMB->hdr.Status.CifsError)) {
-			cFYI(1, ("invalid handle on oplock break"));
+			cFYI(1, "invalid handle on oplock break");
 			return true;
 		} else if (ERRbadfid ==
 		   le16_to_cpu(pSMB->hdr.Status.DosError.Error)) {
@@ -547,8 +546,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 	if (pSMB->hdr.WordCount != 8)
 		return false;
 
-	cFYI(1, ("oplock type 0x%d level 0x%d",
-		 pSMB->LockType, pSMB->OplockLevel));
+	cFYI(1, "oplock type 0x%d level 0x%d",
+		 pSMB->LockType, pSMB->OplockLevel);
 	if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
 		return false;
 
@@ -579,15 +578,15 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 					return true;
 				}
 
-				cFYI(1, ("file id match, oplock break"));
+				cFYI(1, "file id match, oplock break");
 				pCifsInode = CIFS_I(netfile->pInode);
 				pCifsInode->clientCanCacheAll = false;
 				if (pSMB->OplockLevel == 0)
 					pCifsInode->clientCanCacheRead = false;
 				rc = slow_work_enqueue(&netfile->oplock_break);
 				if (rc) {
-					cERROR(1, ("failed to enqueue oplock "
-						   "break: %d\n", rc));
+					cERROR(1, "failed to enqueue oplock "
+						   "break: %d\n", rc);
 				} else {
 					netfile->oplock_break_cancelled = false;
 				}
@@ -597,12 +596,12 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 			}
 			read_unlock(&GlobalSMBSeslock);
 			read_unlock(&cifs_tcp_ses_lock);
-			cFYI(1, ("No matching file for oplock break"));
+			cFYI(1, "No matching file for oplock break");
 			return true;
 		}
 	}
 	read_unlock(&cifs_tcp_ses_lock);
-	cFYI(1, ("Can not process oplock break for non-existent connection"));
+	cFYI(1, "Can not process oplock break for non-existent connection");
 	return true;
 }
 
@@ -721,11 +720,11 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
 {
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
 		cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
-		cERROR(1, ("Autodisabling the use of server inode numbers on "
+		cERROR(1, "Autodisabling the use of server inode numbers on "
 			   "%s. This server doesn't seem to support them "
 			   "properly. Hardlinks will not be recognized on this "
 			   "mount. Consider mounting with the \"noserverino\" "
 			   "option to silence this message.",
-			   cifs_sb->tcon->treeName));
+			   cifs_sb->tcon->treeName);
 	}
 }
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index bd6d6895730..d35d52889cb 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -149,7 +149,7 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
 	else if (address_family == AF_INET6)
 		ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
 
-	cFYI(DBG2, ("address conversion returned %d for %s", ret, cp));
+	cFYI(DBG2, "address conversion returned %d for %s", ret, cp);
 	if (ret > 0)
 		ret = 1;
 	return ret;
@@ -870,8 +870,8 @@ map_smb_to_linux_error(struct smb_hdr *smb, int logErr)
 	}
 	/* else ERRHRD class errors or junk  - return EIO */
 
-	cFYI(1, ("Mapping smb error code %d to POSIX err %d",
-		 smberrcode, rc));
+	cFYI(1, "Mapping smb error code %d to POSIX err %d",
+		 smberrcode, rc);
 
 	/* generic corrective action e.g. reconnect SMB session on
 	 * ERRbaduid could be added */
@@ -940,20 +940,20 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
 	SMB_TIME *st = (SMB_TIME *)&time;
 	SMB_DATE *sd = (SMB_DATE *)&date;
 
-	cFYI(1, ("date %d time %d", date, time));
+	cFYI(1, "date %d time %d", date, time);
 
 	sec = 2 * st->TwoSeconds;
 	min = st->Minutes;
 	if ((sec > 59) || (min > 59))
-		cERROR(1, ("illegal time min %d sec %d", min, sec));
+		cERROR(1, "illegal time min %d sec %d", min, sec);
 	sec += (min * 60);
 	sec += 60 * 60 * st->Hours;
 	if (st->Hours > 24)
-		cERROR(1, ("illegal hours %d", st->Hours));
+		cERROR(1, "illegal hours %d", st->Hours);
 	days = sd->Day;
 	month = sd->Month;
 	if ((days > 31) || (month > 12)) {
-		cERROR(1, ("illegal date, month %d day: %d", month, days));
+		cERROR(1, "illegal date, month %d day: %d", month, days);
 		if (month > 12)
 			month = 12;
 	}
@@ -979,7 +979,7 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
 
 	ts.tv_sec = sec + offset;
 
-	/* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
+	/* cFYI(1, "sec after cnvrt dos to unix time %d",sec); */
 
 	ts.tv_nsec = 0;
 	return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 18e0bc1fb59..daf1753af67 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -47,15 +47,15 @@ static void dump_cifs_file_struct(struct file *file, char *label)
 	if (file) {
 		cf = file->private_data;
 		if (cf == NULL) {
-			cFYI(1, ("empty cifs private file data"));
+			cFYI(1, "empty cifs private file data");
 			return;
 		}
 		if (cf->invalidHandle)
-			cFYI(1, ("invalid handle"));
+			cFYI(1, "invalid handle");
 		if (cf->srch_inf.endOfSearch)
-			cFYI(1, ("end of search"));
+			cFYI(1, "end of search");
 		if (cf->srch_inf.emptyDir)
-			cFYI(1, ("empty dir"));
+			cFYI(1, "empty dir");
 	}
 }
 #else
@@ -76,7 +76,7 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
 	struct inode *inode;
 	struct super_block *sb = parent->d_inode->i_sb;
 
-	cFYI(1, ("For %s", name->name));
+	cFYI(1, "For %s", name->name);
 
 	if (parent->d_op && parent->d_op->d_hash)
 		parent->d_op->d_hash(parent, name);
@@ -214,7 +214,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
 				fid,
 				cifs_sb->local_nls);
 		if (CIFSSMBClose(xid, ptcon, fid)) {
-			cFYI(1, ("Error closing temporary reparsepoint open)"));
+			cFYI(1, "Error closing temporary reparsepoint open");
 		}
 	}
 }
@@ -252,7 +252,7 @@ static int initiate_cifs_search(const int xid, struct file *file)
 	if (full_path == NULL)
 		return -ENOMEM;
 
-	cFYI(1, ("Full path: %s start at: %lld", full_path, file->f_pos));
+	cFYI(1, "Full path: %s start at: %lld", full_path, file->f_pos);
 
 ffirst_retry:
 	/* test for Unix extensions */
@@ -297,7 +297,7 @@ static int cifs_unicode_bytelen(char *str)
 		if (ustr[len] == 0)
 			return len << 1;
 	}
-	cFYI(1, ("Unicode string longer than PATH_MAX found"));
+	cFYI(1, "Unicode string longer than PATH_MAX found");
 	return len << 1;
 }
 
@@ -314,19 +314,18 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
 				pfData->FileNameLength;
 	} else
 		new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
-	cFYI(1, ("new entry %p old entry %p", new_entry, old_entry));
+	cFYI(1, "new entry %p old entry %p", new_entry, old_entry);
 	/* validate that new_entry is not past end of SMB */
 	if (new_entry >= end_of_smb) {
-		cERROR(1,
-		      ("search entry %p began after end of SMB %p old entry %p",
-			new_entry, end_of_smb, old_entry));
+		cERROR(1, "search entry %p began after end of SMB %p old entry %p",
+			new_entry, end_of_smb, old_entry);
 		return NULL;
 	} else if (((level == SMB_FIND_FILE_INFO_STANDARD) &&
 		    (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb))
 		  || ((level != SMB_FIND_FILE_INFO_STANDARD) &&
 		   (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb)))  {
-		cERROR(1, ("search entry %p extends after end of SMB %p",
-			new_entry, end_of_smb));
+		cERROR(1, "search entry %p extends after end of SMB %p",
+			new_entry, end_of_smb);
 		return NULL;
 	} else
 		return new_entry;
@@ -380,8 +379,8 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile)
 		filename = &pFindData->FileName[0];
 		len = pFindData->FileNameLength;
 	} else {
-		cFYI(1, ("Unknown findfirst level %d",
-			 cfile->srch_inf.info_level));
+		cFYI(1, "Unknown findfirst level %d",
+			 cfile->srch_inf.info_level);
 	}
 
 	if (filename) {
@@ -481,7 +480,7 @@ static int cifs_save_resume_key(const char *current_entry,
 		len = (unsigned int)pFindData->FileNameLength;
 		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
 	} else {
-		cFYI(1, ("Unknown findfirst level %d", level));
+		cFYI(1, "Unknown findfirst level %d", level);
 		return -EINVAL;
 	}
 	cifsFile->srch_inf.resume_name_len = len;
@@ -525,7 +524,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	     is_dir_changed(file)) ||
 	   (index_to_find < first_entry_in_buffer)) {
 		/* close and restart search */
-		cFYI(1, ("search backing up - close and restart search"));
+		cFYI(1, "search backing up - close and restart search");
 		write_lock(&GlobalSMBSeslock);
 		if (!cifsFile->srch_inf.endOfSearch &&
 		    !cifsFile->invalidHandle) {
@@ -535,7 +534,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 		} else
 			write_unlock(&GlobalSMBSeslock);
 		if (cifsFile->srch_inf.ntwrk_buf_start) {
-			cFYI(1, ("freeing SMB ff cache buf on search rewind"));
+			cFYI(1, "freeing SMB ff cache buf on search rewind");
 			if (cifsFile->srch_inf.smallBuf)
 				cifs_small_buf_release(cifsFile->srch_inf.
 						ntwrk_buf_start);
@@ -546,8 +545,8 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 		}
 		rc = initiate_cifs_search(xid, file);
 		if (rc) {
-			cFYI(1, ("error %d reinitiating a search on rewind",
-				 rc));
+			cFYI(1, "error %d reinitiating a search on rewind",
+				 rc);
 			return rc;
 		}
 		cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -555,7 +554,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 
 	while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
 	      (rc == 0) && !cifsFile->srch_inf.endOfSearch) {
-		cFYI(1, ("calling findnext2"));
+		cFYI(1, "calling findnext2");
 		rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
 				  &cifsFile->srch_inf);
 		cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
@@ -575,7 +574,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 		first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry
 					- cifsFile->srch_inf.entries_in_buffer;
 		pos_in_buf = index_to_find - first_entry_in_buffer;
-		cFYI(1, ("found entry - pos_in_buf %d", pos_in_buf));
+		cFYI(1, "found entry - pos_in_buf %d", pos_in_buf);
 
 		for (i = 0; (i < (pos_in_buf)) && (current_entry != NULL); i++) {
 			/* go entry by entry figuring out which is first */
@@ -584,19 +583,19 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 		}
 		if ((current_entry == NULL) && (i < pos_in_buf)) {
 			/* BB fixme - check if we should flag this error */
-			cERROR(1, ("reached end of buf searching for pos in buf"
+			cERROR(1, "reached end of buf searching for pos in buf"
 			  " %d index to find %lld rc %d",
-			  pos_in_buf, index_to_find, rc));
+			  pos_in_buf, index_to_find, rc);
 		}
 		rc = 0;
 		*ppCurrentEntry = current_entry;
 	} else {
-		cFYI(1, ("index not in buffer - could not findnext into it"));
+		cFYI(1, "index not in buffer - could not findnext into it");
 		return 0;
 	}
 
 	if (pos_in_buf >= cifsFile->srch_inf.entries_in_buffer) {
-		cFYI(1, ("can not return entries pos_in_buf beyond last"));
+		cFYI(1, "can not return entries pos_in_buf beyond last");
 		*num_to_ret = 0;
 	} else
 		*num_to_ret = cifsFile->srch_inf.entries_in_buffer - pos_in_buf;
@@ -656,12 +655,12 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 		/* one byte length, no name conversion */
 		len = (unsigned int)pFindData->FileNameLength;
 	} else {
-		cFYI(1, ("Unknown findfirst level %d", level));
+		cFYI(1, "Unknown findfirst level %d", level);
 		return -EINVAL;
 	}
 
 	if (len > max_len) {
-		cERROR(1, ("bad search response length %d past smb end", len));
+		cERROR(1, "bad search response length %d past smb end", len);
 		return -EINVAL;
 	}
 
@@ -754,7 +753,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
 	 * case already. Why should we be clobbering other errors from it?
 	 */
 	if (rc) {
-		cFYI(1, ("filldir rc = %d", rc));
+		cFYI(1, "filldir rc = %d", rc);
 		rc = -EOVERFLOW;
 	}
 	dput(tmp_dentry);
@@ -786,7 +785,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 	case 0:
 		if (filldir(direntry, ".", 1, file->f_pos,
 		     file->f_path.dentry->d_inode->i_ino, DT_DIR) < 0) {
-			cERROR(1, ("Filldir for current dir failed"));
+			cERROR(1, "Filldir for current dir failed");
 			rc = -ENOMEM;
 			break;
 		}
@@ -794,7 +793,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 	case 1:
 		if (filldir(direntry, "..", 2, file->f_pos,
 		     file->f_path.dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) {
-			cERROR(1, ("Filldir for parent dir failed"));
+			cERROR(1, "Filldir for parent dir failed");
 			rc = -ENOMEM;
 			break;
 		}
@@ -807,7 +806,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 
 		if (file->private_data == NULL) {
 			rc = initiate_cifs_search(xid, file);
-			cFYI(1, ("initiate cifs search rc %d", rc));
+			cFYI(1, "initiate cifs search rc %d", rc);
 			if (rc) {
 				FreeXid(xid);
 				return rc;
@@ -821,7 +820,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 		cifsFile = file->private_data;
 		if (cifsFile->srch_inf.endOfSearch) {
 			if (cifsFile->srch_inf.emptyDir) {
-				cFYI(1, ("End of search, empty dir"));
+				cFYI(1, "End of search, empty dir");
 				rc = 0;
 				break;
 			}
@@ -833,16 +832,16 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 		rc = find_cifs_entry(xid, pTcon, file,
 				&current_entry, &num_to_fill);
 		if (rc) {
-			cFYI(1, ("fce error %d", rc));
+			cFYI(1, "fce error %d", rc);
 			goto rddir2_exit;
 		} else if (current_entry != NULL) {
-			cFYI(1, ("entry %lld found", file->f_pos));
+			cFYI(1, "entry %lld found", file->f_pos);
 		} else {
-			cFYI(1, ("could not find entry"));
+			cFYI(1, "could not find entry");
 			goto rddir2_exit;
 		}
-		cFYI(1, ("loop through %d times filling dir for net buf %p",
-			num_to_fill, cifsFile->srch_inf.ntwrk_buf_start));
+		cFYI(1, "loop through %d times filling dir for net buf %p",
+			num_to_fill, cifsFile->srch_inf.ntwrk_buf_start);
 		max_len = smbCalcSize((struct smb_hdr *)
 				cifsFile->srch_inf.ntwrk_buf_start);
 		end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len;
@@ -851,8 +850,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 		for (i = 0; (i < num_to_fill) && (rc == 0); i++) {
 			if (current_entry == NULL) {
 				/* evaluate whether this case is an error */
-				cERROR(1, ("past SMB end,  num to fill %d i %d",
-					  num_to_fill, i));
+				cERROR(1, "past SMB end,  num to fill %d i %d",
+					  num_to_fill, i);
 				break;
 			}
 			/* if buggy server returns . and .. late do
@@ -867,8 +866,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 			file->f_pos++;
 			if (file->f_pos ==
 				cifsFile->srch_inf.index_of_last_entry) {
-				cFYI(1, ("last entry in buf at pos %lld %s",
-					file->f_pos, tmp_buf));
+				cFYI(1, "last entry in buf at pos %lld %s",
+					file->f_pos, tmp_buf);
 				cifs_save_resume_key(current_entry, cifsFile);
 				break;
 			} else
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 7c3fd7463f4..da9729da03e 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -284,7 +284,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
 	int len;
 	char *data = *pbcc_area;
 
-	cFYI(1, ("bleft %d", bleft));
+	cFYI(1, "bleft %d", bleft);
 
 	/*
 	 * Windows servers do not always double null terminate their final
@@ -301,7 +301,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
 
 	kfree(ses->serverOS);
 	ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-	cFYI(1, ("serverOS=%s", ses->serverOS));
+	cFYI(1, "serverOS=%s", ses->serverOS);
 	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
 	data += len;
 	bleft -= len;
@@ -310,7 +310,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
 
 	kfree(ses->serverNOS);
 	ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-	cFYI(1, ("serverNOS=%s", ses->serverNOS));
+	cFYI(1, "serverNOS=%s", ses->serverNOS);
 	len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
 	data += len;
 	bleft -= len;
@@ -319,7 +319,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifsSesInfo *ses,
 
 	kfree(ses->serverDomain);
 	ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
-	cFYI(1, ("serverDomain=%s", ses->serverDomain));
+	cFYI(1, "serverDomain=%s", ses->serverDomain);
 
 	return;
 }
@@ -332,7 +332,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
 	int len;
 	char *bcc_ptr = *pbcc_area;
 
-	cFYI(1, ("decode sessetup ascii. bleft %d", bleft));
+	cFYI(1, "decode sessetup ascii. bleft %d", bleft);
 
 	len = strnlen(bcc_ptr, bleft);
 	if (len >= bleft)
@@ -344,7 +344,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
 	if (ses->serverOS)
 		strncpy(ses->serverOS, bcc_ptr, len);
 	if (strncmp(ses->serverOS, "OS/2", 4) == 0) {
-			cFYI(1, ("OS/2 server"));
+			cFYI(1, "OS/2 server");
 			ses->flags |= CIFS_SES_OS2;
 	}
 
@@ -373,7 +373,7 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
 	/* BB For newer servers which do not support Unicode,
 	   but thus do return domain here we could add parsing
 	   for it later, but it is not very important */
-	cFYI(1, ("ascii: bytes left %d", bleft));
+	cFYI(1, "ascii: bytes left %d", bleft);
 
 	return rc;
 }
@@ -384,16 +384,16 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 	CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
 
 	if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
-		cERROR(1, ("challenge blob len %d too small", blob_len));
+		cERROR(1, "challenge blob len %d too small", blob_len);
 		return -EINVAL;
 	}
 
 	if (memcmp(pblob->Signature, "NTLMSSP", 8)) {
-		cERROR(1, ("blob signature incorrect %s", pblob->Signature));
+		cERROR(1, "blob signature incorrect %s", pblob->Signature);
 		return -EINVAL;
 	}
 	if (pblob->MessageType != NtLmChallenge) {
-		cERROR(1, ("Incorrect message type %d", pblob->MessageType));
+		cERROR(1, "Incorrect message type %d", pblob->MessageType);
 		return -EINVAL;
 	}
 
@@ -583,7 +583,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 
 	type = ses->server->secType;
 
-	cFYI(1, ("sess setup type %d", type));
+	cFYI(1, "sess setup type %d", type);
 ssetup_ntlmssp_authenticate:
 	if (phase == NtLmChallenge)
 		phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
@@ -664,7 +664,7 @@ ssetup_ntlmssp_authenticate:
 		changed to do higher than lanman dialect and
 		we reconnected would we ever calc signing_key? */
 
-		cFYI(1, ("Negotiating LANMAN setting up strings"));
+		cFYI(1, "Negotiating LANMAN setting up strings");
 		/* Unicode not allowed for LANMAN dialects */
 		ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #endif
@@ -758,17 +758,17 @@ ssetup_ntlmssp_authenticate:
 		/* check version field to make sure that cifs.upcall is
 		   sending us a response in an expected form */
 		if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
-			cERROR(1, ("incorrect version of cifs.upcall (expected"
+			cERROR(1, "incorrect version of cifs.upcall (expected"
 				   " %d but got %d)",
-				   CIFS_SPNEGO_UPCALL_VERSION, msg->version));
+				   CIFS_SPNEGO_UPCALL_VERSION, msg->version);
 			rc = -EKEYREJECTED;
 			goto ssetup_exit;
 		}
 		/* bail out if key is too long */
 		if (msg->sesskey_len >
 		    sizeof(ses->server->mac_signing_key.data.krb5)) {
-			cERROR(1, ("Kerberos signing key too long (%u bytes)",
-				msg->sesskey_len));
+			cERROR(1, "Kerberos signing key too long (%u bytes)",
+				msg->sesskey_len);
 			rc = -EOVERFLOW;
 			goto ssetup_exit;
 		}
@@ -796,7 +796,7 @@ ssetup_ntlmssp_authenticate:
 		/* BB: is this right? */
 			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
 #else /* ! CONFIG_CIFS_UPCALL */
-		cERROR(1, ("Kerberos negotiated but upcall support disabled!"));
+		cERROR(1, "Kerberos negotiated but upcall support disabled!");
 		rc = -ENOSYS;
 		goto ssetup_exit;
 #endif /* CONFIG_CIFS_UPCALL */
@@ -804,12 +804,12 @@ ssetup_ntlmssp_authenticate:
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 		if (type == RawNTLMSSP) {
 			if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
-				cERROR(1, ("NTLMSSP requires Unicode support"));
+				cERROR(1, "NTLMSSP requires Unicode support");
 				rc = -ENOSYS;
 				goto ssetup_exit;
 			}
 
-			cFYI(1, ("ntlmssp session setup phase %d", phase));
+			cFYI(1, "ntlmssp session setup phase %d", phase);
 			pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
 			capabilities |= CAP_EXTENDED_SECURITY;
 			pSMB->req.Capabilities |= cpu_to_le32(capabilities);
@@ -827,7 +827,7 @@ ssetup_ntlmssp_authenticate:
 				   on the response (challenge) */
 				smb_buf->Uid = ses->Suid;
 			} else {
-				cERROR(1, ("invalid phase %d", phase));
+				cERROR(1, "invalid phase %d", phase);
 				rc = -ENOSYS;
 				goto ssetup_exit;
 			}
@@ -839,12 +839,12 @@ ssetup_ntlmssp_authenticate:
 			}
 			unicode_oslm_strings(&bcc_ptr, nls_cp);
 		} else {
-			cERROR(1, ("secType %d not supported!", type));
+			cERROR(1, "secType %d not supported!", type);
 			rc = -ENOSYS;
 			goto ssetup_exit;
 		}
 #else
-		cERROR(1, ("secType %d not supported!", type));
+		cERROR(1, "secType %d not supported!", type);
 		rc = -ENOSYS;
 		goto ssetup_exit;
 #endif
@@ -862,7 +862,7 @@ ssetup_ntlmssp_authenticate:
 			  CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR);
 	/* SMB request buf freed in SendReceive2 */
 
-	cFYI(1, ("ssetup rc from sendrecv2 is %d", rc));
+	cFYI(1, "ssetup rc from sendrecv2 is %d", rc);
 
 	pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
 	smb_buf = (struct smb_hdr *)iov[0].iov_base;
@@ -870,7 +870,7 @@ ssetup_ntlmssp_authenticate:
 	if ((type == RawNTLMSSP) && (smb_buf->Status.CifsError ==
 			cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
 		if (phase != NtLmNegotiate) {
-			cERROR(1, ("Unexpected more processing error"));
+			cERROR(1, "Unexpected more processing error");
 			goto ssetup_exit;
 		}
 		/* NTLMSSP Negotiate sent now processing challenge (response) */
@@ -882,14 +882,14 @@ ssetup_ntlmssp_authenticate:
 
 	if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
 		rc = -EIO;
-		cERROR(1, ("bad word count %d", smb_buf->WordCount));
+		cERROR(1, "bad word count %d", smb_buf->WordCount);
 		goto ssetup_exit;
 	}
 	action = le16_to_cpu(pSMB->resp.Action);
 	if (action & GUEST_LOGIN)
-		cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */
+		cFYI(1, "Guest login"); /* BB mark SesInfo struct? */
 	ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
-	cFYI(1, ("UID = %d ", ses->Suid));
+	cFYI(1, "UID = %d ", ses->Suid);
 	/* response can have either 3 or 4 word count - Samba sends 3 */
 	/* and lanman response is 3 */
 	bytes_remaining = BCC(smb_buf);
@@ -899,7 +899,7 @@ ssetup_ntlmssp_authenticate:
 		__u16 blob_len;
 		blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
 		if (blob_len > bytes_remaining) {
-			cERROR(1, ("bad security blob length %d", blob_len));
+			cERROR(1, "bad security blob length %d", blob_len);
 			rc = -EINVAL;
 			goto ssetup_exit;
 		}
@@ -933,7 +933,7 @@ ssetup_exit:
 	}
 	kfree(str_area);
 	if (resp_buf_type == CIFS_SMALL_BUFFER) {
-		cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base));
+		cFYI(1, "ssetup freeing small buf %p", iov[0].iov_base);
 		cifs_small_buf_release(iov[0].iov_base);
 	} else if (resp_buf_type == CIFS_LARGE_BUFFER)
 		cifs_buf_release(iov[0].iov_base);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index ad081fe7eb1..28f563cef5d 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -43,7 +43,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 	struct mid_q_entry *temp;
 
 	if (server == NULL) {
-		cERROR(1, ("Null TCP session in AllocMidQEntry"));
+		cERROR(1, "Null TCP session in AllocMidQEntry");
 		return NULL;
 	}
 
@@ -55,7 +55,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 		temp->mid = smb_buffer->Mid;	/* always LE */
 		temp->pid = current->pid;
 		temp->command = smb_buffer->Command;
-		cFYI(1, ("For smb_command %d", temp->command));
+		cFYI(1, "For smb_command %d", temp->command);
 	/*	do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
 		/* when mid allocated can be before when sent */
 		temp->when_alloc = jiffies;
@@ -140,7 +140,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 		total_len += iov[i].iov_len;
 
 	smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
-	cFYI(1, ("Sending smb:  total_len %d", total_len));
+	cFYI(1, "Sending smb:  total_len %d", total_len);
 	dump_smb(smb_buffer, len);
 
 	i = 0;
@@ -168,9 +168,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 			   reconnect which may clear the network problem.
 			*/
 			if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
-				cERROR(1,
-				   ("sends on sock %p stuck for 15 seconds",
-				    ssocket));
+				cERROR(1, "sends on sock %p stuck for 15 seconds",
+				    ssocket);
 				rc = -EAGAIN;
 				break;
 			}
@@ -184,13 +183,13 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 			total_len = 0;
 			break;
 		} else if (rc > total_len) {
-			cERROR(1, ("sent %d requested %d", rc, total_len));
+			cERROR(1, "sent %d requested %d", rc, total_len);
 			break;
 		}
 		if (rc == 0) {
 			/* should never happen, letting socket clear before
 			   retrying is our only obvious option here */
-			cERROR(1, ("tcp sent no data"));
+			cERROR(1, "tcp sent no data");
 			msleep(500);
 			continue;
 		}
@@ -213,8 +212,8 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 	}
 
 	if ((total_len > 0) && (total_len != smb_buf_length + 4)) {
-		cFYI(1, ("partial send (%d remaining), terminating session",
-			total_len));
+		cFYI(1, "partial send (%d remaining), terminating session",
+			total_len);
 		/* If we have only sent part of an SMB then the next SMB
 		   could be taken as the remainder of this one.  We need
 		   to kill the socket so the server throws away the partial
@@ -223,7 +222,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 	}
 
 	if (rc < 0) {
-		cERROR(1, ("Error %d sending data on socket to server", rc));
+		cERROR(1, "Error %d sending data on socket to server", rc);
 	} else
 		rc = 0;
 
@@ -296,7 +295,7 @@ static int allocate_mid(struct cifsSesInfo *ses, struct smb_hdr *in_buf,
 	}
 
 	if (ses->server->tcpStatus == CifsNeedReconnect) {
-		cFYI(1, ("tcp session dead - return to caller to retry"));
+		cFYI(1, "tcp session dead - return to caller to retry");
 		return -EAGAIN;
 	}
 
@@ -348,7 +347,7 @@ static int wait_for_response(struct cifsSesInfo *ses,
 			lrt += time_to_wait;
 			if (time_after(jiffies, lrt)) {
 				/* No replies for time_to_wait. */
-				cERROR(1, ("server not responding"));
+				cERROR(1, "server not responding");
 				return -1;
 			}
 		} else {
@@ -379,7 +378,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
 	iov[0].iov_len = in_buf->smb_buf_length + 4;
 	flags |= CIFS_NO_RESP;
 	rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
-	cFYI(DBG2, ("SendRcvNoRsp flags %d rc %d", flags, rc));
+	cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
 
 	return rc;
 }
@@ -402,7 +401,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 
 	if ((ses == NULL) || (ses->server == NULL)) {
 		cifs_small_buf_release(in_buf);
-		cERROR(1, ("Null session"));
+		cERROR(1, "Null session");
 		return -EIO;
 	}
 
@@ -471,7 +470,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	else if (long_op == CIFS_BLOCKING_OP)
 		timeout = 0x7FFFFFFF; /*  large, but not so large as to wrap */
 	else {
-		cERROR(1, ("unknown timeout flag %d", long_op));
+		cERROR(1, "unknown timeout flag %d", long_op);
 		rc = -EIO;
 		goto out;
 	}
@@ -490,8 +489,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	spin_lock(&GlobalMid_Lock);
 
 	if (midQ->resp_buf == NULL) {
-		cERROR(1, ("No response to cmd %d mid %d",
-			midQ->command, midQ->mid));
+		cERROR(1, "No response to cmd %d mid %d",
+			midQ->command, midQ->mid);
 		if (midQ->midState == MID_REQUEST_SUBMITTED) {
 			if (ses->server->tcpStatus == CifsExiting)
 				rc = -EHOSTDOWN;
@@ -504,7 +503,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		if (rc != -EHOSTDOWN) {
 			if (midQ->midState == MID_RETRY_NEEDED) {
 				rc = -EAGAIN;
-				cFYI(1, ("marking request for retry"));
+				cFYI(1, "marking request for retry");
 			} else {
 				rc = -EIO;
 			}
@@ -521,8 +520,8 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 	receive_len = midQ->resp_buf->smb_buf_length;
 
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-		cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
-			receive_len, xid));
+		cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
+			receive_len, xid);
 		rc = -EIO;
 		goto out;
 	}
@@ -548,7 +547,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 						&ses->server->mac_signing_key,
 						midQ->sequence_number+1);
 			if (rc) {
-				cERROR(1, ("Unexpected SMB signature"));
+				cERROR(1, "Unexpected SMB signature");
 				/* BB FIXME add code to kill session */
 			}
 		}
@@ -569,7 +568,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 						   DeleteMidQEntry */
 	} else {
 		rc = -EIO;
-		cFYI(1, ("Bad MID state?"));
+		cFYI(1, "Bad MID state?");
 	}
 
 out:
@@ -591,11 +590,11 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	struct mid_q_entry *midQ;
 
 	if (ses == NULL) {
-		cERROR(1, ("Null smb session"));
+		cERROR(1, "Null smb session");
 		return -EIO;
 	}
 	if (ses->server == NULL) {
-		cERROR(1, ("Null tcp session"));
+		cERROR(1, "Null tcp session");
 		return -EIO;
 	}
 
@@ -607,8 +606,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	   use ses->maxReq */
 
 	if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-		cERROR(1, ("Illegal length, greater than maximum frame, %d",
-			   in_buf->smb_buf_length));
+		cERROR(1, "Illegal length, greater than maximum frame, %d",
+			   in_buf->smb_buf_length);
 		return -EIO;
 	}
 
@@ -665,7 +664,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	else if (long_op == CIFS_BLOCKING_OP)
 		timeout = 0x7FFFFFFF; /* large but no so large as to wrap */
 	else {
-		cERROR(1, ("unknown timeout flag %d", long_op));
+		cERROR(1, "unknown timeout flag %d", long_op);
 		rc = -EIO;
 		goto out;
 	}
@@ -681,8 +680,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 
 	spin_lock(&GlobalMid_Lock);
 	if (midQ->resp_buf == NULL) {
-		cERROR(1, ("No response for cmd %d mid %d",
-			  midQ->command, midQ->mid));
+		cERROR(1, "No response for cmd %d mid %d",
+			  midQ->command, midQ->mid);
 		if (midQ->midState == MID_REQUEST_SUBMITTED) {
 			if (ses->server->tcpStatus == CifsExiting)
 				rc = -EHOSTDOWN;
@@ -695,7 +694,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 		if (rc != -EHOSTDOWN) {
 			if (midQ->midState == MID_RETRY_NEEDED) {
 				rc = -EAGAIN;
-				cFYI(1, ("marking request for retry"));
+				cFYI(1, "marking request for retry");
 			} else {
 				rc = -EIO;
 			}
@@ -712,8 +711,8 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 	receive_len = midQ->resp_buf->smb_buf_length;
 
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-		cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
-			receive_len, xid));
+		cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
+			receive_len, xid);
 		rc = -EIO;
 		goto out;
 	}
@@ -736,7 +735,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 						&ses->server->mac_signing_key,
 						midQ->sequence_number+1);
 			if (rc) {
-				cERROR(1, ("Unexpected SMB signature"));
+				cERROR(1, "Unexpected SMB signature");
 				/* BB FIXME add code to kill session */
 			}
 		}
@@ -753,7 +752,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 			BCC(out_buf) = le16_to_cpu(BCC_LE(out_buf));
 	} else {
 		rc = -EIO;
-		cERROR(1, ("Bad MID state?"));
+		cERROR(1, "Bad MID state?");
 	}
 
 out:
@@ -824,13 +823,13 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	struct cifsSesInfo *ses;
 
 	if (tcon == NULL || tcon->ses == NULL) {
-		cERROR(1, ("Null smb session"));
+		cERROR(1, "Null smb session");
 		return -EIO;
 	}
 	ses = tcon->ses;
 
 	if (ses->server == NULL) {
-		cERROR(1, ("Null tcp session"));
+		cERROR(1, "Null tcp session");
 		return -EIO;
 	}
 
@@ -842,8 +841,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	   use ses->maxReq */
 
 	if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
-		cERROR(1, ("Illegal length, greater than maximum frame, %d",
-			   in_buf->smb_buf_length));
+		cERROR(1, "Illegal length, greater than maximum frame, %d",
+			   in_buf->smb_buf_length);
 		return -EIO;
 	}
 
@@ -933,8 +932,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 		spin_unlock(&GlobalMid_Lock);
 		receive_len = midQ->resp_buf->smb_buf_length;
 	} else {
-		cERROR(1, ("No response for cmd %d mid %d",
-			  midQ->command, midQ->mid));
+		cERROR(1, "No response for cmd %d mid %d",
+			  midQ->command, midQ->mid);
 		if (midQ->midState == MID_REQUEST_SUBMITTED) {
 			if (ses->server->tcpStatus == CifsExiting)
 				rc = -EHOSTDOWN;
@@ -947,7 +946,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 		if (rc != -EHOSTDOWN) {
 			if (midQ->midState == MID_RETRY_NEEDED) {
 				rc = -EAGAIN;
-				cFYI(1, ("marking request for retry"));
+				cFYI(1, "marking request for retry");
 			} else {
 				rc = -EIO;
 			}
@@ -958,8 +957,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	}
 
 	if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
-		cERROR(1, ("Frame too large received.  Length: %d  Xid: %d",
-			receive_len, xid));
+		cERROR(1, "Frame too large received.  Length: %d  Xid: %d",
+			receive_len, xid);
 		rc = -EIO;
 		goto out;
 	}
@@ -968,7 +967,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 
 	if ((out_buf == NULL) || (midQ->midState != MID_RESPONSE_RECEIVED)) {
 		rc = -EIO;
-		cERROR(1, ("Bad MID state?"));
+		cERROR(1, "Bad MID state?");
 		goto out;
 	}
 
@@ -986,7 +985,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 					   &ses->server->mac_signing_key,
 					   midQ->sequence_number+1);
 		if (rc) {
-			cERROR(1, ("Unexpected SMB signature"));
+			cERROR(1, "Unexpected SMB signature");
 			/* BB FIXME add code to kill session */
 		}
 	}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index f555ce077d4..a1509207bfa 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -70,12 +70,12 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 		return rc;
 	}
 	if (ea_name == NULL) {
-		cFYI(1, ("Null xattr names not supported"));
+		cFYI(1, "Null xattr names not supported");
 	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5)
 		&& (strncmp(ea_name, CIFS_XATTR_OS2_PREFIX, 4))) {
 		cFYI(1,
-		    ("illegal xattr request %s (only user namespace supported)",
-			ea_name));
+		     "illegal xattr request %s (only user namespace supported)",
+		     ea_name);
 		/* BB what if no namespace prefix? */
 		/* Should we just pass them to server, except for
 		system and perhaps security prefixes? */
@@ -131,19 +131,19 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 		search server for EAs or streams to
 		returns as xattrs */
 	if (value_size > MAX_EA_VALUE_SIZE) {
-		cFYI(1, ("size of EA value too large"));
+		cFYI(1, "size of EA value too large");
 		kfree(full_path);
 		FreeXid(xid);
 		return -EOPNOTSUPP;
 	}
 
 	if (ea_name == NULL) {
-		cFYI(1, ("Null xattr names not supported"));
+		cFYI(1, "Null xattr names not supported");
 	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto set_ea_exit;
 		if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0)
-			cFYI(1, ("attempt to set cifs inode metadata"));
+			cFYI(1, "attempt to set cifs inode metadata");
 
 		ea_name += 5; /* skip past user. prefix */
 		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
@@ -169,9 +169,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 					ACL_TYPE_ACCESS, cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			cFYI(1, ("set POSIX ACL rc %d", rc));
+			cFYI(1, "set POSIX ACL rc %d", rc);
 #else
-			cFYI(1, ("set POSIX ACL not supported"));
+			cFYI(1, "set POSIX ACL not supported");
 #endif
 		} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
 				   strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -182,13 +182,13 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 					ACL_TYPE_DEFAULT, cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-			cFYI(1, ("set POSIX default ACL rc %d", rc));
+			cFYI(1, "set POSIX default ACL rc %d", rc);
 #else
-			cFYI(1, ("set default POSIX ACL not supported"));
+			cFYI(1, "set default POSIX ACL not supported");
 #endif
 		} else {
-			cFYI(1, ("illegal xattr request %s (only user namespace"
-				 " supported)", ea_name));
+			cFYI(1, "illegal xattr request %s (only user namespace"
+				" supported)", ea_name);
 		  /* BB what if no namespace prefix? */
 		  /* Should we just pass them to server, except for
 		  system and perhaps security prefixes? */
@@ -235,13 +235,13 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 	/* return dos attributes as pseudo xattr */
 	/* return alt name if available as pseudo attr */
 	if (ea_name == NULL) {
-		cFYI(1, ("Null xattr names not supported"));
+		cFYI(1, "Null xattr names not supported");
 	} else if (strncmp(ea_name, CIFS_XATTR_USER_PREFIX, 5) == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto get_ea_exit;
 
 		if (strncmp(ea_name, CIFS_XATTR_DOS_ATTRIB, 14) == 0) {
-			cFYI(1, ("attempt to query cifs inode metadata"));
+			cFYI(1, "attempt to query cifs inode metadata");
 			/* revalidate/getattr then populate from inode */
 		} /* BB add else when above is implemented */
 		ea_name += 5; /* skip past user. prefix */
@@ -287,7 +287,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 		}
 #endif /* EXPERIMENTAL */
 #else
-		cFYI(1, ("query POSIX ACL not supported yet"));
+		cFYI(1, "query POSIX ACL not supported yet");
 #endif /* CONFIG_CIFS_POSIX */
 	} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
 			  strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
@@ -299,18 +299,18 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 				cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 #else
-		cFYI(1, ("query POSIX default ACL not supported yet"));
+		cFYI(1, "query POSIX default ACL not supported yet");
 #endif
 	} else if (strncmp(ea_name,
 		  CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) {
-		cFYI(1, ("Trusted xattr namespace not supported yet"));
+		cFYI(1, "Trusted xattr namespace not supported yet");
 	} else if (strncmp(ea_name,
 		  CIFS_XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) {
-		cFYI(1, ("Security xattr namespace not supported yet"));
+		cFYI(1, "Security xattr namespace not supported yet");
 	} else
 		cFYI(1,
-		    ("illegal xattr request %s (only user namespace supported)",
-			ea_name));
+		    "illegal xattr request %s (only user namespace supported)",
+		     ea_name);
 
 	/* We could add an additional check for streams ie
 	    if proc/fs/cifs/streamstoxattr is set then
-- 
cgit v1.2.3


From f19159dc5ab9ec28c3b8230689101335d98e2d68 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 21 Apr 2010 04:12:10 +0000
Subject: [CIFS] Cleanup various minor breakage in previous cFYI cleanup

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.h  |  2 +-
 fs/cifs/cifssmb.c | 22 +++++++++++-----------
 fs/cifs/file.c    | 12 ++++++------
 fs/cifs/inode.c   | 22 +++++++++++-----------
 4 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7aa57ecdc43..dac76022c5f 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -114,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.62"
+#define CIFS_VERSION   "1.63"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index be23e426ffb..980c38c658b 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifssmb.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2009
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   Contains the routines for constructing the SMB PDUs themselves
@@ -493,14 +493,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			goto neg_err_exit;
 		}
 
-		cFYI(1, ("LANMAN negotiated"));
+		cFYI(1, "LANMAN negotiated");
 		/* we will not end up setting signing flags - as no signing
 		was in LANMAN and server did not return the flags on */
 		goto signing_check;
 #else /* weak security disabled */
 	} else if (pSMBr->hdr.WordCount == 13) {
-		cERROR(1, ("mount failed, cifs module not built "
-			  "with CIFS_WEAK_PW_HASH support"));
+		cERROR(1, "mount failed, cifs module not built "
+			  "with CIFS_WEAK_PW_HASH support");
 		rc = -EOPNOTSUPP;
 #endif /* WEAK_PW_HASH */
 		goto neg_err_exit;
@@ -1513,7 +1513,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, long_op);
 	cifs_stats_inc(&tcon->num_writes);
 	if (rc) {
-		cFYI(1, ("Send error in write = %d", rc));
+		cFYI(1, "Send error in write = %d", rc);
 	} else {
 		*nbytes = le16_to_cpu(pSMBr->CountHigh);
 		*nbytes = (*nbytes) << 16;
@@ -2529,7 +2529,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
 		cFYI(1, "data starts after end of smb");
 		return -EINVAL;
 	} else if (data_count + *ppdata > end_of_smb) {
-		cFYI(1, "data %p + count %d (%p) ends after end of smb %p start %p",
+		cFYI(1, "data %p + count %d (%p) past smb end %p start %p",
 			*ppdata, data_count, (data_count + *ppdata),
 			end_of_smb, pSMBr);
 		return -EINVAL;
@@ -3304,7 +3304,7 @@ QFileInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QPathInfo = %d", rc));
+		cFYI(1, "Send error in QPathInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -3472,14 +3472,14 @@ UnixQFileInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cFYI(1, ("Send error in QPathInfo = %d", rc));
+		cFYI(1, "Send error in QPathInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
 		if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
-			cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
+			cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
 				   "Unix Extensions can be disabled on mount "
-				   "by specifying the nosfu mount option."));
+				   "by specifying the nosfu mount option.");
 			rc = -EIO;	/* bad smb */
 		} else {
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4037,7 +4037,7 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
 	data_end = (char *)(&(pSMBr->PathConsumed)) +
 				le16_to_cpu(pSMBr->t2.DataCount);
 
-	cFYI(1, "num_referrals: %d dfs flags: 0x%x ... \n",
+	cFYI(1, "num_referrals: %d dfs flags: 0x%x ...\n",
 			*num_of_nodes,
 			le32_to_cpu(pSMBr->DFSFlags));
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ed3689e6617..99897e3562a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3,7 +3,7 @@
  *
  *   vfs operations that deal with files
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *              Jeremy Allison (jra@samba.org)
  *
@@ -855,9 +855,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 						0 /* wait flag */);
 					pfLock->fl_type = F_RDLCK;
 					if (rc != 0)
-						cERROR(1, ("Error unlocking "
+						cERROR(1, "Error unlocking "
 						"previously locked range %d "
-						"during test of lock", rc));
+						"during test of lock", rc);
 					rc = 0;
 				} else {
 					pfLock->fl_type = F_WRLCK;
@@ -1709,7 +1709,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	unsigned int rpages = 0;
 	int rc = 0;
 
-	cFYI(1, "sync page %p",page);
+	cFYI(1, "sync page %p", page);
 	mapping = page->mapping;
 	if (!mapping)
 		return 0;
@@ -1998,7 +1998,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	pTcon = cifs_sb->tcon;
 
-	cFYI(DBG2, ("rpages: num pages %d", num_pages));
+	cFYI(DBG2, "rpages: num pages %d", num_pages);
 	for (i = 0; i < num_pages; ) {
 		unsigned contig_pages;
 		struct page *tmp_page;
@@ -2083,7 +2083,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 			}
 		} else {
 			cFYI(1, "No bytes read (%d) at offset %lld . "
-			        "Cleaning remaining pages from readahead list",
+				"Cleaning remaining pages from readahead list",
 				bytes_read, offset);
 			/* BB turn off caching and do new lookup on
 			   file size at server? */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7524a90aa94..b35cb031c20 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/inode.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2010
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -86,30 +86,30 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
 {
 	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
 
-	cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid));
+	cFYI(1, "%s: revalidating inode %llu", __func__, cifs_i->uniqueid);
 
 	if (inode->i_state & I_NEW) {
-		cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid));
+		cFYI(1, "%s: inode %llu is new", __func__, cifs_i->uniqueid);
 		return;
 	}
 
 	/* don't bother with revalidation if we have an oplock */
 	if (cifs_i->clientCanCacheRead) {
-		cFYI(1, ("%s: inode %llu is oplocked", __func__,
-			 cifs_i->uniqueid));
+		cFYI(1, "%s: inode %llu is oplocked", __func__,
+			 cifs_i->uniqueid);
 		return;
 	}
 
 	 /* revalidate if mtime or size have changed */
 	if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
 	    cifs_i->server_eof == fattr->cf_eof) {
-		cFYI(1, ("%s: inode %llu is unchanged", __func__,
-			 cifs_i->uniqueid));
+		cFYI(1, "%s: inode %llu is unchanged", __func__,
+			 cifs_i->uniqueid);
 		return;
 	}
 
-	cFYI(1, ("%s: invalidating inode %llu mapping", __func__,
-		 cifs_i->uniqueid));
+	cFYI(1, "%s: invalidating inode %llu mapping", __func__,
+		 cifs_i->uniqueid);
 	cifs_i->invalid_mapping = true;
 }
 
@@ -1577,9 +1577,9 @@ int cifs_revalidate_dentry(struct dentry *dentry)
 		goto check_inval;
 	}
 
-	cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
+	cFYI(1, "Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
 		 "jiffies %ld", full_path, inode, inode->i_count.counter,
-		 dentry, dentry->d_time, jiffies));
+		 dentry, dentry->d_time, jiffies);
 
 	if (CIFS_SB(sb)->tcon->unix_ext)
 		rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
-- 
cgit v1.2.3


From 6eca8cc35b50af1037bc919106dd6dd332c959c2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 21 Apr 2010 02:01:05 +0200
Subject: perf: Fix perf probe build error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When we run into dry run mode, we want to make
write_kprobe_trace_event to succeed on writing the event. Let's
initialize it to 0.

Fixes the following build error:
	util/probe-event.c:1266: attention : «ret» may be used uninitialized in this function
	util/probe-event.c:1266: note: «ret» was declared here

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1271808065-25290-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/probe-event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 3967f8f63d0..4fb480367c3 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -1263,7 +1263,7 @@ static struct strlist *get_kprobe_trace_event_names(int fd, bool include_group)
 
 static int write_kprobe_trace_event(int fd, struct kprobe_trace_event *tev)
 {
-	int ret;
+	int ret = 0;
 	char *buf = synthesize_kprobe_trace_command(tev);
 
 	if (!buf) {
-- 
cgit v1.2.3


From 2c964d1f7c87eb71f7902111cd7c8fbba225e4b6 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Wed, 21 Apr 2010 19:44:24 +0000
Subject: [CIFS] Fix losing locks during fork()

When process does fork() private_data of files with lock list stays the same
for file descriptors of the parent and of the child. While finishing the child closes
files and deletes locks from the list even if unlocking fails. When the child process
finishes the parent doesn't have lock in lock list and can't unlock previously before
fork() locked region after the child process finished.

This patch provides behaviour to save locks in lock list if unlocking fails.

Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 99897e3562a..2ba4c41be97 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -922,9 +922,10 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock)
 							1, 0, li->type, false);
 					if (stored_rc)
 						rc = stored_rc;
-
-					list_del(&li->llist);
-					kfree(li);
+					else {
+						list_del(&li->llist);
+						kfree(li);
+					}
 				}
 			}
 			mutex_unlock(&fid->lock_mutex);
-- 
cgit v1.2.3


From cecbca96da387428e220e307a9c945e37e2f4d9e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 18 Apr 2010 19:08:41 +0200
Subject: tracing: Dump either the oops's cpu source or all cpus buffers

The ftrace_dump_on_oops kernel parameter, sysctl and sysrq let one
dump every cpu buffers when an oops or panic happens.

It's nice when you have few cpus but it may take ages if have many,
plus you miss the real origin of the problem in all the cpu traces.

Sometimes, all you need is to dump the cpu buffer that triggered the
opps, most of the time it is our main interest.

This patch modifies ftrace_dump_on_oops to handle this choice.

The ftrace_dump_on_oops kernel parameter, when it comes alone, has
the same behaviour than before. But ftrace_dump_on_oops=orig_cpu
will only dump the buffer of the cpu that oops'ed.

Similarly, sysctl kernel.ftrace_dump_on_oops=1 and
echo 1 > /proc/sys/kernel/ftrace_dump_on_oops keep their previous
behaviour. But setting 2 jumps into cpu origin dump mode.

v2: Fix double setup
v3: Fix spelling issues reported by Randy Dunlap
v4: Also update __ftrace_dump in the selftests

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 Documentation/kernel-parameters.txt |  6 ++++-
 Documentation/trace/ftrace.txt      |  6 +++--
 drivers/char/sysrq.c                |  2 +-
 include/linux/ftrace.h              |  4 ++-
 include/linux/kernel.h              | 11 ++++++--
 kernel/trace/trace.c                | 51 ++++++++++++++++++++++++++++---------
 kernel/trace/trace_selftest.c       |  5 ++--
 7 files changed, 64 insertions(+), 21 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e4cbca58536..ab67b33300f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -789,8 +789,12 @@ and is between 256 and 4096 characters. It is defined in the file
 			as early as possible in order to facilitate early
 			boot debugging.
 
-	ftrace_dump_on_oops
+	ftrace_dump_on_oops[=orig_cpu]
 			[FTRACE] will dump the trace buffers on oops.
+			If no parameter is passed, ftrace will dump
+			buffers of all CPUs, but if you pass orig_cpu, it will
+			dump only the buffer of the CPU that triggered the
+			oops.
 
 	ftrace_filter=[function-list]
 			[FTRACE] Limit the functions traced by the function
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 03485bfbd79..52011815c90 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1337,12 +1337,14 @@ ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one
 can either use the sysctl function or set it via the proc system
 interface.
 
-  sysctl kernel.ftrace_dump_on_oops=1
+  sysctl kernel.ftrace_dump_on_oops=n
 
 or
 
-  echo 1 > /proc/sys/kernel/ftrace_dump_on_oops
+  echo n > /proc/sys/kernel/ftrace_dump_on_oops
 
+If n = 1, ftrace will dump buffers of all CPUs, if n = 2 ftrace will
+only dump the buffer of the CPU that triggered the oops.
 
 Here's an example of such a dump after a null pointer
 dereference in a kernel module:
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 59de2525d30..d4e8b213a46 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -289,7 +289,7 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = {
 
 static void sysrq_ftrace_dump(int key, struct tty_struct *tty)
 {
-	ftrace_dump();
+	ftrace_dump(DUMP_ALL);
 }
 static struct sysrq_key_op sysrq_ftrace_dump_op = {
 	.handler	= sysrq_ftrace_dump,
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 01e6adea07e..ea5b1aae0e8 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -492,7 +492,9 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
 	return tsk->trace & TSK_TRACE_FL_GRAPH;
 }
 
-extern int ftrace_dump_on_oops;
+enum ftrace_dump_mode;
+
+extern enum ftrace_dump_mode ftrace_dump_on_oops;
 
 #ifdef CONFIG_PREEMPT
 #define INIT_TRACE_RECURSION		.trace_recursion = 0,
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 9365227dbaf..9fb1c129903 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -490,6 +490,13 @@ static inline void tracing_off(void) { }
 static inline void tracing_off_permanent(void) { }
 static inline int tracing_is_on(void) { return 0; }
 #endif
+
+enum ftrace_dump_mode {
+	DUMP_NONE,
+	DUMP_ALL,
+	DUMP_ORIG,
+};
+
 #ifdef CONFIG_TRACING
 extern void tracing_start(void);
 extern void tracing_stop(void);
@@ -571,7 +578,7 @@ __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 
-extern void ftrace_dump(void);
+extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
 #else
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
@@ -592,7 +599,7 @@ ftrace_vprintk(const char *fmt, va_list ap)
 {
 	return 0;
 }
-static inline void ftrace_dump(void) { }
+static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #endif /* CONFIG_TRACING */
 
 /*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bed83cab6da..7b516c7ef9a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -117,9 +117,12 @@ static cpumask_var_t __read_mostly	tracing_buffer_mask;
  *
  * It is default off, but you can enable it with either specifying
  * "ftrace_dump_on_oops" in the kernel command line, or setting
- * /proc/sys/kernel/ftrace_dump_on_oops to true.
+ * /proc/sys/kernel/ftrace_dump_on_oops
+ * Set 1 if you want to dump buffers of all CPUs
+ * Set 2 if you want to dump the buffer of the CPU that triggered oops
  */
-int ftrace_dump_on_oops;
+
+enum ftrace_dump_mode ftrace_dump_on_oops;
 
 static int tracing_set_tracer(const char *buf);
 
@@ -139,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace);
 
 static int __init set_ftrace_dump_on_oops(char *str)
 {
-	ftrace_dump_on_oops = 1;
-	return 1;
+	if (*str++ != '=' || !*str) {
+		ftrace_dump_on_oops = DUMP_ALL;
+		return 1;
+	}
+
+	if (!strcmp("orig_cpu", str)) {
+		ftrace_dump_on_oops = DUMP_ORIG;
+                return 1;
+        }
+
+        return 0;
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
 
@@ -4338,7 +4350,7 @@ static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
 	if (ftrace_dump_on_oops)
-		ftrace_dump();
+		ftrace_dump(ftrace_dump_on_oops);
 	return NOTIFY_OK;
 }
 
@@ -4355,7 +4367,7 @@ static int trace_die_handler(struct notifier_block *self,
 	switch (val) {
 	case DIE_OOPS:
 		if (ftrace_dump_on_oops)
-			ftrace_dump();
+			ftrace_dump(ftrace_dump_on_oops);
 		break;
 	default:
 		break;
@@ -4396,7 +4408,8 @@ trace_printk_seq(struct trace_seq *s)
 	trace_seq_init(s);
 }
 
-static void __ftrace_dump(bool disable_tracing)
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 {
 	static arch_spinlock_t ftrace_dump_lock =
 		(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4429,12 +4442,25 @@ static void __ftrace_dump(bool disable_tracing)
 	/* don't look at user memory in panic mode */
 	trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
 
-	printk(KERN_TRACE "Dumping ftrace buffer:\n");
-
 	/* Simulate the iterator */
 	iter.tr = &global_trace;
 	iter.trace = current_trace;
-	iter.cpu_file = TRACE_PIPE_ALL_CPU;
+
+	switch (oops_dump_mode) {
+	case DUMP_ALL:
+		iter.cpu_file = TRACE_PIPE_ALL_CPU;
+		break;
+	case DUMP_ORIG:
+		iter.cpu_file = raw_smp_processor_id();
+		break;
+	case DUMP_NONE:
+		goto out_enable;
+	default:
+		printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
+		iter.cpu_file = TRACE_PIPE_ALL_CPU;
+	}
+
+	printk(KERN_TRACE "Dumping ftrace buffer:\n");
 
 	/*
 	 * We need to stop all tracing on all CPUS to read the
@@ -4473,6 +4499,7 @@ static void __ftrace_dump(bool disable_tracing)
 	else
 		printk(KERN_TRACE "---------------------------------\n");
 
+ out_enable:
 	/* Re-enable tracing if requested */
 	if (!disable_tracing) {
 		trace_flags |= old_userobj;
@@ -4489,9 +4516,9 @@ static void __ftrace_dump(bool disable_tracing)
 }
 
 /* By default: disable tracing after the dump */
-void ftrace_dump(void)
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 {
-	__ftrace_dump(true);
+	__ftrace_dump(true, oops_dump_mode);
 }
 
 __init static int tracer_alloc_buffers(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 9398034f814..6a9d36ddfcf 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -256,7 +256,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 /* Maximum number of functions to trace before diagnosing a hang */
 #define GRAPH_MAX_FUNC_TEST	100000000
 
-static void __ftrace_dump(bool disable_tracing);
+static void
+__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
 static unsigned int graph_hang_thresh;
 
 /* Wrap the real function entry probe to avoid possible hanging */
@@ -267,7 +268,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
 		ftrace_graph_stop();
 		printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
 		if (ftrace_dump_on_oops)
-			__ftrace_dump(false);
+			__ftrace_dump(false, DUMP_ALL);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From cd932c593995abee1d1a8a0bfe608f7d103d87ad Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au.ibm.com>
Date: Tue, 20 Apr 2010 16:58:32 +1000
Subject: perf: Move arch specific code into separate arch directory

The perf userspace tool included some architecture specific code to map
registers from the DWARF register number into the names used by the regs
and stack access API.

This moves the architecture specific code out into a separate
arch/x86 directory along with the infrastructure required to use it.

Signed-off-by: Ian Munsie <imunsie@au.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 tools/perf/Makefile                   | 36 ++++++++++++++---
 tools/perf/arch/x86/Makefile          |  4 ++
 tools/perf/arch/x86/util/dwarf-regs.c | 75 +++++++++++++++++++++++++++++++++++
 tools/perf/util/include/dwarf-regs.h  |  8 ++++
 tools/perf/util/probe-finder.c        | 55 +------------------------
 5 files changed, 119 insertions(+), 59 deletions(-)
 create mode 100644 tools/perf/arch/x86/Makefile
 create mode 100644 tools/perf/arch/x86/util/dwarf-regs.c
 create mode 100644 tools/perf/util/include/dwarf-regs.h

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 3cb3449a964..e8bf2e1ab49 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -173,6 +173,20 @@ uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
 uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
 uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
+				  -e s/arm.*/arm/ -e s/sa110/arm/ \
+				  -e s/s390x/s390/ -e s/parisc64/parisc/ \
+				  -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
+				  -e s/sh[234].*/sh/ )
+
+# Additional ARCH settings for x86
+ifeq ($(ARCH),i386)
+        ARCH := x86
+endif
+ifeq ($(ARCH),x86_64)
+        ARCH := x86
+endif
+
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
 #
@@ -285,7 +299,7 @@ endif
 # Those must not be GNU-specific; they are shared with perl/ which may
 # be built by a different compiler. (Note that this is an artifact now
 # but it still might be nice to keep that distinction.)
-BASIC_CFLAGS = -Iutil/include
+BASIC_CFLAGS = -Iutil/include -Iarch/$(ARCH)/include
 BASIC_LDFLAGS =
 
 # Guard against environment variables
@@ -367,6 +381,7 @@ LIB_H += util/include/asm/byteorder.h
 LIB_H += util/include/asm/swab.h
 LIB_H += util/include/asm/system.h
 LIB_H += util/include/asm/uaccess.h
+LIB_H += util/include/dwarf-regs.h
 LIB_H += perf.h
 LIB_H += util/cache.h
 LIB_H += util/callchain.h
@@ -487,6 +502,15 @@ PERFLIBS = $(LIB_FILE)
 -include config.mak.autogen
 -include config.mak
 
+ifndef NO_DWARF
+ifneq ($(shell sh -c "(echo '\#include <dwarf.h>'; echo '\#include <libdw.h>'; echo 'int main(void) { Dwarf *dbg; dbg = dwarf_begin(0, DWARF_C_READ); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -I/usr/include/elfutils -ldw -lelf -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
+	msg := $(warning No libdw.h found or old libdw.h found, disables dwarf support. Please install elfutils-devel/elfutils-dev);
+	NO_DWARF := 1
+endif # Dwarf support
+endif # NO_DWARF
+
+-include arch/$(ARCH)/Makefile
+
 ifeq ($(uname_S),Darwin)
 	ifndef NO_FINK
 		ifeq ($(shell test -d /sw/lib && echo y),y)
@@ -519,15 +543,15 @@ else
 	msg := $(error No libelf.h/libelf found, please install libelf-dev/elfutils-libelf-devel and glibc-dev[el]);
 endif
 
-ifneq ($(shell sh -c "(echo '\#include <dwarf.h>'; echo '\#include <libdw.h>'; echo 'int main(void) { Dwarf *dbg; dbg = dwarf_begin(0, DWARF_C_READ); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -I/usr/include/elfutils -ldw -lelf -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
-	msg := $(warning No libdw.h found or old libdw.h found, disables dwarf support. Please install elfutils-devel/elfutils-dev);
-else
 ifndef NO_DWARF
+ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined)
+	msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled);
+else
 	BASIC_CFLAGS += -I/usr/include/elfutils -DDWARF_SUPPORT
 	EXTLIBS += -lelf -ldw
 	LIB_OBJS += $(OUTPUT)util/probe-finder.o
-endif
-endif
+endif # PERF_HAVE_DWARF_REGS
+endif # NO_DWARF
 
 ifneq ($(shell sh -c "(echo '\#include <newt.h>'; echo 'int main(void) { newtInit(); newtCls(); return newtFinished(); }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -lnewt -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
 	msg := $(warning newt not found, disables TUI support. Please install newt-devel or libnewt-dev);
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile
new file mode 100644
index 00000000000..15130b50dfe
--- /dev/null
+++ b/tools/perf/arch/x86/Makefile
@@ -0,0 +1,4 @@
+ifndef NO_DWARF
+PERF_HAVE_DWARF_REGS := 1
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
+endif
diff --git a/tools/perf/arch/x86/util/dwarf-regs.c b/tools/perf/arch/x86/util/dwarf-regs.c
new file mode 100644
index 00000000000..a794d308192
--- /dev/null
+++ b/tools/perf/arch/x86/util/dwarf-regs.c
@@ -0,0 +1,75 @@
+/*
+ * dwarf-regs.c : Mapping of DWARF debug register numbers into register names.
+ * Extracted from probe-finder.c
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+#include <libio.h>
+#include <dwarf-regs.h>
+
+/*
+ * Generic dwarf analysis helpers
+ */
+
+#define X86_32_MAX_REGS 8
+const char *x86_32_regs_table[X86_32_MAX_REGS] = {
+	"%ax",
+	"%cx",
+	"%dx",
+	"%bx",
+	"$stack",	/* Stack address instead of %sp */
+	"%bp",
+	"%si",
+	"%di",
+};
+
+#define X86_64_MAX_REGS 16
+const char *x86_64_regs_table[X86_64_MAX_REGS] = {
+	"%ax",
+	"%dx",
+	"%cx",
+	"%bx",
+	"%si",
+	"%di",
+	"%bp",
+	"%sp",
+	"%r8",
+	"%r9",
+	"%r10",
+	"%r11",
+	"%r12",
+	"%r13",
+	"%r14",
+	"%r15",
+};
+
+/* TODO: switching by dwarf address size */
+#ifdef __x86_64__
+#define ARCH_MAX_REGS X86_64_MAX_REGS
+#define arch_regs_table x86_64_regs_table
+#else
+#define ARCH_MAX_REGS X86_32_MAX_REGS
+#define arch_regs_table x86_32_regs_table
+#endif
+
+/* Return architecture dependent register string (for kprobe-tracer) */
+const char *get_arch_regstr(unsigned int n)
+{
+	return (n <= ARCH_MAX_REGS) ? arch_regs_table[n] : NULL;
+}
diff --git a/tools/perf/util/include/dwarf-regs.h b/tools/perf/util/include/dwarf-regs.h
new file mode 100644
index 00000000000..cf6727e99c4
--- /dev/null
+++ b/tools/perf/util/include/dwarf-regs.h
@@ -0,0 +1,8 @@
+#ifndef _PERF_DWARF_REGS_H_
+#define _PERF_DWARF_REGS_H_
+
+#ifdef DWARF_SUPPORT
+const char *get_arch_regstr(unsigned int n);
+#endif
+
+#endif
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 3e7977560be..e7ee52fd0e0 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -31,6 +31,7 @@
 #include <string.h>
 #include <stdarg.h>
 #include <ctype.h>
+#include <dwarf-regs.h>
 
 #include "string.h"
 #include "event.h"
@@ -38,61 +39,9 @@
 #include "util.h"
 #include "probe-finder.h"
 
-
-/*
- * Generic dwarf analysis helpers
- */
-
-#define X86_32_MAX_REGS 8
-const char *x86_32_regs_table[X86_32_MAX_REGS] = {
-	"%ax",
-	"%cx",
-	"%dx",
-	"%bx",
-	"$stack",	/* Stack address instead of %sp */
-	"%bp",
-	"%si",
-	"%di",
-};
-
-#define X86_64_MAX_REGS 16
-const char *x86_64_regs_table[X86_64_MAX_REGS] = {
-	"%ax",
-	"%dx",
-	"%cx",
-	"%bx",
-	"%si",
-	"%di",
-	"%bp",
-	"%sp",
-	"%r8",
-	"%r9",
-	"%r10",
-	"%r11",
-	"%r12",
-	"%r13",
-	"%r14",
-	"%r15",
-};
-
-/* TODO: switching by dwarf address size */
-#ifdef __x86_64__
-#define ARCH_MAX_REGS X86_64_MAX_REGS
-#define arch_regs_table x86_64_regs_table
-#else
-#define ARCH_MAX_REGS X86_32_MAX_REGS
-#define arch_regs_table x86_32_regs_table
-#endif
-
 /* Kprobe tracer basic type is up to u64 */
 #define MAX_BASIC_TYPE_BITS	64
 
-/* Return architecture dependent register string (for kprobe-tracer) */
-static const char *get_arch_regstr(unsigned int n)
-{
-	return (n <= ARCH_MAX_REGS) ? arch_regs_table[n] : NULL;
-}
-
 /*
  * Compare the tail of two strings.
  * Return 0 if whole of either string is same as another's tail part.
@@ -447,7 +396,7 @@ static int convert_location(Dwarf_Op *op, struct probe_finder *pf)
 
 	regs = get_arch_regstr(regn);
 	if (!regs) {
-		pr_warning("%u exceeds max register number.\n", regn);
+		pr_warning("Mapping for DWARF register number %u missing on this architecture.", regn);
 		return -ERANGE;
 	}
 
-- 
cgit v1.2.3


From fead7960f0b645348fa4757100f4d57654bf9ffa Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au.ibm.com>
Date: Tue, 20 Apr 2010 16:58:33 +1000
Subject: perf probe: Add PowerPC DWARF register number mappings

This adds mappings from the register numbers from DWARF to the
register names used in the PowerPC Regs and Stack Access API.  This
allows perf probe to be used to record variable contents on PowerPC.

This requires the functionality represented by the config symbol
HAVE_REGS_AND_STACK_ACCESS_API in order to function, although it will
compile without it.  That functionality is added for PowerPC in commit
359e4284 ("powerpc: Add kprobe-based event tracer").

Signed-off-by: Ian Munsie <imunsie@au.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 tools/perf/arch/powerpc/Makefile          |  4 ++
 tools/perf/arch/powerpc/util/dwarf-regs.c | 88 +++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 tools/perf/arch/powerpc/Makefile
 create mode 100644 tools/perf/arch/powerpc/util/dwarf-regs.c

diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
new file mode 100644
index 00000000000..15130b50dfe
--- /dev/null
+++ b/tools/perf/arch/powerpc/Makefile
@@ -0,0 +1,4 @@
+ifndef NO_DWARF
+PERF_HAVE_DWARF_REGS := 1
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
+endif
diff --git a/tools/perf/arch/powerpc/util/dwarf-regs.c b/tools/perf/arch/powerpc/util/dwarf-regs.c
new file mode 100644
index 00000000000..48ae0c5e3f7
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/dwarf-regs.c
@@ -0,0 +1,88 @@
+/*
+ * Mapping of DWARF debug register numbers into register names.
+ *
+ * Copyright (C) 2010 Ian Munsie, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <libio.h>
+#include <dwarf-regs.h>
+
+
+struct pt_regs_dwarfnum {
+	const char *name;
+	unsigned int dwarfnum;
+};
+
+#define STR(s) #s
+#define REG_DWARFNUM_NAME(r, num) {.name = r, .dwarfnum = num}
+#define GPR_DWARFNUM_NAME(num)	\
+	{.name = STR(%gpr##num), .dwarfnum = num}
+#define REG_DWARFNUM_END {.name = NULL, .dwarfnum = 0}
+
+/*
+ * Reference:
+ * http://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi-1.9.html
+ */
+static const struct pt_regs_dwarfnum regdwarfnum_table[] = {
+	GPR_DWARFNUM_NAME(0),
+	GPR_DWARFNUM_NAME(1),
+	GPR_DWARFNUM_NAME(2),
+	GPR_DWARFNUM_NAME(3),
+	GPR_DWARFNUM_NAME(4),
+	GPR_DWARFNUM_NAME(5),
+	GPR_DWARFNUM_NAME(6),
+	GPR_DWARFNUM_NAME(7),
+	GPR_DWARFNUM_NAME(8),
+	GPR_DWARFNUM_NAME(9),
+	GPR_DWARFNUM_NAME(10),
+	GPR_DWARFNUM_NAME(11),
+	GPR_DWARFNUM_NAME(12),
+	GPR_DWARFNUM_NAME(13),
+	GPR_DWARFNUM_NAME(14),
+	GPR_DWARFNUM_NAME(15),
+	GPR_DWARFNUM_NAME(16),
+	GPR_DWARFNUM_NAME(17),
+	GPR_DWARFNUM_NAME(18),
+	GPR_DWARFNUM_NAME(19),
+	GPR_DWARFNUM_NAME(20),
+	GPR_DWARFNUM_NAME(21),
+	GPR_DWARFNUM_NAME(22),
+	GPR_DWARFNUM_NAME(23),
+	GPR_DWARFNUM_NAME(24),
+	GPR_DWARFNUM_NAME(25),
+	GPR_DWARFNUM_NAME(26),
+	GPR_DWARFNUM_NAME(27),
+	GPR_DWARFNUM_NAME(28),
+	GPR_DWARFNUM_NAME(29),
+	GPR_DWARFNUM_NAME(30),
+	GPR_DWARFNUM_NAME(31),
+	REG_DWARFNUM_NAME("%msr",   66),
+	REG_DWARFNUM_NAME("%ctr",   109),
+	REG_DWARFNUM_NAME("%link",  108),
+	REG_DWARFNUM_NAME("%xer",   101),
+	REG_DWARFNUM_NAME("%dar",   119),
+	REG_DWARFNUM_NAME("%dsisr", 118),
+	REG_DWARFNUM_END,
+};
+
+/**
+ * get_arch_regstr() - lookup register name from it's DWARF register number
+ * @n:	the DWARF register number
+ *
+ * get_arch_regstr() returns the name of the register in struct
+ * regdwarfnum_table from it's DWARF register number. If the register is not
+ * found in the table, this returns NULL;
+ */
+const char *get_arch_regstr(unsigned int n)
+{
+	const struct pt_regs_dwarfnum *roff;
+	for (roff = regdwarfnum_table; roff->name != NULL; roff++)
+		if (roff->dwarfnum == n)
+			return roff->name;
+	return NULL;
+}
-- 
cgit v1.2.3


From fa588e0c57048b3d4bfcd772d80dc0615f83fd35 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 22 Apr 2010 19:21:55 +0000
Subject: [CIFS] Allow null nd (as nfs server uses) on create

While creating a file on a server which supports unix extensions
such as Samba, if a file is being created which does not supply
nameidata (i.e. nd is null), cifs client can oops when calling
cifs_posix_open.

Signed-off-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  6 ++++--
 fs/cifs/dir.c       | 20 ++++++++++++--------
 fs/cifs/file.c      | 11 +++++++----
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 32262e15be3..cc622a735f2 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -107,8 +107,10 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
 				__u16 fileHandle, struct file *file,
 				struct vfsmount *mnt, unsigned int oflags);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
-			   struct vfsmount *mnt, int mode, int oflags,
-			   __u32 *poplock, __u16 *pnetfid, int xid);
+				struct vfsmount *mnt,
+				struct super_block *sb,
+				int mode, int oflags,
+				__u32 *poplock, __u16 *pnetfid, int xid);
 extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
 				     FILE_UNIX_BASIC_INFO *info,
 				     struct cifs_sb_info *cifs_sb);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 4aa2fe3f535..d791d0763a9 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -182,13 +182,14 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
 }
 
 int cifs_posix_open(char *full_path, struct inode **pinode,
-		    struct vfsmount *mnt, int mode, int oflags,
-		    __u32 *poplock, __u16 *pnetfid, int xid)
+			struct vfsmount *mnt, struct super_block *sb,
+			int mode, int oflags,
+			__u32 *poplock, __u16 *pnetfid, int xid)
 {
 	int rc;
 	FILE_UNIX_BASIC_INFO *presp_data;
 	__u32 posix_flags = 0;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifs_fattr fattr;
 
 	cFYI(1, "posix open %s", full_path);
@@ -241,7 +242,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 
 	/* get new inode and set it up */
 	if (*pinode == NULL) {
-		*pinode = cifs_iget(mnt->mnt_sb, &fattr);
+		*pinode = cifs_iget(sb, &fattr);
 		if (!*pinode) {
 			rc = -ENOMEM;
 			goto posix_open_ret;
@@ -250,7 +251,8 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 		cifs_fattr_to_inode(*pinode, &fattr);
 	}
 
-	cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
+	if (mnt)
+		cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
 
 posix_open_ret:
 	kfree(presp_data);
@@ -314,13 +316,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	if (nd && (nd->flags & LOOKUP_OPEN))
 		oflags = nd->intent.open.flags;
 	else
-		oflags = FMODE_READ;
+		oflags = FMODE_READ | SMB_O_CREAT;
 
 	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
 	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
 			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-		rc = cifs_posix_open(full_path, &newinode, nd->path.mnt,
-				     mode, oflags, &oplock, &fileHandle, xid);
+		rc = cifs_posix_open(full_path, &newinode,
+			nd ? nd->path.mnt : NULL,
+			inode->i_sb, mode, oflags, &oplock, &fileHandle, xid);
 		/* EIO could indicate that (posix open) operation is not
 		   supported, despite what server claimed in capability
 		   negotation.  EREMOTE indicates DFS junction, which is not
@@ -677,6 +680,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
 		     (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
 		     (nd->intent.open.flags & O_CREAT)) {
 			rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
+					parent_dir_inode->i_sb,
 					nd->intent.open.create_mode,
 					nd->intent.open.flags, &oplock,
 					&fileHandle, xid);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 2ba4c41be97..5f1f7682256 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -298,10 +298,12 @@ int cifs_open(struct inode *inode, struct file *file)
 	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
 			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
 		int oflags = (int) cifs_posix_convert_flags(file->f_flags);
+		oflags |= SMB_O_CREAT;
 		/* can not refresh inode info since size could be stale */
 		rc = cifs_posix_open(full_path, &inode, file->f_path.mnt,
-				     cifs_sb->mnt_file_mode /* ignored */,
-				     oflags, &oplock, &netfid, xid);
+				inode->i_sb,
+				cifs_sb->mnt_file_mode /* ignored */,
+				oflags, &oplock, &netfid, xid);
 		if (rc == 0) {
 			cFYI(1, "posix open succeeded");
 			/* no need for special case handling of setting mode
@@ -513,8 +515,9 @@ reopen_error_exit:
 		int oflags = (int) cifs_posix_convert_flags(file->f_flags);
 		/* can not refresh inode info since size could be stale */
 		rc = cifs_posix_open(full_path, NULL, file->f_path.mnt,
-				     cifs_sb->mnt_file_mode /* ignored */,
-				     oflags, &oplock, &netfid, xid);
+				inode->i_sb,
+				cifs_sb->mnt_file_mode /* ignored */,
+				oflags, &oplock, &netfid, xid);
 		if (rc == 0) {
 			cFYI(1, "posix reopen succeeded");
 			goto reopen_success;
-- 
cgit v1.2.3


From b1ab1b4d9ab9812c77843abec79030292ef0a544 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 22 Apr 2010 16:06:58 +0200
Subject: x86, cacheinfo: Unify AMD L3 cache index disable checking

All F10h CPUs starting with model 8 resp. 9, stepping 1, support L3
cache index disable. Concentrate the family, model, stepping checking at
one place and enable the feature implicitly on upcoming Fam10h models.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-2-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b3eeb66c0a5..acfb0838390 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -328,18 +328,22 @@ static unsigned int __cpuinit amd_calc_l3_indices(void)
 static void __cpuinit
 amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 {
-	if (index < 3)
+	if (boot_cpu_data.x86 != 0x10)
 		return;
 
-	if (boot_cpu_data.x86 == 0x11)
+	if (index < 3)
 		return;
 
 	/* see errata #382 and #388 */
-	if ((boot_cpu_data.x86 == 0x10) &&
-	    ((boot_cpu_data.x86_model < 0x8) ||
-	     (boot_cpu_data.x86_mask  < 0x1)))
+	if (boot_cpu_data.x86_model < 0x8)
 		return;
 
+	if ((boot_cpu_data.x86_model == 0x8 ||
+	     boot_cpu_data.x86_model == 0x9)
+		&&
+	     boot_cpu_data.x86_mask < 0x1)
+			return;
+
 	this_leaf->can_disable = true;
 	this_leaf->l3_indices  = amd_calc_l3_indices();
 }
@@ -443,8 +447,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
 		amd_cpuid4(index, &eax, &ebx, &ecx);
-		if (boot_cpu_data.x86 >= 0x10)
-			amd_check_l3_disable(index, this_leaf);
+		amd_check_l3_disable(index, this_leaf);
 	} else {
 		cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
 	}
-- 
cgit v1.2.3


From f2b20e41407fccfcfacf927ff91ec888832a37af Mon Sep 17 00:00:00 2001
From: Frank Arnold <frank.arnold@amd.com>
Date: Thu, 22 Apr 2010 16:06:59 +0200
Subject: x86, cacheinfo: Turn off L3 cache index disable feature in
 virtualized environments

When running a quest kernel on xen we get:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000038
IP: [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
PGD 0
Oops: 0000 [#1] SMP
last sysfs file:
CPU 0
Modules linked in:

Pid: 0, comm: swapper Tainted: G        W  2.6.34-rc3 #1 /HVM domU
RIP: 0010:[<ffffffff8142f2fb>]  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x
2ca/0x3df
RSP: 0018:ffff880002203e08  EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000060
RDX: 0000000000000000 RSI: 0000000000000040 RDI: 0000000000000000
RBP: ffff880002203ed8 R08: 00000000000017c0 R09: ffff880002203e38
R10: ffff8800023d5d40 R11: ffffffff81a01e28 R12: ffff880187e6f5c0
R13: ffff880002203e34 R14: ffff880002203e58 R15: ffff880002203e68
FS:  0000000000000000(0000) GS:ffff880002200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000038 CR3: 0000000001a3c000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffffffff81a00000, task ffffffff81a44020)
Stack:
 ffffffff810d7ecb ffff880002203e20 ffffffff81059140 ffff880002203e30
<0> ffffffff810d7ec9 0000000002203e40 000000000050d140 ffff880002203e70
<0> 0000000002008140 0000000000000086 ffff880040020140 ffffffff81068b8b
Call Trace:
 <IRQ>
 [<ffffffff810d7ecb>] ? sync_supers_timer_fn+0x0/0x1c
 [<ffffffff81059140>] ? mod_timer+0x23/0x25
 [<ffffffff810d7ec9>] ? arm_supers_timer+0x34/0x36
 [<ffffffff81068b8b>] ? hrtimer_get_next_event+0xa7/0xc3
 [<ffffffff81058e85>] ? get_next_timer_interrupt+0x19a/0x20d
 [<ffffffff8142fa23>] get_cpu_leaves+0x5c/0x232
 [<ffffffff8106a7b1>] ? sched_clock_local+0x1c/0x82
 [<ffffffff8106a9a0>] ? sched_clock_tick+0x75/0x7a
 [<ffffffff8107748c>] generic_smp_call_function_single_interrupt+0xae/0xd0
 [<ffffffff8101f6ef>] smp_call_function_single_interrupt+0x18/0x27
 [<ffffffff8100a773>] call_function_single_interrupt+0x13/0x20
 <EOI>
 [<ffffffff8143c468>] ? notifier_call_chain+0x14/0x63
 [<ffffffff810295c6>] ? native_safe_halt+0xc/0xd
 [<ffffffff810114eb>] ? default_idle+0x36/0x53
 [<ffffffff81008c22>] cpu_idle+0xaa/0xe4
 [<ffffffff81423a9a>] rest_init+0x7e/0x80
 [<ffffffff81b10dd2>] start_kernel+0x40e/0x419
 [<ffffffff81b102c8>] x86_64_start_reservations+0xb3/0xb7
 [<ffffffff81b103c4>] x86_64_start_kernel+0xf8/0x107
Code: 14 d5 40 ff ae 81 8b 14 02 31 c0 3b 15 47 1c 8b 00 7d 0e 48 8b 05 36 1c 8b
 00 48 63 d2 48 8b 04 d0 c7 85 5c ff ff ff 00 00 00 00 <8b> 70 38 48 8d 8d 5c ff
 ff ff 48 8b 78 10 ba c4 01 00 00 e8 eb
RIP  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
 RSP <ffff880002203e08>
CR2: 0000000000000038
---[ end trace a7919e7f17c0a726 ]---

The L3 cache index disable feature of AMD CPUs has to be disabled if the
kernel is running as guest on top of a hypervisor because northbridge
devices are not available to the guest. Currently, this fixes a boot
crash on top of Xen. In the future this will become an issue on KVM as
well.

Check if northbridge devices are present and do not enable the feature
if there are none.

Signed-off-by: Frank Arnold <frank.arnold@amd.com>
LKML-Reference: <1271945222-5283-3-git-send-email-bp@amd64.org>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index acfb0838390..5ab14c86c6e 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -344,6 +344,10 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	     boot_cpu_data.x86_mask < 0x1)
 			return;
 
+	/* not in virtualized environments */
+	if (num_k8_northbridges == 0)
+		return;
+
 	this_leaf->can_disable = true;
 	this_leaf->l3_indices  = amd_calc_l3_indices();
 }
-- 
cgit v1.2.3


From 9350f982e4fe539e83a2d4a13e9b53ad8253c4a8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 22 Apr 2010 16:07:00 +0200
Subject: x86, cacheinfo: Reorganize AMD L3 cache structure

Add a struct representing L3 cache attributes (subcache sizes and
indices count) and move the respective members out of _cpuid4_info.
Also, stash the struct pci_dev ptr into the struct simplifying the code
even more.

There should be no functionality change resulting from this patch except
slightly slimming the _cpuid4_info per-cpu vars.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-4-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 53 +++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 5ab14c86c6e..ff663ca63fd 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -148,13 +148,19 @@ union _cpuid4_leaf_ecx {
 	u32 full;
 };
 
+struct amd_l3_cache {
+	struct	 pci_dev *dev;
+	bool	 can_disable;
+	unsigned indices;
+	u8	 subcaches[4];
+};
+
 struct _cpuid4_info {
 	union _cpuid4_leaf_eax eax;
 	union _cpuid4_leaf_ebx ebx;
 	union _cpuid4_leaf_ecx ecx;
 	unsigned long size;
-	bool can_disable;
-	unsigned int l3_indices;
+	struct amd_l3_cache *l3;
 	DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
 };
 
@@ -164,8 +170,7 @@ struct _cpuid4_info_regs {
 	union _cpuid4_leaf_ebx ebx;
 	union _cpuid4_leaf_ecx ecx;
 	unsigned long size;
-	bool can_disable;
-	unsigned int l3_indices;
+	struct amd_l3_cache *l3;
 };
 
 unsigned short			num_cache_leaves;
@@ -302,7 +307,7 @@ struct _cache_attr {
 };
 
 #ifdef CONFIG_CPU_SUP_AMD
-static unsigned int __cpuinit amd_calc_l3_indices(void)
+static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 {
 	/*
 	 * We're called over smp_call_function_single() and therefore
@@ -317,12 +322,14 @@ static unsigned int __cpuinit amd_calc_l3_indices(void)
 	pci_read_config_dword(dev, 0x1C4, &val);
 
 	/* calculate subcache sizes */
-	sc0 = !(val & BIT(0));
-	sc1 = !(val & BIT(4));
-	sc2 = !(val & BIT(8))  + !(val & BIT(9));
-	sc3 = !(val & BIT(12)) + !(val & BIT(13));
+	l3->subcaches[0] = sc0 = !(val & BIT(0));
+	l3->subcaches[1] = sc1 = !(val & BIT(4));
+	l3->subcaches[2] = sc2 = !(val & BIT(8))  + !(val & BIT(9));
+	l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
+
+	l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
 
-	return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+	l3->dev = dev;
 }
 
 static void __cpuinit
@@ -348,19 +355,23 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	if (num_k8_northbridges == 0)
 		return;
 
-	this_leaf->can_disable = true;
-	this_leaf->l3_indices  = amd_calc_l3_indices();
+	this_leaf->l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
+	if (!this_leaf->l3) {
+		printk(KERN_WARNING "Error allocating L3 struct\n");
+		return;
+	}
+
+	this_leaf->l3->can_disable = true;
+	amd_calc_l3_indices(this_leaf->l3);
 }
 
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 				  unsigned int index)
 {
-	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
-	int node = amd_get_nb_id(cpu);
-	struct pci_dev *dev = node_to_k8_nb_misc(node);
+	struct pci_dev *dev = this_leaf->l3->dev;
 	unsigned int reg = 0;
 
-	if (!this_leaf->can_disable)
+	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
 		return -EINVAL;
 
 	if (!dev)
@@ -382,15 +393,14 @@ SHOW_CACHE_DISABLE(1)
 static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 	const char *buf, size_t count, unsigned int index)
 {
+	struct pci_dev *dev = this_leaf->l3->dev;
 	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
-	int node = amd_get_nb_id(cpu);
-	struct pci_dev *dev = node_to_k8_nb_misc(node);
 	unsigned long val = 0;
 
 #define SUBCACHE_MASK	(3UL << 20)
 #define SUBCACHE_INDEX	0xfff
 
-	if (!this_leaf->can_disable)
+	if (!this_leaf->l3 || !this_leaf->l3->can_disable)
 		return -EINVAL;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -404,7 +414,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 
 	/* do not allow writes outside of allowed bits */
 	if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
-	    ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
+	    ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
 		return -EINVAL;
 
 	val |= BIT(30);
@@ -708,6 +718,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
 	for (i = 0; i < num_cache_leaves; i++)
 		cache_remove_shared_cpu_map(cpu, i);
 
+	kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
 	kfree(per_cpu(ici_cpuid4_info, cpu));
 	per_cpu(ici_cpuid4_info, cpu) = NULL;
 }
@@ -992,7 +1003,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 
 		this_leaf = CPUID4_INFO_IDX(cpu, i);
 
-		if (this_leaf->can_disable)
+		if (this_leaf->l3 && this_leaf->l3->can_disable)
 			ktype_cache.default_attrs = default_l3_attrs;
 		else
 			ktype_cache.default_attrs = default_attrs;
-- 
cgit v1.2.3


From ba06edb63f5ef2913aad37070eaec3c9d8ac73b8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 22 Apr 2010 16:07:01 +0200
Subject: x86, cacheinfo: Make L3 cache info per node

Currently, we're allocating L3 cache info and calculating indices for
each online cpu which is clearly superfluous. Instead, we need to do
this per-node as is each L3 cache.

No functional change, only per-cpu memory savings.

-v2: Allocate L3 cache descriptors array dynamically.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-5-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 59 ++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index ff663ca63fd..1346e9c23fc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -307,19 +307,18 @@ struct _cache_attr {
 };
 
 #ifdef CONFIG_CPU_SUP_AMD
+
+/*
+ * L3 cache descriptors
+ */
+static struct amd_l3_cache **__cpuinitdata l3_caches;
+
 static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 {
-	/*
-	 * We're called over smp_call_function_single() and therefore
-	 * are on the correct cpu.
-	 */
-	int cpu = smp_processor_id();
-	int node = cpu_to_node(cpu);
-	struct pci_dev *dev = node_to_k8_nb_misc(node);
 	unsigned int sc0, sc1, sc2, sc3;
 	u32 val = 0;
 
-	pci_read_config_dword(dev, 0x1C4, &val);
+	pci_read_config_dword(l3->dev, 0x1C4, &val);
 
 	/* calculate subcache sizes */
 	l3->subcaches[0] = sc0 = !(val & BIT(0));
@@ -328,13 +327,31 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 	l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
 
 	l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+}
+
+static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
+{
+	struct amd_l3_cache *l3;
+	struct pci_dev *dev = node_to_k8_nb_misc(node);
+
+	l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
+	if (!l3) {
+		printk(KERN_WARNING "Error allocating L3 struct\n");
+		return NULL;
+	}
 
 	l3->dev = dev;
+
+	amd_calc_l3_indices(l3);
+
+	return l3;
 }
 
 static void __cpuinit
 amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 {
+	int node;
+
 	if (boot_cpu_data.x86 != 0x10)
 		return;
 
@@ -355,14 +372,28 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	if (num_k8_northbridges == 0)
 		return;
 
-	this_leaf->l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
-	if (!this_leaf->l3) {
-		printk(KERN_WARNING "Error allocating L3 struct\n");
-		return;
+	/*
+	 * Strictly speaking, the amount in @size below is leaked since it is
+	 * never freed but this is done only on shutdown so it doesn't matter.
+	 */
+	if (!l3_caches) {
+		int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
+
+		l3_caches = kzalloc(size, GFP_ATOMIC);
+		if (!l3_caches)
+			return;
 	}
 
-	this_leaf->l3->can_disable = true;
-	amd_calc_l3_indices(this_leaf->l3);
+	node = amd_get_nb_id(smp_processor_id());
+
+	if (!l3_caches[node]) {
+		l3_caches[node] = amd_init_l3_cache(node);
+		l3_caches[node]->can_disable = true;
+	}
+
+	WARN_ON(!l3_caches[node]);
+
+	this_leaf->l3 = l3_caches[node];
 }
 
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
-- 
cgit v1.2.3


From 59d3b388741cf1a5eb7ad27fd4e9ed72643164ae Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 22 Apr 2010 16:07:02 +0200
Subject: x86, cacheinfo: Disable index in all four subcaches

When disabling an L3 cache index, make sure we disable that index in
all four subcaches of the L3. Clarify nomenclature while at it, wrt to
disable slots versus disable index and rename accordingly.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1271945222-5283-6-git-send-email-bp@amd64.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 60 +++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1346e9c23fc..33eae2062cf 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -397,7 +397,7 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 }
 
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
-				  unsigned int index)
+				  unsigned int slot)
 {
 	struct pci_dev *dev = this_leaf->l3->dev;
 	unsigned int reg = 0;
@@ -408,21 +408,53 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 	if (!dev)
 		return -EINVAL;
 
-	pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+	pci_read_config_dword(dev, 0x1BC + slot * 4, &reg);
 	return sprintf(buf, "0x%08x\n", reg);
 }
 
-#define SHOW_CACHE_DISABLE(index)					\
+#define SHOW_CACHE_DISABLE(slot)					\
 static ssize_t								\
-show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf)	\
+show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf)	\
 {									\
-	return show_cache_disable(this_leaf, buf, index);		\
+	return show_cache_disable(this_leaf, buf, slot);		\
 }
 SHOW_CACHE_DISABLE(0)
 SHOW_CACHE_DISABLE(1)
 
+static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
+				 unsigned slot, unsigned long idx)
+{
+	int i;
+
+	idx |= BIT(30);
+
+	/*
+	 *  disable index in all 4 subcaches
+	 */
+	for (i = 0; i < 4; i++) {
+		u32 reg = idx | (i << 20);
+
+		if (!l3->subcaches[i])
+			continue;
+
+		pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+
+		/*
+		 * We need to WBINVD on a core on the node containing the L3
+		 * cache which indices we disable therefore a simple wbinvd()
+		 * is not sufficient.
+		 */
+		wbinvd_on_cpu(cpu);
+
+		reg |= BIT(31);
+		pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+	}
+}
+
+
 static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
-	const char *buf, size_t count, unsigned int index)
+				   const char *buf, size_t count,
+				   unsigned int slot)
 {
 	struct pci_dev *dev = this_leaf->l3->dev;
 	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -448,23 +480,17 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 	    ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
 		return -EINVAL;
 
-	val |= BIT(30);
-	pci_write_config_dword(dev, 0x1BC + index * 4, val);
-	/*
-	 * We need to WBINVD on a core on the node containing the L3 cache which
-	 * indices we disable therefore a simple wbinvd() is not sufficient.
-	 */
-	wbinvd_on_cpu(cpu);
-	pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
+	amd_l3_disable_index(this_leaf->l3, cpu, slot, val);
+
 	return count;
 }
 
-#define STORE_CACHE_DISABLE(index)					\
+#define STORE_CACHE_DISABLE(slot)					\
 static ssize_t								\
-store_cache_disable_##index(struct _cpuid4_info *this_leaf,		\
+store_cache_disable_##slot(struct _cpuid4_info *this_leaf,		\
 			    const char *buf, size_t count)		\
 {									\
-	return store_cache_disable(this_leaf, buf, count, index);	\
+	return store_cache_disable(this_leaf, buf, count, slot);	\
 }
 STORE_CACHE_DISABLE(0)
 STORE_CACHE_DISABLE(1)
-- 
cgit v1.2.3


From 74f5187ac873042f502227701ed1727e7c5fbfa9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 22 Apr 2010 21:50:19 +0200
Subject: sched: Cure load average vs NO_HZ woes

Chase reported that due to us decrementing calc_load_task prematurely
(before the next LOAD_FREQ sample), the load average could be scewed
by as much as the number of CPUs in the machine.

This patch, based on Chase's patch, cures the problem by keeping the
delta of the CPU going into NO_HZ idle separately and folding that in
on the next LOAD_FREQ update.

This restores the balance and we get strict LOAD_FREQ period samples.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Chase Douglas <chase.douglas@canonical.com>
LKML-Reference: <1271934490.1776.343.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c          | 80 +++++++++++++++++++++++++++++++++++++++++--------
 kernel/sched_idletask.c |  3 +-
 2 files changed, 68 insertions(+), 15 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index de0da71daf7..0cc913a8554 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1815,7 +1815,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
 
-static void calc_load_account_active(struct rq *this_rq);
+static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
 
@@ -2950,6 +2950,61 @@ static unsigned long calc_load_update;
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
 
+static long calc_load_fold_active(struct rq *this_rq)
+{
+	long nr_active, delta = 0;
+
+	nr_active = this_rq->nr_running;
+	nr_active += (long) this_rq->nr_uninterruptible;
+
+	if (nr_active != this_rq->calc_load_active) {
+		delta = nr_active - this_rq->calc_load_active;
+		this_rq->calc_load_active = nr_active;
+	}
+
+	return delta;
+}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ *
+ * When making the ILB scale, we should try to pull this in as well.
+ */
+static atomic_long_t calc_load_tasks_idle;
+
+static void calc_load_account_idle(struct rq *this_rq)
+{
+	long delta;
+
+	delta = calc_load_fold_active(this_rq);
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks_idle);
+}
+
+static long calc_load_fold_idle(void)
+{
+	long delta = 0;
+
+	/*
+	 * Its got a race, we don't care...
+	 */
+	if (atomic_long_read(&calc_load_tasks_idle))
+		delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+
+	return delta;
+}
+#else
+static void calc_load_account_idle(struct rq *this_rq)
+{
+}
+
+static inline long calc_load_fold_idle(void)
+{
+	return 0;
+}
+#endif
+
 /**
  * get_avenrun - get the load average array
  * @loads:	pointer to dest load array
@@ -2996,20 +3051,22 @@ void calc_global_load(void)
 }
 
 /*
- * Either called from update_cpu_load() or from a cpu going idle
+ * Called from update_cpu_load() to periodically update this CPU's
+ * active count.
  */
 static void calc_load_account_active(struct rq *this_rq)
 {
-	long nr_active, delta;
+	long delta;
 
-	nr_active = this_rq->nr_running;
-	nr_active += (long) this_rq->nr_uninterruptible;
+	if (time_before(jiffies, this_rq->calc_load_update))
+		return;
 
-	if (nr_active != this_rq->calc_load_active) {
-		delta = nr_active - this_rq->calc_load_active;
-		this_rq->calc_load_active = nr_active;
+	delta  = calc_load_fold_active(this_rq);
+	delta += calc_load_fold_idle();
+	if (delta)
 		atomic_long_add(delta, &calc_load_tasks);
-	}
+
+	this_rq->calc_load_update += LOAD_FREQ;
 }
 
 /*
@@ -3041,10 +3098,7 @@ static void update_cpu_load(struct rq *this_rq)
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
 
-	if (time_after_eq(jiffies, this_rq->calc_load_update)) {
-		this_rq->calc_load_update += LOAD_FREQ;
-		calc_load_account_active(this_rq);
-	}
+	calc_load_account_active(this_rq);
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index bea2b8f1202..9fa0f402c87 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -23,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
 	schedstat_inc(rq, sched_goidle);
-	/* adjust the active tasks as we might go into a long sleep */
-	calc_load_account_active(rq);
+	calc_load_account_idle(rq);
 	return rq->idle;
 }
 
-- 
cgit v1.2.3


From 669c55e9f99b90e46eaa0f98a67ec53d46dc969a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 16 Apr 2010 14:59:29 +0200
Subject: sched: Pre-compute cpumask_weight(sched_domain_span(sd))

Dave reported that his large SPARC machines spend lots of time in
hweight64(), try and optimize some of those needless cpumask_weight()
invocations (esp. with the large offstack cpumasks these are very
expensive indeed).

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        |  3 +++
 kernel/sched_fair.c   | 12 +++++-------
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e3e900f318d..dfea40574b2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -960,6 +960,7 @@ struct sched_domain {
 	char *name;
 #endif
 
+	unsigned int span_weight;
 	/*
 	 * Span of all CPUs in this domain.
 	 *
diff --git a/kernel/sched.c b/kernel/sched.c
index 0cc913a8554..4956ed09283 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6271,6 +6271,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 
+	for (tmp = sd; tmp; tmp = tmp->parent)
+		tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
+
 	/* Remove the sched domains which do not contribute to scheduling. */
 	for (tmp = sd; tmp; ) {
 		struct sched_domain *parent = tmp->parent;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 88d3053ac7c..0a413c7e3ab 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1508,9 +1508,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 		 * Pick the largest domain to update shares over
 		 */
 		tmp = sd;
-		if (affine_sd && (!tmp ||
-				  cpumask_weight(sched_domain_span(affine_sd)) >
-				  cpumask_weight(sched_domain_span(sd))))
+		if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
 			tmp = affine_sd;
 
 		if (tmp) {
@@ -1554,10 +1552,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 
 		/* Now try balancing at a lower domain level of new_cpu */
 		cpu = new_cpu;
-		weight = cpumask_weight(sched_domain_span(sd));
+		weight = sd->span_weight;
 		sd = NULL;
 		for_each_domain(cpu, tmp) {
-			if (weight <= cpumask_weight(sched_domain_span(tmp)))
+			if (weight <= tmp->span_weight)
 				break;
 			if (tmp->flags & sd_flag)
 				sd = tmp;
@@ -2243,7 +2241,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
 
 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
 {
-	unsigned long weight = cpumask_weight(sched_domain_span(sd));
+	unsigned long weight = sd->span_weight;
 	unsigned long smt_gain = sd->smt_gain;
 
 	smt_gain /= weight;
@@ -2276,7 +2274,7 @@ unsigned long scale_rt_power(int cpu)
 
 static void update_cpu_power(struct sched_domain *sd, int cpu)
 {
-	unsigned long weight = cpumask_weight(sched_domain_span(sd));
+	unsigned long weight = sd->span_weight;
 	unsigned long power = SCHED_LOAD_SCALE;
 	struct sched_group *sdg = sd->groups;
 
-- 
cgit v1.2.3


From 99bd5e2f245d8cd17d040c82d40becdb3efd9b69 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 31 Mar 2010 16:47:45 -0700
Subject: sched: Fix select_idle_sibling() logic in select_task_rq_fair()

Issues in the current select_idle_sibling() logic in select_task_rq_fair()
in the context of a task wake-up:

a) Once we select the idle sibling, we use that domain (spanning the cpu that
   the task is currently woken-up and the idle sibling that we found) in our
   wake_affine() decisions. This domain is completely different from the
   domain(we are supposed to use) that spans the cpu that the task currently
   woken-up and the cpu where the task previously ran.

b) We do select_idle_sibling() check only for the cpu that the task is
   currently woken-up on. If select_task_rq_fair() selects the previously run
   cpu for waking the task, doing a select_idle_sibling() check
   for that cpu also helps and we don't do this currently.

c) In the scenarios where the cpu that the task is woken-up is busy but
   with its HT siblings are idle, we are selecting the task be woken-up
   on the idle HT sibling instead of a core that it previously ran
   and currently completely idle. i.e., we are not taking decisions based on
   wake_affine() but directly selecting an idle sibling that can cause
   an imbalance at the SMT/MC level which will be later corrected by the
   periodic load balancer.

Fix this by first going through the load imbalance calculations using
wake_affine() and once we make a decision of woken-up cpu vs previously-ran cpu,
then choose a possible idle sibling for waking up the task on.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1270079265.7835.8.camel@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 82 ++++++++++++++++++++++++++---------------------------
 1 file changed, 40 insertions(+), 42 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0a413c7e3ab..cbd8b8a296d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1375,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 /*
  * Try and locate an idle CPU in the sched_domain.
  */
-static int
-select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_sibling(struct task_struct *p, int target)
 {
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
+	struct sched_domain *sd;
 	int i;
 
 	/*
-	 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
-	 * test in select_task_rq_fair) and the prev_cpu is idle then that's
-	 * always a better target than the current cpu.
+	 * If the task is going to be woken-up on this cpu and if it is
+	 * already idle, then it is the right target.
+	 */
+	if (target == cpu && idle_cpu(cpu))
+		return cpu;
+
+	/*
+	 * If the task is going to be woken-up on the cpu where it previously
+	 * ran and if it is currently idle, then it the right target.
 	 */
-	if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
+	if (target == prev_cpu && idle_cpu(prev_cpu))
 		return prev_cpu;
 
 	/*
-	 * Otherwise, iterate the domain and find an elegible idle cpu.
+	 * Otherwise, iterate the domains and find an elegible idle cpu.
 	 */
-	for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
-		if (!cpu_rq(i)->cfs.nr_running) {
-			target = i;
+	for_each_domain(target, sd) {
+		if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
 			break;
+
+		for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+			if (idle_cpu(i)) {
+				target = i;
+				break;
+			}
 		}
+
+		/*
+		 * Lets stop looking for an idle sibling when we reached
+		 * the domain that spans the current cpu and prev_cpu.
+		 */
+		if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+		    cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+			break;
 	}
 
 	return target;
@@ -1421,7 +1440,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
 	int new_cpu = cpu;
-	int want_affine = 0, cpu_idle = !current->pid;
+	int want_affine = 0;
 	int want_sd = 1;
 	int sync = wake_flags & WF_SYNC;
 
@@ -1460,36 +1479,13 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 		}
 
 		/*
-		 * While iterating the domains looking for a spanning
-		 * WAKE_AFFINE domain, adjust the affine target to any idle cpu
-		 * in cache sharing domains along the way.
+		 * If both cpu and prev_cpu are part of this domain,
+		 * cpu is a valid SD_WAKE_AFFINE target.
 		 */
-		if (want_affine) {
-			int target = -1;
-
-			/*
-			 * If both cpu and prev_cpu are part of this domain,
-			 * cpu is a valid SD_WAKE_AFFINE target.
-			 */
-			if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
-				target = cpu;
-
-			/*
-			 * If there's an idle sibling in this domain, make that
-			 * the wake_affine target instead of the current cpu.
-			 */
-			if (!cpu_idle && tmp->flags & SD_SHARE_PKG_RESOURCES)
-				target = select_idle_sibling(p, tmp, target);
-
-			if (target >= 0) {
-				if (tmp->flags & SD_WAKE_AFFINE) {
-					affine_sd = tmp;
-					want_affine = 0;
-					if (target != cpu)
-						cpu_idle = 1;
-				}
-				cpu = target;
-			}
+		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+			affine_sd = tmp;
+			want_affine = 0;
 		}
 
 		if (!want_sd && !want_affine)
@@ -1520,8 +1516,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
 #endif
 
 	if (affine_sd) {
-		if (cpu_idle || cpu == prev_cpu || wake_affine(affine_sd, p, sync))
-			return cpu;
+		if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+			return select_idle_sibling(p, cpu);
+		else
+			return select_idle_sibling(p, prev_cpu);
 	}
 
 	while (sd) {
-- 
cgit v1.2.3


From cb6e943ccf19ab6d3189147e9d625a992e016084 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 1 Apr 2010 03:17:25 +0200
Subject: oprofile: remove double ring buffering

oprofile used a double buffer scheme for its cpu event buffer
to avoid races on reading with the old locked ring buffer.

But that is obsolete now with the new ring buffer, so simply
use a single buffer. This greatly simplifies the code and avoids
a lot of sample drops on large runs, especially with call graph.

Based on suggestions from Steven Rostedt

For stable kernels from v2.6.32, but not earlier.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: stable <stable@kernel.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 drivers/oprofile/cpu_buffer.c | 63 +++++++++----------------------------------
 1 file changed, 13 insertions(+), 50 deletions(-)

diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 166b67ea622..de82183bb9b 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -30,23 +30,7 @@
 
 #define OP_BUFFER_FLAGS	0
 
-/*
- * Read and write access is using spin locking. Thus, writing to the
- * buffer by NMI handler (x86) could occur also during critical
- * sections when reading the buffer. To avoid this, there are 2
- * buffers for independent read and write access. Read access is in
- * process context only, write access only in the NMI handler. If the
- * read buffer runs empty, both buffers are swapped atomically. There
- * is potentially a small window during swapping where the buffers are
- * disabled and samples could be lost.
- *
- * Using 2 buffers is a little bit overhead, but the solution is clear
- * and does not require changes in the ring buffer implementation. It
- * can be changed to a single buffer solution when the ring buffer
- * access is implemented as non-locking atomic code.
- */
-static struct ring_buffer *op_ring_buffer_read;
-static struct ring_buffer *op_ring_buffer_write;
+static struct ring_buffer *op_ring_buffer;
 DEFINE_PER_CPU(struct oprofile_cpu_buffer, op_cpu_buffer);
 
 static void wq_sync_buffer(struct work_struct *work);
@@ -68,12 +52,9 @@ void oprofile_cpu_buffer_inc_smpl_lost(void)
 
 void free_cpu_buffers(void)
 {
-	if (op_ring_buffer_read)
-		ring_buffer_free(op_ring_buffer_read);
-	op_ring_buffer_read = NULL;
-	if (op_ring_buffer_write)
-		ring_buffer_free(op_ring_buffer_write);
-	op_ring_buffer_write = NULL;
+	if (op_ring_buffer)
+		ring_buffer_free(op_ring_buffer);
+	op_ring_buffer = NULL;
 }
 
 #define RB_EVENT_HDR_SIZE 4
@@ -86,11 +67,8 @@ int alloc_cpu_buffers(void)
 	unsigned long byte_size = buffer_size * (sizeof(struct op_sample) +
 						 RB_EVENT_HDR_SIZE);
 
-	op_ring_buffer_read = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
-	if (!op_ring_buffer_read)
-		goto fail;
-	op_ring_buffer_write = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
-	if (!op_ring_buffer_write)
+	op_ring_buffer = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
+	if (!op_ring_buffer)
 		goto fail;
 
 	for_each_possible_cpu(i) {
@@ -162,16 +140,11 @@ struct op_sample
 *op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size)
 {
 	entry->event = ring_buffer_lock_reserve
-		(op_ring_buffer_write, sizeof(struct op_sample) +
+		(op_ring_buffer, sizeof(struct op_sample) +
 		 size * sizeof(entry->sample->data[0]));
-	if (entry->event)
-		entry->sample = ring_buffer_event_data(entry->event);
-	else
-		entry->sample = NULL;
-
-	if (!entry->sample)
+	if (!entry->event)
 		return NULL;
-
+	entry->sample = ring_buffer_event_data(entry->event);
 	entry->size = size;
 	entry->data = entry->sample->data;
 
@@ -180,25 +153,16 @@ struct op_sample
 
 int op_cpu_buffer_write_commit(struct op_entry *entry)
 {
-	return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event);
+	return ring_buffer_unlock_commit(op_ring_buffer, entry->event);
 }
 
 struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
 {
 	struct ring_buffer_event *e;
-	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
-	if (e)
-		goto event;
-	if (ring_buffer_swap_cpu(op_ring_buffer_read,
-				 op_ring_buffer_write,
-				 cpu))
+	e = ring_buffer_consume(op_ring_buffer, cpu, NULL);
+	if (!e)
 		return NULL;
-	e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
-	if (e)
-		goto event;
-	return NULL;
 
-event:
 	entry->event = e;
 	entry->sample = ring_buffer_event_data(e);
 	entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample))
@@ -209,8 +173,7 @@ event:
 
 unsigned long op_cpu_buffer_entries(int cpu)
 {
-	return ring_buffer_entries_cpu(op_ring_buffer_read, cpu)
-		+ ring_buffer_entries_cpu(op_ring_buffer_write, cpu);
+	return ring_buffer_entries_cpu(op_ring_buffer, cpu);
 }
 
 static int
-- 
cgit v1.2.3


From 1f9cc3cb6a27521edfe0a21abf97d2bb11c4d237 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Fri, 23 Apr 2010 10:36:22 -0500
Subject: x86, pat: Update the page flags for memtype atomically instead of
 using memtype_lock

While testing an application using the xpmem (out of kernel) driver, we
noticed a significant page fault rate reduction of x86_64 with respect
to ia64.  For one test running with 32 cpus, one thread per cpu, it
took 01:08 for each of the threads to vm_insert_pfn 2GB worth of pages.
For the same test running on 256 cpus, one thread per cpu, it took 14:48
to vm_insert_pfn 2 GB worth of pages.

The slowdown was tracked to lookup_memtype which acquires the
spinlock memtype_lock.  This heavily contended lock was slowing down
vm_insert_pfn().

With the cmpxchg on page->flags method, both the 32 cpu and 256 cpu
cases take approx 00:01.3 seconds to complete.

Signed-off-by: Robin Holt <holt@sgi.com>
LKML-Reference: <20100423153627.751194346@gulag1.americas.sgi.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@gmail.com>
Cc: Rafael Wysocki <rjw@novell.com>
Reviewed-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/cacheflush.h | 44 ++++++++++++++++++++++-----------------
 arch/x86/mm/pat.c                 |  8 -------
 2 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 634c40a739a..c70068d05f7 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -44,9 +44,6 @@ static inline void copy_from_user_page(struct vm_area_struct *vma,
 	memcpy(dst, src, len);
 }
 
-#define PG_WC				PG_arch_1
-PAGEFLAG(WC, WC)
-
 #ifdef CONFIG_X86_PAT
 /*
  * X86 PAT uses page flags WC and Uncached together to keep track of
@@ -55,16 +52,24 @@ PAGEFLAG(WC, WC)
  * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not
  * been changed from its default (value of -1 used to denote this).
  * Note we do not support _PAGE_CACHE_UC here.
- *
- * Caller must hold memtype_lock for atomicity.
  */
+
+#define _PGMT_DEFAULT		0
+#define _PGMT_WC		(1UL << PG_arch_1)
+#define _PGMT_UC_MINUS		(1UL << PG_uncached)
+#define _PGMT_WB		(1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_MASK		(1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_CLEAR_MASK	(~_PGMT_MASK)
+
 static inline unsigned long get_page_memtype(struct page *pg)
 {
-	if (!PageUncached(pg) && !PageWC(pg))
+	unsigned long pg_flags = pg->flags & _PGMT_MASK;
+
+	if (pg_flags == _PGMT_DEFAULT)
 		return -1;
-	else if (!PageUncached(pg) && PageWC(pg))
+	else if (pg_flags == _PGMT_WC)
 		return _PAGE_CACHE_WC;
-	else if (PageUncached(pg) && !PageWC(pg))
+	else if (pg_flags == _PGMT_UC_MINUS)
 		return _PAGE_CACHE_UC_MINUS;
 	else
 		return _PAGE_CACHE_WB;
@@ -72,25 +77,26 @@ static inline unsigned long get_page_memtype(struct page *pg)
 
 static inline void set_page_memtype(struct page *pg, unsigned long memtype)
 {
+	unsigned long memtype_flags = _PGMT_DEFAULT;
+	unsigned long old_flags;
+	unsigned long new_flags;
+
 	switch (memtype) {
 	case _PAGE_CACHE_WC:
-		ClearPageUncached(pg);
-		SetPageWC(pg);
+		memtype_flags = _PGMT_WC;
 		break;
 	case _PAGE_CACHE_UC_MINUS:
-		SetPageUncached(pg);
-		ClearPageWC(pg);
+		memtype_flags = _PGMT_UC_MINUS;
 		break;
 	case _PAGE_CACHE_WB:
-		SetPageUncached(pg);
-		SetPageWC(pg);
-		break;
-	default:
-	case -1:
-		ClearPageUncached(pg);
-		ClearPageWC(pg);
+		memtype_flags = _PGMT_WB;
 		break;
 	}
+
+	do {
+		old_flags = pg->flags;
+		new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
+	} while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
 }
 #else
 static inline unsigned long get_page_memtype(struct page *pg) { return -1; }
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 951011166ef..501fc60e5e4 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -190,8 +190,6 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end)
  * Here we do two pass:
  * - Find the memtype of all the pages in the range, look for any conflicts
  * - In case of no conflicts, set the new memtype for pages in the range
- *
- * Caller must hold memtype_lock for atomicity.
  */
 static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
 				  unsigned long *new_type)
@@ -297,9 +295,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
 	is_range_ram = pat_pagerange_is_ram(start, end);
 	if (is_range_ram == 1) {
 
-		spin_lock(&memtype_lock);
 		err = reserve_ram_pages_type(start, end, req_type, new_type);
-		spin_unlock(&memtype_lock);
 
 		return err;
 	} else if (is_range_ram < 0) {
@@ -351,9 +347,7 @@ int free_memtype(u64 start, u64 end)
 	is_range_ram = pat_pagerange_is_ram(start, end);
 	if (is_range_ram == 1) {
 
-		spin_lock(&memtype_lock);
 		err = free_ram_pages_type(start, end);
-		spin_unlock(&memtype_lock);
 
 		return err;
 	} else if (is_range_ram < 0) {
@@ -394,10 +388,8 @@ static unsigned long lookup_memtype(u64 paddr)
 
 	if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
 		struct page *page;
-		spin_lock(&memtype_lock);
 		page = pfn_to_page(paddr >> PAGE_SHIFT);
 		rettype = get_page_memtype(page);
-		spin_unlock(&memtype_lock);
 		/*
 		 * -1 from get_page_memtype() implies RAM page is in its
 		 * default state and not reserved, and hence of type WB
-- 
cgit v1.2.3


From e4cef1f65061429c3e8b356233e87dc6653a9da5 Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Date: Wed, 21 Apr 2010 21:23:54 +0900
Subject: perf lock: Fix state machine to recognize lock sequence

Previous state machine of perf lock was really broken.
This patch improves it a little.

This patch prepares the list of state machine that represents
lock sequences for each threads.

These state machines can be one of these sequences:

      1) acquire -> acquired -> release
      2) acquire -> contended -> acquired -> release
      3) acquire (w/ try) -> release
      4) acquire (w/ read) -> release

The case of 4) is a little special.
Double acquire of read lock is allowed, so the state machine
counts read lock number, and permits double acquire and release.

But, things are not so simple. Something in my model is still wrong.
I counted the number of lock instances with bad sequence,
and ratio is like this (case of tracing whoami): bad:233, total:2279

version 2:
 * threads are now identified with tid, not pid
 * prepared SEQ_STATE_READ_ACQUIRED for read lock.
 * bunch of struct lock_seq_stat is now linked list
 * debug information enhanced (this have to be removed someday)
   e.g.
     | === output for debug===
     |
     | bad:233, total:2279
     | bad rate:0.000000
     | histogram of events caused bad sequence
     |     acquire: 165
     |    acquired: 0
     |   contended: 0
     |     release: 68

Signed-off-by: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1271852634-9351-1-git-send-email-mitake@dcl.info.waseda.ac.jp>
[rename SEQ_STATE_UNINITED to SEQ_STATE_UNINITIALIZED]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 tools/perf/builtin-lock.c | 410 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 342 insertions(+), 68 deletions(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 6c38e4febf9..716d8c544a5 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -23,6 +23,8 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 
+static struct perf_session *session;
+
 /* based on kernel/lockdep.c */
 #define LOCKHASH_BITS		12
 #define LOCKHASH_SIZE		(1UL << LOCKHASH_BITS)
@@ -32,9 +34,6 @@ static struct list_head lockhash_table[LOCKHASH_SIZE];
 #define __lockhashfn(key)	hash_long((unsigned long)key, LOCKHASH_BITS)
 #define lockhashentry(key)	(lockhash_table + __lockhashfn((key)))
 
-#define LOCK_STATE_UNLOCKED	0	       /* initial state */
-#define LOCK_STATE_LOCKED	1
-
 struct lock_stat {
 	struct list_head	hash_entry;
 	struct rb_node		rb;		/* used for sorting */
@@ -47,20 +46,151 @@ struct lock_stat {
 	void			*addr;		/* address of lockdep_map, used as ID */
 	char			*name;		/* for strcpy(), we cannot use const */
 
-	int			state;
-	u64			prev_event_time; /* timestamp of previous event */
-
-	unsigned int		nr_acquired;
 	unsigned int		nr_acquire;
+	unsigned int		nr_acquired;
 	unsigned int		nr_contended;
 	unsigned int		nr_release;
 
+	unsigned int		nr_readlock;
+	unsigned int		nr_trylock;
 	/* these times are in nano sec. */
 	u64			wait_time_total;
 	u64			wait_time_min;
 	u64			wait_time_max;
+
+	int			discard; /* flag of blacklist */
+};
+
+/*
+ * States of lock_seq_stat
+ *
+ * UNINITIALIZED is required for detecting first event of acquire.
+ * As the nature of lock events, there is no guarantee
+ * that the first event for the locks are acquire,
+ * it can be acquired, contended or release.
+ */
+#define SEQ_STATE_UNINITIALIZED      0	       /* initial state */
+#define SEQ_STATE_RELEASED	1
+#define SEQ_STATE_ACQUIRING	2
+#define SEQ_STATE_ACQUIRED	3
+#define SEQ_STATE_READ_ACQUIRED	4
+#define SEQ_STATE_CONTENDED	5
+
+/*
+ * MAX_LOCK_DEPTH
+ * Imported from include/linux/sched.h.
+ * Should this be synchronized?
+ */
+#define MAX_LOCK_DEPTH 48
+
+/*
+ * struct lock_seq_stat:
+ * Place to put on state of one lock sequence
+ * 1) acquire -> acquired -> release
+ * 2) acquire -> contended -> acquired -> release
+ * 3) acquire (with read or try) -> release
+ * 4) Are there other patterns?
+ */
+struct lock_seq_stat {
+	struct list_head        list;
+	int			state;
+	u64			prev_event_time;
+	void                    *addr;
+
+	int                     read_count;
 };
 
+struct thread_stat {
+	struct rb_node		rb;
+
+	u32                     tid;
+	struct list_head        seq_list;
+};
+
+static struct rb_root		thread_stats;
+
+static struct thread_stat *thread_stat_find(u32 tid)
+{
+	struct rb_node *node;
+	struct thread_stat *st;
+
+	node = thread_stats.rb_node;
+	while (node) {
+		st = container_of(node, struct thread_stat, rb);
+		if (st->tid == tid)
+			return st;
+		else if (tid < st->tid)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+
+	return NULL;
+}
+
+static void thread_stat_insert(struct thread_stat *new)
+{
+	struct rb_node **rb = &thread_stats.rb_node;
+	struct rb_node *parent = NULL;
+	struct thread_stat *p;
+
+	while (*rb) {
+		p = container_of(*rb, struct thread_stat, rb);
+		parent = *rb;
+
+		if (new->tid < p->tid)
+			rb = &(*rb)->rb_left;
+		else if (new->tid > p->tid)
+			rb = &(*rb)->rb_right;
+		else
+			BUG_ON("inserting invalid thread_stat\n");
+	}
+
+	rb_link_node(&new->rb, parent, rb);
+	rb_insert_color(&new->rb, &thread_stats);
+}
+
+static struct thread_stat *thread_stat_findnew_after_first(u32 tid)
+{
+	struct thread_stat *st;
+
+	st = thread_stat_find(tid);
+	if (st)
+		return st;
+
+	st = zalloc(sizeof(struct thread_stat));
+	if (!st)
+		die("memory allocation failed\n");
+
+	st->tid = tid;
+	INIT_LIST_HEAD(&st->seq_list);
+
+	thread_stat_insert(st);
+
+	return st;
+}
+
+static struct thread_stat *thread_stat_findnew_first(u32 tid);
+static struct thread_stat *(*thread_stat_findnew)(u32 tid) =
+	thread_stat_findnew_first;
+
+static struct thread_stat *thread_stat_findnew_first(u32 tid)
+{
+	struct thread_stat *st;
+
+	st = zalloc(sizeof(struct thread_stat));
+	if (!st)
+		die("memory allocation failed\n");
+	st->tid = tid;
+	INIT_LIST_HEAD(&st->seq_list);
+
+	rb_link_node(&st->rb, NULL, &thread_stats.rb_node);
+	rb_insert_color(&st->rb, &thread_stats);
+
+	thread_stat_findnew = thread_stat_findnew_after_first;
+	return st;
+}
+
 /* build simple key function one is bigger than two */
 #define SINGLE_KEY(member)						\
 	static int lock_stat_key_ ## member(struct lock_stat *one,	\
@@ -175,8 +305,6 @@ static struct lock_stat *lock_stat_findnew(void *addr, const char *name)
 		goto alloc_failed;
 	strcpy(new->name, name);
 
-	/* LOCK_STATE_UNLOCKED == 0 isn't guaranteed forever */
-	new->state = LOCK_STATE_UNLOCKED;
 	new->wait_time_min = ULLONG_MAX;
 
 	list_add(&new->hash_entry, entry);
@@ -198,6 +326,7 @@ struct raw_event_sample {
 struct trace_acquire_event {
 	void			*addr;
 	const char		*name;
+	int			flag;
 };
 
 struct trace_acquired_event {
@@ -241,120 +370,246 @@ struct trace_lock_handler {
 			      struct thread *thread);
 };
 
+static struct lock_seq_stat *get_seq(struct thread_stat *ts, void *addr)
+{
+	struct lock_seq_stat *seq;
+
+	list_for_each_entry(seq, &ts->seq_list, list) {
+		if (seq->addr == addr)
+			return seq;
+	}
+
+	seq = zalloc(sizeof(struct lock_seq_stat));
+	if (!seq)
+		die("Not enough memory\n");
+	seq->state = SEQ_STATE_UNINITIALIZED;
+	seq->addr = addr;
+
+	list_add(&seq->list, &ts->seq_list);
+	return seq;
+}
+
+static int bad_hist[4];
+
 static void
 report_lock_acquire_event(struct trace_acquire_event *acquire_event,
 			struct event *__event __used,
 			int cpu __used,
-			u64 timestamp,
+			u64 timestamp __used,
 			struct thread *thread __used)
 {
-	struct lock_stat *st;
+	struct lock_stat *ls;
+	struct thread_stat *ts;
+	struct lock_seq_stat *seq;
+
+	ls = lock_stat_findnew(acquire_event->addr, acquire_event->name);
+	if (ls->discard)
+		return;
 
-	st = lock_stat_findnew(acquire_event->addr, acquire_event->name);
+	ts = thread_stat_findnew(thread->pid);
+	seq = get_seq(ts, acquire_event->addr);
 
-	switch (st->state) {
-	case LOCK_STATE_UNLOCKED:
+	switch (seq->state) {
+	case SEQ_STATE_UNINITIALIZED:
+	case SEQ_STATE_RELEASED:
+		if (!acquire_event->flag) {
+			seq->state = SEQ_STATE_ACQUIRING;
+		} else {
+			if (acquire_event->flag & 1)
+				ls->nr_trylock++;
+			if (acquire_event->flag & 2)
+				ls->nr_readlock++;
+			seq->state = SEQ_STATE_READ_ACQUIRED;
+			seq->read_count = 1;
+			ls->nr_acquired++;
+		}
+		break;
+	case SEQ_STATE_READ_ACQUIRED:
+		if (acquire_event->flag & 2) {
+			seq->read_count++;
+			ls->nr_acquired++;
+			goto end;
+		} else {
+			goto broken;
+		}
 		break;
-	case LOCK_STATE_LOCKED:
+	case SEQ_STATE_ACQUIRED:
+	case SEQ_STATE_ACQUIRING:
+	case SEQ_STATE_CONTENDED:
+broken:
+		/* broken lock sequence, discard it */
+		ls->discard = 1;
+		bad_hist[0]++;
+		list_del(&seq->list);
+		free(seq);
+		goto end;
 		break;
 	default:
-		BUG_ON(1);
+		BUG_ON("Unknown state of lock sequence found!\n");
 		break;
 	}
 
-	st->prev_event_time = timestamp;
+	ls->nr_acquire++;
+	seq->prev_event_time = timestamp;
+end:
+	return;
 }
 
 static void
 report_lock_acquired_event(struct trace_acquired_event *acquired_event,
 			 struct event *__event __used,
 			 int cpu __used,
-			 u64 timestamp,
+			 u64 timestamp __used,
 			 struct thread *thread __used)
 {
-	struct lock_stat *st;
+	struct lock_stat *ls;
+	struct thread_stat *ts;
+	struct lock_seq_stat *seq;
+	u64 contended_term;
 
-	st = lock_stat_findnew(acquired_event->addr, acquired_event->name);
+	ls = lock_stat_findnew(acquired_event->addr, acquired_event->name);
+	if (ls->discard)
+		return;
+
+	ts = thread_stat_findnew(thread->pid);
+	seq = get_seq(ts, acquired_event->addr);
 
-	switch (st->state) {
-	case LOCK_STATE_UNLOCKED:
-		st->state = LOCK_STATE_LOCKED;
-		st->nr_acquired++;
+	switch (seq->state) {
+	case SEQ_STATE_UNINITIALIZED:
+		/* orphan event, do nothing */
+		return;
+	case SEQ_STATE_ACQUIRING:
 		break;
-	case LOCK_STATE_LOCKED:
+	case SEQ_STATE_CONTENDED:
+		contended_term = timestamp - seq->prev_event_time;
+		ls->wait_time_total += contended_term;
+
+		if (contended_term < ls->wait_time_min)
+			ls->wait_time_min = contended_term;
+		else if (ls->wait_time_max < contended_term)
+			ls->wait_time_max = contended_term;
 		break;
+	case SEQ_STATE_RELEASED:
+	case SEQ_STATE_ACQUIRED:
+	case SEQ_STATE_READ_ACQUIRED:
+		/* broken lock sequence, discard it */
+		ls->discard = 1;
+		bad_hist[1]++;
+		list_del(&seq->list);
+		free(seq);
+		goto end;
+		break;
+
 	default:
-		BUG_ON(1);
+		BUG_ON("Unknown state of lock sequence found!\n");
 		break;
 	}
 
-	st->prev_event_time = timestamp;
+	seq->state = SEQ_STATE_ACQUIRED;
+	ls->nr_acquired++;
+	seq->prev_event_time = timestamp;
+end:
+	return;
 }
 
 static void
 report_lock_contended_event(struct trace_contended_event *contended_event,
 			  struct event *__event __used,
 			  int cpu __used,
-			  u64 timestamp,
+			  u64 timestamp __used,
 			  struct thread *thread __used)
 {
-	struct lock_stat *st;
+	struct lock_stat *ls;
+	struct thread_stat *ts;
+	struct lock_seq_stat *seq;
+
+	ls = lock_stat_findnew(contended_event->addr, contended_event->name);
+	if (ls->discard)
+		return;
 
-	st = lock_stat_findnew(contended_event->addr, contended_event->name);
+	ts = thread_stat_findnew(thread->pid);
+	seq = get_seq(ts, contended_event->addr);
 
-	switch (st->state) {
-	case LOCK_STATE_UNLOCKED:
+	switch (seq->state) {
+	case SEQ_STATE_UNINITIALIZED:
+		/* orphan event, do nothing */
+		return;
+	case SEQ_STATE_ACQUIRING:
 		break;
-	case LOCK_STATE_LOCKED:
-		st->nr_contended++;
+	case SEQ_STATE_RELEASED:
+	case SEQ_STATE_ACQUIRED:
+	case SEQ_STATE_READ_ACQUIRED:
+	case SEQ_STATE_CONTENDED:
+		/* broken lock sequence, discard it */
+		ls->discard = 1;
+		bad_hist[2]++;
+		list_del(&seq->list);
+		free(seq);
+		goto end;
 		break;
 	default:
-		BUG_ON(1);
+		BUG_ON("Unknown state of lock sequence found!\n");
 		break;
 	}
 
-	st->prev_event_time = timestamp;
+	seq->state = SEQ_STATE_CONTENDED;
+	ls->nr_contended++;
+	seq->prev_event_time = timestamp;
+end:
+	return;
 }
 
 static void
 report_lock_release_event(struct trace_release_event *release_event,
 			struct event *__event __used,
 			int cpu __used,
-			u64 timestamp,
+			u64 timestamp __used,
 			struct thread *thread __used)
 {
-	struct lock_stat *st;
-	u64 hold_time;
+	struct lock_stat *ls;
+	struct thread_stat *ts;
+	struct lock_seq_stat *seq;
 
-	st = lock_stat_findnew(release_event->addr, release_event->name);
+	ls = lock_stat_findnew(release_event->addr, release_event->name);
+	if (ls->discard)
+		return;
 
-	switch (st->state) {
-	case LOCK_STATE_UNLOCKED:
-		break;
-	case LOCK_STATE_LOCKED:
-		st->state = LOCK_STATE_UNLOCKED;
-		hold_time = timestamp - st->prev_event_time;
+	ts = thread_stat_findnew(thread->pid);
+	seq = get_seq(ts, release_event->addr);
 
-		if (timestamp < st->prev_event_time) {
-			/* terribly, this can happen... */
+	switch (seq->state) {
+	case SEQ_STATE_UNINITIALIZED:
+		goto end;
+		break;
+	case SEQ_STATE_ACQUIRED:
+		break;
+	case SEQ_STATE_READ_ACQUIRED:
+		seq->read_count--;
+		BUG_ON(seq->read_count < 0);
+		if (!seq->read_count) {
+			ls->nr_release++;
 			goto end;
 		}
-
-		if (st->wait_time_min > hold_time)
-			st->wait_time_min = hold_time;
-		if (st->wait_time_max < hold_time)
-			st->wait_time_max = hold_time;
-		st->wait_time_total += hold_time;
-
-		st->nr_release++;
+		break;
+	case SEQ_STATE_ACQUIRING:
+	case SEQ_STATE_CONTENDED:
+	case SEQ_STATE_RELEASED:
+		/* broken lock sequence, discard it */
+		ls->discard = 1;
+		bad_hist[3]++;
+		goto free_seq;
 		break;
 	default:
-		BUG_ON(1);
+		BUG_ON("Unknown state of lock sequence found!\n");
 		break;
 	}
 
+	ls->nr_release++;
+free_seq:
+	list_del(&seq->list);
+	free(seq);
 end:
-	st->prev_event_time = timestamp;
+	return;
 }
 
 /* lock oriented handlers */
@@ -381,6 +636,7 @@ process_lock_acquire_event(void *data,
 	tmp = raw_field_value(event, "lockdep_addr", data);
 	memcpy(&acquire_event.addr, &tmp, sizeof(void *));
 	acquire_event.name = (char *)raw_field_ptr(event, "name", data);
+	acquire_event.flag = (int)raw_field_value(event, "flag", data);
 
 	if (trace_handler->acquire_event)
 		trace_handler->acquire_event(&acquire_event, event, cpu, timestamp, thread);
@@ -441,8 +697,8 @@ process_lock_release_event(void *data,
 }
 
 static void
-process_raw_event(void *data, int cpu,
-		  u64 timestamp, struct thread *thread)
+process_raw_event(void *data, int cpu __used,
+		  u64 timestamp __used, struct thread *thread __used)
 {
 	struct event *event;
 	int type;
@@ -604,14 +860,15 @@ static void queue_raw_event(void *data, int raw_size, int cpu,
 	}
 }
 
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int process_sample_event(event_t *event, struct perf_session *s)
 {
 	struct thread *thread;
 	struct sample_data data;
 
 	bzero(&data, sizeof(struct sample_data));
-	event__parse_sample(event, session->sample_type, &data);
-	thread = perf_session__findnew(session, data.pid);
+	event__parse_sample(event, s->sample_type, &data);
+	/* CAUTION: using tid as thread.pid */
+	thread = perf_session__findnew(s, data.tid);
 
 	if (thread == NULL) {
 		pr_debug("problem processing %d event, skipping it.\n",
@@ -634,8 +891,8 @@ static void print_result(void)
 {
 	struct lock_stat *st;
 	char cut_name[20];
+	int bad, total;
 
-	printf("%18s ", "ID");
 	printf("%20s ", "Name");
 	printf("%10s ", "acquired");
 	printf("%10s ", "contended");
@@ -646,11 +903,15 @@ static void print_result(void)
 
 	printf("\n\n");
 
+	bad = total = 0;
 	while ((st = pop_from_result())) {
+		total++;
+		if (st->discard) {
+			bad++;
+			continue;
+		}
 		bzero(cut_name, 20);
 
-		printf("%p ", st->addr);
-
 		if (strlen(st->name) < 16) {
 			/* output raw name */
 			printf("%20s ", st->name);
@@ -673,6 +934,21 @@ static void print_result(void)
 		       0 : st->wait_time_min);
 		printf("\n");
 	}
+
+	{
+		/* Output for debug, this have to be removed */
+		int i;
+		const char *name[4] =
+			{ "acquire", "acquired", "contended", "release" };
+
+		printf("\n=== output for debug===\n\n");
+		printf("bad:%d, total:%d\n", bad, total);
+		printf("bad rate:%f\n", (double)(bad / total));
+
+		printf("histogram of events caused bad sequence\n");
+		for (i = 0; i < 4; i++)
+			printf(" %10s: %d\n", name[i], bad_hist[i]);
+	}
 }
 
 static void dump_map(void)
@@ -692,8 +968,6 @@ static struct perf_event_ops eops = {
 	.comm			= event__process_comm,
 };
 
-static struct perf_session *session;
-
 static int read_events(void)
 {
 	session = perf_session__new(input_name, O_RDONLY, 0);
-- 
cgit v1.2.3


From 5710fcad7c367adefe5634dc998f1f88780a8457 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 21 Apr 2010 18:06:01 +0200
Subject: perf: Fix initialization bug in parse_single_tracepoint_event()

The parse_single_tracepoint_event() was setting some attributes
before it validated the event was indeed a tracepoint event. This
caused problems with other initialization routines like in the
builtin-top.c module whereby sample_period is not set if not 0.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <4bcf232b.698fd80a.6fbe.ffffb737@mx.google.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 tools/perf/util/parse-events.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 3b4ec679756..600d3271425 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -418,12 +418,6 @@ parse_single_tracepoint_event(char *sys_name,
 	u64 id;
 	int fd;
 
-	attr->sample_type |= PERF_SAMPLE_RAW;
-	attr->sample_type |= PERF_SAMPLE_TIME;
-	attr->sample_type |= PERF_SAMPLE_CPU;
-
-	attr->sample_period = 1;
-
 	snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path,
 		 sys_name, evt_name);
 
@@ -442,6 +436,13 @@ parse_single_tracepoint_event(char *sys_name,
 	attr->type = PERF_TYPE_TRACEPOINT;
 	*strp = evt_name + evt_length;
 
+	attr->sample_type |= PERF_SAMPLE_RAW;
+	attr->sample_type |= PERF_SAMPLE_TIME;
+	attr->sample_type |= PERF_SAMPLE_CPU;
+
+	attr->sample_period = 1;
+
+
 	return EVT_HANDLED;
 }
 
-- 
cgit v1.2.3


From c61e52ee705f938596d307625dce00cc4345aaf0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 24 Apr 2010 00:04:12 +0200
Subject: perf: Generalize perf lock's sample event reordering to the session
 layer

The sample events recorded by perf record are not time ordered
because we have one buffer per cpu for each event (even demultiplexed
per task/per cpu for task bound events). But when we read trace events
we want them to be ordered by time because many state machines are
involved.

There are currently two ways perf tools deal with that:

- use -M to multiplex every buffers (perf sched, perf kmem)
  But this creates a lot of contention in SMP machines on
  record time.

- use a post-processing time reordering (perf timechart, perf lock)
  The reordering used by timechart is simple but doesn't scale well
  with huge flow of events, in terms of performance and memory use
  (unusable with perf lock for example).
  Perf lock has its own samples reordering that flushes its memory
  use in a regular basis and that uses a sorting based on the
  previous event queued (a new event to be queued is close to the
  previous one most of the time).

This patch proposes to export perf lock's samples reordering facility
to the session layer that reads the events. So if a tool wants to
get ordered sample events, it needs to set its
struct perf_event_ops::ordered_samples to true and that's it.

This prepares tracing based perf tools to get rid of the need to
use buffers multiplexing (-M) or to implement their own
reordering.

Also lower the flush period to 2 as it's sufficient already.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
---
 tools/perf/builtin-lock.c | 197 ++++++----------------------------------------
 tools/perf/util/session.c | 179 ++++++++++++++++++++++++++++++++++++++++-
 tools/perf/util/session.h |  10 +++
 3 files changed, 210 insertions(+), 176 deletions(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 716d8c544a5..ce276750b14 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -316,8 +316,6 @@ alloc_failed:
 
 static char			const *input_name = "perf.data";
 
-static int			profile_cpu = -1;
-
 struct raw_event_sample {
 	u32			size;
 	char			data[0];
@@ -697,8 +695,7 @@ process_lock_release_event(void *data,
 }
 
 static void
-process_raw_event(void *data, int cpu __used,
-		  u64 timestamp __used, struct thread *thread __used)
+process_raw_event(void *data, int cpu, u64 timestamp, struct thread *thread)
 {
 	struct event *event;
 	int type;
@@ -716,176 +713,6 @@ process_raw_event(void *data, int cpu __used,
 		process_lock_release_event(data, event, cpu, timestamp, thread);
 }
 
-struct raw_event_queue {
-	u64			timestamp;
-	int			cpu;
-	void			*data;
-	struct thread		*thread;
-	struct list_head	list;
-};
-
-static LIST_HEAD(raw_event_head);
-
-#define FLUSH_PERIOD	(5 * NSEC_PER_SEC)
-
-static u64 flush_limit = ULLONG_MAX;
-static u64 last_flush = 0;
-struct raw_event_queue *last_inserted;
-
-static void flush_raw_event_queue(u64 limit)
-{
-	struct raw_event_queue *tmp, *iter;
-
-	list_for_each_entry_safe(iter, tmp, &raw_event_head, list) {
-		if (iter->timestamp > limit)
-			return;
-
-		if (iter == last_inserted)
-			last_inserted = NULL;
-
-		process_raw_event(iter->data, iter->cpu, iter->timestamp,
-				  iter->thread);
-
-		last_flush = iter->timestamp;
-		list_del(&iter->list);
-		free(iter->data);
-		free(iter);
-	}
-}
-
-static void __queue_raw_event_end(struct raw_event_queue *new)
-{
-	struct raw_event_queue *iter;
-
-	list_for_each_entry_reverse(iter, &raw_event_head, list) {
-		if (iter->timestamp < new->timestamp) {
-			list_add(&new->list, &iter->list);
-			return;
-		}
-	}
-
-	list_add(&new->list, &raw_event_head);
-}
-
-static void __queue_raw_event_before(struct raw_event_queue *new,
-				     struct raw_event_queue *iter)
-{
-	list_for_each_entry_continue_reverse(iter, &raw_event_head, list) {
-		if (iter->timestamp < new->timestamp) {
-			list_add(&new->list, &iter->list);
-			return;
-		}
-	}
-
-	list_add(&new->list, &raw_event_head);
-}
-
-static void __queue_raw_event_after(struct raw_event_queue *new,
-				     struct raw_event_queue *iter)
-{
-	list_for_each_entry_continue(iter, &raw_event_head, list) {
-		if (iter->timestamp > new->timestamp) {
-			list_add_tail(&new->list, &iter->list);
-			return;
-		}
-	}
-	list_add_tail(&new->list, &raw_event_head);
-}
-
-/* The queue is ordered by time */
-static void __queue_raw_event(struct raw_event_queue *new)
-{
-	if (!last_inserted) {
-		__queue_raw_event_end(new);
-		return;
-	}
-
-	/*
-	 * Most of the time the current event has a timestamp
-	 * very close to the last event inserted, unless we just switched
-	 * to another event buffer. Having a sorting based on a list and
-	 * on the last inserted event that is close to the current one is
-	 * probably more efficient than an rbtree based sorting.
-	 */
-	if (last_inserted->timestamp >= new->timestamp)
-		__queue_raw_event_before(new, last_inserted);
-	else
-		__queue_raw_event_after(new, last_inserted);
-}
-
-static void queue_raw_event(void *data, int raw_size, int cpu,
-			    u64 timestamp, struct thread *thread)
-{
-	struct raw_event_queue *new;
-
-	if (flush_limit == ULLONG_MAX)
-		flush_limit = timestamp + FLUSH_PERIOD;
-
-	if (timestamp < last_flush) {
-		printf("Warning: Timestamp below last timeslice flush\n");
-		return;
-	}
-
-	new = malloc(sizeof(*new));
-	if (!new)
-		die("Not enough memory\n");
-
-	new->timestamp = timestamp;
-	new->cpu = cpu;
-	new->thread = thread;
-
-	new->data = malloc(raw_size);
-	if (!new->data)
-		die("Not enough memory\n");
-
-	memcpy(new->data, data, raw_size);
-
-	__queue_raw_event(new);
-	last_inserted = new;
-
-	/*
-	 * We want to have a slice of events covering 2 * FLUSH_PERIOD
-	 * If FLUSH_PERIOD is big enough, it ensures every events that occured
-	 * in the first half of the timeslice have all been buffered and there
-	 * are none remaining (we need that because of the weakly ordered
-	 * event recording we have). Then once we reach the 2 * FLUSH_PERIOD
-	 * timeslice, we flush the first half to be gentle with the memory
-	 * (the second half can still get new events in the middle, so wait
-	 * another period to flush it)
-	 */
-	if (new->timestamp > flush_limit &&
-		new->timestamp - flush_limit > FLUSH_PERIOD) {
-		flush_limit += FLUSH_PERIOD;
-		flush_raw_event_queue(flush_limit);
-	}
-}
-
-static int process_sample_event(event_t *event, struct perf_session *s)
-{
-	struct thread *thread;
-	struct sample_data data;
-
-	bzero(&data, sizeof(struct sample_data));
-	event__parse_sample(event, s->sample_type, &data);
-	/* CAUTION: using tid as thread.pid */
-	thread = perf_session__findnew(s, data.tid);
-
-	if (thread == NULL) {
-		pr_debug("problem processing %d event, skipping it.\n",
-			 event->header.type);
-		return -1;
-	}
-
-	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
-
-	if (profile_cpu != -1 && profile_cpu != (int) data.cpu)
-		return 0;
-
-	queue_raw_event(data.raw_data, data.raw_size, data.cpu, data.time, thread);
-
-	return 0;
-}
-
 /* TODO: various way to print, coloring, nano or milli sec */
 static void print_result(void)
 {
@@ -963,9 +790,30 @@ static void dump_map(void)
 	}
 }
 
+static int process_sample_event(event_t *self, struct perf_session *s)
+{
+	struct sample_data data;
+	struct thread *thread;
+
+	bzero(&data, sizeof(data));
+	event__parse_sample(self, s->sample_type, &data);
+
+	thread = perf_session__findnew(s, data.tid);
+	if (thread == NULL) {
+		pr_debug("problem processing %d event, skipping it.\n",
+			self->header.type);
+		return -1;
+	}
+
+	process_raw_event(data.raw_data, data.cpu, data.time, thread);
+
+	return 0;
+}
+
 static struct perf_event_ops eops = {
 	.sample			= process_sample_event,
 	.comm			= event__process_comm,
+	.ordered_samples	= true,
 };
 
 static int read_events(void)
@@ -994,7 +842,6 @@ static void __cmd_report(void)
 	setup_pager();
 	select_key();
 	read_events();
-	flush_raw_event_queue(ULLONG_MAX);
 	sort_result();
 	print_result();
 }
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 7d88ae5c270..b7aade2184b 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -98,6 +98,8 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
 	self->cwdlen = 0;
 	self->unknown_events = 0;
 	self->kerninfo_root = RB_ROOT;
+	self->ordered_samples.flush_limit = ULLONG_MAX;
+	INIT_LIST_HEAD(&self->ordered_samples.samples_head);
 
 	if (mode == O_RDONLY) {
 		if (perf_session__open(self, force) < 0)
@@ -351,6 +353,178 @@ static event__swap_op event__swap_ops[] = {
 	[PERF_RECORD_HEADER_MAX]    = NULL,
 };
 
+struct sample_queue {
+	u64			timestamp;
+	struct sample_event	*event;
+	struct list_head	list;
+};
+
+#define FLUSH_PERIOD	(2 * NSEC_PER_SEC)
+
+static void flush_sample_queue(struct perf_session *s,
+			       struct perf_event_ops *ops)
+{
+	struct list_head *head = &s->ordered_samples.samples_head;
+	u64 limit = s->ordered_samples.flush_limit;
+	struct sample_queue *tmp, *iter;
+
+	if (!ops->ordered_samples)
+		return;
+
+	list_for_each_entry_safe(iter, tmp, head, list) {
+		if (iter->timestamp > limit)
+			return;
+
+		if (iter == s->ordered_samples.last_inserted)
+			s->ordered_samples.last_inserted = NULL;
+
+		ops->sample((event_t *)iter->event, s);
+
+		s->ordered_samples.last_flush = iter->timestamp;
+		list_del(&iter->list);
+		free(iter->event);
+		free(iter);
+	}
+}
+
+static void __queue_sample_end(struct sample_queue *new, struct list_head *head)
+{
+	struct sample_queue *iter;
+
+	list_for_each_entry_reverse(iter, head, list) {
+		if (iter->timestamp < new->timestamp) {
+			list_add(&new->list, &iter->list);
+			return;
+		}
+	}
+
+	list_add(&new->list, head);
+}
+
+static void __queue_sample_before(struct sample_queue *new,
+				  struct sample_queue *iter,
+				  struct list_head *head)
+{
+	list_for_each_entry_continue_reverse(iter, head, list) {
+		if (iter->timestamp < new->timestamp) {
+			list_add(&new->list, &iter->list);
+			return;
+		}
+	}
+
+	list_add(&new->list, head);
+}
+
+static void __queue_sample_after(struct sample_queue *new,
+				 struct sample_queue *iter,
+				 struct list_head *head)
+{
+	list_for_each_entry_continue(iter, head, list) {
+		if (iter->timestamp > new->timestamp) {
+			list_add_tail(&new->list, &iter->list);
+			return;
+		}
+	}
+	list_add_tail(&new->list, head);
+}
+
+/* The queue is ordered by time */
+static void __queue_sample_event(struct sample_queue *new,
+				 struct perf_session *s)
+{
+	struct sample_queue *last_inserted = s->ordered_samples.last_inserted;
+	struct list_head *head = &s->ordered_samples.samples_head;
+
+
+	if (!last_inserted) {
+		__queue_sample_end(new, head);
+		return;
+	}
+
+	/*
+	 * Most of the time the current event has a timestamp
+	 * very close to the last event inserted, unless we just switched
+	 * to another event buffer. Having a sorting based on a list and
+	 * on the last inserted event that is close to the current one is
+	 * probably more efficient than an rbtree based sorting.
+	 */
+	if (last_inserted->timestamp >= new->timestamp)
+		__queue_sample_before(new, last_inserted, head);
+	else
+		__queue_sample_after(new, last_inserted, head);
+}
+
+static int queue_sample_event(event_t *event, struct sample_data *data,
+			      struct perf_session *s,
+			      struct perf_event_ops *ops)
+{
+	u64 timestamp = data->time;
+	struct sample_queue *new;
+	u64 flush_limit;
+
+
+	if (s->ordered_samples.flush_limit == ULLONG_MAX)
+		s->ordered_samples.flush_limit = timestamp + FLUSH_PERIOD;
+
+	if (timestamp < s->ordered_samples.last_flush) {
+		printf("Warning: Timestamp below last timeslice flush\n");
+		return -EINVAL;
+	}
+
+	new = malloc(sizeof(*new));
+	if (!new)
+		return -ENOMEM;
+
+	new->timestamp = timestamp;
+
+	new->event = malloc(event->header.size);
+	if (!new->event) {
+		free(new);
+		return -ENOMEM;
+	}
+
+	memcpy(new->event, event, event->header.size);
+
+	__queue_sample_event(new, s);
+	s->ordered_samples.last_inserted = new;
+
+	/*
+	 * We want to have a slice of events covering 2 * FLUSH_PERIOD
+	 * If FLUSH_PERIOD is big enough, it ensures every events that occured
+	 * in the first half of the timeslice have all been buffered and there
+	 * are none remaining (we need that because of the weakly ordered
+	 * event recording we have). Then once we reach the 2 * FLUSH_PERIOD
+	 * timeslice, we flush the first half to be gentle with the memory
+	 * (the second half can still get new events in the middle, so wait
+	 * another period to flush it)
+	 */
+	flush_limit = s->ordered_samples.flush_limit;
+
+	if (new->timestamp > flush_limit &&
+		new->timestamp - flush_limit > FLUSH_PERIOD) {
+		s->ordered_samples.flush_limit += FLUSH_PERIOD;
+		flush_sample_queue(s, ops);
+	}
+
+	return 0;
+}
+
+static int perf_session__process_sample(event_t *event, struct perf_session *s,
+					struct perf_event_ops *ops)
+{
+	struct sample_data data;
+
+	if (!ops->ordered_samples)
+		return ops->sample(event, s);
+
+	bzero(&data, sizeof(struct sample_data));
+	event__parse_sample(event, s->sample_type, &data);
+
+	queue_sample_event(event, &data, s, ops);
+
+	return 0;
+}
+
 static int perf_session__process_event(struct perf_session *self,
 				       event_t *event,
 				       struct perf_event_ops *ops,
@@ -371,7 +545,7 @@ static int perf_session__process_event(struct perf_session *self,
 
 	switch (event->header.type) {
 	case PERF_RECORD_SAMPLE:
-		return ops->sample(event, self);
+		return perf_session__process_sample(event, self, ops);
 	case PERF_RECORD_MMAP:
 		return ops->mmap(event, self);
 	case PERF_RECORD_COMM:
@@ -611,6 +785,9 @@ more:
 		goto more;
 done:
 	err = 0;
+	/* do the final flush for ordered samples */
+	self->ordered_samples.flush_limit = ULLONG_MAX;
+	flush_sample_queue(self, ops);
 out_err:
 	ui_progress__delete(progress);
 	return err;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 5e47c87b926..796e2291ebd 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -8,9 +8,17 @@
 #include <linux/rbtree.h>
 #include "../../../include/linux/perf_event.h"
 
+struct sample_queue;
 struct ip_callchain;
 struct thread;
 
+struct ordered_samples {
+	u64			last_flush;
+	u64			flush_limit;
+	struct list_head	samples_head;
+	struct sample_queue	*last_inserted;
+};
+
 struct perf_session {
 	struct perf_header	header;
 	unsigned long		size;
@@ -28,6 +36,7 @@ struct perf_session {
 	bool			fd_pipe;
 	int			cwdlen;
 	char			*cwd;
+	struct ordered_samples	ordered_samples;
 	char filename[0];
 };
 
@@ -47,6 +56,7 @@ struct perf_event_ops {
 		 event_type,
 		 tracing_data,
 		 build_id;
+	bool	ordered_samples;
 };
 
 struct perf_session *perf_session__new(const char *filename, int mode, bool force);
-- 
cgit v1.2.3


From a64eae703b390185abe1d15fa932b48f04fa7fbb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 24 Apr 2010 00:29:40 +0200
Subject: perf: Use generic sample reordering in perf sched

Use the new generic sample events reordering from perf sched,
this drops the need of multiplexing the buffers on record time,
improving the scalability of perf sched.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
---
 tools/perf/builtin-sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 09ddc8e6d8e..94453f1e4e0 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1651,9 +1651,10 @@ static int process_lost_event(event_t *event __used,
 }
 
 static struct perf_event_ops event_ops = {
-	.sample	= process_sample_event,
-	.comm	= event__process_comm,
-	.lost	= process_lost_event,
+	.sample			= process_sample_event,
+	.comm			= event__process_comm,
+	.lost			= process_lost_event,
+	.ordered_samples	= true,
 };
 
 static int read_events(void)
@@ -1850,7 +1851,6 @@ static const char *record_args[] = {
 	"record",
 	"-a",
 	"-R",
-	"-M",
 	"-f",
 	"-m", "1024",
 	"-c", "1",
-- 
cgit v1.2.3


From 587570d4cc3cac80da7d569bee9cea3ca104d60e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 24 Apr 2010 00:34:53 +0200
Subject: perf: Use generic sample reordering in perf kmem

Use the new generic sample events reordering from perf kmem,
this drops the need of multiplexing the buffers on record time,
improving the scalability of perf kmem.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 tools/perf/builtin-kmem.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index db474bbf332..ab906cbd5c7 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -335,8 +335,9 @@ static int process_sample_event(event_t *event, struct perf_session *session)
 }
 
 static struct perf_event_ops event_ops = {
-	.sample	= process_sample_event,
-	.comm	= event__process_comm,
+	.sample			= process_sample_event,
+	.comm			= event__process_comm,
+	.ordered_samples	= true,
 };
 
 static double fragmentation(unsigned long n_req, unsigned long n_alloc)
@@ -730,7 +731,6 @@ static const char *record_args[] = {
 	"record",
 	"-a",
 	"-R",
-	"-M",
 	"-f",
 	"-c", "1",
 	"-e", "kmem:kmalloc",
-- 
cgit v1.2.3


From e0a808c65c23f88e48a5fff48775b90e7919c64f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 24 Apr 2010 00:38:33 +0200
Subject: perf: Use generic sample reordering in perf trace

Use the new generic sample events reordering from perf trace.
Before that, the displayed traces were ordered as they were
in the input as recorded by perf record (not time ordered).

This makes eventually perf trace displaying the events as beeing
time ordered.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
---
 tools/perf/builtin-trace.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 2eefb33c967..1d034f6fa28 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -108,6 +108,7 @@ static struct perf_event_ops event_ops = {
 	.event_type = event__process_event_type,
 	.tracing_data = event__process_tracing_data,
 	.build_id = event__process_build_id,
+	.ordered_samples = true,
 };
 
 extern volatile int session_done;
-- 
cgit v1.2.3


From 9df9bbba9f7e2e4ffdc51bbbfa524b67691321d2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 24 Apr 2010 01:18:48 +0200
Subject: perf: Use generic sample reordering in perf timechart

Use the new generic sample events reordering from perf timechart,
this drops the ad hoc sample reordering it was using before.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
---
 tools/perf/builtin-timechart.c | 112 ++---------------------------------------
 1 file changed, 5 insertions(+), 107 deletions(-)

diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 96f4a092df3..c35aa44f82b 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -143,9 +143,6 @@ struct wake_event {
 static struct power_event    *power_events;
 static struct wake_event     *wake_events;
 
-struct sample_wrapper *all_samples;
-
-
 struct process_filter;
 struct process_filter {
 	char			*name;
@@ -566,88 +563,6 @@ static void end_sample_processing(void)
 	}
 }
 
-static u64 sample_time(event_t *event, const struct perf_session *session)
-{
-	int cursor;
-
-	cursor = 0;
-	if (session->sample_type & PERF_SAMPLE_IP)
-		cursor++;
-	if (session->sample_type & PERF_SAMPLE_TID)
-		cursor++;
-	if (session->sample_type & PERF_SAMPLE_TIME)
-		return event->sample.array[cursor];
-	return 0;
-}
-
-
-/*
- * We first queue all events, sorted backwards by insertion.
- * The order will get flipped later.
- */
-static int queue_sample_event(event_t *event, struct perf_session *session)
-{
-	struct sample_wrapper *copy, *prev;
-	int size;
-
-	size = event->sample.header.size + sizeof(struct sample_wrapper) + 8;
-
-	copy = malloc(size);
-	if (!copy)
-		return 1;
-
-	memset(copy, 0, size);
-
-	copy->next = NULL;
-	copy->timestamp = sample_time(event, session);
-
-	memcpy(&copy->data, event, event->sample.header.size);
-
-	/* insert in the right place in the list */
-
-	if (!all_samples) {
-		/* first sample ever */
-		all_samples = copy;
-		return 0;
-	}
-
-	if (all_samples->timestamp < copy->timestamp) {
-		/* insert at the head of the list */
-		copy->next = all_samples;
-		all_samples = copy;
-		return 0;
-	}
-
-	prev = all_samples;
-	while (prev->next) {
-		if (prev->next->timestamp < copy->timestamp) {
-			copy->next = prev->next;
-			prev->next = copy;
-			return 0;
-		}
-		prev = prev->next;
-	}
-	/* insert at the end of the list */
-	prev->next = copy;
-
-	return 0;
-}
-
-static void sort_queued_samples(void)
-{
-	struct sample_wrapper *cursor, *next;
-
-	cursor = all_samples;
-	all_samples = NULL;
-
-	while (cursor) {
-		next = cursor->next;
-		cursor->next = all_samples;
-		all_samples = cursor;
-		cursor = next;
-	}
-}
-
 /*
  * Sort the pid datastructure
  */
@@ -1011,26 +926,12 @@ static void write_svg_file(const char *filename)
 	svg_close();
 }
 
-static void process_samples(struct perf_session *session)
-{
-	struct sample_wrapper *cursor;
-	event_t *event;
-
-	sort_queued_samples();
-
-	cursor = all_samples;
-	while (cursor) {
-		event = (void *)&cursor->data;
-		cursor = cursor->next;
-		process_sample_event(event, session);
-	}
-}
-
 static struct perf_event_ops event_ops = {
-	.comm	= process_comm_event,
-	.fork	= process_fork_event,
-	.exit	= process_exit_event,
-	.sample	= queue_sample_event,
+	.comm			= process_comm_event,
+	.fork			= process_fork_event,
+	.exit			= process_exit_event,
+	.sample			= process_sample_event,
+	.ordered_samples	= true,
 };
 
 static int __cmd_timechart(void)
@@ -1048,8 +949,6 @@ static int __cmd_timechart(void)
 	if (ret)
 		goto out_delete;
 
-	process_samples(session);
-
 	end_sample_processing();
 
 	sort_pids();
@@ -1072,7 +971,6 @@ static const char *record_args[] = {
 	"record",
 	"-a",
 	"-R",
-	"-M",
 	"-f",
 	"-c", "1",
 	"-e", "power:power_start",
-- 
cgit v1.2.3


From e1889d75aff0c3786bc53aeb7d9eaca0691c19c5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 24 Apr 2010 01:55:09 +0200
Subject: perf: Add a perf trace option to check samples ordering reliability

To ensure sample events time reordering is reliable, add a -d option
to perf trace to check that automatically.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
---
 tools/perf/builtin-trace.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 1d034f6fa28..77f556f7604 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -11,6 +11,8 @@
 
 static char const		*script_name;
 static char const		*generate_script_lang;
+static bool			debug_ordering;
+static u64			last_timestamp;
 
 static int default_start_script(const char *script __unused,
 				int argc __unused,
@@ -87,6 +89,14 @@ static int process_sample_event(event_t *event, struct perf_session *session)
 	}
 
 	if (session->sample_type & PERF_SAMPLE_RAW) {
+		if (debug_ordering) {
+			if (data.time < last_timestamp) {
+				pr_err("Samples misordered, previous: %llu "
+					"this: %llu\n", last_timestamp,
+					data.time);
+			}
+			last_timestamp = data.time;
+		}
 		/*
 		 * FIXME: better resolve from pid from the struct trace_entry
 		 * field, although it should be the same than this perf
@@ -532,6 +542,8 @@ static const struct option options[] = {
 		   "generate perf-trace.xx script in specified language"),
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
+	OPT_BOOLEAN('d', "debug-ordering", &debug_ordering,
+		   "check that samples time ordering is monotonic"),
 
 	OPT_END()
 };
-- 
cgit v1.2.3


From cfadf9d4ac4be940595ab73d3def24c23c8b875f Mon Sep 17 00:00:00 2001
From: William Cohen <wcohen@redhat.com>
Date: Fri, 23 Apr 2010 16:36:21 -0400
Subject: perf: Some perf-kvm documentation edits

asciidoc does not allow the "===" to be longer than the line
above it.
Also fix a couple types and formatting errors.

Signed-off-by: William Cohen <wcohen@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <4BD204C5.9000504@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 tools/perf/Documentation/perf-kvm.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Documentation/perf-kvm.txt b/tools/perf/Documentation/perf-kvm.txt
index 93400a0f17f..d004e19fe6d 100644
--- a/tools/perf/Documentation/perf-kvm.txt
+++ b/tools/perf/Documentation/perf-kvm.txt
@@ -1,5 +1,5 @@
 perf-kvm(1)
-==============
+===========
 
 NAME
 ----
@@ -41,9 +41,9 @@ There are a couple of variants of perf kvm:
 OPTIONS
 -------
 --host=::
-        Collect host side perforamnce profile.
+        Collect host side performance profile.
 --guest=::
-        Collect guest side perforamnce profile.
+        Collect guest side performance profile.
 --guestmount=<path>::
 	Guest os root file system mount directory. Users mounts guest os
         root directories under <path> by a specific filesystem access method,
@@ -64,4 +64,5 @@ OPTIONS
 
 SEE ALSO
 --------
-linkperf:perf-top[1] perf-record[1] perf-report[1] perf-diff[1] perf-buildid-list[1]
+linkperf:perf-top[1], linkperf:perf-record[1], linkperf:perf-report[1],
+linkperf:perf-diff[1], linkperf:perf-buildid-list[1]
-- 
cgit v1.2.3


From 0ab061cd523a7f2dbf1b59aab0542cb0ab2e4633 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 21 Apr 2010 15:56:16 -0400
Subject: perf tools: Initialize dso->node member in dso__new

If dso->node member is not initialized, it causes a segmentation fault when
adding to other lists.

It should be initilized in dso__new().

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: : <20100421195616.24664.89980.stgit@localhost6.localdomain6>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/symbol.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index e782e7db16c..e77c33a11de 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -189,6 +189,7 @@ struct dso *dso__new(const char *name)
 		self->sorted_by_name = 0;
 		self->has_build_id = 0;
 		self->kernel = DSO_TYPE_USER;
+		INIT_LIST_HEAD(&self->node);
 	}
 
 	return self;
-- 
cgit v1.2.3


From 15eca306ec95e164d05457f9f27c722f69af6d18 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 21 Apr 2010 15:56:24 -0400
Subject: perf probe: Fix to use symtab only if no debuginfo

Fix perf probe to use symtab only if there is no debuginfo, because debuginfo
has more information than symtab.

If we can't find a function in debuginfo, we never find it in symtab.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20100421195624.24664.46214.stgit@localhost6.localdomain6>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-event.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 4fb480367c3..5d3baec216e 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -180,15 +180,16 @@ static int try_to_find_kprobe_trace_events(struct perf_probe_event *pev,
 		return -ENOENT;
 	}
 	/* Error path : ntevs < 0 */
-	if (need_dwarf) {
-		if (ntevs == -EBADF)
-			pr_warning("No dwarf info found in the vmlinux - "
-				"please rebuild with CONFIG_DEBUG_INFO=y.\n");
-		return ntevs;
+	pr_debug("An error occurred in debuginfo analysis (%d).\n", ntevs);
+	if (ntevs == -EBADF) {
+		pr_warning("Warning: No dwarf info found in the vmlinux - "
+			"please rebuild kernel with CONFIG_DEBUG_INFO=y.\n");
+		if (!need_dwarf) {
+			pr_debug("Trying to use symbols.\nn");
+			return 0;
+		}
 	}
-	pr_debug("An error occurred in debuginfo analysis."
-		 " Try to use symbols.\n");
-	return 0;
+	return ntevs;
 }
 
 #define LINEBUF_SIZE 256
-- 
cgit v1.2.3


From 5d1ee0413c8e2e0aa48510b1edfb3c4d2d43455b Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 21 Apr 2010 15:56:32 -0400
Subject: perf probe: Fix to exit callback soon after finding too many probe
 points

Fix to exit callback soon after finding too many probe points.
Don't try to continue searching because it already failed.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20100421195632.24664.42598.stgit@localhost6.localdomain6>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/probe-finder.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index e7ee52fd0e0..0d795bc3e1a 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -871,6 +871,8 @@ static int probe_point_inline_cb(Dwarf_Die *in_die, void *data)
 			 (uintmax_t)pf->addr);
 
 		param->retval = convert_probe_point(in_die, pf);
+		if (param->retval < 0)
+			return DWARF_CB_ABORT;
 	}
 
 	return DWARF_CB_OK;
@@ -1106,6 +1108,8 @@ static int line_range_funcdecl_cb(Dwarf_Die *sp_die, void *data)
 		return DWARF_CB_OK;
 
 	param->retval = line_range_add_line(src, lineno, lf->lr);
+	if (param->retval < 0)
+		return DWARF_CB_ABORT;
 	return DWARF_CB_OK;
 }
 
-- 
cgit v1.2.3


From ef4a356574426877d569f8b6579325537eb7909b Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Wed, 21 Apr 2010 15:56:40 -0400
Subject: perf probe: Add --max-probes option

Add --max-probes option to change the maximum limit of
findable probe points per event, since inlined function can be
expanded into thousands of probe points. Default value is 128.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Suggested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20100421195640.24664.62984.stgit@localhost6.localdomain6>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-probe.txt |  3 +++
 tools/perf/builtin-probe.c              |  9 ++++++++-
 tools/perf/util/probe-event.c           | 17 ++++++++++-------
 tools/perf/util/probe-event.h           |  4 ++--
 tools/perf/util/probe-finder.c          | 11 ++++++-----
 tools/perf/util/probe-finder.h          |  6 ++++--
 6 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 63c25d30488..94a258c96a4 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -62,6 +62,9 @@ OPTIONS
 	Dry run. With this option, --add and --del doesn't execute actual
 	adding and removal operations.
 
+--max-probes::
+	Set the maximum number of probe points for an event. Default is 128.
+
 PROBE SYNTAX
 ------------
 Probe points are defined by following syntax.
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index c1e54035e8c..61c6d70732c 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -54,6 +54,7 @@ static struct {
 	struct perf_probe_event events[MAX_PROBES];
 	struct strlist *dellist;
 	struct line_range line_range;
+	int max_probe_points;
 } params;
 
 
@@ -179,6 +180,8 @@ static const struct option options[] = {
 		   "file", "vmlinux pathname"),
 #endif
 	OPT__DRY_RUN(&probe_event_dry_run),
+	OPT_INTEGER('\0', "max-probes", &params.max_probe_points,
+		 "Set how many probe points can be found for a probe."),
 	OPT_END()
 };
 
@@ -200,6 +203,9 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 		}
 	}
 
+	if (params.max_probe_points == 0)
+		params.max_probe_points = MAX_PROBES;
+
 	if ((!params.nevents && !params.dellist && !params.list_events &&
 	     !params.show_lines))
 		usage_with_options(probe_usage, options);
@@ -246,7 +252,8 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 
 	if (params.nevents) {
 		ret = add_perf_probe_events(params.events, params.nevents,
-					    params.force_add);
+					    params.force_add,
+					    params.max_probe_points);
 		if (ret < 0) {
 			pr_err("  Error: Failed to add events. (%d)\n", ret);
 			return ret;
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 5d3baec216e..9ded38ced23 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -150,7 +150,8 @@ static int convert_to_perf_probe_point(struct kprobe_trace_point *tp,
 
 /* Try to find perf_probe_event with debuginfo */
 static int try_to_find_kprobe_trace_events(struct perf_probe_event *pev,
-					   struct kprobe_trace_event **tevs)
+					   struct kprobe_trace_event **tevs,
+					   int max_tevs)
 {
 	bool need_dwarf = perf_probe_event_need_dwarf(pev);
 	int fd, ntevs;
@@ -166,7 +167,7 @@ static int try_to_find_kprobe_trace_events(struct perf_probe_event *pev,
 	}
 
 	/* Searching trace events corresponding to probe event */
-	ntevs = find_kprobe_trace_events(fd, pev, tevs);
+	ntevs = find_kprobe_trace_events(fd, pev, tevs, max_tevs);
 	close(fd);
 
 	if (ntevs > 0) {	/* Succeeded to find trace events */
@@ -318,7 +319,8 @@ static int convert_to_perf_probe_point(struct kprobe_trace_point *tp,
 }
 
 static int try_to_find_kprobe_trace_events(struct perf_probe_event *pev,
-				struct kprobe_trace_event **tevs __unused)
+				struct kprobe_trace_event **tevs __unused,
+				int max_tevs __unused)
 {
 	if (perf_probe_event_need_dwarf(pev)) {
 		pr_warning("Debuginfo-analysis is not supported.\n");
@@ -1408,14 +1410,15 @@ static int __add_kprobe_trace_events(struct perf_probe_event *pev,
 }
 
 static int convert_to_kprobe_trace_events(struct perf_probe_event *pev,
-					  struct kprobe_trace_event **tevs)
+					  struct kprobe_trace_event **tevs,
+					  int max_tevs)
 {
 	struct symbol *sym;
 	int ret = 0, i;
 	struct kprobe_trace_event *tev;
 
 	/* Convert perf_probe_event with debuginfo */
-	ret = try_to_find_kprobe_trace_events(pev, tevs);
+	ret = try_to_find_kprobe_trace_events(pev, tevs, max_tevs);
 	if (ret != 0)
 		return ret;
 
@@ -1487,7 +1490,7 @@ struct __event_package {
 };
 
 int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
-			  bool force_add)
+			  bool force_add, int max_tevs)
 {
 	int i, j, ret;
 	struct __event_package *pkgs;
@@ -1506,7 +1509,7 @@ int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
 		pkgs[i].pev = &pevs[i];
 		/* Convert with or without debuginfo */
 		ret  = convert_to_kprobe_trace_events(pkgs[i].pev,
-						      &pkgs[i].tevs);
+						      &pkgs[i].tevs, max_tevs);
 		if (ret < 0)
 			goto end;
 		pkgs[i].ntevs = ret;
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index e7ff0d02c0d..e9db1a214ca 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -115,8 +115,8 @@ extern void clear_kprobe_trace_event(struct kprobe_trace_event *tev);
 extern int parse_line_range_desc(const char *cmd, struct line_range *lr);
 
 
-extern int add_perf_probe_events(struct perf_probe_event *pevs, int ntevs,
-				 bool force_add);
+extern int add_perf_probe_events(struct perf_probe_event *pevs, int npevs,
+				 bool force_add, int max_probe_points);
 extern int del_perf_probe_events(struct strlist *dellist);
 extern int show_perf_probe_events(void);
 extern int show_line_range(struct line_range *lr);
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index 0d795bc3e1a..562b1443e78 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -626,8 +626,9 @@ static int convert_probe_point(Dwarf_Die *sp_die, struct probe_finder *pf)
 	Dwarf_Attribute fb_attr;
 	size_t nops;
 
-	if (pf->ntevs == MAX_PROBES) {
-		pr_warning("Too many( > %d) probe point found.\n", MAX_PROBES);
+	if (pf->ntevs == pf->max_tevs) {
+		pr_warning("Too many( > %d) probe point found.\n",
+			   pf->max_tevs);
 		return -ERANGE;
 	}
 	tev = &pf->tevs[pf->ntevs++];
@@ -932,9 +933,9 @@ static int find_probe_point_by_func(struct probe_finder *pf)
 
 /* Find kprobe_trace_events specified by perf_probe_event from debuginfo */
 int find_kprobe_trace_events(int fd, struct perf_probe_event *pev,
-			     struct kprobe_trace_event **tevs)
+			     struct kprobe_trace_event **tevs, int max_tevs)
 {
-	struct probe_finder pf = {.pev = pev};
+	struct probe_finder pf = {.pev = pev, .max_tevs = max_tevs};
 	struct perf_probe_point *pp = &pev->point;
 	Dwarf_Off off, noff;
 	size_t cuhl;
@@ -942,7 +943,7 @@ int find_kprobe_trace_events(int fd, struct perf_probe_event *pev,
 	Dwarf *dbg;
 	int ret = 0;
 
-	pf.tevs = zalloc(sizeof(struct kprobe_trace_event) * MAX_PROBES);
+	pf.tevs = zalloc(sizeof(struct kprobe_trace_event) * max_tevs);
 	if (pf.tevs == NULL)
 		return -ENOMEM;
 	*tevs = pf.tevs;
diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h
index 310ce897229..66f1980e385 100644
--- a/tools/perf/util/probe-finder.h
+++ b/tools/perf/util/probe-finder.h
@@ -18,7 +18,8 @@ static inline int is_c_varname(const char *name)
 #ifdef DWARF_SUPPORT
 /* Find kprobe_trace_events specified by perf_probe_event from debuginfo */
 extern int find_kprobe_trace_events(int fd, struct perf_probe_event *pev,
-				    struct kprobe_trace_event **tevs);
+				    struct kprobe_trace_event **tevs,
+				    int max_tevs);
 
 /* Find a perf_probe_point from debuginfo */
 extern int find_perf_probe_point(int fd, unsigned long addr,
@@ -32,7 +33,8 @@ extern int find_line_range(int fd, struct line_range *lr);
 struct probe_finder {
 	struct perf_probe_event	*pev;		/* Target probe event */
 	struct kprobe_trace_event *tevs;	/* Result trace events */
-	int			ntevs;		/* number of trace events */
+	int			ntevs;		/* Number of trace events */
+	int			max_tevs;	/* Max number of trace events */
 
 	/* For function searching */
 	int			lno;		/* Line number */
-- 
cgit v1.2.3


From f93830fbb06b67848c762f2177c06cc3cbb97deb Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 26 Apr 2010 15:39:54 -0300
Subject: perf tools: Fix libdw-dev package name in error message

The headers required for DWARF support are provided by the libdw-dev
package in Debian-based distros.  This patch corrects the elfutils-dev
package name to libdw-dev in the Makefile error message when libdw.h is
not found.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <1272292023-9869-1-git-send-email-stefanha@linux.vnet.ibm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index e8bf2e1ab49..3ac6b677bec 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -504,7 +504,7 @@ PERFLIBS = $(LIB_FILE)
 
 ifndef NO_DWARF
 ifneq ($(shell sh -c "(echo '\#include <dwarf.h>'; echo '\#include <libdw.h>'; echo 'int main(void) { Dwarf *dbg; dbg = dwarf_begin(0, DWARF_C_READ); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -I/usr/include/elfutils -ldw -lelf -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
-	msg := $(warning No libdw.h found or old libdw.h found, disables dwarf support. Please install elfutils-devel/elfutils-dev);
+	msg := $(warning No libdw.h found or old libdw.h found, disables dwarf support. Please install elfutils-devel/libdw-dev);
 	NO_DWARF := 1
 endif # Dwarf support
 endif # NO_DWARF
-- 
cgit v1.2.3


From a5fc4ce01867842f6a9cc317035df3081302bffc Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 24 Apr 2010 07:57:42 -0400
Subject: cifs: track local_nls in volume info

Add a local_nls field to the smb_vol struct and keep a pointer to the
local_nls in it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 58a2109e7b3..eb85dd8d510 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -102,6 +102,7 @@ struct smb_vol {
 	bool sockopt_tcp_nodelay:1;
 	unsigned short int port;
 	char *prepath;
+	struct nls_table *local_nls;
 };
 
 static int ipv4_connect(struct TCP_Server_Info *server);
@@ -2353,20 +2354,20 @@ try_mount_again:
 		goto out;
 	}
 
-
 	/* this is needed for ASCII cp to Unicode converts */
 	if (volume_info->iocharset == NULL) {
-		cifs_sb->local_nls = load_nls_default();
-	/* load_nls_default can not return null */
+		/* load_nls_default cannot return null */
+		volume_info->local_nls = load_nls_default();
 	} else {
-		cifs_sb->local_nls = load_nls(volume_info->iocharset);
-		if (cifs_sb->local_nls == NULL) {
+		volume_info->local_nls = load_nls(volume_info->iocharset);
+		if (volume_info->local_nls == NULL) {
 			cERROR(1, "CIFS mount error: iocharset %s not found",
 				 volume_info->iocharset);
 			rc = -ELIBACC;
 			goto out;
 		}
 	}
+	cifs_sb->local_nls = volume_info->local_nls;
 
 	/* get a reference to a tcp session */
 	srvTcp = cifs_get_tcp_session(volume_info);
-- 
cgit v1.2.3


From 36988c76f007738cad5fe1c873a5fb0cda7eb2f6 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 24 Apr 2010 07:57:43 -0400
Subject: cifs: move SMB session creation code into separate function

...it's mostly part of cifs_mount. Break it out into a separate
function.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 158 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 93 insertions(+), 65 deletions(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index eb85dd8d510..74c222ed83d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1616,6 +1616,7 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
 	int xid;
 	struct TCP_Server_Info *server = ses->server;
 
+	cFYI(1, "%s: ses_count=%d\n", __func__, ses->ses_count);
 	write_lock(&cifs_tcp_ses_lock);
 	if (--ses->ses_count > 0) {
 		write_unlock(&cifs_tcp_ses_lock);
@@ -1634,6 +1635,92 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
 	cifs_put_tcp_session(server);
 }
 
+static struct cifsSesInfo *
+cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
+{
+	int rc = -ENOMEM, xid;
+	struct cifsSesInfo *ses;
+
+	xid = GetXid();
+
+	ses = cifs_find_smb_ses(server, volume_info->username);
+	if (ses) {
+		cFYI(1, "Existing smb sess found (status=%d)", ses->status);
+
+		/* existing SMB ses has a server reference already */
+		cifs_put_tcp_session(server);
+
+		mutex_lock(&ses->session_mutex);
+		if (ses->need_reconnect) {
+			cFYI(1, "Session needs reconnect");
+			rc = cifs_setup_session(xid, ses,
+						volume_info->local_nls);
+			if (rc) {
+				mutex_unlock(&ses->session_mutex);
+				/* problem -- put our reference */
+				cifs_put_smb_ses(ses);
+				FreeXid(xid);
+				return ERR_PTR(rc);
+			}
+		}
+		mutex_unlock(&ses->session_mutex);
+		FreeXid(xid);
+		return ses;
+	}
+
+	cFYI(1, "Existing smb sess not found");
+	ses = sesInfoAlloc();
+	if (ses == NULL)
+		goto get_ses_fail;
+
+	/* new SMB session uses our server ref */
+	ses->server = server;
+	if (server->addr.sockAddr6.sin6_family == AF_INET6)
+		sprintf(ses->serverName, "%pI6",
+			&server->addr.sockAddr6.sin6_addr);
+	else
+		sprintf(ses->serverName, "%pI4",
+			&server->addr.sockAddr.sin_addr.s_addr);
+
+	if (volume_info->username)
+		strncpy(ses->userName, volume_info->username,
+			MAX_USERNAME_SIZE);
+
+	/* volume_info->password freed at unmount */
+	if (volume_info->password) {
+		ses->password = kstrdup(volume_info->password, GFP_KERNEL);
+		if (!ses->password)
+			goto get_ses_fail;
+	}
+	if (volume_info->domainname) {
+		int len = strlen(volume_info->domainname);
+		ses->domainName = kmalloc(len + 1, GFP_KERNEL);
+		if (ses->domainName)
+			strcpy(ses->domainName, volume_info->domainname);
+	}
+	ses->linux_uid = volume_info->linux_uid;
+	ses->overrideSecFlg = volume_info->secFlg;
+
+	mutex_lock(&ses->session_mutex);
+	rc = cifs_setup_session(xid, ses, volume_info->local_nls);
+	mutex_unlock(&ses->session_mutex);
+	if (rc)
+		goto get_ses_fail;
+
+	/* success, put it on the list */
+	write_lock(&cifs_tcp_ses_lock);
+	list_add(&ses->smb_ses_list, &server->smb_ses_list);
+	write_unlock(&cifs_tcp_ses_lock);
+
+	FreeXid(xid);
+	return ses;
+
+get_ses_fail:
+	sesInfoFree(ses);
+	FreeXid(xid);
+	return ERR_PTR(rc);
+}
+
 static struct cifsTconInfo *
 cifs_find_tcon(struct cifsSesInfo *ses, const char *unc)
 {
@@ -2376,71 +2463,12 @@ try_mount_again:
 		goto out;
 	}
 
-	pSesInfo = cifs_find_smb_ses(srvTcp, volume_info->username);
-	if (pSesInfo) {
-		cFYI(1, "Existing smb sess found (status=%d)",
-			pSesInfo->status);
-		/*
-		 * The existing SMB session already has a reference to srvTcp,
-		 * so we can put back the extra one we got before
-		 */
-		cifs_put_tcp_session(srvTcp);
-
-		mutex_lock(&pSesInfo->session_mutex);
-		if (pSesInfo->need_reconnect) {
-			cFYI(1, "Session needs reconnect");
-			rc = cifs_setup_session(xid, pSesInfo,
-						cifs_sb->local_nls);
-		}
-		mutex_unlock(&pSesInfo->session_mutex);
-	} else if (!rc) {
-		cFYI(1, "Existing smb sess not found");
-		pSesInfo = sesInfoAlloc();
-		if (pSesInfo == NULL) {
-			rc = -ENOMEM;
-			goto mount_fail_check;
-		}
-
-		/* new SMB session uses our srvTcp ref */
-		pSesInfo->server = srvTcp;
-		if (srvTcp->addr.sockAddr6.sin6_family == AF_INET6)
-			sprintf(pSesInfo->serverName, "%pI6",
-				&srvTcp->addr.sockAddr6.sin6_addr);
-		else
-			sprintf(pSesInfo->serverName, "%pI4",
-				&srvTcp->addr.sockAddr.sin_addr.s_addr);
-
-		write_lock(&cifs_tcp_ses_lock);
-		list_add(&pSesInfo->smb_ses_list, &srvTcp->smb_ses_list);
-		write_unlock(&cifs_tcp_ses_lock);
-
-		/* volume_info->password freed at unmount */
-		if (volume_info->password) {
-			pSesInfo->password = kstrdup(volume_info->password,
-						     GFP_KERNEL);
-			if (!pSesInfo->password) {
-				rc = -ENOMEM;
-				goto mount_fail_check;
-			}
-		}
-		if (volume_info->username)
-			strncpy(pSesInfo->userName, volume_info->username,
-				MAX_USERNAME_SIZE);
-		if (volume_info->domainname) {
-			int len = strlen(volume_info->domainname);
-			pSesInfo->domainName = kmalloc(len + 1, GFP_KERNEL);
-			if (pSesInfo->domainName)
-				strcpy(pSesInfo->domainName,
-					volume_info->domainname);
-		}
-		pSesInfo->linux_uid = volume_info->linux_uid;
-		pSesInfo->overrideSecFlg = volume_info->secFlg;
-		mutex_lock(&pSesInfo->session_mutex);
-
-		/* BB FIXME need to pass vol->secFlgs BB */
-		rc = cifs_setup_session(xid, pSesInfo,
-					cifs_sb->local_nls);
-		mutex_unlock(&pSesInfo->session_mutex);
+	/* get a reference to a SMB session */
+	pSesInfo = cifs_get_smb_ses(srvTcp, volume_info);
+	if (IS_ERR(pSesInfo)) {
+		rc = PTR_ERR(pSesInfo);
+		pSesInfo = NULL;
+		goto mount_fail_check;
 	}
 
 	/* search for existing tcon to this server share */
-- 
cgit v1.2.3


From d00c28de55a69d13cf2c6a99efc3e81a5d95af74 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 24 Apr 2010 07:57:44 -0400
Subject: cifs: move tcon find/create into separate function

...and out of cifs_mount.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 156 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 86 insertions(+), 70 deletions(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 74c222ed83d..c104f54cadf 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1749,6 +1749,7 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
 	int xid;
 	struct cifsSesInfo *ses = tcon->ses;
 
+	cFYI(1, "%s: tc_count=%d\n", __func__, tcon->tc_count);
 	write_lock(&cifs_tcp_ses_lock);
 	if (--tcon->tc_count > 0) {
 		write_unlock(&cifs_tcp_ses_lock);
@@ -1766,6 +1767,80 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
 	cifs_put_smb_ses(ses);
 }
 
+static struct cifsTconInfo *
+cifs_get_tcon(struct cifsSesInfo *ses, struct smb_vol *volume_info)
+{
+	int rc, xid;
+	struct cifsTconInfo *tcon;
+
+	tcon = cifs_find_tcon(ses, volume_info->UNC);
+	if (tcon) {
+		cFYI(1, "Found match on UNC path");
+		/* existing tcon already has a reference */
+		cifs_put_smb_ses(ses);
+		if (tcon->seal != volume_info->seal)
+			cERROR(1, "transport encryption setting "
+				   "conflicts with existing tid");
+		return tcon;
+	}
+
+	tcon = tconInfoAlloc();
+	if (tcon == NULL) {
+		rc = -ENOMEM;
+		goto out_fail;
+	}
+
+	tcon->ses = ses;
+	if (volume_info->password) {
+		tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
+		if (!tcon->password) {
+			rc = -ENOMEM;
+			goto out_fail;
+		}
+	}
+
+	if (strchr(volume_info->UNC + 3, '\\') == NULL
+	    && strchr(volume_info->UNC + 3, '/') == NULL) {
+		cERROR(1, "Missing share name");
+		rc = -ENODEV;
+		goto out_fail;
+	}
+
+	/* BB Do we need to wrap session_mutex around
+	 * this TCon call and Unix SetFS as
+	 * we do on SessSetup and reconnect? */
+	xid = GetXid();
+	rc = CIFSTCon(xid, ses, volume_info->UNC, tcon, volume_info->local_nls);
+	FreeXid(xid);
+	cFYI(1, "CIFS Tcon rc = %d", rc);
+	if (rc)
+		goto out_fail;
+
+	if (volume_info->nodfs) {
+		tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
+		cFYI(1, "DFS disabled (%d)", tcon->Flags);
+	}
+	tcon->seal = volume_info->seal;
+	/* we can have only one retry value for a connection
+	   to a share so for resources mounted more than once
+	   to the same server share the last value passed in
+	   for the retry flag is used */
+	tcon->retry = volume_info->retry;
+	tcon->nocase = volume_info->nocase;
+	tcon->local_lease = volume_info->local_lease;
+
+	write_lock(&cifs_tcp_ses_lock);
+	list_add(&tcon->tcon_list, &ses->tcon_list);
+	write_unlock(&cifs_tcp_ses_lock);
+
+	return tcon;
+
+out_fail:
+	tconInfoFree(tcon);
+	return ERR_PTR(rc);
+}
+
+
 int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 	     const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
@@ -2471,81 +2546,22 @@ try_mount_again:
 		goto mount_fail_check;
 	}
 
-	/* search for existing tcon to this server share */
-	if (!rc) {
-		setup_cifs_sb(volume_info, cifs_sb);
-
-		tcon = cifs_find_tcon(pSesInfo, volume_info->UNC);
-		if (tcon) {
-			cFYI(1, "Found match on UNC path");
-			/* existing tcon already has a reference */
-			cifs_put_smb_ses(pSesInfo);
-			if (tcon->seal != volume_info->seal)
-				cERROR(1, "transport encryption setting "
-					   "conflicts with existing tid");
-		} else {
-			tcon = tconInfoAlloc();
-			if (tcon == NULL) {
-				rc = -ENOMEM;
-				goto mount_fail_check;
-			}
-
-			tcon->ses = pSesInfo;
-			if (volume_info->password) {
-				tcon->password = kstrdup(volume_info->password,
-							 GFP_KERNEL);
-				if (!tcon->password) {
-					rc = -ENOMEM;
-					goto mount_fail_check;
-				}
-			}
-
-			if ((strchr(volume_info->UNC + 3, '\\') == NULL)
-			    && (strchr(volume_info->UNC + 3, '/') == NULL)) {
-				cERROR(1, "Missing share name");
-				rc = -ENODEV;
-				goto mount_fail_check;
-			} else {
-				/* BB Do we need to wrap sesSem around
-				 * this TCon call and Unix SetFS as
-				 * we do on SessSetup and reconnect? */
-				rc = CIFSTCon(xid, pSesInfo, volume_info->UNC,
-					      tcon, cifs_sb->local_nls);
-				cFYI(1, "CIFS Tcon rc = %d", rc);
-				if (volume_info->nodfs) {
-					tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
-					cFYI(1, "DFS disabled (%d)",
-						tcon->Flags);
-				}
-			}
-			if (rc)
-				goto remote_path_check;
-			tcon->seal = volume_info->seal;
-			write_lock(&cifs_tcp_ses_lock);
-			list_add(&tcon->tcon_list, &pSesInfo->tcon_list);
-			write_unlock(&cifs_tcp_ses_lock);
-		}
-
-		/* we can have only one retry value for a connection
-		   to a share so for resources mounted more than once
-		   to the same server share the last value passed in
-		   for the retry flag is used */
-		tcon->retry = volume_info->retry;
-		tcon->nocase = volume_info->nocase;
-		tcon->local_lease = volume_info->local_lease;
-	}
-	if (pSesInfo) {
-		if (pSesInfo->capabilities & CAP_LARGE_FILES)
-			sb->s_maxbytes = MAX_LFS_FILESIZE;
-		else
-			sb->s_maxbytes = MAX_NON_LFS;
-	}
+	setup_cifs_sb(volume_info, cifs_sb);
+	if (pSesInfo->capabilities & CAP_LARGE_FILES)
+		sb->s_maxbytes = MAX_LFS_FILESIZE;
+	else
+		sb->s_maxbytes = MAX_NON_LFS;
 
 	/* BB FIXME fix time_gran to be larger for LANMAN sessions */
 	sb->s_time_gran = 100;
 
-	if (rc)
+	/* search for existing tcon to this server share */
+	tcon = cifs_get_tcon(pSesInfo, volume_info);
+	if (IS_ERR(tcon)) {
+		rc = PTR_ERR(tcon);
+		tcon = NULL;
 		goto remote_path_check;
+	}
 
 	cifs_sb->tcon = tcon;
 
-- 
cgit v1.2.3


From 04912d6a20185473db025061b9b2c81fbdffc48b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 24 Apr 2010 07:57:45 -0400
Subject: cifs: rename "extended_security" to "global_secflags"

...since that more accurately describes what that variable holds.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_debug.c  | 14 +++++++-------
 fs/cifs/cifsencrypt.c |  2 +-
 fs/cifs/cifsfs.c      |  2 +-
 fs/cifs/cifsglob.h    |  2 +-
 fs/cifs/cifssmb.c     |  2 +-
 fs/cifs/connect.c     |  2 +-
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 54951804734..4fce6e61b34 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -716,7 +716,7 @@ static const struct file_operations cifs_multiuser_mount_proc_fops = {
 
 static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
 {
-	seq_printf(m, "0x%x\n", extended_security);
+	seq_printf(m, "0x%x\n", global_secflags);
 	return 0;
 }
 
@@ -744,10 +744,10 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
 		/* single char or single char followed by null */
 		c = flags_string[0];
 		if (c == '0' || c == 'n' || c == 'N') {
-			extended_security = CIFSSEC_DEF; /* default */
+			global_secflags = CIFSSEC_DEF; /* default */
 			return count;
 		} else if (c == '1' || c == 'y' || c == 'Y') {
-			extended_security = CIFSSEC_MAX;
+			global_secflags = CIFSSEC_MAX;
 			return count;
 		} else if (!isdigit(c)) {
 			cERROR(1, "invalid flag %c", c);
@@ -771,12 +771,12 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
 		return -EINVAL;
 	}
 	/* flags look ok - update the global security flags for cifs module */
-	extended_security = flags;
-	if (extended_security & CIFSSEC_MUST_SIGN) {
+	global_secflags = flags;
+	if (global_secflags & CIFSSEC_MUST_SIGN) {
 		/* requiring signing implies signing is allowed */
-		extended_security |= CIFSSEC_MAY_SIGN;
+		global_secflags |= CIFSSEC_MAY_SIGN;
 		cFYI(1, "packet signing now required");
-	} else if ((extended_security & CIFSSEC_MAY_SIGN) == 0) {
+	} else if ((global_secflags & CIFSSEC_MAY_SIGN) == 0) {
 		cFYI(1, "packet signing disabled");
 	}
 	/* BB should we turn on MAY flags for other MUST options? */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 61e41525206..847628dfdc4 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -291,7 +291,7 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 	if (password)
 		strncpy(password_with_pad, password, CIFS_ENCPWD_SIZE);
 
-	if (!encrypt && extended_security & CIFSSEC_MAY_PLNTXT) {
+	if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) {
 		memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
 		memcpy(lnm_session_key, password_with_pad,
 			CIFS_ENCPWD_SIZE);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 53e794131c2..440e98ca01e 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -61,7 +61,7 @@ unsigned int experimEnabled = 0;
 unsigned int linuxExtEnabled = 1;
 unsigned int lookupCacheEnabled = 1;
 unsigned int multiuser_mount = 0;
-unsigned int extended_security = CIFSSEC_DEF;
+unsigned int global_secflags = CIFSSEC_DEF;
 /* unsigned int ntlmv2_support = 0; */
 unsigned int sign_CIFS_PDUs = 1;
 static const struct super_operations cifs_super_ops;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ecf0ffbe2b6..4a2715b389c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -717,7 +717,7 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
 GLOBAL_EXTERN unsigned int oplockEnabled;
 GLOBAL_EXTERN unsigned int experimEnabled;
 GLOBAL_EXTERN unsigned int lookupCacheEnabled;
-GLOBAL_EXTERN unsigned int extended_security;	/* if on, session setup sent
+GLOBAL_EXTERN unsigned int global_secflags;	/* if on, session setup sent
 				with more secure ntlmssp2 challenge/resp */
 GLOBAL_EXTERN unsigned int sign_CIFS_PDUs;  /* enable smb packet signing */
 GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 980c38c658b..4ecb361640b 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -372,7 +372,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	if (ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL)))
 		secFlags = ses->overrideSecFlg;  /* BB FIXME fix sign flags? */
 	else /* if override flags set only sign/seal OR them with global auth */
-		secFlags = extended_security | ses->overrideSecFlg;
+		secFlags = global_secflags | ses->overrideSecFlg;
 
 	cFYI(1, "secFlags 0x%x", secFlags);
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c104f54cadf..c2793bd1db4 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2748,7 +2748,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 		   by Samba (not sure whether other servers allow
 		   NTLMv2 password here) */
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-		if ((extended_security & CIFSSEC_MAY_LANMAN) &&
+		if ((global_secflags & CIFSSEC_MAY_LANMAN) &&
 		    (ses->server->secType == LANMAN))
 			calc_lanman_hash(tcon->password, ses->server->cryptKey,
 					 ses->server->secMode &
-- 
cgit v1.2.3


From ad6cca6d5d0f713e1987e20ed982cfa9eb16b27e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Mon, 26 Apr 2010 12:10:06 +0200
Subject: cifs: change && to ||

This is a typo, if pvolume_info were NULL it would oops.

This function is used in clean up and error handling.  The current code
never passes a NULL pvolume_info, but it could pass a NULL *pvolume_info
if the kmalloc() failed.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c2793bd1db4..7a47c7c5c7e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2421,7 +2421,7 @@ cleanup_volume_info(struct smb_vol **pvolume_info)
 {
 	struct smb_vol *volume_info;
 
-	if (!pvolume_info && !*pvolume_info)
+	if (!pvolume_info || !*pvolume_info)
 		return;
 
 	volume_info = *pvolume_info;
-- 
cgit v1.2.3


From 9106b69382912ddc403a307b69bf894a6f3004e4 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Fri, 2 Apr 2010 19:01:20 +0200
Subject: tracing: Add ftrace events for graph tracer

Add ftrace events for graph tracer, so the graph output could be shared
with other tracers.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1270227683-14631-2-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_functions_graph.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 669b9c31861..db9e06bb766 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -40,7 +40,7 @@ struct fgraph_data {
 #define TRACE_GRAPH_PRINT_OVERHEAD	0x4
 #define TRACE_GRAPH_PRINT_PROC		0x8
 #define TRACE_GRAPH_PRINT_DURATION	0x10
-#define TRACE_GRAPH_PRINT_ABS_TIME	0X20
+#define TRACE_GRAPH_PRINT_ABS_TIME	0x20
 
 static struct tracer_opt trace_opts[] = {
 	/* Display overruns? (for self-debug purpose) */
@@ -1096,6 +1096,12 @@ print_graph_function(struct trace_iterator *iter)
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t
+print_graph_function_event(struct trace_iterator *iter, int flags)
+{
+	return print_graph_function(iter);
+}
+
 static void print_lat_header(struct seq_file *s)
 {
 	static const char spaces[] = "                "	/* 16 spaces */
@@ -1199,6 +1205,16 @@ static void graph_trace_close(struct trace_iterator *iter)
 	}
 }
 
+static struct trace_event graph_trace_entry_event = {
+	.type		= TRACE_GRAPH_ENT,
+	.trace		= print_graph_function_event,
+};
+
+static struct trace_event graph_trace_ret_event = {
+	.type		= TRACE_GRAPH_RET,
+	.trace		= print_graph_function_event,
+};
+
 static struct tracer graph_trace __read_mostly = {
 	.name		= "function_graph",
 	.open		= graph_trace_open,
@@ -1220,6 +1236,16 @@ static __init int init_graph_trace(void)
 {
 	max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
 
+	if (!register_ftrace_event(&graph_trace_entry_event)) {
+		pr_warning("Warning: could not register graph trace events\n");
+		return 1;
+	}
+
+	if (!register_ftrace_event(&graph_trace_ret_event)) {
+		pr_warning("Warning: could not register graph trace events\n");
+		return 1;
+	}
+
 	return register_tracer(&graph_trace);
 }
 
-- 
cgit v1.2.3


From d7a8d9e907cc294ec7a4a7046d1886375fbcc82e Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Fri, 2 Apr 2010 19:01:21 +0200
Subject: tracing: Have graph flags passed in to ouput functions

Let the function graph tracer have custom flags passed to its
output functions.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1270227683-14631-3-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h                 |   6 +-
 kernel/trace/trace_functions_graph.c | 123 ++++++++++++++++++++---------------
 2 files changed, 73 insertions(+), 56 deletions(-)

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2825ef2c0b1..970004c5fa7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -491,7 +491,9 @@ extern int trace_clock_id;
 
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+extern enum print_line_t
+print_graph_function_flags(struct trace_iterator *iter, u32 flags);
+extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
 extern enum print_line_t
 trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
 
@@ -524,7 +526,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
 	return TRACE_TYPE_UNHANDLED;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index db9e06bb766..de5f6518aba 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -527,17 +527,18 @@ get_return_for_leaf(struct trace_iterator *iter,
 
 /* Signal a overhead of time execution to the output */
 static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+print_graph_overhead(unsigned long long duration, struct trace_seq *s,
+		     u32 flags)
 {
 	/* If duration disappear, we don't need anything */
-	if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
+	if (!(flags & TRACE_GRAPH_PRINT_DURATION))
 		return 1;
 
 	/* Non nested entry or return */
 	if (duration == -1)
 		return trace_seq_printf(s, "  ");
 
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+	if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
 		/* Duration exceeded 100 msecs */
 		if (duration > 100000ULL)
 			return trace_seq_printf(s, "! ");
@@ -563,7 +564,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
 
 static enum print_line_t
 print_graph_irq(struct trace_iterator *iter, unsigned long addr,
-		enum trace_type type, int cpu, pid_t pid)
+		enum trace_type type, int cpu, pid_t pid, u32 flags)
 {
 	int ret;
 	struct trace_seq *s = &iter->seq;
@@ -573,21 +574,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 		return TRACE_TYPE_UNHANDLED;
 
 	/* Absolute time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
 		ret = print_graph_abs_time(iter->ts, s);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Cpu */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+	if (flags & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Proc */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+	if (flags & TRACE_GRAPH_PRINT_PROC) {
 		ret = print_graph_proc(s, pid);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -597,7 +598,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 	}
 
 	/* No overhead */
-	ret = print_graph_overhead(-1, s);
+	ret = print_graph_overhead(-1, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -610,7 +611,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Don't close the duration column if haven't one */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+	if (flags & TRACE_GRAPH_PRINT_DURATION)
 		trace_seq_printf(s, " |");
 	ret = trace_seq_printf(s, "\n");
 
@@ -680,7 +681,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 static enum print_line_t
 print_graph_entry_leaf(struct trace_iterator *iter,
 		struct ftrace_graph_ent_entry *entry,
-		struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
+		struct ftrace_graph_ret_entry *ret_entry,
+		struct trace_seq *s, u32 flags)
 {
 	struct fgraph_data *data = iter->private;
 	struct ftrace_graph_ret *graph_ret;
@@ -712,12 +714,12 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	}
 
 	/* Overhead */
-	ret = print_graph_overhead(duration, s);
+	ret = print_graph_overhead(duration, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Duration */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+	if (flags & TRACE_GRAPH_PRINT_DURATION) {
 		ret = print_graph_duration(duration, s);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -740,7 +742,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 static enum print_line_t
 print_graph_entry_nested(struct trace_iterator *iter,
 			 struct ftrace_graph_ent_entry *entry,
-			 struct trace_seq *s, int cpu)
+			 struct trace_seq *s, int cpu, u32 flags)
 {
 	struct ftrace_graph_ent *call = &entry->graph_ent;
 	struct fgraph_data *data = iter->private;
@@ -760,12 +762,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
 	}
 
 	/* No overhead */
-	ret = print_graph_overhead(-1, s);
+	ret = print_graph_overhead(-1, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* No time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+	if (flags & TRACE_GRAPH_PRINT_DURATION) {
 		ret = trace_seq_printf(s, "            |  ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -791,7 +793,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
 
 static enum print_line_t
 print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
-		     int type, unsigned long addr)
+		     int type, unsigned long addr, u32 flags)
 {
 	struct fgraph_data *data = iter->private;
 	struct trace_entry *ent = iter->ent;
@@ -804,27 +806,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 
 	if (type) {
 		/* Interrupt */
-		ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
+		ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Absolute time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
 		ret = print_graph_abs_time(iter->ts, s);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Cpu */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+	if (flags & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
 	/* Proc */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+	if (flags & TRACE_GRAPH_PRINT_PROC) {
 		ret = print_graph_proc(s, ent->pid);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -846,7 +848,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
-			struct trace_iterator *iter)
+			struct trace_iterator *iter, u32 flags)
 {
 	struct fgraph_data *data = iter->private;
 	struct ftrace_graph_ent *call = &field->graph_ent;
@@ -854,14 +856,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	static enum print_line_t ret;
 	int cpu = iter->cpu;
 
-	if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
+	if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	leaf_ret = get_return_for_leaf(iter, field);
 	if (leaf_ret)
-		ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
+		ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
 	else
-		ret = print_graph_entry_nested(iter, field, s, cpu);
+		ret = print_graph_entry_nested(iter, field, s, cpu, flags);
 
 	if (data) {
 		/*
@@ -880,7 +882,8 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 
 static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
-		   struct trace_entry *ent, struct trace_iterator *iter)
+		   struct trace_entry *ent, struct trace_iterator *iter,
+		   u32 flags)
 {
 	unsigned long long duration = trace->rettime - trace->calltime;
 	struct fgraph_data *data = iter->private;
@@ -910,16 +913,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 		}
 	}
 
-	if (print_graph_prologue(iter, s, 0, 0))
+	if (print_graph_prologue(iter, s, 0, 0, flags))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Overhead */
-	ret = print_graph_overhead(duration, s);
+	ret = print_graph_overhead(duration, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Duration */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+	if (flags & TRACE_GRAPH_PRINT_DURATION) {
 		ret = print_graph_duration(duration, s);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -949,14 +952,15 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	}
 
 	/* Overrun */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+	if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
 		ret = trace_seq_printf(s, " (Overruns: %lu)\n",
 					trace->overrun);
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
+	ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
+			      cpu, pid, flags);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -964,8 +968,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 }
 
 static enum print_line_t
-print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
-		    struct trace_iterator *iter)
+print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
+		    struct trace_iterator *iter, u32 flags)
 {
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct fgraph_data *data = iter->private;
@@ -977,16 +981,16 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
 	if (data)
 		depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
 
-	if (print_graph_prologue(iter, s, 0, 0))
+	if (print_graph_prologue(iter, s, 0, 0, flags))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* No overhead */
-	ret = print_graph_overhead(-1, s);
+	ret = print_graph_overhead(-1, s, flags);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* No time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+	if (flags & TRACE_GRAPH_PRINT_DURATION) {
 		ret = trace_seq_printf(s, "            |  ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
@@ -1041,7 +1045,7 @@ print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
 
 
 enum print_line_t
-print_graph_function(struct trace_iterator *iter)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 {
 	struct ftrace_graph_ent_entry *field;
 	struct fgraph_data *data = iter->private;
@@ -1062,7 +1066,7 @@ print_graph_function(struct trace_iterator *iter)
 	if (data && data->failed) {
 		field = &data->ent;
 		iter->cpu = data->cpu;
-		ret = print_graph_entry(field, s, iter);
+		ret = print_graph_entry(field, s, iter, flags);
 		if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
 			per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
 			ret = TRACE_TYPE_NO_CONSUME;
@@ -1082,38 +1086,44 @@ print_graph_function(struct trace_iterator *iter)
 		struct ftrace_graph_ent_entry saved;
 		trace_assign_type(field, entry);
 		saved = *field;
-		return print_graph_entry(&saved, s, iter);
+		return print_graph_entry(&saved, s, iter, flags);
 	}
 	case TRACE_GRAPH_RET: {
 		struct ftrace_graph_ret_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_return(&field->ret, s, entry, iter);
+		return print_graph_return(&field->ret, s, entry, iter, flags);
 	}
 	default:
-		return print_graph_comment(s, entry, iter);
+		return print_graph_comment(s, entry, iter, flags);
 	}
 
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+	return print_graph_function_flags(iter, tracer_flags.val);
+}
+
 static enum print_line_t
 print_graph_function_event(struct trace_iterator *iter, int flags)
 {
 	return print_graph_function(iter);
 }
 
-static void print_lat_header(struct seq_file *s)
+static void print_lat_header(struct seq_file *s, u32 flags)
 {
 	static const char spaces[] = "                "	/* 16 spaces */
 		"    "					/* 4 spaces */
 		"                 ";			/* 17 spaces */
 	int size = 0;
 
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
 		size += 16;
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+	if (flags & TRACE_GRAPH_PRINT_CPU)
 		size += 4;
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+	if (flags & TRACE_GRAPH_PRINT_PROC)
 		size += 17;
 
 	seq_printf(s, "#%.*s  _-----=> irqs-off        \n", size, spaces);
@@ -1124,42 +1134,47 @@ static void print_lat_header(struct seq_file *s)
 	seq_printf(s, "#%.*s|||| /                     \n", size, spaces);
 }
 
-static void print_graph_headers(struct seq_file *s)
+void print_graph_headers_flags(struct seq_file *s, u32 flags)
 {
 	int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
 
 	if (lat)
-		print_lat_header(s);
+		print_lat_header(s, flags);
 
 	/* 1st line */
 	seq_printf(s, "#");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
 		seq_printf(s, "     TIME       ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+	if (flags & TRACE_GRAPH_PRINT_CPU)
 		seq_printf(s, " CPU");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+	if (flags & TRACE_GRAPH_PRINT_PROC)
 		seq_printf(s, "  TASK/PID       ");
 	if (lat)
 		seq_printf(s, "|||||");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+	if (flags & TRACE_GRAPH_PRINT_DURATION)
 		seq_printf(s, "  DURATION   ");
 	seq_printf(s, "               FUNCTION CALLS\n");
 
 	/* 2nd line */
 	seq_printf(s, "#");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+	if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
 		seq_printf(s, "      |         ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+	if (flags & TRACE_GRAPH_PRINT_CPU)
 		seq_printf(s, " |  ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+	if (flags & TRACE_GRAPH_PRINT_PROC)
 		seq_printf(s, "   |    |        ");
 	if (lat)
 		seq_printf(s, "|||||");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+	if (flags & TRACE_GRAPH_PRINT_DURATION)
 		seq_printf(s, "   |   |      ");
 	seq_printf(s, "               |   |   |   |\n");
 }
 
+static void print_graph_headers(struct seq_file *s)
+{
+	print_graph_headers_flags(s, tracer_flags.val);
+}
+
 static void graph_trace_open(struct trace_iterator *iter)
 {
 	/* pid and depth on the last trace processed */
-- 
cgit v1.2.3


From b8bc1389b74c2b66255651a6fcfae56c78b6e63f Mon Sep 17 00:00:00 2001
From: Alessio Igor Bogani <abogani@texware.it>
Date: Sun, 25 Apr 2010 13:18:48 +0200
Subject: ptrace: Cleanup useless header

BKL isn't present anymore into this file thus we can safely remove
smp_lock.h inclusion.

Signed-off-by: Alessio Igor Bogani <abogani@texware.it>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: James Morris <jmorris@namei.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/ptrace.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 53575020f82..2f0f50b450a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -14,7 +14,6 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/signal.h>
-- 
cgit v1.2.3


From 9bf67e516f16d31f86aa6f063576a959bbf19990 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 24 Apr 2010 07:57:46 -0400
Subject: cifs: save the dialect chosen by server

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h |  1 +
 fs/cifs/cifssmb.c  | 11 +++++------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4a2715b389c..c412568b4a1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -185,6 +185,7 @@ struct TCP_Server_Info {
 	struct mac_key mac_signing_key;
 	char ntlmv2_hash[16];
 	unsigned long lstrp; /* when we got last response from this server */
+	u16 dialect; /* dialect index that server chose */
 };
 
 /*
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 4ecb361640b..1372253a060 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -355,7 +355,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	struct TCP_Server_Info *server;
 	u16 count;
 	unsigned int secFlags;
-	u16 dialect;
 
 	if (ses->server)
 		server = ses->server;
@@ -408,10 +407,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	if (rc != 0)
 		goto neg_err_exit;
 
-	dialect = le16_to_cpu(pSMBr->DialectIndex);
-	cFYI(1, "Dialect: %d", dialect);
+	server->dialect = le16_to_cpu(pSMBr->DialectIndex);
+	cFYI(1, "Dialect: %d", server->dialect);
 	/* Check wct = 1 error case */
-	if ((pSMBr->hdr.WordCount < 13) || (dialect == BAD_PROT)) {
+	if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) {
 		/* core returns wct = 1, but we do not ask for core - otherwise
 		small wct just comes when dialect index is -1 indicating we
 		could not negotiate a common dialect */
@@ -419,8 +418,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		goto neg_err_exit;
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 	} else if ((pSMBr->hdr.WordCount == 13)
-			&& ((dialect == LANMAN_PROT)
-				|| (dialect == LANMAN2_PROT))) {
+			&& ((server->dialect == LANMAN_PROT)
+				|| (server->dialect == LANMAN2_PROT))) {
 		__s16 tmp;
 		struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr;
 
-- 
cgit v1.2.3


From d54ff73259a852d4b3886dc586587fdef5e9c8de Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 27 Apr 2010 04:38:15 +0000
Subject: [CIFS] Fix lease break for writes

On lease break we were breaking to readonly leases always
even if write requested.  Also removed experimental
ifdef around setlease code

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 12 ------------
 fs/cifs/cifsfs.h |  2 +-
 fs/cifs/file.c   |  6 ++----
 3 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 440e98ca01e..80a93562b47 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -645,7 +645,6 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 	return generic_file_llseek_unlocked(file, offset, origin);
 }
 
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 {
 	/* note that this is called by vfs setlease with the BKL held
@@ -674,7 +673,6 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 	else
 		return -EAGAIN;
 }
-#endif
 
 struct file_system_type cifs_fs_type = {
 	.owner = THIS_MODULE,
@@ -751,10 +749,7 @@ const struct file_operations cifs_file_ops = {
 #ifdef CONFIG_CIFS_POSIX
 	.unlocked_ioctl	= cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
-
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 	.setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
 const struct file_operations cifs_file_direct_ops = {
@@ -773,9 +768,7 @@ const struct file_operations cifs_file_direct_ops = {
 	.unlocked_ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 	.setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 const struct file_operations cifs_file_nobrl_ops = {
 	.read = do_sync_read,
@@ -792,10 +785,7 @@ const struct file_operations cifs_file_nobrl_ops = {
 #ifdef CONFIG_CIFS_POSIX
 	.unlocked_ioctl	= cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
-
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 	.setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
 const struct file_operations cifs_file_direct_nobrl_ops = {
@@ -813,9 +803,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 	.unlocked_ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
 	.setlease = cifs_setlease,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
 
 const struct file_operations cifs_dir_ops = {
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index dac76022c5f..0242ff9cbf4 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -114,5 +114,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.63"
+#define CIFS_VERSION   "1.64"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 5f1f7682256..d53d6308bf3 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2306,12 +2306,10 @@ cifs_oplock_break(struct slow_work *work)
 	int rc, waitrc = 0;
 
 	if (inode && S_ISREG(inode->i_mode)) {
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-		if (cinode->clientCanCacheAll == 0)
+		if (cinode->clientCanCacheRead)
 			break_lease(inode, O_RDONLY);
-		else if (cinode->clientCanCacheRead == 0)
+		else
 			break_lease(inode, O_WRONLY);
-#endif
 		rc = filemap_fdatawrite(inode->i_mapping);
 		if (cinode->clientCanCacheRead == 0) {
 			waitrc = filemap_fdatawait(inode->i_mapping);
-- 
cgit v1.2.3


From 62b915f1060996a8e1f69be50e3b8e9e43b710cb Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Fri, 2 Apr 2010 19:01:22 +0200
Subject: tracing: Add graph output support for irqsoff tracer

Add function graph output to irqsoff tracer.

The graph output is enabled by setting new 'display-graph' trace option.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1270227683-14631-4-git-send-email-jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h               |  15 +-
 kernel/trace/trace.c                 |  35 +++--
 kernel/trace/trace.h                 |  21 +++
 kernel/trace/trace_functions_graph.c |  15 +-
 kernel/trace/trace_irqsoff.c         | 271 +++++++++++++++++++++++++++++++++--
 5 files changed, 324 insertions(+), 33 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ea5b1aae0e8..8415a522f43 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -352,6 +352,10 @@ struct ftrace_graph_ret {
 	int depth;
 };
 
+/* Type of the callback handlers for tracing function graph*/
+typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
+typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 /* for init task */
@@ -400,10 +404,6 @@ extern char __irqentry_text_end[];
 
 #define FTRACE_RETFUNC_DEPTH 50
 #define FTRACE_RETSTACK_ALLOC_SIZE 32
-/* Type of the callback handlers for tracing function graph*/
-typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
-typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
-
 extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 				trace_func_graph_ent_t entryfunc);
 
@@ -441,6 +441,13 @@ static inline void unpause_graph_tracing(void)
 static inline void ftrace_graph_init_task(struct task_struct *t) { }
 static inline void ftrace_graph_exit_task(struct task_struct *t) { }
 
+static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+			  trace_func_graph_ent_t entryfunc)
+{
+	return -1;
+}
+static inline void unregister_ftrace_graph(void) { }
+
 static inline int task_curr_ret_stack(struct task_struct *tsk)
 {
 	return -1;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7b516c7ef9a..8b9ba41ec14 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1808,7 +1808,7 @@ static void print_func_help_header(struct seq_file *m)
 }
 
 
-static void
+void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 {
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -2017,7 +2017,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 	return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
 }
 
-static int trace_empty(struct trace_iterator *iter)
+int trace_empty(struct trace_iterator *iter)
 {
 	int cpu;
 
@@ -2084,6 +2084,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 	return print_trace_fmt(iter);
 }
 
+void trace_default_header(struct seq_file *m)
+{
+	struct trace_iterator *iter = m->private;
+
+	if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+		/* print nothing if the buffers are empty */
+		if (trace_empty(iter))
+			return;
+		print_trace_header(m, iter);
+		if (!(trace_flags & TRACE_ITER_VERBOSE))
+			print_lat_help_header(m);
+	} else {
+		if (!(trace_flags & TRACE_ITER_VERBOSE))
+			print_func_help_header(m);
+	}
+}
+
 static int s_show(struct seq_file *m, void *v)
 {
 	struct trace_iterator *iter = v;
@@ -2096,17 +2113,9 @@ static int s_show(struct seq_file *m, void *v)
 		}
 		if (iter->trace && iter->trace->print_header)
 			iter->trace->print_header(m);
-		else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
-			/* print nothing if the buffers are empty */
-			if (trace_empty(iter))
-				return 0;
-			print_trace_header(m, iter);
-			if (!(trace_flags & TRACE_ITER_VERBOSE))
-				print_lat_help_header(m);
-		} else {
-			if (!(trace_flags & TRACE_ITER_VERBOSE))
-				print_func_help_header(m);
-		}
+		else
+			trace_default_header(m);
+
 	} else if (iter->leftover) {
 		/*
 		 * If we filled the seq_file buffer earlier, we
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 970004c5fa7..911e9864e94 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -378,6 +378,9 @@ void trace_function(struct trace_array *tr,
 		    unsigned long ip,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
+void trace_default_header(struct seq_file *m);
+void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
+int trace_empty(struct trace_iterator *iter);
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
 int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -491,11 +494,29 @@ extern int trace_clock_id;
 
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+/* Flag options */
+#define TRACE_GRAPH_PRINT_OVERRUN       0x1
+#define TRACE_GRAPH_PRINT_CPU           0x2
+#define TRACE_GRAPH_PRINT_OVERHEAD      0x4
+#define TRACE_GRAPH_PRINT_PROC          0x8
+#define TRACE_GRAPH_PRINT_DURATION      0x10
+#define TRACE_GRAPH_PRINT_ABS_TIME      0x20
+
 extern enum print_line_t
 print_graph_function_flags(struct trace_iterator *iter, u32 flags);
 extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
 extern enum print_line_t
 trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
+extern void graph_trace_open(struct trace_iterator *iter);
+extern void graph_trace_close(struct trace_iterator *iter);
+extern int __trace_graph_entry(struct trace_array *tr,
+			       struct ftrace_graph_ent *trace,
+			       unsigned long flags, int pc);
+extern void __trace_graph_return(struct trace_array *tr,
+				 struct ftrace_graph_ret *trace,
+				 unsigned long flags, int pc);
+
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index de5f6518aba..dd11c830eb8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -179,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
 	return ret;
 }
 
-static int __trace_graph_entry(struct trace_array *tr,
+int __trace_graph_entry(struct trace_array *tr,
 				struct ftrace_graph_ent *trace,
 				unsigned long flags,
 				int pc)
@@ -246,7 +246,7 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
 		return trace_graph_entry(trace);
 }
 
-static void __trace_graph_return(struct trace_array *tr,
+void __trace_graph_return(struct trace_array *tr,
 				struct ftrace_graph_ret *trace,
 				unsigned long flags,
 				int pc)
@@ -1093,6 +1093,11 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 		trace_assign_type(field, entry);
 		return print_graph_return(&field->ret, s, entry, iter, flags);
 	}
+	case TRACE_STACK:
+	case TRACE_FN:
+		/* dont trace stack and functions as comments */
+		return TRACE_TYPE_UNHANDLED;
+
 	default:
 		return print_graph_comment(s, entry, iter, flags);
 	}
@@ -1170,12 +1175,12 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
 	seq_printf(s, "               |   |   |   |\n");
 }
 
-static void print_graph_headers(struct seq_file *s)
+void print_graph_headers(struct seq_file *s)
 {
 	print_graph_headers_flags(s, tracer_flags.val);
 }
 
-static void graph_trace_open(struct trace_iterator *iter)
+void graph_trace_open(struct trace_iterator *iter)
 {
 	/* pid and depth on the last trace processed */
 	struct fgraph_data *data;
@@ -1210,7 +1215,7 @@ static void graph_trace_open(struct trace_iterator *iter)
 	pr_warning("function graph tracer: not enough memory\n");
 }
 
-static void graph_trace_close(struct trace_iterator *iter)
+void graph_trace_close(struct trace_iterator *iter)
 {
 	struct fgraph_data *data = iter->private;
 
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2974bc7538c..6fd486e0cef 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
 
 static int save_lat_flag;
 
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
+static int start_irqsoff_tracer(struct trace_array *tr, int graph);
+
 #ifdef CONFIG_PREEMPT_TRACER
 static inline int
 preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
 # define irq_trace() (0)
 #endif
 
+#define TRACE_DISPLAY_GRAPH	1
+
+static struct tracer_opt trace_opts[] = {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	/* display latency trace as call graph */
+	{ TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+#endif
+	{ } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+	.val  = 0,
+	.opts = trace_opts,
+};
+
+#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
+
 /*
  * Sequence count - we record it when starting a measurement and
  * skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
 };
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+	int cpu;
+
+	if (!(bit & TRACE_DISPLAY_GRAPH))
+		return -EINVAL;
+
+	if (!(is_graph() ^ set))
+		return 0;
+
+	stop_irqsoff_tracer(irqsoff_trace, !set);
+
+	for_each_possible_cpu(cpu)
+		per_cpu(tracing_cpu, cpu) = 0;
+
+	tracing_max_latency = 0;
+	tracing_reset_online_cpus(irqsoff_trace);
+
+	return start_irqsoff_tracer(irqsoff_trace, set);
+}
+
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int ret;
+	int cpu;
+	int pc;
+
+	cpu = raw_smp_processor_id();
+	if (likely(!per_cpu(tracing_cpu, cpu)))
+		return 0;
+
+	local_save_flags(flags);
+	/* slight chance to get a false positive on tracing_cpu */
+	if (!irqs_disabled_flags(flags))
+		return 0;
+
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		ret = __trace_graph_entry(tr, trace, flags, pc);
+	} else
+		ret = 0;
+
+	atomic_dec(&data->disabled);
+	return ret;
+}
+
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	cpu = raw_smp_processor_id();
+	if (likely(!per_cpu(tracing_cpu, cpu)))
+		return;
+
+	local_save_flags(flags);
+	/* slight chance to get a false positive on tracing_cpu */
+	if (!irqs_disabled_flags(flags))
+		return;
+
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		__trace_graph_return(tr, trace, flags, pc);
+	}
+
+	atomic_dec(&data->disabled);
+}
+
+static void irqsoff_trace_open(struct trace_iterator *iter)
+{
+	if (is_graph())
+		graph_trace_open(iter);
+
+}
+
+static void irqsoff_trace_close(struct trace_iterator *iter)
+{
+	if (iter->private)
+		graph_trace_close(iter);
+}
+
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
+			    TRACE_GRAPH_PRINT_PROC)
+
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+	u32 flags = GRAPH_TRACER_FLAGS;
+
+	if (trace_flags & TRACE_ITER_LATENCY_FMT)
+		flags |= TRACE_GRAPH_PRINT_DURATION;
+	else
+		flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+	/*
+	 * In graph mode call the graph tracer output function,
+	 * otherwise go with the TRACE_FN event handler
+	 */
+	if (is_graph())
+		return print_graph_function_flags(iter, flags);
+
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static void irqsoff_print_header(struct seq_file *s)
+{
+	if (is_graph()) {
+		struct trace_iterator *iter = s->private;
+		u32 flags = GRAPH_TRACER_FLAGS;
+
+		if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+			/* print nothing if the buffers are empty */
+			if (trace_empty(iter))
+				return;
+
+			print_trace_header(s, iter);
+			flags |= TRACE_GRAPH_PRINT_DURATION;
+		} else
+			flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+
+		print_graph_headers_flags(s, flags);
+	} else
+		trace_default_header(s);
+}
+
+static void
+trace_graph_function(struct trace_array *tr,
+		 unsigned long ip, unsigned long flags, int pc)
+{
+	u64 time = trace_clock_local();
+	struct ftrace_graph_ent ent = {
+		.func  = ip,
+		.depth = 0,
+	};
+	struct ftrace_graph_ret ret = {
+		.func     = ip,
+		.depth    = 0,
+		.calltime = time,
+		.rettime  = time,
+	};
+
+	__trace_graph_entry(tr, &ent, flags, pc);
+	__trace_graph_return(tr, &ret, flags, pc);
+}
+
+static void
+__trace_function(struct trace_array *tr,
+		 unsigned long ip, unsigned long parent_ip,
+		 unsigned long flags, int pc)
+{
+	if (!is_graph())
+		trace_function(tr, ip, parent_ip, flags, pc);
+	else {
+		trace_graph_function(tr, parent_ip, flags, pc);
+		trace_graph_function(tr, ip, flags, pc);
+	}
+}
+
+#else
+#define __trace_function trace_function
+
+static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+{
+	return -EINVAL;
+}
+
+static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
+{
+	return -1;
+}
+
+static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
+{
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
+static void irqsoff_print_header(struct seq_file *s) { }
+static void irqsoff_trace_open(struct trace_iterator *iter) { }
+static void irqsoff_trace_close(struct trace_iterator *iter) { }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 /*
  * Should this new latency be reported/recorded?
  */
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
 	if (!report_latency(delta))
 		goto out_unlock;
 
-	trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+	__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 	/* Skip 5 functions to get to the irq/preempt enable function */
 	__trace_stack(tr, flags, 5, pc);
 
@@ -172,7 +388,7 @@ out_unlock:
 out:
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = ftrace_now(cpu);
-	trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
+	__trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 }
 
 static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 
 	local_save_flags(flags);
 
-	trace_function(tr, ip, parent_ip, flags, preempt_count());
+	__trace_function(tr, ip, parent_ip, flags, preempt_count());
 
 	per_cpu(tracing_cpu, cpu) = 1;
 
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	atomic_inc(&data->disabled);
 
 	local_save_flags(flags);
-	trace_function(tr, ip, parent_ip, flags, preempt_count());
+	__trace_function(tr, ip, parent_ip, flags, preempt_count());
 	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
 	data->critical_start = 0;
 	atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
 
-static void start_irqsoff_tracer(struct trace_array *tr)
+static int start_irqsoff_tracer(struct trace_array *tr, int graph)
 {
-	register_ftrace_function(&trace_ops);
-	if (tracing_is_enabled())
+	int ret = 0;
+
+	if (!graph)
+		ret = register_ftrace_function(&trace_ops);
+	else
+		ret = register_ftrace_graph(&irqsoff_graph_return,
+					    &irqsoff_graph_entry);
+
+	if (!ret && tracing_is_enabled())
 		tracer_enabled = 1;
 	else
 		tracer_enabled = 0;
+
+	return ret;
 }
 
-static void stop_irqsoff_tracer(struct trace_array *tr)
+static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
 {
 	tracer_enabled = 0;
-	unregister_ftrace_function(&trace_ops);
+
+	if (!graph)
+		unregister_ftrace_function(&trace_ops);
+	else
+		unregister_ftrace_graph();
 }
 
 static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 	/* make sure that the tracer is visible */
 	smp_wmb();
 	tracing_reset_online_cpus(tr);
-	start_irqsoff_tracer(tr);
+
+	if (start_irqsoff_tracer(tr, is_graph()))
+		printk(KERN_ERR "failed to start irqsoff tracer\n");
 }
 
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
-	stop_irqsoff_tracer(tr);
+	stop_irqsoff_tracer(tr, is_graph());
 
 	if (!save_lat_flag)
 		trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,15 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
 	.print_max	= 1,
+	.print_header   = irqsoff_print_header,
+	.print_line     = irqsoff_print_line,
+	.flags		= &tracer_flags,
+	.set_flag	= irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_irqsoff,
 #endif
+	.open           = irqsoff_trace_open,
+	.close          = irqsoff_trace_close,
 };
 # define register_irqsoff(trace) register_tracer(&trace)
 #else
@@ -435,9 +672,15 @@ static struct tracer preemptoff_tracer __read_mostly =
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
 	.print_max	= 1,
+	.print_header   = irqsoff_print_header,
+	.print_line     = irqsoff_print_line,
+	.flags		= &tracer_flags,
+	.set_flag	= irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptoff,
 #endif
+	.open		= irqsoff_trace_open,
+	.close		= irqsoff_trace_close,
 };
 # define register_preemptoff(trace) register_tracer(&trace)
 #else
@@ -463,9 +706,15 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
 	.print_max	= 1,
+	.print_header   = irqsoff_print_header,
+	.print_line     = irqsoff_print_line,
+	.flags		= &tracer_flags,
+	.set_flag	= irqsoff_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptirqsoff,
 #endif
+	.open		= irqsoff_trace_open,
+	.close		= irqsoff_trace_close,
 };
 
 # define register_preemptirqsoff(trace) register_tracer(&trace)
-- 
cgit v1.2.3


From 72c9ddfd4c5bf54ef03cfdf57026416cb678eeba Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Tue, 20 Apr 2010 15:47:11 -0700
Subject: ring-buffer: Make non-consuming read less expensive with lots of
 cpus.

When performing a non-consuming read, a synchronize_sched() is
performed once for every cpu which is actively tracing.

This is very expensive, and can make it take several seconds to open
up the 'trace' file with lots of cpus.

Only one synchronize_sched() call is actually necessary.  What is
desired is for all cpus to see the disabling state change.  So we
transform the existing sequence:

	for_each_cpu() {
		ring_buffer_read_start();
	}

where each ring_buffer_start() call performs a synchronize_sched(),
into the following:

	for_each_cpu() {
		ring_buffer_read_prepare();
	}
	ring_buffer_read_prepare_sync();
	for_each_cpu() {
		ring_buffer_read_start();
	}

wherein only the single ring_buffer_read_prepare_sync() call needs to
do the synchronize_sched().

The first phase, via ring_buffer_read_prepare(), allocates the 'iter'
memory and increments ->record_disabled.

In the second phase, ring_buffer_read_prepare_sync() makes sure this
->record_disabled state is visible fully to all cpus.

And in the final third phase, the ring_buffer_read_start() calls reset
the 'iter' objects allocated in the first phase since we now know that
none of the cpus are adding trace entries any more.

This makes openning the 'trace' file nearly instantaneous on a
sparc64 Niagara2 box with 128 cpus tracing.

Signed-off-by: David S. Miller <davem@davemloft.net>
LKML-Reference: <20100420.154711.11246950.davem@davemloft.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  4 ++-
 kernel/trace/ring_buffer.c  | 64 ++++++++++++++++++++++++++++++++++++++-------
 kernel/trace/trace.c        | 11 +++++---
 3 files changed, 65 insertions(+), 14 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index c8297761e41..25b4f686d91 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -127,7 +127,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
 		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_prepare_sync(void);
+void ring_buffer_read_start(struct ring_buffer_iter *iter);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 
 struct ring_buffer_event *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5885cdfc41f..2a090448ef6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3332,23 +3332,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
- * ring_buffer_read_start - start a non consuming read of the buffer
+ * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
  *
- * This starts up an iteration through the buffer. It also disables
- * the recording to the buffer until the reading is finished.
- * This prevents the reading from being corrupted. This is not
- * a consuming read, so a producer is not expected.
+ * This performs the initial preparations necessary to iterate
+ * through the buffer.  Memory is allocated, buffer recording
+ * is disabled, and the iterator pointer is returned to the caller.
  *
- * Must be paired with ring_buffer_finish.
+ * Disabling buffer recordng prevents the reading from being
+ * corrupted. This is not a consuming read, so a producer is not
+ * expected.
+ *
+ * After a sequence of ring_buffer_read_prepare calls, the user is
+ * expected to make at least one call to ring_buffer_prepare_sync.
+ * Afterwards, ring_buffer_read_start is invoked to get things going
+ * for real.
+ *
+ * This overall must be paired with ring_buffer_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
-	unsigned long flags;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
@@ -3362,15 +3369,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	iter->cpu_buffer = cpu_buffer;
 
 	atomic_inc(&cpu_buffer->record_disabled);
+
+	return iter;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
+
+/**
+ * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
+ *
+ * All previously invoked ring_buffer_read_prepare calls to prepare
+ * iterators will be synchronized.  Afterwards, read_buffer_read_start
+ * calls on those iterators are allowed.
+ */
+void
+ring_buffer_read_prepare_sync(void)
+{
 	synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @iter: The iterator returned by ring_buffer_read_prepare
+ *
+ * This finalizes the startup of an iteration through the buffer.
+ * The iterator comes from a call to ring_buffer_read_prepare and
+ * an intervening ring_buffer_read_prepare_sync must have been
+ * performed.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+void
+ring_buffer_read_start(struct ring_buffer_iter *iter)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long flags;
+
+	if (!iter)
+		return;
+
+	cpu_buffer = iter->cpu_buffer;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	arch_spin_lock(&cpu_buffer->lock);
 	rb_iter_reset(iter);
 	arch_spin_unlock(&cpu_buffer->lock);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-
-	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8b9ba41ec14..756d7283318 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2201,15 +2201,20 @@ __tracing_open(struct inode *inode, struct file *file)
 
 	if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
 		for_each_tracing_cpu(cpu) {
-
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_start(iter->tr->buffer, cpu);
+				ring_buffer_read_prepare(iter->tr->buffer, cpu);
+		}
+		ring_buffer_read_prepare_sync();
+		for_each_tracing_cpu(cpu) {
+			ring_buffer_read_start(iter->buffer_iter[cpu]);
 			tracing_iter_reset(iter, cpu);
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-				ring_buffer_read_start(iter->tr->buffer, cpu);
+			ring_buffer_read_prepare(iter->tr->buffer, cpu);
+		ring_buffer_read_prepare_sync();
+		ring_buffer_read_start(iter->buffer_iter[cpu]);
 		tracing_iter_reset(iter, cpu);
 	}
 
-- 
cgit v1.2.3


From a838b2e634405fb89ddbf4fa9412acb33911911f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 27 Apr 2010 13:26:58 -0400
Subject: ring-buffer: Make benchmark handle missed events

With the addition of the "missed events" flags that is stored in the
commit field of the ring buffer page, the ring_buffer_benchmark
was not updated to handle this. If events are missed, then the
missed events flag is set in the ring buffer page, the benchmark
will count that flag as part of the size of the page and will hit the BUG()
when it tries to read beyond the page.

The solution is simply to have the ring buffer benchmark mask off
the extra bits.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index dc56556b55a..302f8a61463 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -113,7 +113,8 @@ static enum event_status read_page(int cpu)
 	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
 	if (ret >= 0) {
 		rpage = bpage;
-		commit = local_read(&rpage->commit);
+		/* The commit may have missed event flags set, clear them */
+		commit = local_read(&rpage->commit) & 0xfffff;
 		for (i = 0; i < commit && !kill_test; i += inc) {
 
 			if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
-- 
cgit v1.2.3


From 07271aa42d13378e67ebd79ea9ca1c4a5e2ad46f Mon Sep 17 00:00:00 2001
From: Chase Douglas <chase.douglas@canonical.com>
Date: Fri, 23 Apr 2010 14:02:39 -0400
Subject: tracing: Add documentation for trace commands mod, traceon/traceoff

The mod command went in as commit
64e7c440618998fd69eee6ab490b042d12248021

The traceon/traceoff commands went in as commit
23b4ff3aa479c9e3bb23cb6b2d0a97878399784a

Signed-off-by: Chase Douglas <chase.douglas@canonical.com>
LKML-Reference: <1272045759-32018-1-git-send-email-chase.douglas@canonical.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/trace/ftrace.txt | 44 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 52011815c90..557c1edecca 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -155,6 +155,9 @@ of ftrace. Here is a list of some of the key files:
 	to be traced. Echoing names of functions into this file
 	will limit the trace to only those functions.
 
+	This interface also allows for commands to be used. See the
+	"Filter commands" section for more details.
+
   set_ftrace_notrace:
 
 	This has an effect opposite to that of
@@ -1824,6 +1827,47 @@ this special filter via:
  echo > set_graph_function
 
 
+Filter commands
+---------------
+
+A few commands are supported by the set_ftrace_filter interface.
+Trace commands have the following format:
+
+<function>:<command>:<parameter>
+
+The following commands are supported:
+
+- mod
+  This command enables function filtering per module. The
+  parameter defines the module. For example, if only the write*
+  functions in the ext3 module are desired, run:
+
+   echo 'write*:mod:ext3' > set_ftrace_filter
+
+  This command interacts with the filter in the same way as
+  filtering based on function names. Thus, adding more functions
+  in a different module is accomplished by appending (>>) to the
+  filter file. Remove specific module functions by prepending
+  '!':
+
+   echo '!writeback*:mod:ext3' >> set_ftrace_filter
+
+- traceon/traceoff
+  These commands turn tracing on and off when the specified
+  functions are hit. The parameter determines how many times the
+  tracing system is turned on and off. If unspecified, there is
+  no limit. For example, to disable tracing when a schedule bug
+  is hit the first 5 times, run:
+
+   echo '__schedule_bug:traceoff:5' > set_ftrace_filter
+
+  These commands are cumulative whether or not they are appended
+  to set_ftrace_filter. To remove a command, prepend it by '!'
+  and drop the parameter:
+
+   echo '!__schedule_bug:traceoff' > set_ftrace_filter
+
+
 trace_pipe
 ----------
 
-- 
cgit v1.2.3


From e330b3bcd83199dd63a819d8d12e40f9edae6c77 Mon Sep 17 00:00:00 2001
From: Chase Douglas <chase.douglas@canonical.com>
Date: Mon, 26 Apr 2010 14:02:05 -0400
Subject: tracing: Show sample std dev in function profiling

When combined with function graph tracing the ftrace function profiler
also prints the average run time of functions. While this gives us some
good information, it doesn't tell us anything about the variance of the
run times of the function. This change prints out the s^2 sample
standard deviation alongside the average.

This change adds one entry to the profile record structure. This
increases the memory footprint of the function profiler by 1/3 on a
32-bit system, and by 1/5 on a 64-bit system when function graphing is
enabled, though the memory is only allocated when the profiler is turned
on. During the profiling, one extra line of code adds the squared
calltime to the new record entry, so this should not adversly affect
performance.

Note that the square of the sample standard deviation is printed because
there is no sqrt implementation for unsigned long long in the kernel.

Signed-off-by: Chase Douglas <chase.douglas@canonical.com>
LKML-Reference: <1272304925-2436-1-git-send-email-chase.douglas@canonical.com>

[ fixed comment about ns^2 -> us^2 conversion ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2404b59b309..3bcb340d6f0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -264,6 +264,7 @@ struct ftrace_profile {
 	unsigned long			counter;
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	unsigned long long		time;
+	unsigned long long		time_squared;
 #endif
 };
 
@@ -366,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
 {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	seq_printf(m, "  Function                               "
-		   "Hit    Time            Avg\n"
+		   "Hit    Time            Avg             s^2\n"
 		      "  --------                               "
-		   "---    ----            ---\n");
+		   "---    ----            ---             ---\n");
 #else
 	seq_printf(m, "  Function                               Hit\n"
 		      "  --------                               ---\n");
@@ -384,6 +385,7 @@ static int function_stat_show(struct seq_file *m, void *v)
 	static DEFINE_MUTEX(mutex);
 	static struct trace_seq s;
 	unsigned long long avg;
+	unsigned long long stddev;
 #endif
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -394,11 +396,25 @@ static int function_stat_show(struct seq_file *m, void *v)
 	avg = rec->time;
 	do_div(avg, rec->counter);
 
+	/* Sample standard deviation (s^2) */
+	if (rec->counter <= 1)
+		stddev = 0;
+	else {
+		stddev = rec->time_squared - rec->counter * avg * avg;
+		/*
+		 * Divide only 1000 for ns^2 -> us^2 conversion.
+		 * trace_print_graph_duration will divide 1000 again.
+		 */
+		do_div(stddev, (rec->counter - 1) * 1000);
+	}
+
 	mutex_lock(&mutex);
 	trace_seq_init(&s);
 	trace_print_graph_duration(rec->time, &s);
 	trace_seq_puts(&s, "    ");
 	trace_print_graph_duration(avg, &s);
+	trace_seq_puts(&s, "    ");
+	trace_print_graph_duration(stddev, &s);
 	trace_print_seq(m, &s);
 	mutex_unlock(&mutex);
 #endif
@@ -668,8 +684,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 	}
 
 	rec = ftrace_find_profiled_func(stat, trace->func);
-	if (rec)
+	if (rec) {
 		rec->time += calltime;
+		rec->time_squared += calltime * calltime;
+	}
 
  out:
 	local_irq_restore(flags);
-- 
cgit v1.2.3


From 23346f21b277e3aae5e9989e711a11cbe8133a45 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Apr 2010 21:17:50 -0300
Subject: perf tools: Rename "kernel_info" to "machine"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct kernel_info and kerninfo__ are too vague, what they really
describe are machines, virtual ones or hosts.

There are more changes to introduce helpers to shorten function calls
and to make more clear what is really being done, but I left that for
subsequent patches.

Cc: Avi Kivity <avi@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Zhang, Yanmin <yanmin_zhang@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-annotate.c     |   2 +-
 tools/perf/builtin-buildid-list.c |   2 +-
 tools/perf/builtin-kmem.c         |   8 +--
 tools/perf/builtin-record.c       |  36 +++++------
 tools/perf/builtin-report.c       |   2 +-
 tools/perf/builtin-top.c          |  15 ++---
 tools/perf/util/event.c           |  92 +++++++++++++-------------
 tools/perf/util/event.h           |   4 +-
 tools/perf/util/header.c          |  70 ++++++++++----------
 tools/perf/util/header.h          |   2 +-
 tools/perf/util/map.c             | 105 +++++++++++++++---------------
 tools/perf/util/map.h             |  63 ++++++++----------
 tools/perf/util/session.c         |   4 +-
 tools/perf/util/session.h         |  27 +++++++-
 tools/perf/util/symbol.c          | 132 +++++++++++++++++---------------------
 tools/perf/util/symbol.h          |   3 +-
 16 files changed, 280 insertions(+), 287 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index f924b4332be..986b2efcef2 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -571,7 +571,7 @@ static int __cmd_annotate(void)
 		perf_session__fprintf(session, stdout);
 
 	if (verbose > 2)
-		dsos__fprintf(&session->kerninfo_root, stdout);
+		dsos__fprintf(&session->machines, stdout);
 
 	perf_session__collapse_resort(&session->hists);
 	perf_session__output_resort(&session->hists, session->event_total[0]);
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index 623afe3fdcb..b4a265ae3b9 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -46,7 +46,7 @@ static int __cmd_buildid_list(void)
 	if (with_hits)
 		perf_session__process_events(session, &build_id__mark_dso_hit_ops);
 
-	dsos__fprintf_buildid(&session->kerninfo_root, stdout, with_hits);
+	dsos__fprintf_buildid(&session->machines, stdout, with_hits);
 
 	perf_session__delete(session);
 	return err;
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index ab906cbd5c7..20674759464 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -352,7 +352,7 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 			   int n_lines, int is_caller)
 {
 	struct rb_node *next;
-	struct kernel_info *kerninfo;
+	struct machine *machine;
 
 	printf("%.102s\n", graph_dotted_line);
 	printf(" %-34s |",  is_caller ? "Callsite": "Alloc Ptr");
@@ -361,8 +361,8 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 
 	next = rb_first(root);
 
-	kerninfo = kerninfo__findhost(&session->kerninfo_root);
-	if (!kerninfo) {
+	machine = perf_session__find_host_machine(session);
+	if (!machine) {
 		pr_err("__print_result: couldn't find kernel information\n");
 		return;
 	}
@@ -370,7 +370,7 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 		struct alloc_stat *data = rb_entry(next, struct alloc_stat,
 						   node);
 		struct symbol *sym = NULL;
-		struct map_groups *kmaps = &kerninfo->kmaps;
+		struct map_groups *kmaps = &machine->kmaps;
 		struct map *map;
 		char buf[BUFSIZ];
 		u64 addr;
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 27f992aca8b..83b308a035c 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -456,14 +456,14 @@ static void atexit_header(void)
 	}
 }
 
-static void event__synthesize_guest_os(struct kernel_info *kerninfo,
-		void *data __attribute__((unused)))
+static void event__synthesize_guest_os(struct machine *machine, void *data)
 {
 	int err;
 	char *guest_kallsyms;
 	char path[PATH_MAX];
+	struct perf_session *psession = data;
 
-	if (is_host_kernel(kerninfo))
+	if (machine__is_host(machine))
 		return;
 
 	/*
@@ -475,16 +475,15 @@ static void event__synthesize_guest_os(struct kernel_info *kerninfo,
 	 *in module instead of in guest kernel.
 	 */
 	err = event__synthesize_modules(process_synthesized_event,
-			session,
-			kerninfo);
+					psession, machine);
 	if (err < 0)
 		pr_err("Couldn't record guest kernel [%d]'s reference"
-			" relocation symbol.\n", kerninfo->pid);
+		       " relocation symbol.\n", machine->pid);
 
-	if (is_default_guest(kerninfo))
+	if (machine__is_default_guest(machine))
 		guest_kallsyms = (char *) symbol_conf.default_guest_kallsyms;
 	else {
-		sprintf(path, "%s/proc/kallsyms", kerninfo->root_dir);
+		sprintf(path, "%s/proc/kallsyms", machine->root_dir);
 		guest_kallsyms = path;
 	}
 
@@ -493,13 +492,13 @@ static void event__synthesize_guest_os(struct kernel_info *kerninfo,
 	 * have no _text sometimes.
 	 */
 	err = event__synthesize_kernel_mmap(process_synthesized_event,
-			session, kerninfo, "_text");
+					    psession, machine, "_text");
 	if (err < 0)
 		err = event__synthesize_kernel_mmap(process_synthesized_event,
-				session, kerninfo, "_stext");
+						    psession, machine, "_stext");
 	if (err < 0)
 		pr_err("Couldn't record guest kernel [%d]'s reference"
-			" relocation symbol.\n", kerninfo->pid);
+		       " relocation symbol.\n", machine->pid);
 }
 
 static int __cmd_record(int argc, const char **argv)
@@ -513,7 +512,7 @@ static int __cmd_record(int argc, const char **argv)
 	int child_ready_pipe[2], go_pipe[2];
 	const bool forks = argc > 0;
 	char buf;
-	struct kernel_info *kerninfo;
+	struct machine *machine;
 
 	page_size = sysconf(_SC_PAGE_SIZE);
 
@@ -682,31 +681,30 @@ static int __cmd_record(int argc, const char **argv)
 		advance_output(err);
 	}
 
-	kerninfo = kerninfo__findhost(&session->kerninfo_root);
-	if (!kerninfo) {
+	machine = perf_session__find_host_machine(session);
+	if (!machine) {
 		pr_err("Couldn't find native kernel information.\n");
 		return -1;
 	}
 
 	err = event__synthesize_kernel_mmap(process_synthesized_event,
-			session, kerninfo, "_text");
+					    session, machine, "_text");
 	if (err < 0)
 		err = event__synthesize_kernel_mmap(process_synthesized_event,
-				session, kerninfo, "_stext");
+						    session, machine, "_stext");
 	if (err < 0) {
 		pr_err("Couldn't record kernel reference relocation symbol.\n");
 		return err;
 	}
 
 	err = event__synthesize_modules(process_synthesized_event,
-				session, kerninfo);
+					session, machine);
 	if (err < 0) {
 		pr_err("Couldn't record kernel reference relocation symbol.\n");
 		return err;
 	}
 	if (perf_guest)
-		kerninfo__process_allkernels(&session->kerninfo_root,
-			event__synthesize_guest_os, session);
+		perf_session__process_machines(session, event__synthesize_guest_os);
 
 	if (!system_wide && profile_cpu == -1)
 		event__synthesize_thread(target_tid, process_synthesized_event,
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 816edae7c5b..49cc367d8c3 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -313,7 +313,7 @@ static int __cmd_report(void)
 		perf_session__fprintf(session, stdout);
 
 	if (verbose > 2)
-		dsos__fprintf(&session->kerninfo_root, stdout);
+		dsos__fprintf(&session->machines, stdout);
 
 	next = rb_first(&session->stats_by_id);
 	while (next) {
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index dfd7ea7dabd..c390f340b03 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -854,7 +854,7 @@ static void handle_keypress(struct perf_session *session, int c)
 		case 'Q':
 			printf("exiting.\n");
 			if (dump_symtab)
-				dsos__fprintf(&session->kerninfo_root, stderr);
+				dsos__fprintf(&session->machines, stderr);
 			exit(0);
 		case 's':
 			prompt_symbol(&sym_filter_entry, "Enter details symbol");
@@ -982,7 +982,7 @@ static void event__process_sample(const event_t *self,
 	u64 ip = self->ip.ip;
 	struct sym_entry *syme;
 	struct addr_location al;
-	struct kernel_info *kerninfo;
+	struct machine *machine;
 	u8 origin = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
 	++samples;
@@ -992,18 +992,17 @@ static void event__process_sample(const event_t *self,
 		++us_samples;
 		if (hide_user_symbols)
 			return;
-		kerninfo = kerninfo__findhost(&session->kerninfo_root);
+		machine = perf_session__find_host_machine(session);
 		break;
 	case PERF_RECORD_MISC_KERNEL:
 		++kernel_samples;
 		if (hide_kernel_symbols)
 			return;
-		kerninfo = kerninfo__findhost(&session->kerninfo_root);
+		machine = perf_session__find_host_machine(session);
 		break;
 	case PERF_RECORD_MISC_GUEST_KERNEL:
 		++guest_kernel_samples;
-		kerninfo = kerninfo__find(&session->kerninfo_root,
-					  self->ip.pid);
+		machine = perf_session__find_machine(session, self->ip.pid);
 		break;
 	case PERF_RECORD_MISC_GUEST_USER:
 		++guest_us_samples;
@@ -1016,7 +1015,7 @@ static void event__process_sample(const event_t *self,
 		return;
 	}
 
-	if (!kerninfo && perf_guest) {
+	if (!machine && perf_guest) {
 		pr_err("Can't find guest [%d]'s kernel information\n",
 			self->ip.pid);
 		return;
@@ -1041,7 +1040,7 @@ static void event__process_sample(const event_t *self,
 		 * --hide-kernel-symbols, even if the user specifies an
 		 * invalid --vmlinux ;-)
 		 */
-		if (al.map == kerninfo->vmlinux_maps[MAP__FUNCTION] &&
+		if (al.map == machine->vmlinux_maps[MAP__FUNCTION] &&
 		    RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) {
 			pr_err("The %s file can't be used\n",
 			       symbol_conf.vmlinux_name);
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index e3fa8d3d11b..2f33ca9899b 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -172,17 +172,17 @@ static int event__synthesize_mmap_events(pid_t pid, pid_t tgid,
 
 int event__synthesize_modules(event__handler_t process,
 			      struct perf_session *session,
-			      struct kernel_info *kerninfo)
+			      struct machine *machine)
 {
 	struct rb_node *nd;
-	struct map_groups *kmaps = &kerninfo->kmaps;
+	struct map_groups *kmaps = &machine->kmaps;
 	u16 misc;
 
 	/*
 	 * kernel uses 0 for user space maps, see kernel/perf_event.c
 	 * __perf_event_mmap
 	 */
-	if (is_host_kernel(kerninfo))
+	if (machine__is_host(machine))
 		misc = PERF_RECORD_MISC_KERNEL;
 	else
 		misc = PERF_RECORD_MISC_GUEST_KERNEL;
@@ -204,7 +204,7 @@ int event__synthesize_modules(event__handler_t process,
 				        (sizeof(ev.mmap.filename) - size));
 		ev.mmap.start = pos->start;
 		ev.mmap.len   = pos->end - pos->start;
-		ev.mmap.pid   = kerninfo->pid;
+		ev.mmap.pid   = machine->pid;
 
 		memcpy(ev.mmap.filename, pos->dso->long_name,
 		       pos->dso->long_name_len + 1);
@@ -267,7 +267,7 @@ static int find_symbol_cb(void *arg, const char *name, char type, u64 start)
 
 int event__synthesize_kernel_mmap(event__handler_t process,
 				  struct perf_session *session,
-				  struct kernel_info *kerninfo,
+				  struct machine *machine,
 				  const char *symbol_name)
 {
 	size_t size;
@@ -288,8 +288,8 @@ int event__synthesize_kernel_mmap(event__handler_t process,
 	 */
 	struct process_symbol_args args = { .name = symbol_name, };
 
-	mmap_name = kern_mmap_name(kerninfo, name_buff);
-	if (is_host_kernel(kerninfo)) {
+	mmap_name = machine__mmap_name(machine, name_buff);
+	if (machine__is_host(machine)) {
 		/*
 		 * kernel uses PERF_RECORD_MISC_USER for user space maps,
 		 * see kernel/perf_event.c __perf_event_mmap
@@ -298,10 +298,10 @@ int event__synthesize_kernel_mmap(event__handler_t process,
 		filename = "/proc/kallsyms";
 	} else {
 		ev.header.misc = PERF_RECORD_MISC_GUEST_KERNEL;
-		if (is_default_guest(kerninfo))
+		if (machine__is_default_guest(machine))
 			filename = (char *) symbol_conf.default_guest_kallsyms;
 		else {
-			sprintf(path, "%s/proc/kallsyms", kerninfo->root_dir);
+			sprintf(path, "%s/proc/kallsyms", machine->root_dir);
 			filename = path;
 		}
 	}
@@ -309,7 +309,7 @@ int event__synthesize_kernel_mmap(event__handler_t process,
 	if (kallsyms__parse(filename, &args, find_symbol_cb) <= 0)
 		return -ENOENT;
 
-	map = kerninfo->vmlinux_maps[MAP__FUNCTION];
+	map = machine->vmlinux_maps[MAP__FUNCTION];
 	size = snprintf(ev.mmap.filename, sizeof(ev.mmap.filename),
 			"%s%s", mmap_name, symbol_name) + 1;
 	size = ALIGN(size, sizeof(u64));
@@ -318,7 +318,7 @@ int event__synthesize_kernel_mmap(event__handler_t process,
 	ev.mmap.pgoff = args.start;
 	ev.mmap.start = map->start;
 	ev.mmap.len   = map->end - ev.mmap.start;
-	ev.mmap.pid   = kerninfo->pid;
+	ev.mmap.pid   = machine->pid;
 
 	return process(&ev, session);
 }
@@ -389,18 +389,18 @@ static int event__process_kernel_mmap(event_t *self,
 {
 	struct map *map;
 	char kmmap_prefix[PATH_MAX];
-	struct kernel_info *kerninfo;
+	struct machine *machine;
 	enum dso_kernel_type kernel_type;
 	bool is_kernel_mmap;
 
-	kerninfo = kerninfo__findnew(&session->kerninfo_root, self->mmap.pid);
-	if (!kerninfo) {
-		pr_err("Can't find id %d's kerninfo\n", self->mmap.pid);
+	machine = perf_session__findnew_machine(session, self->mmap.pid);
+	if (!machine) {
+		pr_err("Can't find id %d's machine\n", self->mmap.pid);
 		goto out_problem;
 	}
 
-	kern_mmap_name(kerninfo, kmmap_prefix);
-	if (is_host_kernel(kerninfo))
+	machine__mmap_name(machine, kmmap_prefix);
+	if (machine__is_host(machine))
 		kernel_type = DSO_TYPE_KERNEL;
 	else
 		kernel_type = DSO_TYPE_GUEST_KERNEL;
@@ -429,10 +429,9 @@ static int event__process_kernel_mmap(event_t *self,
 		} else
 			strcpy(short_module_name, self->mmap.filename);
 
-		map = map_groups__new_module(&kerninfo->kmaps,
-				self->mmap.start,
-				self->mmap.filename,
-				kerninfo);
+		map = map_groups__new_module(&machine->kmaps,
+					     self->mmap.start,
+					     self->mmap.filename, machine);
 		if (map == NULL)
 			goto out_problem;
 
@@ -449,27 +448,27 @@ static int event__process_kernel_mmap(event_t *self,
 		 * Should be there already, from the build-id table in
 		 * the header.
 		 */
-		struct dso *kernel = __dsos__findnew(&kerninfo->dsos__kernel,
-				kmmap_prefix);
+		struct dso *kernel = __dsos__findnew(&machine->kernel_dsos,
+						     kmmap_prefix);
 		if (kernel == NULL)
 			goto out_problem;
 
 		kernel->kernel = kernel_type;
-		if (__map_groups__create_kernel_maps(&kerninfo->kmaps,
-					kerninfo->vmlinux_maps, kernel) < 0)
+		if (__map_groups__create_kernel_maps(&machine->kmaps,
+						     machine->vmlinux_maps,
+						     kernel) < 0)
 			goto out_problem;
 
-		event_set_kernel_mmap_len(kerninfo->vmlinux_maps, self);
-		perf_session__set_kallsyms_ref_reloc_sym(kerninfo->vmlinux_maps,
-				symbol_name,
-				self->mmap.pgoff);
-		if (is_default_guest(kerninfo)) {
+		event_set_kernel_mmap_len(machine->vmlinux_maps, self);
+		perf_session__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps,
+							 symbol_name,
+							 self->mmap.pgoff);
+		if (machine__is_default_guest(machine)) {
 			/*
 			 * preload dso of guest kernel and modules
 			 */
-			dso__load(kernel,
-				kerninfo->vmlinux_maps[MAP__FUNCTION],
-				NULL);
+			dso__load(kernel, machine->vmlinux_maps[MAP__FUNCTION],
+				  NULL);
 		}
 	}
 	return 0;
@@ -479,7 +478,7 @@ out_problem:
 
 int event__process_mmap(event_t *self, struct perf_session *session)
 {
-	struct kernel_info *kerninfo;
+	struct machine *machine;
 	struct thread *thread;
 	struct map *map;
 	u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
@@ -498,8 +497,8 @@ int event__process_mmap(event_t *self, struct perf_session *session)
 	}
 
 	thread = perf_session__findnew(session, self->mmap.pid);
-	kerninfo = kerninfo__findhost(&session->kerninfo_root);
-	map = map__new(&kerninfo->dsos__user, self->mmap.start,
+	machine = perf_session__find_host_machine(session);
+	map = map__new(&machine->user_dsos, self->mmap.start,
 			self->mmap.len, self->mmap.pgoff,
 			self->mmap.pid, self->mmap.filename,
 			MAP__FUNCTION, session->cwd, session->cwdlen);
@@ -546,7 +545,7 @@ void thread__find_addr_map(struct thread *self,
 			   struct addr_location *al)
 {
 	struct map_groups *mg = &self->mg;
-	struct kernel_info *kerninfo = NULL;
+	struct machine *machine = NULL;
 
 	al->thread = self;
 	al->addr = addr;
@@ -555,19 +554,19 @@ void thread__find_addr_map(struct thread *self,
 
 	if (cpumode == PERF_RECORD_MISC_KERNEL && perf_host) {
 		al->level = 'k';
-		kerninfo = kerninfo__findhost(&session->kerninfo_root);
-		mg = &kerninfo->kmaps;
+		machine = perf_session__find_host_machine(session);
+		mg = &machine->kmaps;
 	} else if (cpumode == PERF_RECORD_MISC_USER && perf_host) {
 		al->level = '.';
-		kerninfo = kerninfo__findhost(&session->kerninfo_root);
+		machine = perf_session__find_host_machine(session);
 	} else if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL && perf_guest) {
 		al->level = 'g';
-		kerninfo = kerninfo__find(&session->kerninfo_root, pid);
-		if (!kerninfo) {
+		machine = perf_session__find_machine(session, pid);
+		if (!machine) {
 			al->map = NULL;
 			return;
 		}
-		mg = &kerninfo->kmaps;
+		mg = &machine->kmaps;
 	} else {
 		/*
 		 * 'u' means guest os user space.
@@ -603,10 +602,9 @@ try_again:
 		 * in the whole kernel symbol list.
 		 */
 		if ((long long)al->addr < 0 &&
-			cpumode == PERF_RECORD_MISC_KERNEL &&
-			kerninfo &&
-			mg != &kerninfo->kmaps)  {
-			mg = &kerninfo->kmaps;
+		    cpumode == PERF_RECORD_MISC_KERNEL &&
+		    machine && mg != &machine->kmaps) {
+			mg = &machine->kmaps;
 			goto try_again;
 		}
 	} else
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 4af2ed5d48a..b364da5b0cb 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -156,12 +156,12 @@ void event__synthesize_threads(event__handler_t process,
 			       struct perf_session *session);
 int event__synthesize_kernel_mmap(event__handler_t process,
 				struct perf_session *session,
-				struct kernel_info *kerninfo,
+				struct machine *machine,
 				const char *symbol_name);
 
 int event__synthesize_modules(event__handler_t process,
 			      struct perf_session *session,
-			      struct kernel_info *kerninfo);
+			      struct machine *machine);
 
 int event__process_comm(event_t *self, struct perf_session *session);
 int event__process_lost(event_t *self, struct perf_session *session);
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 75d01676802..6227dc4cb2c 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -229,10 +229,9 @@ static int dsos__write_buildid_table(struct perf_header *header, int fd)
 	int err = 0;
 	u16 kmisc, umisc;
 
-	for (nd = rb_first(&session->kerninfo_root); nd; nd = rb_next(nd)) {
-		struct kernel_info *pos = rb_entry(nd, struct kernel_info,
-				rb_node);
-		if (is_host_kernel(pos)) {
+	for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
+		struct machine *pos = rb_entry(nd, struct machine, rb_node);
+		if (machine__is_host(pos)) {
 			kmisc = PERF_RECORD_MISC_KERNEL;
 			umisc = PERF_RECORD_MISC_USER;
 		} else {
@@ -240,11 +239,11 @@ static int dsos__write_buildid_table(struct perf_header *header, int fd)
 			umisc = PERF_RECORD_MISC_GUEST_USER;
 		}
 
-		err = __dsos__write_buildid_table(&pos->dsos__kernel, pos->pid,
-				kmisc, fd);
+		err = __dsos__write_buildid_table(&pos->kernel_dsos, pos->pid,
+						  kmisc, fd);
 		if (err == 0)
-			err = __dsos__write_buildid_table(&pos->dsos__user,
-				pos->pid, umisc, fd);
+			err = __dsos__write_buildid_table(&pos->user_dsos,
+							  pos->pid, umisc, fd);
 		if (err)
 			break;
 	}
@@ -378,11 +377,10 @@ static int dsos__cache_build_ids(struct perf_header *self)
 	if (mkdir(debugdir, 0755) != 0 && errno != EEXIST)
 		return -1;
 
-	for (nd = rb_first(&session->kerninfo_root); nd; nd = rb_next(nd)) {
-		struct kernel_info *pos = rb_entry(nd, struct kernel_info,
-				rb_node);
-		ret |= __dsos__cache_build_ids(&pos->dsos__kernel, debugdir);
-		ret |= __dsos__cache_build_ids(&pos->dsos__user, debugdir);
+	for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
+		struct machine *pos = rb_entry(nd, struct machine, rb_node);
+		ret |= __dsos__cache_build_ids(&pos->kernel_dsos, debugdir);
+		ret |= __dsos__cache_build_ids(&pos->user_dsos, debugdir);
 	}
 	return ret ? -1 : 0;
 }
@@ -394,11 +392,10 @@ static bool dsos__read_build_ids(struct perf_header *self, bool with_hits)
 			struct perf_session, header);
 	struct rb_node *nd;
 
-	for (nd = rb_first(&session->kerninfo_root); nd; nd = rb_next(nd)) {
-		struct kernel_info *pos = rb_entry(nd, struct kernel_info,
-				rb_node);
-		ret |= __dsos__read_build_ids(&pos->dsos__kernel, with_hits);
-		ret |= __dsos__read_build_ids(&pos->dsos__user, with_hits);
+	for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
+		struct machine *pos = rb_entry(nd, struct machine, rb_node);
+		ret |= __dsos__read_build_ids(&pos->kernel_dsos, with_hits);
+		ret |= __dsos__read_build_ids(&pos->user_dsos, with_hits);
 	}
 
 	return ret;
@@ -685,13 +682,13 @@ static int __event_process_build_id(struct build_id_event *bev,
 {
 	int err = -1;
 	struct list_head *head;
-	struct kernel_info *kerninfo;
+	struct machine *machine;
 	u16 misc;
 	struct dso *dso;
 	enum dso_kernel_type dso_type;
 
-	kerninfo = kerninfo__findnew(&session->kerninfo_root, bev->pid);
-	if (!kerninfo)
+	machine = perf_session__findnew_machine(session, bev->pid);
+	if (!machine)
 		goto out;
 
 	misc = bev->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
@@ -699,16 +696,16 @@ static int __event_process_build_id(struct build_id_event *bev,
 	switch (misc) {
 	case PERF_RECORD_MISC_KERNEL:
 		dso_type = DSO_TYPE_KERNEL;
-		head = &kerninfo->dsos__kernel;
+		head = &machine->kernel_dsos;
 		break;
 	case PERF_RECORD_MISC_GUEST_KERNEL:
 		dso_type = DSO_TYPE_GUEST_KERNEL;
-		head = &kerninfo->dsos__kernel;
+		head = &machine->kernel_dsos;
 		break;
 	case PERF_RECORD_MISC_USER:
 	case PERF_RECORD_MISC_GUEST_USER:
 		dso_type = DSO_TYPE_USER;
-		head = &kerninfo->dsos__user;
+		head = &machine->user_dsos;
 		break;
 	default:
 		goto out;
@@ -1113,8 +1110,7 @@ int event__process_tracing_data(event_t *self,
 }
 
 int event__synthesize_build_id(struct dso *pos, u16 misc,
-			       event__handler_t process,
-			       struct kernel_info *kerninfo,
+			       event__handler_t process, struct machine *machine,
 			       struct perf_session *session)
 {
 	event_t ev;
@@ -1131,7 +1127,7 @@ int event__synthesize_build_id(struct dso *pos, u16 misc,
 	memcpy(&ev.build_id.build_id, pos->build_id, sizeof(pos->build_id));
 	ev.build_id.header.type = PERF_RECORD_HEADER_BUILD_ID;
 	ev.build_id.header.misc = misc;
-	ev.build_id.pid = kerninfo->pid;
+	ev.build_id.pid = machine->pid;
 	ev.build_id.header.size = sizeof(ev.build_id) + len;
 	memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len);
 
@@ -1142,7 +1138,7 @@ int event__synthesize_build_id(struct dso *pos, u16 misc,
 
 static int __event_synthesize_build_ids(struct list_head *head, u16 misc,
 					event__handler_t process,
-					struct kernel_info *kerninfo,
+					struct machine *machine,
 					struct perf_session *session)
 {
 	struct dso *pos;
@@ -1153,7 +1149,7 @@ static int __event_synthesize_build_ids(struct list_head *head, u16 misc,
 			continue;
 
 		err = event__synthesize_build_id(pos, misc, process,
-					kerninfo, session);
+						 machine, session);
 		if (err < 0)
 			return err;
 	}
@@ -1166,15 +1162,15 @@ int event__synthesize_build_ids(event__handler_t process,
 {
 	int err = 0;
 	u16 kmisc, umisc;
-	struct kernel_info *pos;
+	struct machine *pos;
 	struct rb_node *nd;
 
 	if (!dsos__read_build_ids(&session->header, true))
 		return 0;
 
-	for (nd = rb_first(&session->kerninfo_root); nd; nd = rb_next(nd)) {
-		pos = rb_entry(nd, struct kernel_info, rb_node);
-		if (is_host_kernel(pos)) {
+	for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
+		pos = rb_entry(nd, struct machine, rb_node);
+		if (machine__is_host(pos)) {
 			kmisc = PERF_RECORD_MISC_KERNEL;
 			umisc = PERF_RECORD_MISC_USER;
 		} else {
@@ -1182,11 +1178,11 @@ int event__synthesize_build_ids(event__handler_t process,
 			umisc = PERF_RECORD_MISC_GUEST_USER;
 		}
 
-		err = __event_synthesize_build_ids(&pos->dsos__kernel,
-				kmisc, process, pos, session);
+		err = __event_synthesize_build_ids(&pos->kernel_dsos, kmisc,
+						   process, pos, session);
 		if (err == 0)
-			err = __event_synthesize_build_ids(&pos->dsos__user,
-					umisc, process, pos, session);
+			err = __event_synthesize_build_ids(&pos->user_dsos, umisc,
+							   process, pos, session);
 		if (err)
 			break;
 	}
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 27591545814..f39443db070 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -120,7 +120,7 @@ int event__process_tracing_data(event_t *self,
 
 int event__synthesize_build_id(struct dso *pos, u16 misc,
 			       event__handler_t process,
-			       struct kernel_info *kerninfo,
+			       struct machine *machine,
 			       struct perf_session *session);
 int event__synthesize_build_ids(event__handler_t process,
 				struct perf_session *session);
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 7facd016ec9..da3d4e82623 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -245,7 +245,7 @@ void map_groups__init(struct map_groups *self)
 		self->maps[i] = RB_ROOT;
 		INIT_LIST_HEAD(&self->removed_maps[i]);
 	}
-	 self->this_kerninfo = NULL;
+	self->machine = NULL;
 }
 
 void map_groups__flush(struct map_groups *self)
@@ -513,133 +513,130 @@ struct map *maps__find(struct rb_root *maps, u64 ip)
 	return NULL;
 }
 
-struct kernel_info *add_new_kernel_info(struct rb_root *kerninfo_root,
-			pid_t pid, const char *root_dir)
+struct machine *machines__add(struct rb_root *self, pid_t pid,
+			      const char *root_dir)
 {
-	struct rb_node **p = &kerninfo_root->rb_node;
+	struct rb_node **p = &self->rb_node;
 	struct rb_node *parent = NULL;
-	struct kernel_info *kerninfo, *pos;
+	struct machine *pos, *machine = malloc(sizeof(*machine));
 
-	kerninfo = malloc(sizeof(struct kernel_info));
-	if (!kerninfo)
+	if (!machine)
 		return NULL;
 
-	kerninfo->pid = pid;
-	map_groups__init(&kerninfo->kmaps);
-	kerninfo->root_dir = strdup(root_dir);
-	RB_CLEAR_NODE(&kerninfo->rb_node);
-	INIT_LIST_HEAD(&kerninfo->dsos__user);
-	INIT_LIST_HEAD(&kerninfo->dsos__kernel);
-	kerninfo->kmaps.this_kerninfo = kerninfo;
+	machine->pid = pid;
+	map_groups__init(&machine->kmaps);
+	machine->root_dir = strdup(root_dir);
+	RB_CLEAR_NODE(&machine->rb_node);
+	INIT_LIST_HEAD(&machine->user_dsos);
+	INIT_LIST_HEAD(&machine->kernel_dsos);
+	machine->kmaps.machine = machine;
 
 	while (*p != NULL) {
 		parent = *p;
-		pos = rb_entry(parent, struct kernel_info, rb_node);
+		pos = rb_entry(parent, struct machine, rb_node);
 		if (pid < pos->pid)
 			p = &(*p)->rb_left;
 		else
 			p = &(*p)->rb_right;
 	}
 
-	rb_link_node(&kerninfo->rb_node, parent, p);
-	rb_insert_color(&kerninfo->rb_node, kerninfo_root);
+	rb_link_node(&machine->rb_node, parent, p);
+	rb_insert_color(&machine->rb_node, self);
 
-	return kerninfo;
+	return machine;
 }
 
-struct kernel_info *kerninfo__find(struct rb_root *kerninfo_root, pid_t pid)
+struct machine *machines__find(struct rb_root *self, pid_t pid)
 {
-	struct rb_node **p = &kerninfo_root->rb_node;
+	struct rb_node **p = &self->rb_node;
 	struct rb_node *parent = NULL;
-	struct kernel_info *kerninfo;
-	struct kernel_info *default_kerninfo = NULL;
+	struct machine *machine;
+	struct machine *default_machine = NULL;
 
 	while (*p != NULL) {
 		parent = *p;
-		kerninfo = rb_entry(parent, struct kernel_info, rb_node);
-		if (pid < kerninfo->pid)
+		machine = rb_entry(parent, struct machine, rb_node);
+		if (pid < machine->pid)
 			p = &(*p)->rb_left;
-		else if (pid > kerninfo->pid)
+		else if (pid > machine->pid)
 			p = &(*p)->rb_right;
 		else
-			return kerninfo;
-		if (!kerninfo->pid)
-			default_kerninfo = kerninfo;
+			return machine;
+		if (!machine->pid)
+			default_machine = machine;
 	}
 
-	return default_kerninfo;
+	return default_machine;
 }
 
-struct kernel_info *kerninfo__findhost(struct rb_root *kerninfo_root)
+/*
+ * FIXME: Why repeatedly search for this?
+ */
+struct machine *machines__find_host(struct rb_root *self)
 {
-	struct rb_node **p = &kerninfo_root->rb_node;
+	struct rb_node **p = &self->rb_node;
 	struct rb_node *parent = NULL;
-	struct kernel_info *kerninfo;
+	struct machine *machine;
 	pid_t pid = HOST_KERNEL_ID;
 
 	while (*p != NULL) {
 		parent = *p;
-		kerninfo = rb_entry(parent, struct kernel_info, rb_node);
-		if (pid < kerninfo->pid)
+		machine = rb_entry(parent, struct machine, rb_node);
+		if (pid < machine->pid)
 			p = &(*p)->rb_left;
-		else if (pid > kerninfo->pid)
+		else if (pid > machine->pid)
 			p = &(*p)->rb_right;
 		else
-			return kerninfo;
+			return machine;
 	}
 
 	return NULL;
 }
 
-struct kernel_info *kerninfo__findnew(struct rb_root *kerninfo_root, pid_t pid)
+struct machine *machines__findnew(struct rb_root *self, pid_t pid)
 {
 	char path[PATH_MAX];
 	const char *root_dir;
-	int ret;
-	struct kernel_info *kerninfo = kerninfo__find(kerninfo_root, pid);
+	struct machine *machine = machines__find(self, pid);
 
-	if (!kerninfo || kerninfo->pid != pid) {
+	if (!machine || machine->pid != pid) {
 		if (pid == HOST_KERNEL_ID || pid == DEFAULT_GUEST_KERNEL_ID)
 			root_dir = "";
 		else {
 			if (!symbol_conf.guestmount)
 				goto out;
 			sprintf(path, "%s/%d", symbol_conf.guestmount, pid);
-			ret = access(path, R_OK);
-			if (ret) {
+			if (access(path, R_OK)) {
 				pr_err("Can't access file %s\n", path);
 				goto out;
 			}
 			root_dir = path;
 		}
-		kerninfo = add_new_kernel_info(kerninfo_root, pid, root_dir);
+		machine = machines__add(self, pid, root_dir);
 	}
 
 out:
-	return kerninfo;
+	return machine;
 }
 
-void kerninfo__process_allkernels(struct rb_root *kerninfo_root,
-		process_kernel_info process,
-		void *data)
+void machines__process(struct rb_root *self, machine__process_t process, void *data)
 {
 	struct rb_node *nd;
 
-	for (nd = rb_first(kerninfo_root); nd; nd = rb_next(nd)) {
-		struct kernel_info *pos = rb_entry(nd, struct kernel_info,
-							rb_node);
+	for (nd = rb_first(self); nd; nd = rb_next(nd)) {
+		struct machine *pos = rb_entry(nd, struct machine, rb_node);
 		process(pos, data);
 	}
 }
 
-char *kern_mmap_name(struct kernel_info *kerninfo, char *buff)
+char *machine__mmap_name(struct machine *self, char *buff)
 {
-	if (is_host_kernel(kerninfo))
+	if (machine__is_host(self))
 		sprintf(buff, "[%s]", "kernel.kallsyms");
-	else if (is_default_guest(kerninfo))
+	else if (machine__is_default_guest(self))
 		sprintf(buff, "[%s]", "guest.kernel.kallsyms");
 	else
-		sprintf(buff, "[%s.%d]", "guest.kernel.kallsyms", kerninfo->pid);
+		sprintf(buff, "[%s.%d]", "guest.kernel.kallsyms", self->pid);
 
 	return buff;
 }
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 30d38d634e0..4c1c2da704b 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -5,6 +5,7 @@
 #include <linux/list.h>
 #include <linux/rbtree.h>
 #include <stdio.h>
+#include <stdbool.h>
 #include "types.h"
 
 enum map_type {
@@ -19,7 +20,7 @@ extern const char *map_type__name[MAP__NR_TYPES];
 struct dso;
 struct ref_reloc_sym;
 struct map_groups;
-struct kernel_info;
+struct machine;
 
 struct map {
 	union {
@@ -46,23 +47,23 @@ struct kmap {
 };
 
 struct map_groups {
-	struct rb_root		maps[MAP__NR_TYPES];
-	struct list_head	removed_maps[MAP__NR_TYPES];
-	struct kernel_info	*this_kerninfo;
+	struct rb_root	 maps[MAP__NR_TYPES];
+	struct list_head removed_maps[MAP__NR_TYPES];
+	struct machine	 *machine;
 };
 
-/* Native host kernel uses -1 as pid index in kernel_info */
+/* Native host kernel uses -1 as pid index in machine */
 #define	HOST_KERNEL_ID			(-1)
 #define	DEFAULT_GUEST_KERNEL_ID		(0)
 
-struct kernel_info {
-	struct rb_node rb_node;
-	pid_t pid;
-	char *root_dir;
-	struct list_head dsos__user;
-	struct list_head dsos__kernel;
+struct machine {
+	struct rb_node	  rb_node;
+	pid_t		  pid;
+	char		  *root_dir;
+	struct list_head  user_dsos;
+	struct list_head  kernel_dsos;
 	struct map_groups kmaps;
-	struct map *vmlinux_maps[MAP__NR_TYPES];
+	struct map	  *vmlinux_maps[MAP__NR_TYPES];
 };
 
 static inline struct kmap *map__kmap(struct map *self)
@@ -124,36 +125,30 @@ int map_groups__clone(struct map_groups *self,
 size_t map_groups__fprintf(struct map_groups *self, int verbose, FILE *fp);
 size_t map_groups__fprintf_maps(struct map_groups *self, int verbose, FILE *fp);
 
-struct kernel_info *add_new_kernel_info(struct rb_root *kerninfo_root,
-			pid_t pid, const char *root_dir);
-struct kernel_info *kerninfo__find(struct rb_root *kerninfo_root, pid_t pid);
-struct kernel_info *kerninfo__findnew(struct rb_root *kerninfo_root, pid_t pid);
-struct kernel_info *kerninfo__findhost(struct rb_root *kerninfo_root);
-char *kern_mmap_name(struct kernel_info *kerninfo, char *buff);
+typedef void (*machine__process_t)(struct machine *self, void *data);
+
+void machines__process(struct rb_root *self, machine__process_t process, void *data);
+struct machine *machines__add(struct rb_root *self, pid_t pid,
+			      const char *root_dir);
+struct machine *machines__find_host(struct rb_root *self);
+struct machine *machines__find(struct rb_root *self, pid_t pid);
+struct machine *machines__findnew(struct rb_root *self, pid_t pid);
+char *machine__mmap_name(struct machine *self, char *buff);
 
 /*
  * Default guest kernel is defined by parameter --guestkallsyms
  * and --guestmodules
  */
-static inline int is_default_guest(struct kernel_info *kerninfo)
+static inline bool machine__is_default_guest(struct machine *self)
 {
-	if (!kerninfo)
-		return 0;
-	return kerninfo->pid == DEFAULT_GUEST_KERNEL_ID;
+	return self ? self->pid == DEFAULT_GUEST_KERNEL_ID : false;
 }
 
-static inline int is_host_kernel(struct kernel_info *kerninfo)
+static inline bool machine__is_host(struct machine *self)
 {
-	if (!kerninfo)
-		return 0;
-	return kerninfo->pid == HOST_KERNEL_ID;
+	return self ? self->pid == HOST_KERNEL_ID : false;
 }
 
-typedef void (*process_kernel_info)(struct kernel_info *kerninfo, void *data);
-void kerninfo__process_allkernels(struct rb_root *kerninfo_root,
-				  process_kernel_info process,
-				  void *data);
-
 static inline void map_groups__insert(struct map_groups *self, struct map *map)
 {
 	maps__insert(&self->maps[map->type], map);
@@ -197,10 +192,8 @@ int map_groups__fixup_overlappings(struct map_groups *self, struct map *map,
 
 struct map *map_groups__find_by_name(struct map_groups *self,
 				     enum map_type type, const char *name);
-struct map *map_groups__new_module(struct map_groups *self,
-				    u64 start,
-				    const char *filename,
-				    struct kernel_info *kerninfo);
+struct map *map_groups__new_module(struct map_groups *self, u64 start,
+				   const char *filename, struct machine *machine);
 
 void map_groups__flush(struct map_groups *self);
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index b7aade2184b..b745c1c0b6c 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -70,7 +70,7 @@ void perf_session__update_sample_type(struct perf_session *self)
 int perf_session__create_kernel_maps(struct perf_session *self)
 {
 	int ret;
-	struct rb_root *root = &self->kerninfo_root;
+	struct rb_root *root = &self->machines;
 
 	ret = map_groups__create_kernel_maps(root, HOST_KERNEL_ID);
 	if (ret >= 0)
@@ -97,7 +97,7 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
 	self->cwd = NULL;
 	self->cwdlen = 0;
 	self->unknown_events = 0;
-	self->kerninfo_root = RB_ROOT;
+	self->machines = RB_ROOT;
 	self->ordered_samples.flush_limit = ULLONG_MAX;
 	INIT_LIST_HEAD(&self->ordered_samples.samples_head);
 
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 796e2291ebd..71252723a17 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -25,7 +25,7 @@ struct perf_session {
 	unsigned long		mmap_window;
 	struct rb_root		threads;
 	struct thread		*last_match;
-	struct rb_root		kerninfo_root;
+	struct rb_root		machines;
 	struct events_stats	events_stats;
 	struct rb_root		stats_by_id;
 	unsigned long		event_total[PERF_RECORD_MAX];
@@ -102,4 +102,29 @@ int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 			       u64 session_total, const char *helpline,
 			       const char *input_name);
 #endif
+
+static inline
+struct machine *perf_session__find_host_machine(struct perf_session *self)
+{
+	return machines__find_host(&self->machines);
+}
+
+static inline
+struct machine *perf_session__find_machine(struct perf_session *self, pid_t pid)
+{
+	return machines__find(&self->machines, pid);
+}
+
+static inline
+struct machine *perf_session__findnew_machine(struct perf_session *self, pid_t pid)
+{
+	return machines__findnew(&self->machines, pid);
+}
+
+static inline
+void perf_session__process_machines(struct perf_session *self,
+				    machine__process_t process)
+{
+	return machines__process(&self->machines, process, self);
+}
 #endif /* __PERF_SESSION_H */
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index e77c33a11de..dc046368b5c 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -485,7 +485,7 @@ static int dso__split_kallsyms(struct dso *self, struct map *map,
 			       symbol_filter_t filter)
 {
 	struct map_groups *kmaps = map__kmap(map)->kmaps;
-	struct kernel_info *kerninfo = kmaps->this_kerninfo;
+	struct machine *machine = kmaps->machine;
 	struct map *curr_map = map;
 	struct symbol *pos;
 	int count = 0;
@@ -508,8 +508,8 @@ static int dso__split_kallsyms(struct dso *self, struct map *map,
 
 			if (strcmp(curr_map->dso->short_name, module)) {
 				if (curr_map != map &&
-					self->kernel == DSO_TYPE_GUEST_KERNEL &&
-					is_default_guest(kerninfo)) {
+				    self->kernel == DSO_TYPE_GUEST_KERNEL &&
+				    machine__is_default_guest(machine)) {
 					/*
 					 * We assume all symbols of a module are
 					 * continuous in * kallsyms, so curr_map
@@ -527,13 +527,13 @@ static int dso__split_kallsyms(struct dso *self, struct map *map,
 					pr_err("%s/proc/{kallsyms,modules} "
 					         "inconsistency while looking "
 						 "for \"%s\" module!\n",
-						 kerninfo->root_dir, module);
+						 machine->root_dir, module);
 					curr_map = map;
 					goto discard_symbol;
 				}
 
 				if (curr_map->dso->loaded &&
-					!is_default_guest(kmaps->this_kerninfo))
+				    !machine__is_default_guest(machine))
 					goto discard_symbol;
 			}
 			/*
@@ -586,7 +586,7 @@ discard_symbol:		rb_erase(&pos->rb_node, root);
 
 	if (curr_map != map &&
 	    self->kernel == DSO_TYPE_GUEST_KERNEL &&
-	    is_default_guest(kmaps->this_kerninfo)) {
+	    machine__is_default_guest(kmaps->machine)) {
 		dso__set_loaded(curr_map->dso, curr_map->type);
 	}
 
@@ -1291,7 +1291,7 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter)
 	char build_id_hex[BUILD_ID_SIZE * 2 + 1];
 	int ret = -1;
 	int fd;
-	struct kernel_info *kerninfo;
+	struct machine *machine;
 	const char *root_dir;
 
 	dso__set_loaded(self, map->type);
@@ -1301,10 +1301,10 @@ int dso__load(struct dso *self, struct map *map, symbol_filter_t filter)
 	else if (self->kernel == DSO_TYPE_GUEST_KERNEL)
 		return dso__load_guest_kernel_sym(self, map, filter);
 
-	if (map->groups && map->groups->this_kerninfo)
-		kerninfo = map->groups->this_kerninfo;
+	if (map->groups && map->groups->machine)
+		machine = map->groups->machine;
 	else
-		kerninfo = NULL;
+		machine = NULL;
 
 	name = malloc(size);
 	if (!name)
@@ -1359,8 +1359,8 @@ more:
 			snprintf(name, size, "%s", self->long_name);
 			break;
 		case DSO__ORIG_GUEST_KMODULE:
-			if (map->groups && map->groups->this_kerninfo)
-				root_dir = map->groups->this_kerninfo->root_dir;
+			if (map->groups && map->groups->machine)
+				root_dir = map->groups->machine->root_dir;
 			else
 				root_dir = "";
 			snprintf(name, size, "%s%s", root_dir, self->long_name);
@@ -1566,12 +1566,12 @@ static struct map *map__new2(u64 start, struct dso *dso, enum map_type type)
 
 struct map *map_groups__new_module(struct map_groups *self, u64 start,
 				const char *filename,
-				struct kernel_info *kerninfo)
+				struct machine *machine)
 {
 	struct map *map;
 	struct dso *dso;
 
-	dso = __dsos__findnew(&kerninfo->dsos__kernel, filename);
+	dso = __dsos__findnew(&machine->kernel_dsos, filename);
 	if (dso == NULL)
 		return NULL;
 
@@ -1579,7 +1579,7 @@ struct map *map_groups__new_module(struct map_groups *self, u64 start,
 	if (map == NULL)
 		return NULL;
 
-	if (is_host_kernel(kerninfo))
+	if (machine__is_host(machine))
 		dso->origin = DSO__ORIG_KMODULE;
 	else
 		dso->origin = DSO__ORIG_GUEST_KMODULE;
@@ -1587,7 +1587,7 @@ struct map *map_groups__new_module(struct map_groups *self, u64 start,
 	return map;
 }
 
-static int map_groups__create_modules(struct kernel_info *kerninfo)
+static int map_groups__create_modules(struct machine *machine)
 {
 	char *line = NULL;
 	size_t n;
@@ -1597,10 +1597,10 @@ static int map_groups__create_modules(struct kernel_info *kerninfo)
 	const char *modules;
 	char path[PATH_MAX];
 
-	if (is_default_guest(kerninfo))
+	if (machine__is_default_guest(machine))
 		modules = symbol_conf.default_guest_modules;
 	else {
-		sprintf(path, "%s/proc/modules", kerninfo->root_dir);
+		sprintf(path, "%s/proc/modules", machine->root_dir);
 		modules = path;
 	}
 
@@ -1608,7 +1608,7 @@ static int map_groups__create_modules(struct kernel_info *kerninfo)
 	if (file == NULL)
 		return -1;
 
-	root_dir = kerninfo->root_dir;
+	root_dir = machine->root_dir;
 
 	while (!feof(file)) {
 		char name[PATH_MAX];
@@ -1638,8 +1638,8 @@ static int map_groups__create_modules(struct kernel_info *kerninfo)
 		*sep = '\0';
 
 		snprintf(name, sizeof(name), "[%s]", line);
-		map = map_groups__new_module(&kerninfo->kmaps,
-				start, name, kerninfo);
+		map = map_groups__new_module(&machine->kmaps, start,
+					     name, machine);
 		if (map == NULL)
 			goto out_delete_line;
 		dso__kernel_module_get_build_id(map->dso, root_dir);
@@ -1648,7 +1648,7 @@ static int map_groups__create_modules(struct kernel_info *kerninfo)
 	free(line);
 	fclose(file);
 
-	return map_groups__set_modules_path(&kerninfo->kmaps, root_dir);
+	return map_groups__set_modules_path(&machine->kmaps, root_dir);
 
 out_delete_line:
 	free(line);
@@ -1820,16 +1820,16 @@ static int dso__load_guest_kernel_sym(struct dso *self, struct map *map,
 {
 	int err;
 	const char *kallsyms_filename = NULL;
-	struct kernel_info *kerninfo;
+	struct machine *machine;
 	char path[PATH_MAX];
 
 	if (!map->groups) {
 		pr_debug("Guest kernel map hasn't the point to groups\n");
 		return -1;
 	}
-	kerninfo = map->groups->this_kerninfo;
+	machine = map->groups->machine;
 
-	if (is_default_guest(kerninfo)) {
+	if (machine__is_default_guest(machine)) {
 		/*
 		 * if the user specified a vmlinux filename, use it and only
 		 * it, reporting errors to the user if it cannot be used.
@@ -1845,7 +1845,7 @@ static int dso__load_guest_kernel_sym(struct dso *self, struct map *map,
 		if (!kallsyms_filename)
 			return -1;
 	} else {
-		sprintf(path, "%s/proc/kallsyms", kerninfo->root_dir);
+		sprintf(path, "%s/proc/kallsyms", machine->root_dir);
 		kallsyms_filename = path;
 	}
 
@@ -1856,9 +1856,8 @@ static int dso__load_guest_kernel_sym(struct dso *self, struct map *map,
 out_try_fixup:
 	if (err > 0) {
 		if (kallsyms_filename != NULL) {
-			kern_mmap_name(kerninfo, path);
-			dso__set_long_name(self,
-				strdup(path));
+			machine__mmap_name(machine, path);
+			dso__set_long_name(self, strdup(path));
 		}
 		map__fixup_start(map);
 		map__fixup_end(map);
@@ -1908,15 +1907,14 @@ static void __dsos__fprintf(struct list_head *head, FILE *fp)
 	}
 }
 
-void dsos__fprintf(struct rb_root *kerninfo_root, FILE *fp)
+void dsos__fprintf(struct rb_root *machines, FILE *fp)
 {
 	struct rb_node *nd;
 
-	for (nd = rb_first(kerninfo_root); nd; nd = rb_next(nd)) {
-		struct kernel_info *pos = rb_entry(nd, struct kernel_info,
-				rb_node);
-		__dsos__fprintf(&pos->dsos__kernel, fp);
-		__dsos__fprintf(&pos->dsos__user, fp);
+	for (nd = rb_first(machines); nd; nd = rb_next(nd)) {
+		struct machine *pos = rb_entry(nd, struct machine, rb_node);
+		__dsos__fprintf(&pos->kernel_dsos, fp);
+		__dsos__fprintf(&pos->user_dsos, fp);
 	}
 }
 
@@ -1935,19 +1933,15 @@ static size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp,
 	return ret;
 }
 
-size_t dsos__fprintf_buildid(struct rb_root *kerninfo_root,
-		FILE *fp, bool with_hits)
+size_t dsos__fprintf_buildid(struct rb_root *machines, FILE *fp, bool with_hits)
 {
 	struct rb_node *nd;
 	size_t ret = 0;
 
-	for (nd = rb_first(kerninfo_root); nd; nd = rb_next(nd)) {
-		struct kernel_info *pos = rb_entry(nd, struct kernel_info,
-				rb_node);
-		ret += __dsos__fprintf_buildid(&pos->dsos__kernel,
-					fp, with_hits);
-		ret += __dsos__fprintf_buildid(&pos->dsos__user,
-					fp, with_hits);
+	for (nd = rb_first(machines); nd; nd = rb_next(nd)) {
+		struct machine *pos = rb_entry(nd, struct machine, rb_node);
+		ret += __dsos__fprintf_buildid(&pos->kernel_dsos, fp, with_hits);
+		ret += __dsos__fprintf_buildid(&pos->user_dsos, fp, with_hits);
 	}
 	return ret;
 }
@@ -1964,14 +1958,12 @@ struct dso *dso__new_kernel(const char *name)
 	return self;
 }
 
-static struct dso *dso__new_guest_kernel(struct kernel_info *kerninfo,
+static struct dso *dso__new_guest_kernel(struct machine *machine,
 					const char *name)
 {
 	char buff[PATH_MAX];
-	struct dso *self;
+	struct dso *self = dso__new(name ?: machine__mmap_name(machine, buff));
 
-	kern_mmap_name(kerninfo, buff);
-	self = dso__new(name ?: buff);
 	if (self != NULL) {
 		dso__set_short_name(self, "[guest.kernel]");
 		self->kernel = DSO_TYPE_GUEST_KERNEL;
@@ -1980,36 +1972,35 @@ static struct dso *dso__new_guest_kernel(struct kernel_info *kerninfo,
 	return self;
 }
 
-void dso__read_running_kernel_build_id(struct dso *self,
-			struct kernel_info *kerninfo)
+void dso__read_running_kernel_build_id(struct dso *self, struct machine *machine)
 {
 	char path[PATH_MAX];
 
-	if (is_default_guest(kerninfo))
+	if (machine__is_default_guest(machine))
 		return;
-	sprintf(path, "%s/sys/kernel/notes", kerninfo->root_dir);
+	sprintf(path, "%s/sys/kernel/notes", machine->root_dir);
 	if (sysfs__read_build_id(path, self->build_id,
 				 sizeof(self->build_id)) == 0)
 		self->has_build_id = true;
 }
 
-static struct dso *dsos__create_kernel(struct kernel_info *kerninfo)
+static struct dso *dsos__create_kernel(struct machine *machine)
 {
 	const char *vmlinux_name = NULL;
 	struct dso *kernel;
 
-	if (is_host_kernel(kerninfo)) {
+	if (machine__is_host(machine)) {
 		vmlinux_name = symbol_conf.vmlinux_name;
 		kernel = dso__new_kernel(vmlinux_name);
 	} else {
-		if (is_default_guest(kerninfo))
+		if (machine__is_default_guest(machine))
 			vmlinux_name = symbol_conf.default_guest_vmlinux_name;
-		kernel = dso__new_guest_kernel(kerninfo, vmlinux_name);
+		kernel = dso__new_guest_kernel(machine, vmlinux_name);
 	}
 
 	if (kernel != NULL) {
-		dso__read_running_kernel_build_id(kernel, kerninfo);
-		dsos__add(&kerninfo->dsos__kernel, kernel);
+		dso__read_running_kernel_build_id(kernel, machine);
+		dsos__add(&machine->kernel_dsos, kernel);
 	}
 	return kernel;
 }
@@ -2154,29 +2145,28 @@ out_free_comm_list:
 	return -1;
 }
 
-int map_groups__create_kernel_maps(struct rb_root *kerninfo_root, pid_t pid)
+int map_groups__create_kernel_maps(struct rb_root *machines, pid_t pid)
 {
-	struct kernel_info *kerninfo;
 	struct dso *kernel;
+	struct machine *machine = machines__findnew(machines, pid);
 
-	kerninfo = kerninfo__findnew(kerninfo_root, pid);
-	if (kerninfo == NULL)
+	if (machine == NULL)
 		return -1;
-	kernel = dsos__create_kernel(kerninfo);
+	kernel = dsos__create_kernel(machine);
 	if (kernel == NULL)
 		return -1;
 
-	if (__map_groups__create_kernel_maps(&kerninfo->kmaps,
-			kerninfo->vmlinux_maps, kernel) < 0)
+	if (__map_groups__create_kernel_maps(&machine->kmaps,
+					     machine->vmlinux_maps, kernel) < 0)
 		return -1;
 
 	if (symbol_conf.use_modules &&
-		map_groups__create_modules(kerninfo) < 0)
+	    map_groups__create_modules(machine) < 0)
 		pr_debug("Problems creating module maps, continuing anyway...\n");
 	/*
 	 * Now that we have all the maps created, just set the ->end of them:
 	 */
-	map_groups__fixup_end(&kerninfo->kmaps);
+	map_groups__fixup_end(&machine->kmaps);
 	return 0;
 }
 
@@ -2223,7 +2213,7 @@ char *strxfrchar(char *s, char from, char to)
 	return s;
 }
 
-int map_groups__create_guest_kernel_maps(struct rb_root *kerninfo_root)
+int map_groups__create_guest_kernel_maps(struct rb_root *machines)
 {
 	int ret = 0;
 	struct dirent **namelist = NULL;
@@ -2234,8 +2224,7 @@ int map_groups__create_guest_kernel_maps(struct rb_root *kerninfo_root)
 	if (symbol_conf.default_guest_vmlinux_name ||
 	    symbol_conf.default_guest_modules ||
 	    symbol_conf.default_guest_kallsyms) {
-		map_groups__create_kernel_maps(kerninfo_root,
-					DEFAULT_GUEST_KERNEL_ID);
+		map_groups__create_kernel_maps(machines, DEFAULT_GUEST_KERNEL_ID);
 	}
 
 	if (symbol_conf.guestmount) {
@@ -2256,8 +2245,7 @@ int map_groups__create_guest_kernel_maps(struct rb_root *kerninfo_root)
 				pr_debug("Can't access file %s\n", path);
 				goto failure;
 			}
-			map_groups__create_kernel_maps(kerninfo_root,
-							pid);
+			map_groups__create_kernel_maps(machines, pid);
 		}
 failure:
 		free(namelist);
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 478f5ab3778..ed885b06a02 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -186,8 +186,7 @@ enum dso_origin {
 char dso__symtab_origin(const struct dso *self);
 void dso__set_long_name(struct dso *self, char *name);
 void dso__set_build_id(struct dso *self, void *build_id);
-void dso__read_running_kernel_build_id(struct dso *self,
-		struct kernel_info *kerninfo);
+void dso__read_running_kernel_build_id(struct dso *self, struct machine *machine);
 struct symbol *dso__find_symbol(struct dso *self, enum map_type type, u64 addr);
 struct symbol *dso__find_symbol_by_name(struct dso *self, enum map_type type,
 					const char *name);
-- 
cgit v1.2.3


From 48ea8f5470aa6f35244d1b218316705ea88c0259 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Apr 2010 21:19:05 -0300
Subject: perf machine: Pass buffer size to machine__mmap_name
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Don't blindly assume that the size of the buffer is enough, use
snprintf.

Cc: Avi Kivity <avi@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Zhang, Yanmin <yanmin_zhang@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c  |  4 ++--
 tools/perf/util/map.c    | 10 +++++-----
 tools/perf/util/map.h    |  2 +-
 tools/perf/util/symbol.c |  6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 2f33ca9899b..7400e5147e1 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -288,7 +288,7 @@ int event__synthesize_kernel_mmap(event__handler_t process,
 	 */
 	struct process_symbol_args args = { .name = symbol_name, };
 
-	mmap_name = machine__mmap_name(machine, name_buff);
+	mmap_name = machine__mmap_name(machine, name_buff, sizeof(name_buff));
 	if (machine__is_host(machine)) {
 		/*
 		 * kernel uses PERF_RECORD_MISC_USER for user space maps,
@@ -399,7 +399,7 @@ static int event__process_kernel_mmap(event_t *self,
 		goto out_problem;
 	}
 
-	machine__mmap_name(machine, kmmap_prefix);
+	machine__mmap_name(machine, kmmap_prefix, sizeof(kmmap_prefix));
 	if (machine__is_host(machine))
 		kernel_type = DSO_TYPE_KERNEL;
 	else
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index da3d4e82623..ee25ee91504 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -629,14 +629,14 @@ void machines__process(struct rb_root *self, machine__process_t process, void *d
 	}
 }
 
-char *machine__mmap_name(struct machine *self, char *buff)
+char *machine__mmap_name(struct machine *self, char *bf, size_t size)
 {
 	if (machine__is_host(self))
-		sprintf(buff, "[%s]", "kernel.kallsyms");
+		snprintf(bf, size, "[%s]", "kernel.kallsyms");
 	else if (machine__is_default_guest(self))
-		sprintf(buff, "[%s]", "guest.kernel.kallsyms");
+		snprintf(bf, size, "[%s]", "guest.kernel.kallsyms");
 	else
-		sprintf(buff, "[%s.%d]", "guest.kernel.kallsyms", self->pid);
+		snprintf(bf, size, "[%s.%d]", "guest.kernel.kallsyms", self->pid);
 
-	return buff;
+	return bf;
 }
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 4c1c2da704b..6fabad1fd02 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -133,7 +133,7 @@ struct machine *machines__add(struct rb_root *self, pid_t pid,
 struct machine *machines__find_host(struct rb_root *self);
 struct machine *machines__find(struct rb_root *self, pid_t pid);
 struct machine *machines__findnew(struct rb_root *self, pid_t pid);
-char *machine__mmap_name(struct machine *self, char *buff);
+char *machine__mmap_name(struct machine *self, char *bf, size_t size);
 
 /*
  * Default guest kernel is defined by parameter --guestkallsyms
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index dc046368b5c..c9c0bdd667a 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1856,7 +1856,7 @@ static int dso__load_guest_kernel_sym(struct dso *self, struct map *map,
 out_try_fixup:
 	if (err > 0) {
 		if (kallsyms_filename != NULL) {
-			machine__mmap_name(machine, path);
+			machine__mmap_name(machine, path, sizeof(path));
 			dso__set_long_name(self, strdup(path));
 		}
 		map__fixup_start(map);
@@ -1961,8 +1961,8 @@ struct dso *dso__new_kernel(const char *name)
 static struct dso *dso__new_guest_kernel(struct machine *machine,
 					const char *name)
 {
-	char buff[PATH_MAX];
-	struct dso *self = dso__new(name ?: machine__mmap_name(machine, buff));
+	char bf[PATH_MAX];
+	struct dso *self = dso__new(name ?: machine__mmap_name(machine, bf, sizeof(bf)));
 
 	if (self != NULL) {
 		dso__set_short_name(self, "[guest.kernel]");
-- 
cgit v1.2.3


From d28c62232e50eab202bcd3f19b5c7a25b8b900b6 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Apr 2010 21:20:43 -0300
Subject: perf machine: Adopt some map_groups functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Those functions operated on members now grouped in 'struct machine', so
move those methods to this new class.

The changes made to 'perf probe' shows that using this abstraction
inserting probes on guests almost got supported for free.

Cc: Avi Kivity <avi@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zhang, Yanmin <yanmin_zhang@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-kmem.c     |  4 +--
 tools/perf/util/event.c       |  9 ++----
 tools/perf/util/map.c         | 24 ++++++++++-----
 tools/perf/util/map.h         | 12 ++++----
 tools/perf/util/probe-event.c | 20 ++++++------
 tools/perf/util/session.c     |  7 ++---
 tools/perf/util/symbol.c      | 71 ++++++++++++++++++-------------------------
 tools/perf/util/symbol.h      |  8 ++---
 8 files changed, 74 insertions(+), 81 deletions(-)

diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 20674759464..15635612e59 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -370,7 +370,6 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 		struct alloc_stat *data = rb_entry(next, struct alloc_stat,
 						   node);
 		struct symbol *sym = NULL;
-		struct map_groups *kmaps = &machine->kmaps;
 		struct map *map;
 		char buf[BUFSIZ];
 		u64 addr;
@@ -378,8 +377,7 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 		if (is_caller) {
 			addr = data->call_site;
 			if (!raw_ip)
-				sym = map_groups__find_function(kmaps, addr,
-								&map, NULL);
+				sym = machine__find_function(machine, addr, &map, NULL);
 		} else
 			addr = data->ptr;
 
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 7400e5147e1..1757b0ffeaa 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -429,9 +429,8 @@ static int event__process_kernel_mmap(event_t *self,
 		} else
 			strcpy(short_module_name, self->mmap.filename);
 
-		map = map_groups__new_module(&machine->kmaps,
-					     self->mmap.start,
-					     self->mmap.filename, machine);
+		map = machine__new_module(machine, self->mmap.start,
+					  self->mmap.filename);
 		if (map == NULL)
 			goto out_problem;
 
@@ -454,9 +453,7 @@ static int event__process_kernel_mmap(event_t *self,
 			goto out_problem;
 
 		kernel->kernel = kernel_type;
-		if (__map_groups__create_kernel_maps(&machine->kmaps,
-						     machine->vmlinux_maps,
-						     kernel) < 0)
+		if (__machine__create_kernel_maps(machine, kernel) < 0)
 			goto out_problem;
 
 		event_set_kernel_mmap_len(machine->vmlinux_maps, self);
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index ee25ee91504..44a4df68b3c 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -513,6 +513,19 @@ struct map *maps__find(struct rb_root *maps, u64 ip)
 	return NULL;
 }
 
+int machine__init(struct machine *self, const char *root_dir, pid_t pid)
+{
+	map_groups__init(&self->kmaps);
+	RB_CLEAR_NODE(&self->rb_node);
+	INIT_LIST_HEAD(&self->user_dsos);
+	INIT_LIST_HEAD(&self->kernel_dsos);
+
+	self->kmaps.machine = self;
+	self->pid	    = pid;
+	self->root_dir      = strdup(root_dir);
+	return self->root_dir == NULL ? -ENOMEM : 0;
+}
+
 struct machine *machines__add(struct rb_root *self, pid_t pid,
 			      const char *root_dir)
 {
@@ -523,13 +536,10 @@ struct machine *machines__add(struct rb_root *self, pid_t pid,
 	if (!machine)
 		return NULL;
 
-	machine->pid = pid;
-	map_groups__init(&machine->kmaps);
-	machine->root_dir = strdup(root_dir);
-	RB_CLEAR_NODE(&machine->rb_node);
-	INIT_LIST_HEAD(&machine->user_dsos);
-	INIT_LIST_HEAD(&machine->kernel_dsos);
-	machine->kmaps.machine = machine;
+	if (machine__init(machine, root_dir, pid) != 0) {
+		free(machine);
+		return NULL;
+	}
 
 	while (*p != NULL) {
 		parent = *p;
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 6fabad1fd02..881dba4f820 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -134,6 +134,7 @@ struct machine *machines__find_host(struct rb_root *self);
 struct machine *machines__find(struct rb_root *self, pid_t pid);
 struct machine *machines__findnew(struct rb_root *self, pid_t pid);
 char *machine__mmap_name(struct machine *self, char *bf, size_t size);
+int machine__init(struct machine *self, const char *root_dir, pid_t pid);
 
 /*
  * Default guest kernel is defined by parameter --guestkallsyms
@@ -172,11 +173,11 @@ struct symbol *map_groups__find_symbol_by_name(struct map_groups *self,
 					       struct map **mapp,
 					       symbol_filter_t filter);
 
-static inline
-struct symbol *map_groups__find_function(struct map_groups *self, u64 addr,
-					 struct map **mapp, symbol_filter_t filter)
+static inline struct symbol *machine__find_function(struct machine *self,
+						    u64 addr, struct map **mapp,
+						    symbol_filter_t filter)
 {
-	return map_groups__find_symbol(self, MAP__FUNCTION, addr, mapp, filter);
+	return map_groups__find_symbol(&self->kmaps, MAP__FUNCTION, addr, mapp, filter);
 }
 
 static inline
@@ -192,8 +193,7 @@ int map_groups__fixup_overlappings(struct map_groups *self, struct map *map,
 
 struct map *map_groups__find_by_name(struct map_groups *self,
 				     enum map_type type, const char *name);
-struct map *map_groups__new_module(struct map_groups *self, u64 start,
-				   const char *filename, struct machine *machine);
+struct map *machine__new_module(struct machine *self, u64 start, const char *filename);
 
 void map_groups__flush(struct map_groups *self);
 
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 9ded38ced23..914c67095d9 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -72,8 +72,7 @@ static int e_snprintf(char *str, size_t size, const char *format, ...)
 }
 
 static char *synthesize_perf_probe_point(struct perf_probe_point *pp);
-static struct map_groups kmap_groups;
-static struct map *kmaps[MAP__NR_TYPES];
+static struct machine machine;
 
 /* Initialize symbol maps and path of vmlinux */
 static int init_vmlinux(void)
@@ -92,12 +91,15 @@ static int init_vmlinux(void)
 		goto out;
 	}
 
+	ret = machine__init(&machine, "/", 0);
+	if (ret < 0)
+		goto out;
+
 	kernel = dso__new_kernel(symbol_conf.vmlinux_name);
 	if (kernel == NULL)
 		die("Failed to create kernel dso.");
 
-	map_groups__init(&kmap_groups);
-	ret = __map_groups__create_kernel_maps(&kmap_groups, kmaps, kernel);
+	ret = __machine__create_kernel_maps(&machine, kernel);
 	if (ret < 0)
 		pr_debug("Failed to create kernel maps.\n");
 
@@ -110,12 +112,12 @@ out:
 #ifdef DWARF_SUPPORT
 static int open_vmlinux(void)
 {
-	if (map__load(kmaps[MAP__FUNCTION], NULL) < 0) {
+	if (map__load(machine.vmlinux_maps[MAP__FUNCTION], NULL) < 0) {
 		pr_debug("Failed to load kernel map.\n");
 		return -EINVAL;
 	}
-	pr_debug("Try to open %s\n", kmaps[MAP__FUNCTION]->dso->long_name);
-	return open(kmaps[MAP__FUNCTION]->dso->long_name, O_RDONLY);
+	pr_debug("Try to open %s\n", machine.vmlinux_maps[MAP__FUNCTION]->dso->long_name);
+	return open(machine.vmlinux_maps[MAP__FUNCTION]->dso->long_name, O_RDONLY);
 }
 
 /* Convert trace point to probe point with debuginfo */
@@ -125,7 +127,7 @@ static int convert_to_perf_probe_point(struct kprobe_trace_point *tp,
 	struct symbol *sym;
 	int fd, ret = -ENOENT;
 
-	sym = map__find_symbol_by_name(kmaps[MAP__FUNCTION],
+	sym = map__find_symbol_by_name(machine.vmlinux_maps[MAP__FUNCTION],
 				       tp->symbol, NULL);
 	if (sym) {
 		fd = open_vmlinux();
@@ -1466,7 +1468,7 @@ static int convert_to_kprobe_trace_events(struct perf_probe_event *pev,
 	}
 
 	/* Currently just checking function name from symbol map */
-	sym = map__find_symbol_by_name(kmaps[MAP__FUNCTION],
+	sym = map__find_symbol_by_name(machine.vmlinux_maps[MAP__FUNCTION],
 				       tev->point.symbol, NULL);
 	if (!sym) {
 		pr_warning("Kernel symbol \'%s\' not found.\n",
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index b745c1c0b6c..a8dd73ed158 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -69,12 +69,11 @@ void perf_session__update_sample_type(struct perf_session *self)
 
 int perf_session__create_kernel_maps(struct perf_session *self)
 {
-	int ret;
-	struct rb_root *root = &self->machines;
+	struct rb_root *machines = &self->machines;
+	int ret = machines__create_kernel_maps(machines, HOST_KERNEL_ID);
 
-	ret = map_groups__create_kernel_maps(root, HOST_KERNEL_ID);
 	if (ret >= 0)
-		ret = map_groups__create_guest_kernel_maps(root);
+		ret = machines__create_guest_kernel_maps(machines);
 	return ret;
 }
 
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index c9c0bdd667a..12359c37240 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1528,21 +1528,20 @@ static char *get_kernel_version(const char *root_dir)
 	return strdup(name);
 }
 
-static int map_groups__set_modules_path(struct map_groups *self,
-				const char *root_dir)
+static int machine__set_modules_path(struct machine *self)
 {
 	char *version;
 	char modules_path[PATH_MAX];
 
-	version = get_kernel_version(root_dir);
+	version = get_kernel_version(self->root_dir);
 	if (!version)
 		return -1;
 
 	snprintf(modules_path, sizeof(modules_path), "%s/lib/modules/%s/kernel",
-		 root_dir, version);
+		 self->root_dir, version);
 	free(version);
 
-	return map_groups__set_modules_path_dir(self, modules_path);
+	return map_groups__set_modules_path_dir(&self->kmaps, modules_path);
 }
 
 /*
@@ -1564,14 +1563,12 @@ static struct map *map__new2(u64 start, struct dso *dso, enum map_type type)
 	return self;
 }
 
-struct map *map_groups__new_module(struct map_groups *self, u64 start,
-				const char *filename,
-				struct machine *machine)
+struct map *machine__new_module(struct machine *self, u64 start,
+				const char *filename)
 {
 	struct map *map;
-	struct dso *dso;
+	struct dso *dso = __dsos__findnew(&self->kernel_dsos, filename);
 
-	dso = __dsos__findnew(&machine->kernel_dsos, filename);
 	if (dso == NULL)
 		return NULL;
 
@@ -1579,28 +1576,27 @@ struct map *map_groups__new_module(struct map_groups *self, u64 start,
 	if (map == NULL)
 		return NULL;
 
-	if (machine__is_host(machine))
+	if (machine__is_host(self))
 		dso->origin = DSO__ORIG_KMODULE;
 	else
 		dso->origin = DSO__ORIG_GUEST_KMODULE;
-	map_groups__insert(self, map);
+	map_groups__insert(&self->kmaps, map);
 	return map;
 }
 
-static int map_groups__create_modules(struct machine *machine)
+static int machine__create_modules(struct machine *self)
 {
 	char *line = NULL;
 	size_t n;
 	FILE *file;
 	struct map *map;
-	const char *root_dir;
 	const char *modules;
 	char path[PATH_MAX];
 
-	if (machine__is_default_guest(machine))
+	if (machine__is_default_guest(self))
 		modules = symbol_conf.default_guest_modules;
 	else {
-		sprintf(path, "%s/proc/modules", machine->root_dir);
+		sprintf(path, "%s/proc/modules", self->root_dir);
 		modules = path;
 	}
 
@@ -1608,8 +1604,6 @@ static int map_groups__create_modules(struct machine *machine)
 	if (file == NULL)
 		return -1;
 
-	root_dir = machine->root_dir;
-
 	while (!feof(file)) {
 		char name[PATH_MAX];
 		u64 start;
@@ -1638,17 +1632,16 @@ static int map_groups__create_modules(struct machine *machine)
 		*sep = '\0';
 
 		snprintf(name, sizeof(name), "[%s]", line);
-		map = map_groups__new_module(&machine->kmaps, start,
-					     name, machine);
+		map = machine__new_module(self, start, name);
 		if (map == NULL)
 			goto out_delete_line;
-		dso__kernel_module_get_build_id(map->dso, root_dir);
+		dso__kernel_module_get_build_id(map->dso, self->root_dir);
 	}
 
 	free(line);
 	fclose(file);
 
-	return map_groups__set_modules_path(&machine->kmaps, root_dir);
+	return machine__set_modules_path(self);
 
 out_delete_line:
 	free(line);
@@ -2005,25 +1998,23 @@ static struct dso *dsos__create_kernel(struct machine *machine)
 	return kernel;
 }
 
-int __map_groups__create_kernel_maps(struct map_groups *self,
-				     struct map *vmlinux_maps[MAP__NR_TYPES],
-				     struct dso *kernel)
+int __machine__create_kernel_maps(struct machine *self, struct dso *kernel)
 {
 	enum map_type type;
 
 	for (type = 0; type < MAP__NR_TYPES; ++type) {
 		struct kmap *kmap;
 
-		vmlinux_maps[type] = map__new2(0, kernel, type);
-		if (vmlinux_maps[type] == NULL)
+		self->vmlinux_maps[type] = map__new2(0, kernel, type);
+		if (self->vmlinux_maps[type] == NULL)
 			return -1;
 
-		vmlinux_maps[type]->map_ip =
-			vmlinux_maps[type]->unmap_ip = identity__map_ip;
+		self->vmlinux_maps[type]->map_ip =
+			self->vmlinux_maps[type]->unmap_ip = identity__map_ip;
 
-		kmap = map__kmap(vmlinux_maps[type]);
-		kmap->kmaps = self;
-		map_groups__insert(self, vmlinux_maps[type]);
+		kmap = map__kmap(self->vmlinux_maps[type]);
+		kmap->kmaps = &self->kmaps;
+		map_groups__insert(&self->kmaps, self->vmlinux_maps[type]);
 	}
 
 	return 0;
@@ -2145,10 +2136,10 @@ out_free_comm_list:
 	return -1;
 }
 
-int map_groups__create_kernel_maps(struct rb_root *machines, pid_t pid)
+int machines__create_kernel_maps(struct rb_root *self, pid_t pid)
 {
 	struct dso *kernel;
-	struct machine *machine = machines__findnew(machines, pid);
+	struct machine *machine = machines__findnew(self, pid);
 
 	if (machine == NULL)
 		return -1;
@@ -2156,12 +2147,10 @@ int map_groups__create_kernel_maps(struct rb_root *machines, pid_t pid)
 	if (kernel == NULL)
 		return -1;
 
-	if (__map_groups__create_kernel_maps(&machine->kmaps,
-					     machine->vmlinux_maps, kernel) < 0)
+	if (__machine__create_kernel_maps(machine, kernel) < 0)
 		return -1;
 
-	if (symbol_conf.use_modules &&
-	    map_groups__create_modules(machine) < 0)
+	if (symbol_conf.use_modules && machine__create_modules(machine) < 0)
 		pr_debug("Problems creating module maps, continuing anyway...\n");
 	/*
 	 * Now that we have all the maps created, just set the ->end of them:
@@ -2213,7 +2202,7 @@ char *strxfrchar(char *s, char from, char to)
 	return s;
 }
 
-int map_groups__create_guest_kernel_maps(struct rb_root *machines)
+int machines__create_guest_kernel_maps(struct rb_root *self)
 {
 	int ret = 0;
 	struct dirent **namelist = NULL;
@@ -2224,7 +2213,7 @@ int map_groups__create_guest_kernel_maps(struct rb_root *machines)
 	if (symbol_conf.default_guest_vmlinux_name ||
 	    symbol_conf.default_guest_modules ||
 	    symbol_conf.default_guest_kallsyms) {
-		map_groups__create_kernel_maps(machines, DEFAULT_GUEST_KERNEL_ID);
+		machines__create_kernel_maps(self, DEFAULT_GUEST_KERNEL_ID);
 	}
 
 	if (symbol_conf.guestmount) {
@@ -2245,7 +2234,7 @@ int map_groups__create_guest_kernel_maps(struct rb_root *machines)
 				pr_debug("Can't access file %s\n", path);
 				goto failure;
 			}
-			map_groups__create_kernel_maps(machines, pid);
+			machines__create_kernel_maps(self, pid);
 		}
 failure:
 		free(namelist);
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index ed885b06a02..37b717b861c 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -199,11 +199,9 @@ int kallsyms__parse(const char *filename, void *arg,
 		    int (*process_symbol)(void *arg, const char *name,
 					  char type, u64 start));
 
-int __map_groups__create_kernel_maps(struct map_groups *self,
-			struct map *vmlinux_maps[MAP__NR_TYPES],
-			struct dso *kernel);
-int map_groups__create_kernel_maps(struct rb_root *kerninfo_root, pid_t pid);
-int map_groups__create_guest_kernel_maps(struct rb_root *kerninfo_root);
+int __machine__create_kernel_maps(struct machine *self, struct dso *kernel);
+int machines__create_kernel_maps(struct rb_root *self, pid_t pid);
+int machines__create_guest_kernel_maps(struct rb_root *self);
 
 int symbol__init(void);
 bool symbol_type__is_a(char symbol_type, enum map_type map_type);
-- 
cgit v1.2.3


From cbf6968098f89d3216d074f06544b5032b344da4 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Apr 2010 21:22:44 -0300
Subject: perf machines: Make the machines class adopt the dsos__fprintf
 methods
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now those methods don't operate on a global list of dsos, but on lists
of machines, so make this clear by renaming the functions.

Cc: Avi Kivity <avi@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Zhang, Yanmin <yanmin_zhang@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-annotate.c     |  2 +-
 tools/perf/builtin-buildid-list.c |  2 +-
 tools/perf/builtin-report.c       |  2 +-
 tools/perf/builtin-top.c          |  2 +-
 tools/perf/util/session.h         | 13 +++++++++++++
 tools/perf/util/symbol.c          | 22 ++++++++++++++--------
 tools/perf/util/symbol.h          |  5 ++---
 7 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 986b2efcef2..b57dbcf62af 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -571,7 +571,7 @@ static int __cmd_annotate(void)
 		perf_session__fprintf(session, stdout);
 
 	if (verbose > 2)
-		dsos__fprintf(&session->machines, stdout);
+		perf_session__fprintf_dsos(session, stdout);
 
 	perf_session__collapse_resort(&session->hists);
 	perf_session__output_resort(&session->hists, session->event_total[0]);
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index b4a265ae3b9..7dc3b2e7a5e 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -46,7 +46,7 @@ static int __cmd_buildid_list(void)
 	if (with_hits)
 		perf_session__process_events(session, &build_id__mark_dso_hit_ops);
 
-	dsos__fprintf_buildid(&session->machines, stdout, with_hits);
+	perf_session__fprintf_dsos_buildid(session, stdout, with_hits);
 
 	perf_session__delete(session);
 	return err;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 49cc367d8c3..f1b46eb7ef9 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -313,7 +313,7 @@ static int __cmd_report(void)
 		perf_session__fprintf(session, stdout);
 
 	if (verbose > 2)
-		dsos__fprintf(&session->machines, stdout);
+		perf_session__fprintf_dsos(session, stdout);
 
 	next = rb_first(&session->stats_by_id);
 	while (next) {
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index c390f340b03..d95281f588d 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -854,7 +854,7 @@ static void handle_keypress(struct perf_session *session, int c)
 		case 'Q':
 			printf("exiting.\n");
 			if (dump_symtab)
-				dsos__fprintf(&session->machines, stderr);
+				perf_session__fprintf_dsos(session, stderr);
 			exit(0);
 		case 's':
 			prompt_symbol(&sym_filter_entry, "Enter details symbol");
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 71252723a17..61ca92e58ad 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -127,4 +127,17 @@ void perf_session__process_machines(struct perf_session *self,
 {
 	return machines__process(&self->machines, process, self);
 }
+
+static inline
+size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp)
+{
+	return machines__fprintf_dsos(&self->machines, fp);
+}
+
+static inline
+size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, FILE *fp,
+					  bool with_hits)
+{
+	return machines__fprintf_dsos_buildid(&self->machines, fp, with_hits);
+}
 #endif /* __PERF_SESSION_H */
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 12359c37240..caa890f8e2c 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1889,26 +1889,32 @@ struct dso *__dsos__findnew(struct list_head *head, const char *name)
 	return dso;
 }
 
-static void __dsos__fprintf(struct list_head *head, FILE *fp)
+static size_t __dsos__fprintf(struct list_head *head, FILE *fp)
 {
 	struct dso *pos;
+	size_t ret = 0;
 
 	list_for_each_entry(pos, head, node) {
 		int i;
 		for (i = 0; i < MAP__NR_TYPES; ++i)
-			dso__fprintf(pos, i, fp);
+			ret += dso__fprintf(pos, i, fp);
 	}
+
+	return ret;
 }
 
-void dsos__fprintf(struct rb_root *machines, FILE *fp)
+size_t machines__fprintf_dsos(struct rb_root *self, FILE *fp)
 {
 	struct rb_node *nd;
+	size_t ret = 0;
 
-	for (nd = rb_first(machines); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(self); nd; nd = rb_next(nd)) {
 		struct machine *pos = rb_entry(nd, struct machine, rb_node);
-		__dsos__fprintf(&pos->kernel_dsos, fp);
-		__dsos__fprintf(&pos->user_dsos, fp);
+		ret += __dsos__fprintf(&pos->kernel_dsos, fp);
+		ret += __dsos__fprintf(&pos->user_dsos, fp);
 	}
+
+	return ret;
 }
 
 static size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp,
@@ -1926,12 +1932,12 @@ static size_t __dsos__fprintf_buildid(struct list_head *head, FILE *fp,
 	return ret;
 }
 
-size_t dsos__fprintf_buildid(struct rb_root *machines, FILE *fp, bool with_hits)
+size_t machines__fprintf_dsos_buildid(struct rb_root *self, FILE *fp, bool with_hits)
 {
 	struct rb_node *nd;
 	size_t ret = 0;
 
-	for (nd = rb_first(machines); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(self); nd; nd = rb_next(nd)) {
 		struct machine *pos = rb_entry(nd, struct machine, rb_node);
 		ret += __dsos__fprintf_buildid(&pos->kernel_dsos, fp, with_hits);
 		ret += __dsos__fprintf_buildid(&pos->user_dsos, fp, with_hits);
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 37b717b861c..2cec6a10716 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -162,9 +162,8 @@ int dso__load_vmlinux_path(struct dso *self, struct map *map,
 			   symbol_filter_t filter);
 int dso__load_kallsyms(struct dso *self, const char *filename, struct map *map,
 		       symbol_filter_t filter);
-void dsos__fprintf(struct rb_root *kerninfo_root, FILE *fp);
-size_t dsos__fprintf_buildid(struct rb_root *kerninfo_root,
-		FILE *fp, bool with_hits);
+size_t machines__fprintf_dsos(struct rb_root *self, FILE *fp);
+size_t machines__fprintf_dsos_buildid(struct rb_root *self, FILE *fp, bool with_hits);
 
 size_t dso__fprintf_buildid(struct dso *self, FILE *fp);
 size_t dso__fprintf(struct dso *self, enum map_type type, FILE *fp);
-- 
cgit v1.2.3


From ebe6aa5ac456a13213ed563863e70dd441618a97 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 24 Apr 2010 07:57:47 -0400
Subject: cifs: eliminate "first_time" parm to CIFS_SessSetup

We can use the is_first_ses_reconnect() function to determine this.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  1 -
 fs/cifs/connect.c   |  4 +---
 fs/cifs/sess.c      | 21 ++++++++++++++-------
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index cc622a735f2..8e9214275e4 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -95,7 +95,6 @@ extern int small_smb_init_no_tc(const int smb_cmd, const int wct,
 				struct cifsSesInfo *ses,
 				void **request_buf);
 extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
-			     const int stage,
 			     const struct nls_table *nls_cp);
 extern __u16 GetNextMid(struct TCP_Server_Info *server);
 extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7a47c7c5c7e..9123c23bd1d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2878,7 +2878,6 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 					   struct nls_table *nls_info)
 {
 	int rc = 0;
-	int first_time = 0;
 	struct TCP_Server_Info *server = pSesInfo->server;
 
 	/* what if server changes its buffer size after dropping the session? */
@@ -2899,7 +2898,6 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 			spin_unlock(&GlobalMid_Lock);
 
 		}
-		first_time = 1;
 	}
 
 	if (rc)
@@ -2913,7 +2911,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 	cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
 		 server->secMode, server->capabilities, server->timeAdj);
 
-	rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
+	rc = CIFS_SessSetup(xid, pSesInfo, nls_info);
 	if (rc) {
 		cERROR(1, "Send error in SessSetup = %d", rc);
 	} else {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index da9729da03e..84b92dfaf84 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -35,9 +35,11 @@
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
 			 unsigned char *p24);
 
-/* Checks if this is the first smb session to be reconnected after
-   the socket has been reestablished (so we know whether to use vc 0).
-   Called while holding the cifs_tcp_ses_lock, so do not block */
+/*
+ * Checks if this is the first smb session to be reconnected after
+ * the socket has been reestablished (so we know whether to use vc 0).
+ * Called while holding the cifs_tcp_ses_lock, so do not block
+ */
 static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
 {
 	struct list_head *tmp;
@@ -447,7 +449,7 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
    This function returns the length of the data in the blob */
 static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 				   struct cifsSesInfo *ses,
-				   const struct nls_table *nls_cp, int first)
+				   const struct nls_table *nls_cp, bool first)
 {
 	AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
 	__u32 flags;
@@ -546,7 +548,7 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
 
 static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
 				  struct cifsSesInfo *ses,
-				  const struct nls_table *nls, int first_time)
+				  const struct nls_table *nls, bool first_time)
 {
 	int bloblen;
 
@@ -559,8 +561,8 @@ static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
 #endif
 
 int
-CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
-		const struct nls_table *nls_cp)
+CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
+	       const struct nls_table *nls_cp)
 {
 	int rc = 0;
 	int wct;
@@ -577,10 +579,15 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	int bytes_remaining;
 	struct key *spnego_key = NULL;
 	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
+	bool first_time;
 
 	if (ses == NULL)
 		return -EINVAL;
 
+	read_lock(&cifs_tcp_ses_lock);
+	first_time = is_first_ses_reconnect(ses);
+	read_unlock(&cifs_tcp_ses_lock);
+
 	type = ses->server->secType;
 
 	cFYI(1, "sess setup type %d", type);
-- 
cgit v1.2.3


From 37e44bc50d91df1fe7edcf6f02fe168c6d802e64 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 27 Apr 2010 21:04:24 -0400
Subject: tracing: Fix sleep time function profiling

When sleep_time is off the function profiler ignores the time that a task
is scheduled out. When the task is scheduled out a timestamp is taken.
When the task is scheduled back in, the timestamp is compared to the
current time and the saved calltimes are adjusted accordingly.

But when stopping the function profiler, the sched switch hook that
does this adjustment was stopped before shutting down the tracer.
This allowed some tasks to not get their timestamps set when they
scheduled out. When the function profiler started again, this would
skew the times of the scheduler functions.

This patch moves the stopping of the sched switch to after the function
profiler is stopped. It also ignores zero set calltimes, which may
happen on start up.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3bcb340d6f0..8c9c2934c45 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -666,6 +666,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 	if (!stat->hash || !ftrace_profile_enabled)
 		goto out;
 
+	/* If the calltime was zero'd ignore it */
+	if (!trace->calltime)
+		goto out;
+
 	calltime = trace->rettime - trace->calltime;
 
 	if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -3357,11 +3361,11 @@ void unregister_ftrace_graph(void)
 		goto out;
 
 	ftrace_graph_active--;
-	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 	unregister_pm_notifier(&ftrace_suspend_notifier);
+	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 
  out:
 	mutex_unlock(&ftrace_lock);
-- 
cgit v1.2.3


From 18acde52b83bd1c8e1d007db519f46d344aa13ed Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 27 Apr 2010 22:26:51 -0300
Subject: perf tools: Create $(OUTPUT)arch/$(ARCH)/util/ directory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So that "make -C tools/perf O=/tmp/some/path" works again.

Problem introduced in:

cd932c5 "perf: Move arch specific code into separate arch director"

Cc: Ian Munsie <imunsie@au.ibm.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 3ac6b677bec..b86f2ad165a 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -187,6 +187,8 @@ ifeq ($(ARCH),x86_64)
         ARCH := x86
 endif
 
+$(shell sh -c 'mkdir -p $(OUTPUT)arch/$(ARCH)/util/' 2> /dev/null)
+
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
 #
-- 
cgit v1.2.3


From 402af0d7c692ddcfa2333e93d3f275ebd0487926 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 21 Apr 2010 15:21:51 +0100
Subject: x86, asm: Introduce and use percpu_inc()

... generating slightly smaller code.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF261F020000780003B33C@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/hardirq.h   |  2 +-
 arch/x86/include/asm/percpu.h    | 24 ++++++++++++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce.c |  4 ++--
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 0f8576427cf..aeab29aee61 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -35,7 +35,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 
 #define __ARCH_IRQ_STAT
 
-#define inc_irq_stat(member)	percpu_add(irq_stat.member, 1)
+#define inc_irq_stat(member)	percpu_inc(irq_stat.member)
 
 #define local_softirq_pending()	percpu_read(irq_stat.__softirq_pending)
 
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 66a272dfd8b..0ec6d12d84e 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -190,6 +190,29 @@ do {									\
 	pfo_ret__;					\
 })
 
+#define percpu_unary_op(op, var)			\
+({							\
+	switch (sizeof(var)) {				\
+	case 1:						\
+		asm(op "b "__percpu_arg(0)		\
+		    : "+m" (var));			\
+		break;					\
+	case 2:						\
+		asm(op "w "__percpu_arg(0)		\
+		    : "+m" (var));			\
+		break;					\
+	case 4:						\
+		asm(op "l "__percpu_arg(0)		\
+		    : "+m" (var));			\
+		break;					\
+	case 8:						\
+		asm(op "q "__percpu_arg(0)		\
+		    : "+m" (var));			\
+		break;					\
+	default: __bad_percpu_size();			\
+	}						\
+})
+
 /*
  * percpu_read() makes gcc load the percpu variable every time it is
  * accessed while percpu_read_stable() allows the value to be cached.
@@ -207,6 +230,7 @@ do {									\
 #define percpu_and(var, val)		percpu_to_op("and", var, val)
 #define percpu_or(var, val)		percpu_to_op("or", var, val)
 #define percpu_xor(var, val)		percpu_to_op("xor", var, val)
+#define percpu_inc(var)		percpu_unary_op("inc", var)
 
 #define __this_cpu_read_1(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))
 #define __this_cpu_read_2(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0afa767..7a355ddcc64 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -539,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 	struct mce m;
 	int i;
 
-	__get_cpu_var(mce_poll_count)++;
+	percpu_inc(mce_poll_count);
 
 	mce_setup(&m);
 
@@ -934,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 
 	atomic_inc(&mce_entry);
 
-	__get_cpu_var(mce_exception_count)++;
+	percpu_inc(mce_exception_count);
 
 	if (notify_die(DIE_NMI, "machine check", regs, error_code,
 			   18, SIGKILL) == NOTIFY_STOP)
-- 
cgit v1.2.3


From 2e61878698781d6a9a8bfbaa4ea9c5ddb5a178c3 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 21 Apr 2010 16:13:20 +0100
Subject: x86-64: Combine SRAT regions when possible

... i.e. when the hole between two regions isn't occupied by memory on
another node. This reduces the memory->node table size, thus reducing
cache footprint of lookups, which got increased significantly some
time ago, and things go back to how they were before that change on
the systems I looked at.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF3230020000780003B3CA@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/srat_64.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 28c68762648..3ebe6519bd8 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -363,6 +363,54 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 	for (i = 0; i < MAX_NUMNODES; i++)
 		cutoff_node(i, start, end);
 
+	/*
+	 * Join together blocks on the same node, holes between
+	 * which don't overlap with memory on other nodes.
+	 */
+	for (i = 0; i < num_node_memblks; ++i) {
+		int j, k;
+
+		for (j = i + 1; j < num_node_memblks; ++j) {
+			unsigned long start, end;
+
+			if (memblk_nodeid[i] != memblk_nodeid[j])
+				continue;
+			start = min(node_memblk_range[i].end,
+			            node_memblk_range[j].end);
+			end = max(node_memblk_range[i].start,
+			          node_memblk_range[j].start);
+			for (k = 0; k < num_node_memblks; ++k) {
+				if (memblk_nodeid[i] == memblk_nodeid[k])
+					continue;
+				if (start < node_memblk_range[k].end &&
+				    end > node_memblk_range[k].start)
+					break;
+			}
+			if (k < num_node_memblks)
+				continue;
+			start = min(node_memblk_range[i].start,
+			            node_memblk_range[j].start);
+			end = max(node_memblk_range[i].end,
+			          node_memblk_range[j].end);
+			printk(KERN_INFO "SRAT: Node %d "
+			       "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
+			       memblk_nodeid[i],
+			       node_memblk_range[i].start,
+			       node_memblk_range[i].end,
+			       node_memblk_range[j].start,
+			       node_memblk_range[j].end,
+			       start, end);
+			node_memblk_range[i].start = start;
+			node_memblk_range[i].end = end;
+			k = --num_node_memblks - j;
+			memmove(memblk_nodeid + j, memblk_nodeid + j+1,
+				k * sizeof(*memblk_nodeid));
+			memmove(node_memblk_range + j, node_memblk_range + j+1,
+				k * sizeof(*node_memblk_range));
+			--j;
+		}
+	}
+
 	memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
 					   memblk_nodeid);
 	if (memnode_shift < 0) {
-- 
cgit v1.2.3


From 5967ed87ade85a421ef814296c3c7f182b08c225 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 21 Apr 2010 16:08:14 +0100
Subject: x86-64: Reduce SMP locks table size

Reduce the SMP locks table size by using relative pointers instead of
absolute ones, thus cutting the table size by half.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF30FE020000780003B3B6@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/alternative-asm.h |  4 +--
 arch/x86/include/asm/alternative.h     |  4 +--
 arch/x86/kernel/alternative.c          | 45 +++++++++++++++++++---------------
 3 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index b97f786a48d..a63a68be1cc 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -6,8 +6,8 @@
 	.macro LOCK_PREFIX
 1:	lock
 	.section .smp_locks,"a"
-	_ASM_ALIGN
-	_ASM_PTR 1b
+	.balign 4
+	.long 1b - .
 	.previous
 	.endm
 #else
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index b09ec55650b..714bf241728 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -30,8 +30,8 @@
 #ifdef CONFIG_SMP
 #define LOCK_PREFIX \
 		".section .smp_locks,\"a\"\n"	\
-		_ASM_ALIGN "\n"			\
-		_ASM_PTR "661f\n" /* address */	\
+		".balign 4\n"			\
+		".long 661f - .\n" /* offset */	\
 		".previous\n"			\
 		"661:\n\tlock; "
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1a160d5d44d..93673842722 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
 }
 
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern u8 *__smp_locks[], *__smp_locks_end[];
+extern s32 __smp_locks[], __smp_locks_end[];
 static void *text_poke_early(void *addr, const void *opcode, size_t len);
 
 /* Replace instructions with better alternatives for this CPU type.
@@ -235,37 +235,39 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 
 #ifdef CONFIG_SMP
 
-static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_lock(const s32 *start, const s32 *end,
+				  u8 *text, u8 *text_end)
 {
-	u8 **ptr;
+	const s32 *poff;
 
 	mutex_lock(&text_mutex);
-	for (ptr = start; ptr < end; ptr++) {
-		if (*ptr < text)
-			continue;
-		if (*ptr > text_end)
+	for (poff = start; poff < end; poff++) {
+		u8 *ptr = (u8 *)poff + *poff;
+
+		if (!*poff || ptr < text || ptr >= text_end)
 			continue;
 		/* turn DS segment override prefix into lock prefix */
-		text_poke(*ptr, ((unsigned char []){0xf0}), 1);
+		text_poke(ptr, ((unsigned char []){0xf0}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
 
-static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_unlock(const s32 *start, const s32 *end,
+				    u8 *text, u8 *text_end)
 {
-	u8 **ptr;
+	const s32 *poff;
 
 	if (noreplace_smp)
 		return;
 
 	mutex_lock(&text_mutex);
-	for (ptr = start; ptr < end; ptr++) {
-		if (*ptr < text)
-			continue;
-		if (*ptr > text_end)
+	for (poff = start; poff < end; poff++) {
+		u8 *ptr = (u8 *)poff + *poff;
+
+		if (!*poff || ptr < text || ptr >= text_end)
 			continue;
 		/* turn lock prefix into DS segment override prefix */
-		text_poke(*ptr, ((unsigned char []){0x3E}), 1);
+		text_poke(ptr, ((unsigned char []){0x3E}), 1);
 	};
 	mutex_unlock(&text_mutex);
 }
@@ -276,8 +278,8 @@ struct smp_alt_module {
 	char		*name;
 
 	/* ptrs to lock prefixes */
-	u8		**locks;
-	u8		**locks_end;
+	const s32	*locks;
+	const s32	*locks_end;
 
 	/* .text segment, needed to avoid patching init code ;) */
 	u8		*text;
@@ -398,16 +400,19 @@ void alternatives_smp_switch(int smp)
 int alternatives_text_reserved(void *start, void *end)
 {
 	struct smp_alt_module *mod;
-	u8 **ptr;
+	const s32 *poff;
 	u8 *text_start = start;
 	u8 *text_end = end;
 
 	list_for_each_entry(mod, &smp_alt_modules, next) {
 		if (mod->text > text_end || mod->text_end < text_start)
 			continue;
-		for (ptr = mod->locks; ptr < mod->locks_end; ptr++)
-			if (text_start <= *ptr && text_end >= *ptr)
+		for (poff = mod->locks; poff < mod->locks_end; poff++) {
+			const u8 *ptr = (const u8 *)poff + *poff;
+
+			if (text_start <= ptr && text_end > ptr)
 				return 1;
+		}
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 47f9fe26299ae022ac1e3fa12e7e73def62b7898 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 21 Apr 2010 16:19:57 +0100
Subject: x86-64: Don't export init_level4_pgt

It's not used by any module, and i386 (as well as some other arches)
also doesn't export its equivalent (swapper_pg_dir).

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF33BD020000780003B3E4@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/x8664_ksyms_64.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 693920b2249..1b950d151e5 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__memcpy);
 
 EXPORT_SYMBOL(empty_zero_page);
-EXPORT_SYMBOL(init_level4_pgt);
 #ifndef CONFIG_PARAVIRT
 EXPORT_SYMBOL(native_load_gs_index);
 #endif
-- 
cgit v1.2.3


From 6fc108a08dcddf8f9113cc7102ddaacf7ed37a6b Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Wed, 21 Apr 2010 15:23:44 +0100
Subject: x86: Clean up arch/x86/Kconfig*

No functional change intended.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <4BCF2690020000780003B340@vpn.id2.novell.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig            | 49 ++++++++++++++-------------------------------
 arch/x86/Kconfig.debug      |  2 --
 arch/x86/include/asm/boot.h |  2 +-
 3 files changed, 16 insertions(+), 37 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9458685902b..85e2252625b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -197,20 +197,17 @@ config HAVE_INTEL_TXT
 
 # Use the generic interrupt handling code in kernel/irq/:
 config GENERIC_HARDIRQS
-	bool
-	default y
+	def_bool y
 
 config GENERIC_HARDIRQS_NO__DO_IRQ
        def_bool y
 
 config GENERIC_IRQ_PROBE
-	bool
-	default y
+	def_bool y
 
 config GENERIC_PENDING_IRQ
-	bool
+	def_bool y
 	depends on GENERIC_HARDIRQS && SMP
-	default y
 
 config USE_GENERIC_SMP_HELPERS
 	def_bool y
@@ -225,14 +222,12 @@ config X86_64_SMP
 	depends on X86_64 && SMP
 
 config X86_HT
-	bool
+	def_bool y
 	depends on SMP
-	default y
 
 config X86_TRAMPOLINE
-	bool
+	def_bool y
 	depends on SMP || (64BIT && ACPI_SLEEP)
-	default y
 
 config X86_32_LAZY_GS
 	def_bool y
@@ -447,7 +442,7 @@ config X86_NUMAQ
 	  firmware with - send email to <Martin.Bligh@us.ibm.com>.
 
 config X86_SUPPORTS_MEMORY_FAILURE
-	bool
+	def_bool y
 	# MCE code calls memory_failure():
 	depends on X86_MCE
 	# On 32-bit this adds too big of NODES_SHIFT and we run out of page flags:
@@ -455,7 +450,6 @@ config X86_SUPPORTS_MEMORY_FAILURE
 	# On 32-bit SPARSEMEM adds too big of SECTIONS_WIDTH:
 	depends on X86_64 || !SPARSEMEM
 	select ARCH_SUPPORTS_MEMORY_FAILURE
-	default y
 
 config X86_VISWS
 	bool "SGI 320/540 (Visual Workstation)"
@@ -570,7 +564,6 @@ config PARAVIRT_SPINLOCKS
 
 config PARAVIRT_CLOCK
 	bool
-	default n
 
 endif
 
@@ -749,7 +742,6 @@ config MAXSMP
 	bool "Configure Maximum number of SMP Processors and NUMA Nodes"
 	depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
 	select CPUMASK_OFFSTACK
-	default n
 	---help---
 	  Configure maximum number of CPUS and NUMA Nodes for this architecture.
 	  If unsure, say N.
@@ -829,7 +821,6 @@ config X86_VISWS_APIC
 
 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 	bool "Reroute for broken boot IRQs"
-	default n
 	depends on X86_IO_APIC
 	---help---
 	  This option enables a workaround that fixes a source of
@@ -876,9 +867,8 @@ config X86_MCE_AMD
 	   the DRAM Error Threshold.
 
 config X86_ANCIENT_MCE
-	def_bool n
+	bool "Support for old Pentium 5 / WinChip machine checks"
 	depends on X86_32 && X86_MCE
-	prompt "Support for old Pentium 5 / WinChip machine checks"
 	---help---
 	  Include support for machine check handling on old Pentium 5 or WinChip
 	  systems. These typically need to be enabled explicitely on the command
@@ -886,8 +876,7 @@ config X86_ANCIENT_MCE
 
 config X86_MCE_THRESHOLD
 	depends on X86_MCE_AMD || X86_MCE_INTEL
-	bool
-	default y
+	def_bool y
 
 config X86_MCE_INJECT
 	depends on X86_MCE
@@ -1026,8 +1015,8 @@ config X86_CPUID
 
 choice
 	prompt "High Memory Support"
-	default HIGHMEM4G if !X86_NUMAQ
 	default HIGHMEM64G if X86_NUMAQ
+	default HIGHMEM4G
 	depends on X86_32
 
 config NOHIGHMEM
@@ -1285,7 +1274,7 @@ source "mm/Kconfig"
 
 config HIGHPTE
 	bool "Allocate 3rd-level pagetables from highmem"
-	depends on X86_32 && (HIGHMEM4G || HIGHMEM64G)
+	depends on HIGHMEM
 	---help---
 	  The VM uses one page table entry for each page of physical memory.
 	  For systems with a lot of RAM, this can be wasteful of precious
@@ -1369,8 +1358,7 @@ config MATH_EMULATION
 	  kernel, it won't hurt.
 
 config MTRR
-	bool
-	default y
+	def_bool y
 	prompt "MTRR (Memory Type Range Register) support" if EMBEDDED
 	---help---
 	  On Intel P6 family processors (Pentium Pro, Pentium II and later)
@@ -1436,8 +1424,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
 	  mtrr_spare_reg_nr=N on the kernel command line.
 
 config X86_PAT
-	bool
-	default y
+	def_bool y
 	prompt "x86 PAT support" if EMBEDDED
 	depends on MTRR
 	---help---
@@ -1605,8 +1592,7 @@ config X86_NEED_RELOCS
 	depends on X86_32 && RELOCATABLE
 
 config PHYSICAL_ALIGN
-	hex
-	prompt "Alignment value to which kernel should be aligned" if X86_32
+	hex "Alignment value to which kernel should be aligned" if X86_32
 	default "0x1000000"
 	range 0x2000 0x1000000
 	---help---
@@ -1653,7 +1639,6 @@ config COMPAT_VDSO
 
 config CMDLINE_BOOL
 	bool "Built-in kernel command line"
-	default n
 	---help---
 	  Allow for specifying boot arguments to the kernel at
 	  build time.  On some systems (e.g. embedded ones), it is
@@ -1687,7 +1672,6 @@ config CMDLINE
 
 config CMDLINE_OVERRIDE
 	bool "Built-in command line overrides boot loader arguments"
-	default n
 	depends on CMDLINE_BOOL
 	---help---
 	  Set this option to 'Y' to have the kernel ignore the boot loader
@@ -1723,8 +1707,7 @@ source "drivers/acpi/Kconfig"
 source "drivers/sfi/Kconfig"
 
 config X86_APM_BOOT
-	bool
-	default y
+	def_bool y
 	depends on APM || APM_MODULE
 
 menuconfig APM
@@ -1953,8 +1936,7 @@ config DMAR_DEFAULT_ON
 	  experimental.
 
 config DMAR_BROKEN_GFX_WA
-	def_bool n
-	prompt "Workaround broken graphics drivers (going away soon)"
+	bool "Workaround broken graphics drivers (going away soon)"
 	depends on DMAR && BROKEN
 	---help---
 	  Current Graphics drivers tend to use physical address
@@ -2052,7 +2034,6 @@ config SCx200HR_TIMER
 config OLPC
 	bool "One Laptop Per Child support"
 	select GPIOLIB
-	default n
 	---help---
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bc01e3ebfeb..e7edd66d866 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -45,7 +45,6 @@ config EARLY_PRINTK
 
 config EARLY_PRINTK_DBGP
 	bool "Early printk via EHCI debug port"
-	default n
 	depends on EARLY_PRINTK && PCI
 	---help---
 	  Write kernel log output directly into the EHCI debug port.
@@ -76,7 +75,6 @@ config DEBUG_PER_CPU_MAPS
 	bool "Debug access to per_cpu maps"
 	depends on DEBUG_KERNEL
 	depends on SMP
-	default n
 	---help---
 	  Say Y to verify that the per_cpu map being accessed has
 	  been setup.  Adds a fair amount of code to kernel memory
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 7a1065958ba..3b62ab56c7a 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -24,7 +24,7 @@
 #define MIN_KERNEL_ALIGN	(_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
 
 #if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
-	(CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2))
+	(CONFIG_PHYSICAL_ALIGN < MIN_KERNEL_ALIGN)
 #error "Invalid value for CONFIG_PHYSICAL_ALIGN"
 #endif
 
-- 
cgit v1.2.3


From 3272c8a57b77a7277a740e211fe12171e4b37e99 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Wed, 21 Apr 2010 12:33:54 +0200
Subject: logfs: testing the wrong variable

There is a typo here.  We should test "last" instead of "first".

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 5866ee6e132..7fc4625c6f0 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -382,7 +382,7 @@ static struct page *find_super_block(struct super_block *sb)
 	if (!first || IS_ERR(first))
 		return NULL;
 	last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
-	if (!last || IS_ERR(first)) {
+	if (!last || IS_ERR(last)) {
 		page_cache_release(first);
 		return NULL;
 	}
-- 
cgit v1.2.3


From 2e531fa0d0868f5114c2b3a782ab02eb9d6f914d Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Thu, 29 Apr 2010 14:56:37 +0200
Subject: LogFS: Fix typo in b6349ac8

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/readwrite.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 3159db6958e..8c663a55b72 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1861,7 +1861,7 @@ int logfs_truncate(struct inode *inode, u64 target)
 			size = target;
 
 		logfs_get_wblocks(sb, NULL, 1);
-		err = __logfs_truncate(inode, target);
+		err = __logfs_truncate(inode, size);
 		if (!err)
 			err = __logfs_write_inode(inode, 0);
 		logfs_put_wblocks(sb, NULL, 1);
-- 
cgit v1.2.3


From 5c0541d53ef3897494768decb09eb8f1087953a5 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 29 Apr 2010 15:25:23 -0300
Subject: perf symbols: Add machine helper routines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created when writing the first 'perf test' regression testing routine.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-kmem.c |  2 +-
 tools/perf/util/map.h     | 25 ++++++++++++---
 tools/perf/util/symbol.c  | 77 +++++++++++++++++++++++++++++++++++------------
 tools/perf/util/symbol.h  |  7 +++++
 4 files changed, 86 insertions(+), 25 deletions(-)

diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 15635612e59..ee05dba9609 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -377,7 +377,7 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 		if (is_caller) {
 			addr = data->call_site;
 			if (!raw_ip)
-				sym = machine__find_function(machine, addr, &map, NULL);
+				sym = machine__find_kernel_function(machine, addr, &map, NULL);
 		} else
 			addr = data->ptr;
 
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index 881dba4f820..f3913451282 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -30,6 +30,7 @@ struct map {
 	u64			start;
 	u64			end;
 	enum map_type		type;
+	u32			priv;
 	u64			pgoff;
 
 	/* ip -> dso rip */
@@ -66,6 +67,12 @@ struct machine {
 	struct map	  *vmlinux_maps[MAP__NR_TYPES];
 };
 
+static inline
+struct map *machine__kernel_map(struct machine *self, enum map_type type)
+{
+	return self->vmlinux_maps[type];
+}
+
 static inline struct kmap *map__kmap(struct map *self)
 {
 	return (struct kmap *)(self + 1);
@@ -173,11 +180,21 @@ struct symbol *map_groups__find_symbol_by_name(struct map_groups *self,
 					       struct map **mapp,
 					       symbol_filter_t filter);
 
-static inline struct symbol *machine__find_function(struct machine *self,
-						    u64 addr, struct map **mapp,
-						    symbol_filter_t filter)
+static inline
+struct symbol *machine__find_kernel_symbol(struct machine *self,
+					   enum map_type type, u64 addr,
+					   struct map **mapp,
+					   symbol_filter_t filter)
+{
+	return map_groups__find_symbol(&self->kmaps, type, addr, mapp, filter);
+}
+
+static inline
+struct symbol *machine__find_kernel_function(struct machine *self, u64 addr,
+					     struct map **mapp,
+					     symbol_filter_t filter)
 {
-	return map_groups__find_symbol(&self->kmaps, MAP__FUNCTION, addr, mapp, filter);
+	return machine__find_kernel_symbol(self, MAP__FUNCTION, addr, mapp, filter);
 }
 
 static inline
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index caa890f8e2c..4c0146a4906 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1983,23 +1983,23 @@ void dso__read_running_kernel_build_id(struct dso *self, struct machine *machine
 		self->has_build_id = true;
 }
 
-static struct dso *dsos__create_kernel(struct machine *machine)
+static struct dso *machine__create_kernel(struct machine *self)
 {
 	const char *vmlinux_name = NULL;
 	struct dso *kernel;
 
-	if (machine__is_host(machine)) {
+	if (machine__is_host(self)) {
 		vmlinux_name = symbol_conf.vmlinux_name;
 		kernel = dso__new_kernel(vmlinux_name);
 	} else {
-		if (machine__is_default_guest(machine))
+		if (machine__is_default_guest(self))
 			vmlinux_name = symbol_conf.default_guest_vmlinux_name;
-		kernel = dso__new_guest_kernel(machine, vmlinux_name);
+		kernel = dso__new_guest_kernel(self, vmlinux_name);
 	}
 
 	if (kernel != NULL) {
-		dso__read_running_kernel_build_id(kernel, machine);
-		dsos__add(&machine->kernel_dsos, kernel);
+		dso__read_running_kernel_build_id(kernel, self);
+		dsos__add(&self->kernel_dsos, kernel);
 	}
 	return kernel;
 }
@@ -2026,6 +2026,23 @@ int __machine__create_kernel_maps(struct machine *self, struct dso *kernel)
 	return 0;
 }
 
+int machine__create_kernel_maps(struct machine *self)
+{
+	struct dso *kernel = machine__create_kernel(self);
+
+	if (kernel == NULL ||
+	    __machine__create_kernel_maps(self, kernel) < 0)
+		return -1;
+
+	if (symbol_conf.use_modules && machine__create_modules(self) < 0)
+		pr_debug("Problems creating module maps, continuing anyway...\n");
+	/*
+	 * Now that we have all the maps created, just set the ->end of them:
+	 */
+	map_groups__fixup_end(&self->kmaps);
+	return 0;
+}
+
 static void vmlinux_path__exit(void)
 {
 	while (--vmlinux_path__nr_entries >= 0) {
@@ -2144,25 +2161,12 @@ out_free_comm_list:
 
 int machines__create_kernel_maps(struct rb_root *self, pid_t pid)
 {
-	struct dso *kernel;
 	struct machine *machine = machines__findnew(self, pid);
 
 	if (machine == NULL)
 		return -1;
-	kernel = dsos__create_kernel(machine);
-	if (kernel == NULL)
-		return -1;
-
-	if (__machine__create_kernel_maps(machine, kernel) < 0)
-		return -1;
 
-	if (symbol_conf.use_modules && machine__create_modules(machine) < 0)
-		pr_debug("Problems creating module maps, continuing anyway...\n");
-	/*
-	 * Now that we have all the maps created, just set the ->end of them:
-	 */
-	map_groups__fixup_end(&machine->kmaps);
-	return 0;
+	return machine__create_kernel_maps(machine);
 }
 
 static int hex(char ch)
@@ -2248,3 +2252,36 @@ failure:
 
 	return ret;
 }
+
+int machine__load_kallsyms(struct machine *self, const char *filename,
+			   enum map_type type, symbol_filter_t filter)
+{
+	struct map *map = self->vmlinux_maps[type];
+	int ret = dso__load_kallsyms(map->dso, filename, map, filter);
+
+	if (ret > 0) {
+		dso__set_loaded(map->dso, type);
+		/*
+		 * Since /proc/kallsyms will have multiple sessions for the
+		 * kernel, with modules between them, fixup the end of all
+		 * sections.
+		 */
+		__map_groups__fixup_end(&self->kmaps, type);
+	}
+
+	return ret;
+}
+
+int machine__load_vmlinux_path(struct machine *self, enum map_type type,
+			       symbol_filter_t filter)
+{
+	struct map *map = self->vmlinux_maps[type];
+	int ret = dso__load_vmlinux_path(map->dso, map, filter);
+
+	if (ret > 0) {
+		dso__set_loaded(map->dso, type);
+		map__reloc_vmlinux(map);
+	}
+
+	return ret;
+}
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 2cec6a10716..a517c17407b 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -162,6 +162,11 @@ int dso__load_vmlinux_path(struct dso *self, struct map *map,
 			   symbol_filter_t filter);
 int dso__load_kallsyms(struct dso *self, const char *filename, struct map *map,
 		       symbol_filter_t filter);
+int machine__load_kallsyms(struct machine *self, const char *filename,
+			   enum map_type type, symbol_filter_t filter);
+int machine__load_vmlinux_path(struct machine *self, enum map_type type,
+			       symbol_filter_t filter);
+
 size_t machines__fprintf_dsos(struct rb_root *self, FILE *fp);
 size_t machines__fprintf_dsos_buildid(struct rb_root *self, FILE *fp, bool with_hits);
 
@@ -199,6 +204,8 @@ int kallsyms__parse(const char *filename, void *arg,
 					  char type, u64 start));
 
 int __machine__create_kernel_maps(struct machine *self, struct dso *kernel);
+int machine__create_kernel_maps(struct machine *self);
+
 int machines__create_kernel_maps(struct rb_root *self, pid_t pid);
 int machines__create_guest_kernel_maps(struct rb_root *self);
 
-- 
cgit v1.2.3


From 1c6a800cde3b818fd8320b5d402f2d77d2948c00 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 29 Apr 2010 18:58:32 -0300
Subject: perf test: Initial regression testing command
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

First an example with the first internal test:

[acme@doppio linux-2.6-tip]$ perf test
 1: vmlinux symtab matches kallsyms: Ok

So it run just one test, that is "vmlinux symtab matches kallsyms", and it was
successful.

If we run it in verbose mode, we'll see details about errors and extra warnings
for non-fatal problems:

[acme@doppio linux-2.6-tip]$ perf test -v
 1: vmlinux symtab matches kallsyms:
--- start ---
Looking at the vmlinux_path (5 entries long)
No build_id in vmlinux, ignoring it
No build_id in /boot/vmlinux, ignoring it
No build_id in /boot/vmlinux-2.6.34-rc4-tip+, ignoring it
Using /lib/modules/2.6.34-rc4-tip+/build/vmlinux for symbols
Maps only in vmlinux:
 ffffffff81cb81b1-ffffffff81e1149b 0 [kernel].init.text
 ffffffff81e1149c-ffffffff9fffffff 0 [kernel].exit.text
 ffffffffff600000-ffffffffff6000ff 0 [kernel].vsyscall_0
 ffffffffff600100-ffffffffff6003ff 0 [kernel].vsyscall_fn
 ffffffffff600400-ffffffffff6007ff 0 [kernel].vsyscall_1
 ffffffffff600800-ffffffffffffffff 0 [kernel].vsyscall_2
Maps in vmlinux with a different name in kallsyms:
 ffffffffff600000-ffffffffff6000ff 0 [kernel].vsyscall_0 in kallsyms as [kernel].0
 ffffffffff600100-ffffffffff6003ff 0 [kernel].vsyscall_fn in kallsyms as:
*ffffffffff600100-ffffffffff60012f 0 [kernel].2
 ffffffffff600400-ffffffffff6007ff 0 [kernel].vsyscall_1 in kallsyms as [kernel].6
 ffffffffff600800-ffffffffffffffff 0 [kernel].vsyscall_2 in kallsyms as [kernel].8
Maps only in kallsyms:
 ffffffffff600130-ffffffffff6003ff 0 [kernel].4
---- end ----
vmlinux symtab matches kallsyms: Ok
[acme@doppio linux-2.6-tip]$

In the above case we only know the name of the non contiguous kernel ranges in
the address space when reading the symbol information from the ELF symtab in
vmlinux.

The /proc/kallsyms file lack this, we only notice they are separate because
there are modules after the kernel and after that more kernel functions, so we
need to have a module rbtree backed by the module .ko path to get symtabs in
the vmlinux case.

The tool uses it to match by address to emit appropriate warning, but don't
considers this fatal.

The .init.text and .exit.text ines, of course, aren't in kallsyms, so I left
these cases just as extra info in verbose mode.

The end of the sections also aren't in kallsyms, so we the symbols layer does
another pass and sets the end addresses as the next map start minus one, which
sometimes pads, causing harmless mismatches.

But at least the symbols match, tested it by copying /proc/kallsyms to
/tmp/kallsyms and doing changes to see if they were detected.

This first test also should serve as a first stab at documenting the
symbol library by providing a self contained example that exercises it
together with comments about what is being done.

More tests to check if actions done on a monitored app, like doing mmaps, etc,
makes the kernel generate the expected events should be added next.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-test.txt |  22 +++
 tools/perf/Makefile                    |   1 +
 tools/perf/builtin-test.c              | 281 +++++++++++++++++++++++++++++++++
 tools/perf/builtin.h                   |   1 +
 tools/perf/command-list.txt            |   1 +
 tools/perf/perf.c                      |   1 +
 6 files changed, 307 insertions(+)
 create mode 100644 tools/perf/Documentation/perf-test.txt
 create mode 100644 tools/perf/builtin-test.c

diff --git a/tools/perf/Documentation/perf-test.txt b/tools/perf/Documentation/perf-test.txt
new file mode 100644
index 00000000000..1c4b5f5b7f7
--- /dev/null
+++ b/tools/perf/Documentation/perf-test.txt
@@ -0,0 +1,22 @@
+perf-test(1)
+============
+
+NAME
+----
+perf-test - Runs sanity tests.
+
+SYNOPSIS
+--------
+[verse]
+'perf test <options>'
+
+DESCRIPTION
+-----------
+This command does assorted sanity tests, initially thru linked routines but
+also will look for a directory with more tests in the form of scripts.
+
+OPTIONS
+-------
+-v::
+--verbose::
+	Be more verbose.
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index b86f2ad165a..739c4412b18 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -490,6 +490,7 @@ BUILTIN_OBJS += $(OUTPUT)builtin-probe.o
 BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o
 BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
 BUILTIN_OBJS += $(OUTPUT)builtin-kvm.o
+BUILTIN_OBJS += $(OUTPUT)builtin-test.o
 
 PERFLIBS = $(LIB_FILE)
 
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
new file mode 100644
index 00000000000..0339612e738
--- /dev/null
+++ b/tools/perf/builtin-test.c
@@ -0,0 +1,281 @@
+/*
+ * builtin-test.c
+ *
+ * Builtin regression testing command: ever growing number of sanity tests
+ */
+#include "builtin.h"
+
+#include "util/cache.h"
+#include "util/debug.h"
+#include "util/parse-options.h"
+#include "util/session.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+
+static long page_size;
+
+static int vmlinux_matches_kallsyms_filter(struct map *map __used, struct symbol *sym)
+{
+	bool *visited = symbol__priv(sym);
+	*visited = true;
+	return 0;
+}
+
+static int test__vmlinux_matches_kallsyms(void)
+{
+	int err = -1;
+	struct rb_node *nd;
+	struct symbol *sym;
+	struct map *kallsyms_map, *vmlinux_map;
+	struct machine kallsyms, vmlinux;
+	enum map_type type = MAP__FUNCTION;
+	struct ref_reloc_sym ref_reloc_sym = { .name = "_stext", };
+
+	/*
+	 * Step 1:
+	 *
+	 * Init the machines that will hold kernel, modules obtained from
+	 * both vmlinux + .ko files and from /proc/kallsyms split by modules.
+	 */
+	machine__init(&kallsyms, "", HOST_KERNEL_ID);
+	machine__init(&vmlinux, "", HOST_KERNEL_ID);
+
+	/*
+	 * Step 2:
+	 *
+	 * Create the kernel maps for kallsyms and the DSO where we will then
+	 * load /proc/kallsyms. Also create the modules maps from /proc/modules
+	 * and find the .ko files that match them in /lib/modules/`uname -r`/.
+	 */
+	if (machine__create_kernel_maps(&kallsyms) < 0) {
+		pr_debug("machine__create_kernel_maps ");
+		return -1;
+	}
+
+	/*
+	 * Step 3:
+	 *
+	 * Load and split /proc/kallsyms into multiple maps, one per module.
+	 */
+	if (machine__load_kallsyms(&kallsyms, "/proc/kallsyms", type, NULL) <= 0) {
+		pr_debug("dso__load_kallsyms ");
+		goto out;
+	}
+
+	/*
+	 * Step 4:
+	 *
+	 * kallsyms will be internally on demand sorted by name so that we can
+	 * find the reference relocation * symbol, i.e. the symbol we will use
+	 * to see if the running kernel was relocated by checking if it has the
+	 * same value in the vmlinux file we load.
+	 */
+	kallsyms_map = machine__kernel_map(&kallsyms, type);
+
+	sym = map__find_symbol_by_name(kallsyms_map, ref_reloc_sym.name, NULL);
+	if (sym == NULL) {
+		pr_debug("dso__find_symbol_by_name ");
+		goto out;
+	}
+
+	ref_reloc_sym.addr = sym->start;
+
+	/*
+	 * Step 5:
+	 *
+	 * Now repeat step 2, this time for the vmlinux file we'll auto-locate.
+	 */
+	if (machine__create_kernel_maps(&vmlinux) < 0) {
+		pr_debug("machine__create_kernel_maps ");
+		goto out;
+	}
+
+	vmlinux_map = machine__kernel_map(&vmlinux, type);
+	map__kmap(vmlinux_map)->ref_reloc_sym = &ref_reloc_sym;
+
+	/*
+	 * Step 6:
+	 *
+	 * Locate a vmlinux file in the vmlinux path that has a buildid that
+	 * matches the one of the running kernel.
+	 *
+	 * While doing that look if we find the ref reloc symbol, if we find it
+	 * we'll have its ref_reloc_symbol.unrelocated_addr and then
+	 * maps__reloc_vmlinux will notice and set proper ->[un]map_ip routines
+	 * to fixup the symbols.
+	 */
+	if (machine__load_vmlinux_path(&vmlinux, type,
+				       vmlinux_matches_kallsyms_filter) <= 0) {
+		pr_debug("machine__load_vmlinux_path ");
+		goto out;
+	}
+
+	err = 0;
+	/*
+	 * Step 7:
+	 *
+	 * Now look at the symbols in the vmlinux DSO and check if we find all of them
+	 * in the kallsyms dso. For the ones that are in both, check its names and
+	 * end addresses too.
+	 */
+	for (nd = rb_first(&vmlinux_map->dso->symbols[type]); nd; nd = rb_next(nd)) {
+		struct symbol *pair;
+
+		sym  = rb_entry(nd, struct symbol, rb_node);
+		pair = machine__find_kernel_symbol(&kallsyms, type, sym->start, NULL, NULL);
+
+		if (pair && pair->start == sym->start) {
+next_pair:
+			if (strcmp(sym->name, pair->name) == 0) {
+				/*
+				 * kallsyms don't have the symbol end, so we
+				 * set that by using the next symbol start - 1,
+				 * in some cases we get this up to a page
+				 * wrong, trace_kmalloc when I was developing
+				 * this code was one such example, 2106 bytes
+				 * off the real size. More than that and we
+				 * _really_ have a problem.
+				 */
+				s64 skew = sym->end - pair->end;
+				if (llabs(skew) < page_size)
+					continue;
+
+				pr_debug("%#Lx: diff end addr for %s v: %#Lx k: %#Lx\n",
+					 sym->start, sym->name, sym->end, pair->end);
+			} else {
+				struct rb_node *nnd = rb_prev(&pair->rb_node);
+
+				if (nnd) {
+					struct symbol *next = rb_entry(nnd, struct symbol, rb_node);
+
+					if (next->start == sym->start) {
+						pair = next;
+						goto next_pair;
+					}
+				}
+				pr_debug("%#Lx: diff name v: %s k: %s\n",
+					 sym->start, sym->name, pair->name);
+			}
+		} else
+			pr_debug("%#Lx: %s not on kallsyms\n", sym->start, sym->name);
+
+		err = -1;
+	}
+
+	if (!verbose)
+		goto out;
+
+	pr_info("Maps only in vmlinux:\n");
+
+	for (nd = rb_first(&vmlinux.kmaps.maps[type]); nd; nd = rb_next(nd)) {
+		struct map *pos = rb_entry(nd, struct map, rb_node), *pair;
+		/*
+		 * If it is the kernel, kallsyms is always "[kernel.kallsyms]", while
+		 * the kernel will have the path for the vmlinux file being used,
+		 * so use the short name, less descriptive but the same ("[kernel]" in
+		 * both cases.
+		 */
+		pair = map_groups__find_by_name(&kallsyms.kmaps, type,
+						(pos->dso->kernel ?
+							pos->dso->short_name :
+							pos->dso->name));
+		if (pair)
+			pair->priv = 1;
+		else
+			map__fprintf(pos, stderr);
+	}
+
+	pr_info("Maps in vmlinux with a different name in kallsyms:\n");
+
+	for (nd = rb_first(&vmlinux.kmaps.maps[type]); nd; nd = rb_next(nd)) {
+		struct map *pos = rb_entry(nd, struct map, rb_node), *pair;
+
+		pair = map_groups__find(&kallsyms.kmaps, type, pos->start);
+		if (pair == NULL || pair->priv)
+			continue;
+
+		if (pair->start == pos->start) {
+			pair->priv = 1;
+			pr_info(" %Lx-%Lx %Lx %s in kallsyms as",
+				pos->start, pos->end, pos->pgoff, pos->dso->name);
+			if (pos->pgoff != pair->pgoff || pos->end != pair->end)
+				pr_info(": \n*%Lx-%Lx %Lx",
+					pair->start, pair->end, pair->pgoff);
+			pr_info(" %s\n", pair->dso->name);
+			pair->priv = 1;
+		}
+	}
+
+	pr_info("Maps only in kallsyms:\n");
+
+	for (nd = rb_first(&kallsyms.kmaps.maps[type]);
+	     nd; nd = rb_next(nd)) {
+		struct map *pos = rb_entry(nd, struct map, rb_node);
+
+		if (!pos->priv)
+			map__fprintf(pos, stderr);
+	}
+out:
+	return err;
+}
+
+static struct test {
+	const char *desc;
+	int (*func)(void);
+} tests[] = {
+	{
+		.desc = "vmlinux symtab matches kallsyms",
+		.func = test__vmlinux_matches_kallsyms,
+	},
+	{
+		.func = NULL,
+	},
+};
+
+static int __cmd_test(void)
+{
+	int i = 0;
+
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	while (tests[i].func) {
+		int err;
+		pr_info("%2d: %s:", i + 1, tests[i].desc);
+		pr_debug("\n--- start ---\n");
+		err = tests[i].func();
+		pr_debug("---- end ----\n%s:", tests[i].desc);
+		pr_info(" %s\n", err ? "FAILED!\n" : "Ok");
+		++i;
+	}
+
+	return 0;
+}
+
+static const char * const test_usage[] = {
+	"perf test [<options>]",
+	NULL,
+};
+
+static const struct option test_options[] = {
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_END()
+};
+
+int cmd_test(int argc, const char **argv, const char *prefix __used)
+{
+	argc = parse_options(argc, argv, test_options, test_usage, 0);
+	if (argc)
+		usage_with_options(test_usage, test_options);
+
+	symbol_conf.priv_size = sizeof(int);
+	symbol_conf.sort_by_name = true;
+	symbol_conf.try_vmlinux_path = true;
+
+	if (symbol__init() < 0)
+		return -1;
+
+	setup_pager();
+
+	return __cmd_test();
+}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index ab28bca92e5..34a8a9ab961 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -33,5 +33,6 @@ extern int cmd_probe(int argc, const char **argv, const char *prefix);
 extern int cmd_kmem(int argc, const char **argv, const char *prefix);
 extern int cmd_lock(int argc, const char **argv, const char *prefix);
 extern int cmd_kvm(int argc, const char **argv, const char *prefix);
+extern int cmd_test(int argc, const char **argv, const char *prefix);
 
 #endif
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index 2a1162d413a..80a1a446ce3 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -20,3 +20,4 @@ perf-probe			mainporcelain common
 perf-kmem			mainporcelain common
 perf-lock			mainporcelain common
 perf-kvm			mainporcelain common
+perf-test			mainporcelain common
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 985cdb4bd00..5ff9b5b4697 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -308,6 +308,7 @@ static void handle_internal_command(int argc, const char **argv)
 		{ "kmem",	cmd_kmem,	0 },
 		{ "lock",	cmd_lock,	0 },
 		{ "kvm",	cmd_kvm,	0 },
+		{ "test",	cmd_test,	0 },
 	};
 	unsigned int i;
 	static const char ext[] = STRIP_EXTENSION;
-- 
cgit v1.2.3


From b701a47ba48b698976fb2fe05fb285b0edc1d26a Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 29 Apr 2010 16:03:57 -0700
Subject: x86: Fix LOCK_PREFIX_HERE for uniprocessor build

Checkin b3ac891b67bd4b1fc728d1c784cad1212dea433d:
x86: Add support for lock prefix in alternatives

... did not define LOCK_PREFIX_HERE in the case of a uniprocessor
build.  As a result, it would cause any of the usages of this macro to
fail on a uniprocessor build.  Fix this by defining LOCK_PREFIX_HERE
as a null string.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Luca Barbieri <luca@luca-barbieri.com>
LKML-Reference: <1267005265-27958-2-git-send-email-luca@luca-barbieri.com>
---
 arch/x86/include/asm/alternative.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 55fee12cea6..e29a6c9bba0 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -38,6 +38,7 @@
 #define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "
 
 #else /* ! CONFIG_SMP */
+#define LOCK_PREFIX_HERE ""
 #define LOCK_PREFIX ""
 #endif
 
-- 
cgit v1.2.3


From 8795d7717c467bea7b0a0649d44a258e09f34db2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 15 Apr 2010 23:10:42 +0200
Subject: lockdep: Fix redundant_hardirqs_on incremented with irqs enabled

When a path restore the flags while irqs are already enabled, we
update the per cpu var redundant_hardirqs_on in a racy fashion
and debug_atomic_inc() warns about this situation.

In this particular case, loosing a few hits in a stat is not a big
deal, so increment it without protection.

v2: Don't bother with disabling irq, we can miss one count in
    rare situations

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Miller <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 kernel/lockdep.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 78325f8f113..1b58a1bbcc8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2298,7 +2298,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
 		return;
 
 	if (unlikely(curr->hardirqs_enabled)) {
-		debug_atomic_inc(redundant_hardirqs_on);
+		/*
+		 * Neither irq nor preemption are disabled here
+		 * so this is racy by nature but loosing one hit
+		 * in a stat is not a big deal.
+		 */
+		this_cpu_inc(lockdep_stats.redundant_hardirqs_on);
 		return;
 	}
 	/* we'll do an OFF -> ON transition: */
-- 
cgit v1.2.3


From 913769f24eadcd38a936ffae41d9b4895ec02e43 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 15 Apr 2010 23:10:43 +0200
Subject: lockdep: Simplify debug atomic ops

Simplify debug_atomic_inc/dec by using this_cpu_inc/dec() instead
of doing it through an indirect get_cpu_var() and a manual
incrementation.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/lockdep_internals.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 8d7d4b6c741..2b174762fa0 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -140,19 +140,13 @@ struct lockdep_stats {
 DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
 
 #define debug_atomic_inc(ptr)			{		\
-	struct lockdep_stats *__cpu_lockdep_stats;		\
-								\
 	WARN_ON_ONCE(!irqs_disabled());				\
-	__cpu_lockdep_stats = &__get_cpu_var(lockdep_stats);	\
-	__cpu_lockdep_stats->ptr++;				\
+	this_cpu_inc(lockdep_stats.ptr);			\
 }
 
 #define debug_atomic_dec(ptr)			{		\
-	struct lockdep_stats *__cpu_lockdep_stats;		\
-								\
 	WARN_ON_ONCE(!irqs_disabled());				\
-	__cpu_lockdep_stats = &__get_cpu_var(lockdep_stats);	\
-	__cpu_lockdep_stats->ptr--;				\
+	this_cpu_inc(lockdep_stats.ptr);			\
 }
 
 #define debug_atomic_read(ptr)		({				\
-- 
cgit v1.2.3


From e5a5f1f015cf435eb3d2f5712ba51ffdbb92cbef Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 30 Apr 2010 19:55:00 +0200
Subject: perf: Remove leftover useless options to record trace events from
 scripts

-f, -c 1, -R are now useless for trace events recording, moreover
-M is useless and event hurts.

Remove them from the documentation examples and from record scripts.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
---
 tools/perf/Documentation/perf-trace-perl.txt                |  6 ++----
 tools/perf/Documentation/perf-trace-python.txt              | 10 ++++------
 tools/perf/scripts/perl/bin/check-perf-trace-record         |  2 +-
 tools/perf/scripts/perl/bin/failed-syscalls-record          |  2 +-
 tools/perf/scripts/perl/bin/rw-by-file-record               |  2 +-
 tools/perf/scripts/perl/bin/rw-by-pid-record                |  2 +-
 tools/perf/scripts/perl/bin/rwtop-record                    |  2 +-
 tools/perf/scripts/perl/bin/wakeup-latency-record           |  2 +-
 tools/perf/scripts/perl/bin/workqueue-stats-record          |  2 +-
 tools/perf/scripts/python/bin/failed-syscalls-by-pid-record |  2 +-
 tools/perf/scripts/python/bin/sctop-record                  |  2 +-
 tools/perf/scripts/python/bin/syscall-counts-by-pid-record  |  2 +-
 tools/perf/scripts/python/bin/syscall-counts-record         |  2 +-
 13 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/tools/perf/Documentation/perf-trace-perl.txt b/tools/perf/Documentation/perf-trace-perl.txt
index d729cee8d98..ee6525ee6d6 100644
--- a/tools/perf/Documentation/perf-trace-perl.txt
+++ b/tools/perf/Documentation/perf-trace-perl.txt
@@ -49,12 +49,10 @@ available as calls back into the perf executable (see below).
 As an example, the following perf record command can be used to record
 all sched_wakeup events in the system:
 
- # perf record -c 1 -f -a -M -R -e sched:sched_wakeup
+ # perf record -a -e sched:sched_wakeup
 
 Traces meant to be processed using a script should be recorded with
-the above options: -c 1 says to sample every event, -a to enable
-system-wide collection, -M to multiplex the output, and -R to collect
-raw samples.
+the above option: -a to enable system-wide collection.
 
 The format file for the sched_wakep event defines the following fields
 (see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
diff --git a/tools/perf/Documentation/perf-trace-python.txt b/tools/perf/Documentation/perf-trace-python.txt
index a241aca7718..16a86500dcf 100644
--- a/tools/perf/Documentation/perf-trace-python.txt
+++ b/tools/perf/Documentation/perf-trace-python.txt
@@ -93,7 +93,7 @@ don't care how it exited, so we'll use 'perf record' to record only
 the sys_enter events:
 
 ----
-# perf record -c 1 -f -a -M -R -e raw_syscalls:sys_enter
+# perf record -a -e raw_syscalls:sys_enter
 
 ^C[ perf record: Woken up 1 times to write data ]
 [ perf record: Captured and wrote 56.545 MB perf.data (~2470503 samples) ]
@@ -359,7 +359,7 @@ your script:
 # cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-record
 
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_enter
+perf record -a -e raw_syscalls:sys_enter
 ----
 
 The 'report' script is also a shell script with the same base name as
@@ -449,12 +449,10 @@ available as calls back into the perf executable (see below).
 As an example, the following perf record command can be used to record
 all sched_wakeup events in the system:
 
- # perf record -c 1 -f -a -M -R -e sched:sched_wakeup
+ # perf record -a -e sched:sched_wakeup
 
 Traces meant to be processed using a script should be recorded with
-the above options: -c 1 says to sample every event, -a to enable
-system-wide collection, -M to multiplex the output, and -R to collect
-raw samples.
+the above option: -a to enable system-wide collection.
 
 The format file for the sched_wakep event defines the following fields
 (see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
diff --git a/tools/perf/scripts/perl/bin/check-perf-trace-record b/tools/perf/scripts/perl/bin/check-perf-trace-record
index e6cb1474f8e..423ad6aed05 100644
--- a/tools/perf/scripts/perl/bin/check-perf-trace-record
+++ b/tools/perf/scripts/perl/bin/check-perf-trace-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e kmem:kmalloc -e irq:softirq_entry -e kmem:kfree
+perf record -a -e kmem:kmalloc -e irq:softirq_entry -e kmem:kfree
diff --git a/tools/perf/scripts/perl/bin/failed-syscalls-record b/tools/perf/scripts/perl/bin/failed-syscalls-record
index 6ad9b8f5f00..eb5846bcb56 100644
--- a/tools/perf/scripts/perl/bin/failed-syscalls-record
+++ b/tools/perf/scripts/perl/bin/failed-syscalls-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_exit $@
+perf record -a -e raw_syscalls:sys_exit $@
diff --git a/tools/perf/scripts/perl/bin/rw-by-file-record b/tools/perf/scripts/perl/bin/rw-by-file-record
index a828679837a..5bfaae5a6cb 100644
--- a/tools/perf/scripts/perl/bin/rw-by-file-record
+++ b/tools/perf/scripts/perl/bin/rw-by-file-record
@@ -1,3 +1,3 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e syscalls:sys_enter_read -e syscalls:sys_enter_write $@
+perf record -a -e syscalls:sys_enter_read -e syscalls:sys_enter_write $@
 
diff --git a/tools/perf/scripts/perl/bin/rw-by-pid-record b/tools/perf/scripts/perl/bin/rw-by-pid-record
index 63976bf11e8..6e0b2f7755a 100644
--- a/tools/perf/scripts/perl/bin/rw-by-pid-record
+++ b/tools/perf/scripts/perl/bin/rw-by-pid-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e syscalls:sys_enter_read -e syscalls:sys_exit_read -e syscalls:sys_enter_write -e syscalls:sys_exit_write $@
+perf record -a -e syscalls:sys_enter_read -e syscalls:sys_exit_read -e syscalls:sys_enter_write -e syscalls:sys_exit_write $@
diff --git a/tools/perf/scripts/perl/bin/rwtop-record b/tools/perf/scripts/perl/bin/rwtop-record
index 63976bf11e8..6e0b2f7755a 100644
--- a/tools/perf/scripts/perl/bin/rwtop-record
+++ b/tools/perf/scripts/perl/bin/rwtop-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e syscalls:sys_enter_read -e syscalls:sys_exit_read -e syscalls:sys_enter_write -e syscalls:sys_exit_write $@
+perf record -a -e syscalls:sys_enter_read -e syscalls:sys_exit_read -e syscalls:sys_enter_write -e syscalls:sys_exit_write $@
diff --git a/tools/perf/scripts/perl/bin/wakeup-latency-record b/tools/perf/scripts/perl/bin/wakeup-latency-record
index 9c0cf588ff8..9f2acaaae9f 100644
--- a/tools/perf/scripts/perl/bin/wakeup-latency-record
+++ b/tools/perf/scripts/perl/bin/wakeup-latency-record
@@ -1,5 +1,5 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e sched:sched_switch -e sched:sched_wakeup $@
+perf record -a -e sched:sched_switch -e sched:sched_wakeup $@
 
 
diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-record b/tools/perf/scripts/perl/bin/workqueue-stats-record
index c2a1a942113..85301f2471f 100644
--- a/tools/perf/scripts/perl/bin/workqueue-stats-record
+++ b/tools/perf/scripts/perl/bin/workqueue-stats-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e workqueue:workqueue_creation -e workqueue:workqueue_destruction -e workqueue:workqueue_execution -e workqueue:workqueue_insertion $@
+perf record -a -e workqueue:workqueue_creation -e workqueue:workqueue_destruction -e workqueue:workqueue_execution -e workqueue:workqueue_insertion $@
diff --git a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-record b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-record
index 6ad9b8f5f00..eb5846bcb56 100644
--- a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-record
+++ b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_exit $@
+perf record -a -e raw_syscalls:sys_exit $@
diff --git a/tools/perf/scripts/python/bin/sctop-record b/tools/perf/scripts/python/bin/sctop-record
index 27ccffa26ab..1fc5998b721 100644
--- a/tools/perf/scripts/python/bin/sctop-record
+++ b/tools/perf/scripts/python/bin/sctop-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_enter $@
+perf record -a -e raw_syscalls:sys_enter $@
diff --git a/tools/perf/scripts/python/bin/syscall-counts-by-pid-record b/tools/perf/scripts/python/bin/syscall-counts-by-pid-record
index 27ccffa26ab..1fc5998b721 100644
--- a/tools/perf/scripts/python/bin/syscall-counts-by-pid-record
+++ b/tools/perf/scripts/python/bin/syscall-counts-by-pid-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_enter $@
+perf record -a -e raw_syscalls:sys_enter $@
diff --git a/tools/perf/scripts/python/bin/syscall-counts-record b/tools/perf/scripts/python/bin/syscall-counts-record
index 27ccffa26ab..1fc5998b721 100644
--- a/tools/perf/scripts/python/bin/syscall-counts-record
+++ b/tools/perf/scripts/python/bin/syscall-counts-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_enter $@
+perf record -a -e raw_syscalls:sys_enter $@
-- 
cgit v1.2.3


From d00a47cce569a3e660a8c9de5d57af28d6a9f0f7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 1 May 2010 03:08:46 +0200
Subject: perf: Fix warning while reading ring buffer headers

commit e9e94e3bd862d31777335722e747e97d9821bc1d
"perf trace: Ignore "overwrite" field if present in
/events/header_page" makes perf trace launching spurious warnings
about unexpected tokens read:

	Warning: Error: expected type 6 but read 4

This change tries to handle the overcommit field in the header_page
file whenever this field is present or not.

The problem is that if this field is not present, we try to find it
and give up in the middle of the line when we realize we are actually
dealing with another field, which is the "data" one. And this failure
abandons the file pointer in the middle of the "data" description
line:

	field: u64 timestamp;	offset:0;	size:8;	signed:0;
	field: local_t commit;	offset:8;	size:8;	signed:1;
	field: char data;	offset:16;	size:4080;	signed:1;
                      ^^^
                      Here

What happens next is that we want to read this line to parse the data
field, but we fail because the pointer is not in the beginning of the
line.

We could probably fix that by rewinding the pointer. But in fact we
don't care much about these headers that only concern the ftrace
ring-buffer. We don't use them from perf.

Just skip this part of perf.data, but don't remove it from recording
to stay compatible with olders perf.data

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 tools/perf/util/trace-event-parse.c | 89 -------------------------------------
 tools/perf/util/trace-event-read.c  | 12 ++---
 tools/perf/util/trace-event.h       |  1 -
 3 files changed, 7 insertions(+), 95 deletions(-)

diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index d6ef414075a..069f261b225 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -691,11 +691,6 @@ static int __read_expected(enum event_type expect, const char *str,
 	return ret;
 }
 
-static int read_expected_warn(enum event_type expect, const char *str, bool warn)
-{
-	return __read_expected(expect, str, 1, warn);
-}
-
 static int read_expected(enum event_type expect, const char *str)
 {
 	return __read_expected(expect, str, 1, true);
@@ -3104,90 +3099,6 @@ static void print_args(struct print_arg *args)
 	}
 }
 
-static void parse_header_field(const char *field,
-			       int *offset, int *size, bool warn)
-{
-	char *token;
-	int type;
-
-	if (read_expected(EVENT_ITEM, "field") < 0)
-		return;
-	if (read_expected(EVENT_OP, ":") < 0)
-		return;
-
-	/* type */
-	if (read_expect_type(EVENT_ITEM, &token) < 0)
-		goto fail;
-	free_token(token);
-
-	if (read_expected_warn(EVENT_ITEM, field, warn) < 0)
-		return;
-	if (read_expected(EVENT_OP, ";") < 0)
-		return;
-	if (read_expected(EVENT_ITEM, "offset") < 0)
-		return;
-	if (read_expected(EVENT_OP, ":") < 0)
-		return;
-	if (read_expect_type(EVENT_ITEM, &token) < 0)
-		goto fail;
-	*offset = atoi(token);
-	free_token(token);
-	if (read_expected(EVENT_OP, ";") < 0)
-		return;
-	if (read_expected(EVENT_ITEM, "size") < 0)
-		return;
-	if (read_expected(EVENT_OP, ":") < 0)
-		return;
-	if (read_expect_type(EVENT_ITEM, &token) < 0)
-		goto fail;
-	*size = atoi(token);
-	free_token(token);
-	if (read_expected(EVENT_OP, ";") < 0)
-		return;
-	type = read_token(&token);
-	if (type != EVENT_NEWLINE) {
-		/* newer versions of the kernel have a "signed" type */
-		if (type != EVENT_ITEM)
-			goto fail;
-
-		if (strcmp(token, "signed") != 0)
-			goto fail;
-
-		free_token(token);
-
-		if (read_expected(EVENT_OP, ":") < 0)
-			return;
-
-		if (read_expect_type(EVENT_ITEM, &token))
-			goto fail;
-
-		free_token(token);
-		if (read_expected(EVENT_OP, ";") < 0)
-			return;
-
-		if (read_expect_type(EVENT_NEWLINE, &token))
-			goto fail;
-	}
- fail:
-	free_token(token);
-}
-
-int parse_header_page(char *buf, unsigned long size)
-{
-	init_input_buf(buf, size);
-
-	parse_header_field("timestamp", &header_page_ts_offset,
-			   &header_page_ts_size, true);
-	parse_header_field("commit", &header_page_size_offset,
-			   &header_page_size_size, true);
-	parse_header_field("overwrite", &header_page_overwrite_offset,
-			   &header_page_overwrite_size, false);
-	parse_header_field("data", &header_page_data_offset,
-			   &header_page_data_size, true);
-
-	return 0;
-}
-
 int parse_ftrace_file(char *buf, unsigned long size)
 {
 	struct format_field *field;
diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c
index 44889c9b563..46066391288 100644
--- a/tools/perf/util/trace-event-read.c
+++ b/tools/perf/util/trace-event-read.c
@@ -52,6 +52,12 @@ static unsigned long	page_size;
 
 static ssize_t calc_data_size;
 
+/* If it fails, the next read will report it */
+static void skip(int size)
+{
+	lseek(input_fd, size, SEEK_CUR);
+}
+
 static int do_read(int fd, void *buf, int size)
 {
 	int rsize = size;
@@ -169,7 +175,6 @@ static void read_ftrace_printk(void)
 static void read_header_files(void)
 {
 	unsigned long long size;
-	char *header_page;
 	char *header_event;
 	char buf[BUFSIZ];
 
@@ -179,10 +184,7 @@ static void read_header_files(void)
 		die("did not read header page");
 
 	size = read8();
-	header_page = malloc_or_die(size);
-	read_or_die(header_page, size);
-	parse_header_page(header_page, size);
-	free(header_page);
+	skip(size);
 
 	/*
 	 * The size field in the page is of type long,
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 1f45d468fd9..4e1acc31eba 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -244,7 +244,6 @@ extern int header_page_data_size;
 
 extern bool latency_format;
 
-int parse_header_page(char *buf, unsigned long size);
 int trace_parse_common_type(void *data);
 int trace_parse_common_pid(void *data);
 int parse_common_pc(void *data);
-- 
cgit v1.2.3


From 73266fc1df2f94cf72b3beba3eee3b88ed0b0664 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 22 Apr 2010 05:05:45 +0200
Subject: hw-breakpoints: Tag ptrace breakpoint as exclude_kernel

Tag ptrace breakpoints with the exclude_kernel attribute set. This
will make it easier to set generic policies on breakpoints, when it
comes to ensure nobody unpriviliged try to breakpoint on the kernel.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/sh/kernel/ptrace_32.c    | 2 +-
 arch/x86/kernel/ptrace.c      | 2 +-
 include/linux/hw_breakpoint.h | 6 ++++++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 7759a9a9321..d4104ce9fe5 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -85,7 +85,7 @@ static int set_single_step(struct task_struct *tsk, unsigned long addr)
 
 	bp = thread->ptrace_bps[0];
 	if (!bp) {
-		hw_breakpoint_init(&attr);
+		ptrace_breakpoint_init(&attr);
 
 		attr.bp_addr = addr;
 		attr.bp_len = HW_BREAKPOINT_LEN_2;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 055be0afd33..70c4872cd8a 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -688,7 +688,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 	struct perf_event_attr attr;
 
 	if (!t->ptrace_bps[nr]) {
-		hw_breakpoint_init(&attr);
+		ptrace_breakpoint_init(&attr);
 		/*
 		 * Put stub len and type to register (reserve) an inactive but
 		 * correct bp
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index c70d27af03f..a0aa5a9cfb0 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -34,6 +34,12 @@ static inline void hw_breakpoint_init(struct perf_event_attr *attr)
 	attr->sample_period = 1;
 }
 
+static inline void ptrace_breakpoint_init(struct perf_event_attr *attr)
+{
+	hw_breakpoint_init(attr);
+	attr->exclude_kernel = 1;
+}
+
 static inline unsigned long hw_breakpoint_addr(struct perf_event *bp)
 {
 	return bp->attr.bp_addr;
-- 
cgit v1.2.3


From 87e9b2024659c614a876ce359a57e98a47b5ef37 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 17 Apr 2010 18:11:59 +0200
Subject: hw-breakpoints: Check disabled breakpoints again

We stopped checking disabled breakpoints because we weren't
allowing breakpoints on NULL addresses. And gdb tends to set
NULL addresses on inactive breakpoints.

But refusing NULL addresses was actually a regression that has
been fixed now. There is no reason anymore to not validate
inactive breakpoint settings.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Ingo Molnar <mingo@elte.hu>
---
 kernel/hw_breakpoint.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 03808ed342a..9ed9ae3a48b 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -316,17 +316,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 	if (ret)
 		return ret;
 
-	/*
-	 * Ptrace breakpoints can be temporary perf events only
-	 * meant to reserve a slot. In this case, it is created disabled and
-	 * we don't want to check the params right now (as we put a null addr)
-	 * But perf tools create events as disabled and we want to check
-	 * the params for them.
-	 * This is a quick hack that will be removed soon, once we remove
-	 * the tmp breakpoints from ptrace
-	 */
-	if (!bp->attr.disabled || !bp->overflow_handler)
-		ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+	ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
 
 	/* if arch_validate_hwbkpt_settings() fails then release bp slot */
 	if (ret)
-- 
cgit v1.2.3


From b2812d031dea86926e9c10f7714af33ac2f6b43d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 18 Apr 2010 18:11:53 +0200
Subject: hw-breakpoints: Change/Enforce some breakpoints policies

The current policies of breakpoints in x86 and SH are the following:

- task bound breakpoints can only break on userspace addresses
- cpu wide breakpoints can only break on kernel addresses

The former rule prevents ptrace breakpoints to be set to trigger on
kernel addresses, which is good. But as a side effect, we can't
breakpoint on kernel addresses for task bound breakpoints.

The latter rule simply makes no sense, there is no reason why we
can't set breakpoints on userspace while performing cpu bound
profiles.

We want the following new policies:

- task bound breakpoint can set userspace address breakpoints, with
no particular privilege required.
- task bound breakpoints can set kernelspace address breakpoints but
must be privileged to do that.
- cpu bound breakpoints can do what they want as they are privileged
already.

To implement these new policies, this patch checks if we are dealing
with a kernel address breakpoint, if so and if the exclude_kernel
parameter is set, we tell the user that the breakpoint is invalid,
which makes a good generic ptrace protection.
If we don't have exclude_kernel, ensure the user has the right
privileges as kernel breakpoints are quite sensitive (risk of
trap recursion attacks and global performance impacts).

[ Paul Mundt: keep addr space check for sh signal delivery and fix
  double function declaration]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/hw_breakpoint.h  |  5 ++---
 arch/sh/kernel/hw_breakpoint.c       | 34 ++++++------------------------
 arch/x86/include/asm/hw_breakpoint.h |  5 ++---
 arch/x86/kernel/hw_breakpoint.c      | 41 ++++++------------------------------
 kernel/hw_breakpoint.c               | 26 +++++++++++++++++++++--
 5 files changed, 41 insertions(+), 70 deletions(-)

diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h
index 965dd780d51..382bad937dc 100644
--- a/arch/sh/include/asm/hw_breakpoint.h
+++ b/arch/sh/include/asm/hw_breakpoint.h
@@ -47,9 +47,8 @@ struct pmu;
 #define HBP_NUM		2
 
 /* arch/sh/kernel/hw_breakpoint.c */
-extern int arch_check_va_in_userspace(unsigned long va, u16 hbp_len);
-extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
-					 struct task_struct *tsk);
+extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
 extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
 					   unsigned long val, void *data);
 
diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c
index 675eea7785d..1f2cf622986 100644
--- a/arch/sh/kernel/hw_breakpoint.c
+++ b/arch/sh/kernel/hw_breakpoint.c
@@ -119,26 +119,17 @@ static int get_hbp_len(u16 hbp_len)
 	return len_in_bytes;
 }
 
-/*
- * Check for virtual address in user space.
- */
-int arch_check_va_in_userspace(unsigned long va, u16 hbp_len)
-{
-	unsigned int len;
-
-	len = get_hbp_len(hbp_len);
-
-	return (va <= TASK_SIZE - len);
-}
-
 /*
  * Check for virtual address in kernel space.
  */
-static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+int arch_check_bp_in_kernelspace(struct perf_event *bp)
 {
 	unsigned int len;
+	unsigned long va;
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 
-	len = get_hbp_len(hbp_len);
+	va = info->address;
+	len = get_hbp_len(info->len);
 
 	return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
 }
@@ -226,8 +217,7 @@ static int arch_build_bp_info(struct perf_event *bp)
 /*
  * Validate the arch-specific HW Breakpoint register settings
  */
-int arch_validate_hwbkpt_settings(struct perf_event *bp,
-				  struct task_struct *tsk)
+int arch_validate_hwbkpt_settings(struct perf_event *bp)
 {
 	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 	unsigned int align;
@@ -270,15 +260,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
 	if (info->address & align)
 		return -EINVAL;
 
-	/* Check that the virtual address is in the proper range */
-	if (tsk) {
-		if (!arch_check_va_in_userspace(info->address, info->len))
-			return -EFAULT;
-	} else {
-		if (!arch_check_va_in_kernelspace(info->address, info->len))
-			return -EFAULT;
-	}
-
 	return 0;
 }
 
@@ -363,8 +344,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
 		perf_bp_event(bp, args->regs);
 
 		/* Deliver the signal to userspace */
-		if (arch_check_va_in_userspace(bp->attr.bp_addr,
-					       bp->attr.bp_len)) {
+		if (!arch_check_bp_in_kernelspace(bp)) {
 			siginfo_t info;
 
 			info.si_signo = args->signr;
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index 2a1bd8f4f23..c77a5a6fab9 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -44,9 +44,8 @@ struct arch_hw_breakpoint {
 struct perf_event;
 struct pmu;
 
-extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
-extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
-					 struct task_struct *tsk);
+extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
+extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
 extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
 					   unsigned long val, void *data);
 
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index d6cc065f519..a8f1b803d2f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -188,26 +188,17 @@ static int get_hbp_len(u8 hbp_len)
 	return len_in_bytes;
 }
 
-/*
- * Check for virtual address in user space.
- */
-int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
-{
-	unsigned int len;
-
-	len = get_hbp_len(hbp_len);
-
-	return (va <= TASK_SIZE - len);
-}
-
 /*
  * Check for virtual address in kernel space.
  */
-static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+int arch_check_bp_in_kernelspace(struct perf_event *bp)
 {
 	unsigned int len;
+	unsigned long va;
+	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 
-	len = get_hbp_len(hbp_len);
+	va = info->address;
+	len = get_hbp_len(info->len);
 
 	return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
 }
@@ -300,8 +291,7 @@ static int arch_build_bp_info(struct perf_event *bp)
 /*
  * Validate the arch-specific HW Breakpoint register settings
  */
-int arch_validate_hwbkpt_settings(struct perf_event *bp,
-				  struct task_struct *tsk)
+int arch_validate_hwbkpt_settings(struct perf_event *bp)
 {
 	struct arch_hw_breakpoint *info = counter_arch_bp(bp);
 	unsigned int align;
@@ -314,16 +304,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
 
 	ret = -EINVAL;
 
-	if (info->type == X86_BREAKPOINT_EXECUTE)
-		/*
-		 * Ptrace-refactoring code
-		 * For now, we'll allow instruction breakpoint only for user-space
-		 * addresses
-		 */
-		if ((!arch_check_va_in_userspace(info->address, info->len)) &&
-			info->len != X86_BREAKPOINT_EXECUTE)
-			return ret;
-
 	switch (info->len) {
 	case X86_BREAKPOINT_LEN_1:
 		align = 0;
@@ -350,15 +330,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
 	if (info->address & align)
 		return -EINVAL;
 
-	/* Check that the virtual address is in the proper range */
-	if (tsk) {
-		if (!arch_check_va_in_userspace(info->address, info->len))
-			return -EFAULT;
-	} else {
-		if (!arch_check_va_in_kernelspace(info->address, info->len))
-			return -EFAULT;
-	}
-
 	return 0;
 }
 
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 9ed9ae3a48b..89e8a050c43 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -308,6 +308,28 @@ int dbg_release_bp_slot(struct perf_event *bp)
 	return 0;
 }
 
+static int validate_hw_breakpoint(struct perf_event *bp)
+{
+	int ret;
+
+	ret = arch_validate_hwbkpt_settings(bp);
+	if (ret)
+		return ret;
+
+	if (arch_check_bp_in_kernelspace(bp)) {
+		if (bp->attr.exclude_kernel)
+			return -EINVAL;
+		/*
+		 * Don't let unprivileged users set a breakpoint in the trap
+		 * path to avoid trap recursion attacks.
+		 */
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+	}
+
+	return 0;
+}
+
 int register_perf_hw_breakpoint(struct perf_event *bp)
 {
 	int ret;
@@ -316,7 +338,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
 	if (ret)
 		return ret;
 
-	ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+	ret = validate_hw_breakpoint(bp);
 
 	/* if arch_validate_hwbkpt_settings() fails then release bp slot */
 	if (ret)
@@ -363,7 +385,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
 	if (attr->disabled)
 		goto end;
 
-	err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
+	err = validate_hw_breakpoint(bp);
 	if (!err)
 		perf_event_enable(bp);
 
-- 
cgit v1.2.3


From 0102752e4c9e0655b39734550d4c35327954f7f9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 11 Apr 2010 18:55:56 +0200
Subject: hw-breakpoints: Separate constraint space for data and instruction
 breakpoints

There are two outstanding fashions for archs to implement hardware
breakpoints.

The first is to separate breakpoint address pattern definition
space between data and instruction breakpoints. We then have
typically distinct instruction address breakpoint registers
and data address breakpoint registers, delivered with
separate control registers for data and instruction breakpoints
as well. This is the case of PowerPc and ARM for example.

The second consists in having merged breakpoint address space
definition between data and instruction breakpoint. Address
registers can host either instruction or data address and
the access mode for the breakpoint is defined in a control
register. This is the case of x86 and Super H.

This patch adds a new CONFIG_HAVE_MIXED_BREAKPOINTS_REGS config
that archs can select if they belong to the second case. Those
will have their slot allocation merged for instructions and
data breakpoints.

The others will have a separate slot tracking between data and
instruction breakpoints.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/Kconfig                  | 11 ++++++
 arch/sh/Kconfig               |  1 +
 arch/x86/Kconfig              |  1 +
 include/linux/hw_breakpoint.h |  9 +++--
 kernel/hw_breakpoint.c        | 86 +++++++++++++++++++++++++++++--------------
 5 files changed, 78 insertions(+), 30 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index f06010fb483..acda512da2e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -137,6 +137,17 @@ config HAVE_HW_BREAKPOINT
 	bool
 	depends on PERF_EVENTS
 
+config HAVE_MIXED_BREAKPOINTS_REGS
+	bool
+	depends on HAVE_HW_BREAKPOINT
+	help
+	  Depending on the arch implementation of hardware breakpoints,
+	  some of them have separate registers for data and instruction
+	  breakpoints addresses, others have mixed registers to store
+	  them but define the access type in a control register.
+	  Select this option if your arch implements breakpoints under the
+	  latter fashion.
+
 config HAVE_USER_RETURN_NOTIFIER
 	bool
 
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 8d90564c2bc..e6d8ab5cfa9 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH32
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_ARCH_KGDB
 	select HAVE_HW_BREAKPOINT
+	select HAVE_MIXED_BREAKPOINTS_REGS
 	select PERF_EVENTS if HAVE_HW_BREAKPOINT
 	select ARCH_HIBERNATION_POSSIBLE if MMU
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 97a95dfd118..01177dcbe26 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -53,6 +53,7 @@ config X86
 	select HAVE_KERNEL_LZMA
 	select HAVE_KERNEL_LZO
 	select HAVE_HW_BREAKPOINT
+	select HAVE_MIXED_BREAKPOINTS_REGS
 	select PERF_EVENTS
 	select ANON_INODES
 	select HAVE_ARCH_KMEMCHECK
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index a0aa5a9cfb0..7e889909309 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -9,9 +9,12 @@ enum {
 };
 
 enum {
-	HW_BREAKPOINT_R = 1,
-	HW_BREAKPOINT_W = 2,
-	HW_BREAKPOINT_X = 4,
+	HW_BREAKPOINT_EMPTY	= 0,
+	HW_BREAKPOINT_R		= 1,
+	HW_BREAKPOINT_W		= 2,
+	HW_BREAKPOINT_RW	= HW_BREAKPOINT_R | HW_BREAKPOINT_W,
+	HW_BREAKPOINT_X		= 4,
+	HW_BREAKPOINT_INVALID   = HW_BREAKPOINT_RW | HW_BREAKPOINT_X,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 89e8a050c43..8ead1345e33 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -45,18 +45,28 @@
 
 #include <linux/hw_breakpoint.h>
 
+enum bp_type_idx {
+	TYPE_INST 	= 0,
+#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
+	TYPE_DATA	= 0,
+#else
+	TYPE_DATA	= 1,
+#endif
+	TYPE_MAX
+};
+
 /*
  * Constraints data
  */
 
 /* Number of pinned cpu breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
+static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
 
 /* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
+static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[TYPE_MAX][HBP_NUM]);
 
 /* Number of non-pinned cpu/task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
+static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
 
 /* Gather the number of total pinned and un-pinned bp in a cpuset */
 struct bp_busy_slots {
@@ -67,14 +77,22 @@ struct bp_busy_slots {
 /* Serialize accesses to the above constraints */
 static DEFINE_MUTEX(nr_bp_mutex);
 
+static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
+{
+	if (bp->attr.bp_type & HW_BREAKPOINT_RW)
+		return TYPE_DATA;
+
+	return TYPE_INST;
+}
+
 /*
  * Report the maximum number of pinned breakpoints a task
  * have in this cpu
  */
-static unsigned int max_task_bp_pinned(int cpu)
+static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 {
 	int i;
-	unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+	unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 
 	for (i = HBP_NUM -1; i >= 0; i--) {
 		if (tsk_pinned[i] > 0)
@@ -84,7 +102,7 @@ static unsigned int max_task_bp_pinned(int cpu)
 	return 0;
 }
 
-static int task_bp_pinned(struct task_struct *tsk)
+static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
 {
 	struct perf_event_context *ctx = tsk->perf_event_ctxp;
 	struct list_head *list;
@@ -105,7 +123,8 @@ static int task_bp_pinned(struct task_struct *tsk)
 	 */
 	list_for_each_entry(bp, list, event_entry) {
 		if (bp->attr.type == PERF_TYPE_BREAKPOINT)
-			count++;
+			if (find_slot_idx(bp) == type)
+				count++;
 	}
 
 	raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -118,18 +137,19 @@ static int task_bp_pinned(struct task_struct *tsk)
  * a given cpu (cpu > -1) or in all of them (cpu = -1).
  */
 static void
-fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
+fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
+		    enum bp_type_idx type)
 {
 	int cpu = bp->cpu;
 	struct task_struct *tsk = bp->ctx->task;
 
 	if (cpu >= 0) {
-		slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
+		slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
 		if (!tsk)
-			slots->pinned += max_task_bp_pinned(cpu);
+			slots->pinned += max_task_bp_pinned(cpu, type);
 		else
-			slots->pinned += task_bp_pinned(tsk);
-		slots->flexible = per_cpu(nr_bp_flexible, cpu);
+			slots->pinned += task_bp_pinned(tsk, type);
+		slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
 
 		return;
 	}
@@ -137,16 +157,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
 	for_each_online_cpu(cpu) {
 		unsigned int nr;
 
-		nr = per_cpu(nr_cpu_bp_pinned, cpu);
+		nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
 		if (!tsk)
-			nr += max_task_bp_pinned(cpu);
+			nr += max_task_bp_pinned(cpu, type);
 		else
-			nr += task_bp_pinned(tsk);
+			nr += task_bp_pinned(tsk, type);
 
 		if (nr > slots->pinned)
 			slots->pinned = nr;
 
-		nr = per_cpu(nr_bp_flexible, cpu);
+		nr = per_cpu(nr_bp_flexible[type], cpu);
 
 		if (nr > slots->flexible)
 			slots->flexible = nr;
@@ -156,14 +176,15 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
 /*
  * Add a pinned breakpoint for the given task in our constraint table
  */
-static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
+static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
+				enum bp_type_idx type)
 {
 	unsigned int *tsk_pinned;
 	int count = 0;
 
-	count = task_bp_pinned(tsk);
+	count = task_bp_pinned(tsk, type);
 
-	tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
+	tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 	if (enable) {
 		tsk_pinned[count]++;
 		if (count > 0)
@@ -178,7 +199,8 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
 /*
  * Add/remove the given breakpoint in our constraint table
  */
-static void toggle_bp_slot(struct perf_event *bp, bool enable)
+static void
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type)
 {
 	int cpu = bp->cpu;
 	struct task_struct *tsk = bp->ctx->task;
@@ -186,20 +208,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 	/* Pinned counter task profiling */
 	if (tsk) {
 		if (cpu >= 0) {
-			toggle_bp_task_slot(tsk, cpu, enable);
+			toggle_bp_task_slot(tsk, cpu, enable, type);
 			return;
 		}
 
 		for_each_online_cpu(cpu)
-			toggle_bp_task_slot(tsk, cpu, enable);
+			toggle_bp_task_slot(tsk, cpu, enable, type);
 		return;
 	}
 
 	/* Pinned counter cpu profiling */
 	if (enable)
-		per_cpu(nr_cpu_bp_pinned, bp->cpu)++;
+		per_cpu(nr_cpu_bp_pinned[type], bp->cpu)++;
 	else
-		per_cpu(nr_cpu_bp_pinned, bp->cpu)--;
+		per_cpu(nr_cpu_bp_pinned[type], bp->cpu)--;
 }
 
 /*
@@ -246,14 +268,21 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
 static int __reserve_bp_slot(struct perf_event *bp)
 {
 	struct bp_busy_slots slots = {0};
+	enum bp_type_idx type;
 
-	fetch_bp_busy_slots(&slots, bp);
+	/* Basic checks */
+	if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
+	    bp->attr.bp_type == HW_BREAKPOINT_INVALID)
+		return -EINVAL;
+
+	type = find_slot_idx(bp);
+	fetch_bp_busy_slots(&slots, bp, type);
 
 	/* Flexible counters need to keep at least one slot */
 	if (slots.pinned + (!!slots.flexible) == HBP_NUM)
 		return -ENOSPC;
 
-	toggle_bp_slot(bp, true);
+	toggle_bp_slot(bp, true, type);
 
 	return 0;
 }
@@ -273,7 +302,10 @@ int reserve_bp_slot(struct perf_event *bp)
 
 static void __release_bp_slot(struct perf_event *bp)
 {
-	toggle_bp_slot(bp, false);
+	enum bp_type_idx type;
+
+	type = find_slot_idx(bp);
+	toggle_bp_slot(bp, false, type);
 }
 
 void release_bp_slot(struct perf_event *bp)
-- 
cgit v1.2.3


From f93a20541134fa767e8dc4eb32e956d30b9f6b92 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 13 Apr 2010 00:32:30 +0200
Subject: hw-breakpoints: Handle breakpoint weight in allocation constraints

Depending on their nature and on what an arch supports, breakpoints
may consume more than one address register. For example a simple
absolute address match usually only requires one address register.
But an address range match may consume two registers.

Currently our slot allocation constraints, that tend to reflect the
limited arch's resources, always consider that a breakpoint consumes
one slot.

Then provide a way for archs to tell us the weight of a breakpoint
through a new hw_breakpoint_weight() helper. This weight will be
computed against the generic allocation constraints instead of
a constant value.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
---
 kernel/hw_breakpoint.c | 63 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 18 deletions(-)

diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 8ead1345e33..974498b858f 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -77,6 +77,11 @@ struct bp_busy_slots {
 /* Serialize accesses to the above constraints */
 static DEFINE_MUTEX(nr_bp_mutex);
 
+__weak int hw_breakpoint_weight(struct perf_event *bp)
+{
+	return 1;
+}
+
 static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
 {
 	if (bp->attr.bp_type & HW_BREAKPOINT_RW)
@@ -124,7 +129,7 @@ static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
 	list_for_each_entry(bp, list, event_entry) {
 		if (bp->attr.type == PERF_TYPE_BREAKPOINT)
 			if (find_slot_idx(bp) == type)
-				count++;
+				count += hw_breakpoint_weight(bp);
 	}
 
 	raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -173,26 +178,41 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 	}
 }
 
+/*
+ * For now, continue to consider flexible as pinned, until we can
+ * ensure no flexible event can ever be scheduled before a pinned event
+ * in a same cpu.
+ */
+static void
+fetch_this_slot(struct bp_busy_slots *slots, int weight)
+{
+	slots->pinned += weight;
+}
+
 /*
  * Add a pinned breakpoint for the given task in our constraint table
  */
 static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
-				enum bp_type_idx type)
+				enum bp_type_idx type, int weight)
 {
 	unsigned int *tsk_pinned;
-	int count = 0;
+	int old_count = 0;
+	int old_idx = 0;
+	int idx = 0;
 
-	count = task_bp_pinned(tsk, type);
+	old_count = task_bp_pinned(tsk, type);
+	old_idx = old_count - 1;
+	idx = old_idx + weight;
 
 	tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 	if (enable) {
-		tsk_pinned[count]++;
-		if (count > 0)
-			tsk_pinned[count-1]--;
+		tsk_pinned[idx]++;
+		if (old_count > 0)
+			tsk_pinned[old_idx]--;
 	} else {
-		tsk_pinned[count]--;
-		if (count > 0)
-			tsk_pinned[count-1]++;
+		tsk_pinned[idx]--;
+		if (old_count > 0)
+			tsk_pinned[old_idx]++;
 	}
 }
 
@@ -200,7 +220,8 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
  * Add/remove the given breakpoint in our constraint table
  */
 static void
-toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type)
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
+	       int weight)
 {
 	int cpu = bp->cpu;
 	struct task_struct *tsk = bp->ctx->task;
@@ -208,20 +229,20 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type)
 	/* Pinned counter task profiling */
 	if (tsk) {
 		if (cpu >= 0) {
-			toggle_bp_task_slot(tsk, cpu, enable, type);
+			toggle_bp_task_slot(tsk, cpu, enable, type, weight);
 			return;
 		}
 
 		for_each_online_cpu(cpu)
-			toggle_bp_task_slot(tsk, cpu, enable, type);
+			toggle_bp_task_slot(tsk, cpu, enable, type, weight);
 		return;
 	}
 
 	/* Pinned counter cpu profiling */
 	if (enable)
-		per_cpu(nr_cpu_bp_pinned[type], bp->cpu)++;
+		per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
 	else
-		per_cpu(nr_cpu_bp_pinned[type], bp->cpu)--;
+		per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
 }
 
 /*
@@ -269,6 +290,7 @@ static int __reserve_bp_slot(struct perf_event *bp)
 {
 	struct bp_busy_slots slots = {0};
 	enum bp_type_idx type;
+	int weight;
 
 	/* Basic checks */
 	if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
@@ -276,13 +298,16 @@ static int __reserve_bp_slot(struct perf_event *bp)
 		return -EINVAL;
 
 	type = find_slot_idx(bp);
+	weight = hw_breakpoint_weight(bp);
+
 	fetch_bp_busy_slots(&slots, bp, type);
+	fetch_this_slot(&slots, weight);
 
 	/* Flexible counters need to keep at least one slot */
-	if (slots.pinned + (!!slots.flexible) == HBP_NUM)
+	if (slots.pinned + (!!slots.flexible) > HBP_NUM)
 		return -ENOSPC;
 
-	toggle_bp_slot(bp, true, type);
+	toggle_bp_slot(bp, true, type, weight);
 
 	return 0;
 }
@@ -303,9 +328,11 @@ int reserve_bp_slot(struct perf_event *bp)
 static void __release_bp_slot(struct perf_event *bp)
 {
 	enum bp_type_idx type;
+	int weight;
 
 	type = find_slot_idx(bp);
-	toggle_bp_slot(bp, false, type);
+	weight = hw_breakpoint_weight(bp);
+	toggle_bp_slot(bp, false, type, weight);
 }
 
 void release_bp_slot(struct perf_event *bp)
-- 
cgit v1.2.3


From feef47d0cb530e8419dfa0b48141b538b89b1b1a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 23 Apr 2010 05:59:55 +0200
Subject: hw-breakpoints: Get the number of available registers on boot
 dynamically

The breakpoint generic layer assumes that archs always know in advance
the static number of address registers available to host breakpoints
through the HBP_NUM macro.

However this is not true for every archs. For example Arm needs to get
this information dynamically to handle the compatiblity between
different versions.

To solve this, this patch proposes to drop the static HBP_NUM macro
and let the arch provide the number of available slots through a
new hw_breakpoint_slots() function. For archs that have
CONFIG_HAVE_MIXED_BREAKPOINTS_REGS selected, it will be called once
as the number of registers fits for instruction and data breakpoints
together.
For the others it will be called first to get the number of
instruction breakpoint registers and another time to get the
data breakpoint registers, the targeted type is given as a
parameter of hw_breakpoint_slots().

Reported-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Ingo Molnar <mingo@elte.hu>
---
 arch/sh/include/asm/hw_breakpoint.h  |  5 ++++
 arch/x86/include/asm/hw_breakpoint.h |  5 ++++
 include/linux/hw_breakpoint.h        | 10 +++++++
 kernel/hw_breakpoint.c               | 53 ++++++++++++++++++++++++++++--------
 kernel/trace/trace_ksym.c            | 26 +++++-------------
 5 files changed, 68 insertions(+), 31 deletions(-)

diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h
index 382bad937dc..e14cad96798 100644
--- a/arch/sh/include/asm/hw_breakpoint.h
+++ b/arch/sh/include/asm/hw_breakpoint.h
@@ -46,6 +46,11 @@ struct pmu;
 /* Maximum number of UBC channels */
 #define HBP_NUM		2
 
+static inline int hw_breakpoint_slots(int type)
+{
+	return HBP_NUM;
+}
+
 /* arch/sh/kernel/hw_breakpoint.c */
 extern int arch_check_bp_in_kernelspace(struct perf_event *bp);
 extern int arch_validate_hwbkpt_settings(struct perf_event *bp);
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index c77a5a6fab9..942255310e6 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -41,6 +41,11 @@ struct arch_hw_breakpoint {
 /* Total number of available HW breakpoint registers */
 #define HBP_NUM 4
 
+static inline int hw_breakpoint_slots(int type)
+{
+	return HBP_NUM;
+}
+
 struct perf_event;
 struct pmu;
 
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 7e889909309..a2d6ea49ec5 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -17,6 +17,16 @@ enum {
 	HW_BREAKPOINT_INVALID   = HW_BREAKPOINT_RW | HW_BREAKPOINT_X,
 };
 
+enum bp_type_idx {
+	TYPE_INST 	= 0,
+#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
+	TYPE_DATA	= 0,
+#else
+	TYPE_DATA	= 1,
+#endif
+	TYPE_MAX
+};
+
 #ifdef __KERNEL__
 
 #include <linux/perf_event.h>
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 974498b858f..684b710cbb9 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,20 +40,12 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 
 #include <linux/hw_breakpoint.h>
 
-enum bp_type_idx {
-	TYPE_INST 	= 0,
-#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
-	TYPE_DATA	= 0,
-#else
-	TYPE_DATA	= 1,
-#endif
-	TYPE_MAX
-};
 
 /*
  * Constraints data
@@ -63,11 +55,15 @@ enum bp_type_idx {
 static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
 
 /* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[TYPE_MAX][HBP_NUM]);
+static DEFINE_PER_CPU(unsigned int, *nr_task_bp_pinned[TYPE_MAX]);
 
 /* Number of non-pinned cpu/task breakpoints in a cpu */
 static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
 
+static int nr_slots[TYPE_MAX];
+
+static int constraints_initialized;
+
 /* Gather the number of total pinned and un-pinned bp in a cpuset */
 struct bp_busy_slots {
 	unsigned int pinned;
@@ -99,7 +95,7 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 	int i;
 	unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
 
-	for (i = HBP_NUM -1; i >= 0; i--) {
+	for (i = nr_slots[type] - 1; i >= 0; i--) {
 		if (tsk_pinned[i] > 0)
 			return i + 1;
 	}
@@ -292,6 +288,10 @@ static int __reserve_bp_slot(struct perf_event *bp)
 	enum bp_type_idx type;
 	int weight;
 
+	/* We couldn't initialize breakpoint constraints on boot */
+	if (!constraints_initialized)
+		return -ENOMEM;
+
 	/* Basic checks */
 	if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
 	    bp->attr.bp_type == HW_BREAKPOINT_INVALID)
@@ -304,7 +304,7 @@ static int __reserve_bp_slot(struct perf_event *bp)
 	fetch_this_slot(&slots, weight);
 
 	/* Flexible counters need to keep at least one slot */
-	if (slots.pinned + (!!slots.flexible) > HBP_NUM)
+	if (slots.pinned + (!!slots.flexible) > nr_slots[type])
 		return -ENOSPC;
 
 	toggle_bp_slot(bp, true, type, weight);
@@ -551,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
 
 static int __init init_hw_breakpoint(void)
 {
+	unsigned int **task_bp_pinned;
+	int cpu, err_cpu;
+	int i;
+
+	for (i = 0; i < TYPE_MAX; i++)
+		nr_slots[i] = hw_breakpoint_slots(i);
+
+	for_each_possible_cpu(cpu) {
+		for (i = 0; i < TYPE_MAX; i++) {
+			task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
+			*task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
+						  GFP_KERNEL);
+			if (!*task_bp_pinned)
+				goto err_alloc;
+		}
+	}
+
+	constraints_initialized = 1;
+
 	return register_die_notifier(&hw_breakpoint_exceptions_nb);
+
+ err_alloc:
+	for_each_possible_cpu(err_cpu) {
+		if (err_cpu == cpu)
+			break;
+		for (i = 0; i < TYPE_MAX; i++)
+			kfree(per_cpu(nr_task_bp_pinned[i], cpu));
+	}
+
+	return -ENOMEM;
 }
 core_initcall(init_hw_breakpoint);
 
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index d59cd687947..8eaf00749b6 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -34,12 +34,6 @@
 
 #include <asm/atomic.h>
 
-/*
- * For now, let us restrict the no. of symbols traced simultaneously to number
- * of available hardware breakpoint registers.
- */
-#define KSYM_TRACER_MAX HBP_NUM
-
 #define KSYM_TRACER_OP_LEN 3 /* rw- */
 
 struct trace_ksym {
@@ -53,7 +47,6 @@ struct trace_ksym {
 
 static struct trace_array *ksym_trace_array;
 
-static unsigned int ksym_filter_entry_count;
 static unsigned int ksym_tracing_enabled;
 
 static HLIST_HEAD(ksym_filter_head);
@@ -181,13 +174,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
 	struct trace_ksym *entry;
 	int ret = -ENOMEM;
 
-	if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
-		printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
-		" new requests for tracing can be accepted now.\n",
-			KSYM_TRACER_MAX);
-		return -ENOSPC;
-	}
-
 	entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
 	if (!entry)
 		return -ENOMEM;
@@ -203,13 +189,17 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
 
 	if (IS_ERR(entry->ksym_hbp)) {
 		ret = PTR_ERR(entry->ksym_hbp);
-		printk(KERN_INFO "ksym_tracer request failed. Try again"
-					" later!!\n");
+		if (ret == -ENOSPC) {
+			printk(KERN_ERR "ksym_tracer: Maximum limit reached."
+			" No new requests for tracing can be accepted now.\n");
+		} else {
+			printk(KERN_INFO "ksym_tracer request failed. Try again"
+					 " later!!\n");
+		}
 		goto err;
 	}
 
 	hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
-	ksym_filter_entry_count++;
 
 	return 0;
 
@@ -265,7 +255,6 @@ static void __ksym_trace_reset(void)
 	hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
 								ksym_hlist) {
 		unregister_wide_hw_breakpoint(entry->ksym_hbp);
-		ksym_filter_entry_count--;
 		hlist_del_rcu(&(entry->ksym_hlist));
 		synchronize_rcu();
 		kfree(entry);
@@ -338,7 +327,6 @@ static ssize_t ksym_trace_filter_write(struct file *file,
 				goto out_unlock;
 		}
 		/* Error or "symbol:---" case: drop it */
-		ksym_filter_entry_count--;
 		hlist_del_rcu(&(entry->ksym_hlist));
 		synchronize_rcu();
 		kfree(entry);
-- 
cgit v1.2.3


From ad342631f13d40aa787b9e5aaf4800f10d6c3647 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Tue, 27 Apr 2010 13:45:31 +0200
Subject: logfs: Return -EINVAL if filesystem image doesn't match

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 7fc4625c6f0..edd99487f93 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -413,7 +413,7 @@ static int __logfs_read_sb(struct super_block *sb)
 
 	page = find_super_block(sb);
 	if (!page)
-		return -EIO;
+		return -EINVAL;
 
 	ds = page_address(page);
 	super->s_size = be64_to_cpu(ds->ds_filesystem_size);
-- 
cgit v1.2.3


From bd2b3f29594c50d7c5bd864d9af05d440394ee82 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Sat, 1 May 2010 17:33:06 +0200
Subject: logfs: fix logfs_seek_hole()

logfs_seek_hole(inode, 0x200) would crap itself if the inode contained
just 0x1ff (or fewer) blocks.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/readwrite.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 8c663a55b72..e37cee3b100 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -892,6 +892,8 @@ u64 logfs_seek_hole(struct inode *inode, u64 bix)
 		return bix;
 	else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
 		bix = maxbix(li->li_height);
+	else if (bix >= maxbix(li->li_height))
+		return bix;
 	else {
 		bix = seek_holedata_loop(inode, bix, 0);
 		if (bix < maxbix(li->li_height))
-- 
cgit v1.2.3


From ccc0197b02178f7e1707e659cbc5242fc94b499a Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Sat, 1 May 2010 17:00:34 +0200
Subject: logfs: Close i_ino reuse race

logfs_seek_hole() may return the same offset it is passed as argument.
Found by Prasad Joshi <prasadjoshi124@gmail.com>

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 14ed27274da..45bf86f1595 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -326,7 +326,7 @@ static void logfs_set_ino_generation(struct super_block *sb,
 	u64 ino;
 
 	mutex_lock(&super->s_journal_mutex);
-	ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino);
+	ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1);
 	super->s_last_ino = ino;
 	super->s_inos_till_wrap--;
 	if (super->s_inos_till_wrap < 0) {
-- 
cgit v1.2.3


From fb72014d98afd51e85aab9c061344ef32d615606 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 30 Apr 2010 19:31:12 -0300
Subject: perf tools: Don't use code surrounded by __KERNEL__
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We need to refactor code to be explicitely shared by the kernel and at
least the tools/ userspace programs, so, till we do that, copy the bare
minimum bitmap/bitops code needed by tools/perf.

Reported-by: "H. Peter Anvin" <hpa@zytor.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile                    | 16 +-------------
 tools/perf/util/bitmap.c               | 21 +++++++++++++++++++
 tools/perf/util/hweight.c              | 31 +++++++++++++++++++++++++++
 tools/perf/util/include/asm/bitops.h   | 18 ----------------
 tools/perf/util/include/asm/hweight.h  |  8 +++++++
 tools/perf/util/include/linux/bitmap.h | 38 +++++++++++++++++++++++++++++++---
 tools/perf/util/include/linux/bitops.h | 20 ++++++++----------
 7 files changed, 105 insertions(+), 47 deletions(-)
 create mode 100644 tools/perf/util/bitmap.c
 create mode 100644 tools/perf/util/hweight.c
 delete mode 100644 tools/perf/util/include/asm/bitops.h
 create mode 100644 tools/perf/util/include/asm/hweight.h

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 739c4412b18..c5ac0a99156 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -377,9 +377,9 @@ LIB_H += util/include/linux/rbtree.h
 LIB_H += util/include/linux/string.h
 LIB_H += util/include/linux/types.h
 LIB_H += util/include/asm/asm-offsets.h
-LIB_H += util/include/asm/bitops.h
 LIB_H += util/include/asm/bug.h
 LIB_H += util/include/asm/byteorder.h
+LIB_H += util/include/asm/hweight.h
 LIB_H += util/include/asm/swab.h
 LIB_H += util/include/asm/system.h
 LIB_H += util/include/asm/uaccess.h
@@ -435,7 +435,6 @@ LIB_OBJS += $(OUTPUT)util/path.o
 LIB_OBJS += $(OUTPUT)util/rbtree.o
 LIB_OBJS += $(OUTPUT)util/bitmap.o
 LIB_OBJS += $(OUTPUT)util/hweight.o
-LIB_OBJS += $(OUTPUT)util/find_next_bit.o
 LIB_OBJS += $(OUTPUT)util/run-command.o
 LIB_OBJS += $(OUTPUT)util/quote.o
 LIB_OBJS += $(OUTPUT)util/strbuf.o
@@ -948,19 +947,6 @@ $(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS
 $(OUTPUT)util/rbtree.o: ../../lib/rbtree.c $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
 
-# some perf warning policies can't fit to lib/bitmap.c, eg: it warns about variable shadowing
-# from <string.h> that comes from kernel headers wrapping.
-KBITMAP_FLAGS=`echo $(ALL_CFLAGS) | sed s/-Wshadow// | sed s/-Wswitch-default// | sed s/-Wextra//`
-
-$(OUTPUT)util/bitmap.o: ../../lib/bitmap.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(KBITMAP_FLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
-
-$(OUTPUT)util/hweight.o: ../../lib/hweight.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
-
-$(OUTPUT)util/find_next_bit.o: ../../lib/find_next_bit.c $(OUTPUT)PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
-
 $(OUTPUT)util/scripting-engines/trace-event-perl.o: util/scripting-engines/trace-event-perl.c $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow $<
 
diff --git a/tools/perf/util/bitmap.c b/tools/perf/util/bitmap.c
new file mode 100644
index 00000000000..5e230acae1e
--- /dev/null
+++ b/tools/perf/util/bitmap.c
@@ -0,0 +1,21 @@
+/*
+ * From lib/bitmap.c
+ * Helper functions for bitmap.h.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+#include <linux/bitmap.h>
+
+int __bitmap_weight(const unsigned long *bitmap, int bits)
+{
+	int k, w = 0, lim = bits/BITS_PER_LONG;
+
+	for (k = 0; k < lim; k++)
+		w += hweight_long(bitmap[k]);
+
+	if (bits % BITS_PER_LONG)
+		w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits));
+
+	return w;
+}
diff --git a/tools/perf/util/hweight.c b/tools/perf/util/hweight.c
new file mode 100644
index 00000000000..5c1d0d099f0
--- /dev/null
+++ b/tools/perf/util/hweight.c
@@ -0,0 +1,31 @@
+#include <linux/bitops.h>
+
+/**
+ * hweightN - returns the hamming weight of a N-bit word
+ * @x: the word to weigh
+ *
+ * The Hamming Weight of a number is the total number of bits set in it.
+ */
+
+unsigned int hweight32(unsigned int w)
+{
+	unsigned int res = w - ((w >> 1) & 0x55555555);
+	res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
+	res = (res + (res >> 4)) & 0x0F0F0F0F;
+	res = res + (res >> 8);
+	return (res + (res >> 16)) & 0x000000FF;
+}
+
+unsigned long hweight64(__u64 w)
+{
+#if BITS_PER_LONG == 32
+	return hweight32((unsigned int)(w >> 32)) + hweight32((unsigned int)w);
+#elif BITS_PER_LONG == 64
+	__u64 res = w - ((w >> 1) & 0x5555555555555555ul);
+	res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
+	res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful;
+	res = res + (res >> 8);
+	res = res + (res >> 16);
+	return (res + (res >> 32)) & 0x00000000000000FFul;
+#endif
+}
diff --git a/tools/perf/util/include/asm/bitops.h b/tools/perf/util/include/asm/bitops.h
deleted file mode 100644
index 58e9817ffae..00000000000
--- a/tools/perf/util/include/asm/bitops.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef _PERF_ASM_BITOPS_H_
-#define _PERF_ASM_BITOPS_H_
-
-#include <sys/types.h>
-#include "../../types.h"
-#include <linux/compiler.h>
-
-/* CHECKME: Not sure both always match */
-#define BITS_PER_LONG	__WORDSIZE
-
-#include "../../../../include/asm-generic/bitops/__fls.h"
-#include "../../../../include/asm-generic/bitops/fls.h"
-#include "../../../../include/asm-generic/bitops/fls64.h"
-#include "../../../../include/asm-generic/bitops/__ffs.h"
-#include "../../../../include/asm-generic/bitops/ffz.h"
-#include "../../../../include/asm-generic/bitops/hweight.h"
-
-#endif
diff --git a/tools/perf/util/include/asm/hweight.h b/tools/perf/util/include/asm/hweight.h
new file mode 100644
index 00000000000..36cf26d434a
--- /dev/null
+++ b/tools/perf/util/include/asm/hweight.h
@@ -0,0 +1,8 @@
+#ifndef PERF_HWEIGHT_H
+#define PERF_HWEIGHT_H
+
+#include <linux/types.h>
+unsigned int hweight32(unsigned int w);
+unsigned long hweight64(__u64 w);
+
+#endif /* PERF_HWEIGHT_H */
diff --git a/tools/perf/util/include/linux/bitmap.h b/tools/perf/util/include/linux/bitmap.h
index 94507639a8c..eda4416efa0 100644
--- a/tools/perf/util/include/linux/bitmap.h
+++ b/tools/perf/util/include/linux/bitmap.h
@@ -1,3 +1,35 @@
-#include "../../../../include/linux/bitmap.h"
-#include "../../../../include/asm-generic/bitops/find.h"
-#include <linux/errno.h>
+#ifndef _PERF_BITOPS_H
+#define _PERF_BITOPS_H
+
+#include <string.h>
+#include <linux/bitops.h>
+
+int __bitmap_weight(const unsigned long *bitmap, int bits);
+
+#define BITMAP_LAST_WORD_MASK(nbits)					\
+(									\
+	((nbits) % BITS_PER_LONG) ?					\
+		(1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL		\
+)
+
+#define small_const_nbits(nbits) \
+	(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
+
+static inline void bitmap_zero(unsigned long *dst, int nbits)
+{
+	if (small_const_nbits(nbits))
+		*dst = 0UL;
+	else {
+		int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+		memset(dst, 0, len);
+	}
+}
+
+static inline int bitmap_weight(const unsigned long *src, int nbits)
+{
+	if (small_const_nbits(nbits))
+		return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
+	return __bitmap_weight(src, nbits);
+}
+
+#endif /* _PERF_BITOPS_H */
diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h
index 8d63116e943..bb4ac2e0538 100644
--- a/tools/perf/util/include/linux/bitops.h
+++ b/tools/perf/util/include/linux/bitops.h
@@ -1,13 +1,12 @@
 #ifndef _PERF_LINUX_BITOPS_H_
 #define _PERF_LINUX_BITOPS_H_
 
-#define __KERNEL__
+#include <linux/kernel.h>
+#include <asm/hweight.h>
 
-#define CONFIG_GENERIC_FIND_NEXT_BIT
-#define CONFIG_GENERIC_FIND_FIRST_BIT
-#include "../../../../include/linux/bitops.h"
-
-#undef __KERNEL__
+#define BITS_PER_LONG __WORDSIZE
+#define BITS_PER_BYTE           8
+#define BITS_TO_LONGS(nr)       DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
 
 static inline void set_bit(int nr, unsigned long *addr)
 {
@@ -20,10 +19,9 @@ static __always_inline int test_bit(unsigned int nr, const unsigned long *addr)
 		(((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
 }
 
-unsigned long generic_find_next_zero_le_bit(const unsigned long *addr, unsigned
-		long size, unsigned long offset);
-
-unsigned long generic_find_next_le_bit(const unsigned long *addr, unsigned
-		long size, unsigned long offset);
+static inline unsigned long hweight_long(unsigned long w)
+{
+	return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
+}
 
 #endif
-- 
cgit v1.2.3


From 789688faef5b3ba78065beaf2f3d6f1c839f74a3 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sat, 1 May 2010 01:41:19 -0500
Subject: perf/live: don't synthesize build ids at the end of a live mode trace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It doesn't really make sense to record the build ids at the end of a
live mode session - live mode samples need that information during the
trace rather than at the end.

Leave event__synthesize_build_id() in place, however; we'll still be
using that to synthesize build ids in a more timely fashion in a
future patch.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1272696080-16435-2-git-send-email-tzanussi@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c |  7 ------
 tools/perf/util/header.c    | 61 ---------------------------------------------
 tools/perf/util/header.h    |  2 --
 3 files changed, 70 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 83b308a035c..1a7379674c2 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -446,13 +446,6 @@ static void atexit_header(void)
 
 		process_buildids();
 		perf_header__write(&session->header, output, true);
-	} else {
-		int err;
-
-		err = event__synthesize_build_ids(process_synthesized_event,
-						  session);
-		if (err < 0)
-			pr_err("Couldn't synthesize build ids.\n");
 	}
 }
 
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 6227dc4cb2c..2d1d97e0746 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1136,67 +1136,6 @@ int event__synthesize_build_id(struct dso *pos, u16 misc,
 	return err;
 }
 
-static int __event_synthesize_build_ids(struct list_head *head, u16 misc,
-					event__handler_t process,
-					struct machine *machine,
-					struct perf_session *session)
-{
-	struct dso *pos;
-
-	dsos__for_each_with_build_id(pos, head) {
-		int err;
-		if (!pos->hit)
-			continue;
-
-		err = event__synthesize_build_id(pos, misc, process,
-						 machine, session);
-		if (err < 0)
-			return err;
-	}
-
-	return 0;
-}
-
-int event__synthesize_build_ids(event__handler_t process,
-				struct perf_session *session)
-{
-	int err = 0;
-	u16 kmisc, umisc;
-	struct machine *pos;
-	struct rb_node *nd;
-
-	if (!dsos__read_build_ids(&session->header, true))
-		return 0;
-
-	for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) {
-		pos = rb_entry(nd, struct machine, rb_node);
-		if (machine__is_host(pos)) {
-			kmisc = PERF_RECORD_MISC_KERNEL;
-			umisc = PERF_RECORD_MISC_USER;
-		} else {
-			kmisc = PERF_RECORD_MISC_GUEST_KERNEL;
-			umisc = PERF_RECORD_MISC_GUEST_USER;
-		}
-
-		err = __event_synthesize_build_ids(&pos->kernel_dsos, kmisc,
-						   process, pos, session);
-		if (err == 0)
-			err = __event_synthesize_build_ids(&pos->user_dsos, umisc,
-							   process, pos, session);
-		if (err)
-			break;
-	}
-
-	if (err < 0) {
-		pr_debug("failed to synthesize build ids\n");
-		return err;
-	}
-
-	dsos__cache_build_ids(&session->header);
-
-	return 0;
-}
-
 int event__process_build_id(event_t *self,
 			    struct perf_session *session)
 {
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index f39443db070..402ac2454cf 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -122,8 +122,6 @@ int event__synthesize_build_id(struct dso *pos, u16 misc,
 			       event__handler_t process,
 			       struct machine *machine,
 			       struct perf_session *session);
-int event__synthesize_build_ids(event__handler_t process,
-				struct perf_session *session);
 int event__process_build_id(event_t *self, struct perf_session *session);
 
 #endif /* __PERF_HEADER_H */
-- 
cgit v1.2.3


From 454c407ec17a0c63e4023ac0877d687945a7df4a Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sat, 1 May 2010 01:41:20 -0500
Subject: perf: add perf-inject builtin
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, perf 'live mode' writes build-ids at the end of the
session, which isn't actually useful for processing live mode events.

What would be better would be to have the build-ids sent before any of
the samples that reference them, which can be done by processing the
event stream and retrieving the build-ids on the first hit.  Doing
that in perf-record itself, however, is off-limits.

This patch introduces perf-inject, which does the same job while
leaving perf-record untouched.  Normal mode perf still records the
build-ids at the end of the session as it should, but for live mode,
perf-inject can be injected in between the record and report steps
e.g.:

perf record -o - ./hackbench 10 | perf inject -v -b | perf report -v -i -

perf-inject reads a perf-record event stream and repipes it to stdout.
At any point the processing code can inject other events into the
event stream - in this case build-ids (-b option) are read and
injected as needed into the event stream.

Build-ids are just the first user of perf-inject - potentially
anything that needs userspace processing to augment the trace stream
with additional information could make use of this facility.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1272696080-16435-3-git-send-email-tzanussi@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile                |   1 +
 tools/perf/builtin-annotate.c      |   2 +-
 tools/perf/builtin-buildid-list.c  |   2 +-
 tools/perf/builtin-diff.c          |   4 +-
 tools/perf/builtin-inject.c        | 228 +++++++++++++++++++++++++++++++++++++
 tools/perf/builtin-kmem.c          |   2 +-
 tools/perf/builtin-lock.c          |   2 +-
 tools/perf/builtin-record.c        |   2 +-
 tools/perf/builtin-report.c        |   2 +-
 tools/perf/builtin-sched.c         |   2 +-
 tools/perf/builtin-timechart.c     |   2 +-
 tools/perf/builtin-top.c           |   2 +-
 tools/perf/builtin-trace.c         |   2 +-
 tools/perf/builtin.h               |   1 +
 tools/perf/perf.c                  |   1 +
 tools/perf/util/header.c           |  35 ++++--
 tools/perf/util/session.c          |   3 +-
 tools/perf/util/session.h          |   3 +-
 tools/perf/util/trace-event-read.c |  19 +++-
 tools/perf/util/trace-event.h      |   2 +-
 20 files changed, 293 insertions(+), 24 deletions(-)
 create mode 100644 tools/perf/builtin-inject.c

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index c5ac0a99156..0ef5cfe52f2 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -490,6 +490,7 @@ BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o
 BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
 BUILTIN_OBJS += $(OUTPUT)builtin-kvm.o
 BUILTIN_OBJS += $(OUTPUT)builtin-test.o
+BUILTIN_OBJS += $(OUTPUT)builtin-inject.o
 
 PERFLIBS = $(LIB_FILE)
 
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index b57dbcf62af..ee154b58772 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -554,7 +554,7 @@ static int __cmd_annotate(void)
 	int ret;
 	struct perf_session *session;
 
-	session = perf_session__new(input_name, O_RDONLY, force);
+	session = perf_session__new(input_name, O_RDONLY, force, false);
 	if (session == NULL)
 		return -ENOMEM;
 
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index 7dc3b2e7a5e..44a47e13bd6 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -39,7 +39,7 @@ static int __cmd_buildid_list(void)
 	int err = -1;
 	struct perf_session *session;
 
-	session = perf_session__new(input_name, O_RDONLY, force);
+	session = perf_session__new(input_name, O_RDONLY, force, false);
 	if (session == NULL)
 		return -1;
 
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 207e860591e..4cce68f2368 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -156,8 +156,8 @@ static int __cmd_diff(void)
 	int ret, i;
 	struct perf_session *session[2];
 
-	session[0] = perf_session__new(input_old, O_RDONLY, force);
-	session[1] = perf_session__new(input_new, O_RDONLY, force);
+	session[0] = perf_session__new(input_old, O_RDONLY, force, false);
+	session[1] = perf_session__new(input_new, O_RDONLY, force, false);
 	if (session[0] == NULL || session[1] == NULL)
 		return -ENOMEM;
 
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
new file mode 100644
index 00000000000..a5902a3eadd
--- /dev/null
+++ b/tools/perf/builtin-inject.c
@@ -0,0 +1,228 @@
+/*
+ * builtin-inject.c
+ *
+ * Builtin inject command: Examine the live mode (stdin) event stream
+ * and repipe it to stdout while optionally injecting additional
+ * events into it.
+ */
+#include "builtin.h"
+
+#include "perf.h"
+#include "util/session.h"
+#include "util/debug.h"
+
+#include "util/parse-options.h"
+
+static char		const *input_name = "-";
+static bool		inject_build_ids;
+
+static int event__repipe(event_t *event __used,
+			 struct perf_session *session __used)
+{
+	uint32_t size;
+	void *buf = event;
+
+	size = event->header.size;
+
+	while (size) {
+		int ret = write(STDOUT_FILENO, buf, size);
+		if (ret < 0)
+			return -errno;
+
+		size -= ret;
+		buf += ret;
+	}
+
+	return 0;
+}
+
+static int event__repipe_mmap(event_t *self, struct perf_session *session)
+{
+	int err;
+
+	err = event__process_mmap(self, session);
+	event__repipe(self, session);
+
+	return err;
+}
+
+static int event__repipe_task(event_t *self, struct perf_session *session)
+{
+	int err;
+
+	err = event__process_task(self, session);
+	event__repipe(self, session);
+
+	return err;
+}
+
+static int event__repipe_tracing_data(event_t *self,
+				      struct perf_session *session)
+{
+	int err;
+
+	event__repipe(self, session);
+	err = event__process_tracing_data(self, session);
+
+	return err;
+}
+
+static int read_buildid(struct map *self, struct perf_session *session)
+{
+	const char *name = self->dso->long_name;
+	int err;
+
+	if (filename__read_build_id(self->dso->long_name, self->dso->build_id,
+				    sizeof(self->dso->build_id)) > 0) {
+		char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+
+		self->dso->has_build_id = true;
+
+		build_id__sprintf(self->dso->build_id,
+				  sizeof(self->dso->build_id),
+				  sbuild_id);
+		pr_debug("build id found for %s: %s\n", self->dso->long_name,
+			 sbuild_id);
+	}
+
+	if (self->dso->has_build_id) {
+		u16 misc = PERF_RECORD_MISC_USER;
+		struct machine *machine;
+
+		misc = self->dso->kernel ? PERF_RECORD_MISC_KERNEL : misc;
+
+		machine = perf_session__find_host_machine(session);
+		if (!machine) {
+			pr_err("Can't find machine for session\n");
+			return -1;
+		}
+
+		err = event__synthesize_build_id(self->dso, misc,
+						 event__repipe, machine,
+						 session);
+		if (err) {
+			pr_err("Can't synthesize build_id event for %s\n",
+			       name);
+			return -1;
+		}
+	} else {
+		pr_debug("no build_id found for %s\n", name);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int event__inject_buildid(event_t *event, struct perf_session *session)
+{
+	struct addr_location al;
+	struct thread *thread;
+	u8 cpumode;
+	int err = 0;
+
+	cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+
+	thread = perf_session__findnew(session, event->ip.pid);
+	if (thread == NULL) {
+		pr_err("problem processing %d event, skipping it.\n",
+		       event->header.type);
+		err = -1;
+		goto repipe;
+	}
+
+	thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION,
+			      event->ip.pid, event->ip.ip, &al);
+
+	if (al.map != NULL) {
+		if (!al.map->dso->hit) {
+			al.map->dso->hit = 1;
+			if (map__load(al.map, NULL) >= 0)
+				read_buildid(al.map, session);
+			else
+				pr_warning("no symbols found in %s, maybe "
+					   "install a debug package?\n",
+					   al.map->dso->long_name);
+		}
+	}
+
+repipe:
+	event__repipe(event, session);
+	return err;
+}
+
+struct perf_event_ops inject_ops = {
+	.sample		= event__repipe,
+	.mmap		= event__repipe,
+	.comm		= event__repipe,
+	.fork		= event__repipe,
+	.exit		= event__repipe,
+	.lost		= event__repipe,
+	.read		= event__repipe,
+	.throttle	= event__repipe,
+	.unthrottle	= event__repipe,
+	.attr		= event__repipe,
+	.event_type 	= event__repipe,
+	.tracing_data 	= event__repipe,
+	.build_id 	= event__repipe,
+};
+
+extern volatile int session_done;
+
+static void sig_handler(int sig __attribute__((__unused__)))
+{
+	session_done = 1;
+}
+
+static int __cmd_inject(void)
+{
+	struct perf_session *session;
+	int ret = -EINVAL;
+
+	signal(SIGINT, sig_handler);
+
+	if (inject_build_ids) {
+		inject_ops.sample	= event__inject_buildid;
+		inject_ops.mmap		= event__repipe_mmap;
+		inject_ops.fork		= event__repipe_task;
+		inject_ops.tracing_data	= event__repipe_tracing_data;
+	}
+
+	session = perf_session__new(input_name, O_RDONLY, false, true);
+	if (session == NULL)
+		return -ENOMEM;
+
+	ret = perf_session__process_events(session, &inject_ops);
+
+	perf_session__delete(session);
+
+	return ret;
+}
+
+static const char * const report_usage[] = {
+	"perf inject [<options>]",
+	NULL
+};
+
+static const struct option options[] = {
+	OPT_BOOLEAN('b', "inject build-ids", &inject_build_ids,
+		    "Inject build-ids into the output stream"),
+	OPT_INCR('v', "verbose", &verbose,
+		 "be more verbose (show build ids, etc)"),
+	OPT_END()
+};
+
+int cmd_inject(int argc, const char **argv, const char *prefix __used)
+{
+	argc = parse_options(argc, argv, options, report_usage, 0);
+
+	/*
+	 * Any (unrecognized) arguments left?
+	 */
+	if (argc)
+		usage_with_options(report_usage, options);
+
+	if (symbol__init() < 0)
+		return -1;
+
+	return __cmd_inject();
+}
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index ee05dba9609..31f60a2535e 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -492,7 +492,7 @@ static void sort_result(void)
 static int __cmd_kmem(void)
 {
 	int err = -EINVAL;
-	struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0);
+	struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0, false);
 	if (session == NULL)
 		return -ENOMEM;
 
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index ce276750b14..6605000ed73 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -818,7 +818,7 @@ static struct perf_event_ops eops = {
 
 static int read_events(void)
 {
-	session = perf_session__new(input_name, O_RDONLY, 0);
+	session = perf_session__new(input_name, O_RDONLY, 0, false);
 	if (!session)
 		die("Initializing perf session failed\n");
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 1a7379674c2..e382d93d369 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -548,7 +548,7 @@ static int __cmd_record(int argc, const char **argv)
 	}
 
 	session = perf_session__new(output_name, O_WRONLY,
-				    write_mode == WRITE_FORCE);
+				    write_mode == WRITE_FORCE, false);
 	if (session == NULL) {
 		pr_err("Not enough memory for reading perf file header\n");
 		return -1;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f1b46eb7ef9..0152b5412cc 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -289,7 +289,7 @@ static int __cmd_report(void)
 
 	signal(SIGINT, sig_handler);
 
-	session = perf_session__new(input_name, O_RDONLY, force);
+	session = perf_session__new(input_name, O_RDONLY, force, false);
 	if (session == NULL)
 		return -ENOMEM;
 
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 94453f1e4e0..aef6ed0e119 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1660,7 +1660,7 @@ static struct perf_event_ops event_ops = {
 static int read_events(void)
 {
 	int err = -EINVAL;
-	struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0);
+	struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0, false);
 	if (session == NULL)
 		return -ENOMEM;
 
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index c35aa44f82b..5a52ed9fc10 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -936,7 +936,7 @@ static struct perf_event_ops event_ops = {
 
 static int __cmd_timechart(void)
 {
-	struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0);
+	struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0, false);
 	int ret = -EINVAL;
 
 	if (session == NULL)
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index d95281f588d..3de397764cb 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1287,7 +1287,7 @@ static int __cmd_top(void)
 	 * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
 	 * mmap reading, etc is encapsulated in it. Use O_WRONLY for now.
 	 */
-	struct perf_session *session = perf_session__new(NULL, O_WRONLY, false);
+	struct perf_session *session = perf_session__new(NULL, O_WRONLY, false, false);
 	if (session == NULL)
 		return -ENOMEM;
 
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 77f556f7604..9c483e92e8d 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -661,7 +661,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used)
 	if (!script_name)
 		setup_pager();
 
-	session = perf_session__new(input_name, O_RDONLY, 0);
+	session = perf_session__new(input_name, O_RDONLY, 0, false);
 	if (session == NULL)
 		return -ENOMEM;
 
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 34a8a9ab961..921245b2858 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -34,5 +34,6 @@ extern int cmd_kmem(int argc, const char **argv, const char *prefix);
 extern int cmd_lock(int argc, const char **argv, const char *prefix);
 extern int cmd_kvm(int argc, const char **argv, const char *prefix);
 extern int cmd_test(int argc, const char **argv, const char *prefix);
+extern int cmd_inject(int argc, const char **argv, const char *prefix);
 
 #endif
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 5ff9b5b4697..08e0e5d2b50 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -309,6 +309,7 @@ static void handle_internal_command(int argc, const char **argv)
 		{ "lock",	cmd_lock,	0 },
 		{ "kvm",	cmd_kvm,	0 },
 		{ "test",	cmd_test,	0 },
+		{ "inject",	cmd_inject,	0 },
 	};
 	unsigned int i;
 	static const char ext[] = STRIP_EXTENSION;
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 2d1d97e0746..79da0e50ef8 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -713,10 +713,18 @@ static int __event_process_build_id(struct build_id_event *bev,
 
 	dso = __dsos__findnew(head, filename);
 	if (dso != NULL) {
+		char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+
 		dso__set_build_id(dso, &bev->build_id);
-			if (filename[0] == '[')
-				dso->kernel = dso_type;
-		}
+
+		if (filename[0] == '[')
+			dso->kernel = dso_type;
+
+		build_id__sprintf(dso->build_id, sizeof(dso->build_id),
+				  sbuild_id);
+		pr_debug("build id event received for %s: %s\n",
+			 dso->long_name, sbuild_id);
+	}
 
 	err = 0;
 out:
@@ -767,7 +775,7 @@ static int perf_file_section__process(struct perf_file_section *self,
 
 	switch (feat) {
 	case HEADER_TRACE_INFO:
-		trace_report(fd);
+		trace_report(fd, false);
 		break;
 
 	case HEADER_BUILD_ID:
@@ -782,12 +790,16 @@ static int perf_file_section__process(struct perf_file_section *self,
 }
 
 static int perf_file_header__read_pipe(struct perf_pipe_file_header *self,
-				       struct perf_header *ph, int fd)
+				       struct perf_header *ph, int fd,
+				       bool repipe)
 {
 	if (do_read(fd, self, sizeof(*self)) <= 0 ||
 	    memcmp(&self->magic, __perf_magic, sizeof(self->magic)))
 		return -1;
 
+	if (repipe && do_write(STDOUT_FILENO, self, sizeof(*self)) < 0)
+		return -1;
+
 	if (self->size != sizeof(*self)) {
 		u64 size = bswap_64(self->size);
 
@@ -805,7 +817,8 @@ static int perf_header__read_pipe(struct perf_session *session, int fd)
 	struct perf_header *self = &session->header;
 	struct perf_pipe_file_header f_header;
 
-	if (perf_file_header__read_pipe(&f_header, self, fd) < 0) {
+	if (perf_file_header__read_pipe(&f_header, self, fd,
+					session->repipe) < 0) {
 		pr_debug("incompatible file format\n");
 		return -EINVAL;
 	}
@@ -1096,12 +1109,17 @@ int event__process_tracing_data(event_t *self,
 	lseek(session->fd, offset + sizeof(struct tracing_data_event),
 	      SEEK_SET);
 
-	size_read = trace_report(session->fd);
+	size_read = trace_report(session->fd, session->repipe);
 
 	padding = ALIGN(size_read, sizeof(u64)) - size_read;
 
 	if (read(session->fd, buf, padding) < 0)
 		die("reading input file");
+	if (session->repipe) {
+		int retw = write(STDOUT_FILENO, buf, padding);
+		if (retw <= 0 || retw != padding)
+			die("repiping tracing data padding");
+	}
 
 	if (size_read + padding != size)
 		die("tracing data size mismatch");
@@ -1110,7 +1128,8 @@ int event__process_tracing_data(event_t *self,
 }
 
 int event__synthesize_build_id(struct dso *pos, u16 misc,
-			       event__handler_t process, struct machine *machine,
+			       event__handler_t process,
+			       struct machine *machine,
 			       struct perf_session *session)
 {
 	event_t ev;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index a8dd73ed158..5d353e70fe2 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -77,7 +77,7 @@ int perf_session__create_kernel_maps(struct perf_session *self)
 	return ret;
 }
 
-struct perf_session *perf_session__new(const char *filename, int mode, bool force)
+struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe)
 {
 	size_t len = filename ? strlen(filename) + 1 : 0;
 	struct perf_session *self = zalloc(sizeof(*self) + len);
@@ -97,6 +97,7 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
 	self->cwdlen = 0;
 	self->unknown_events = 0;
 	self->machines = RB_ROOT;
+	self->repipe = repipe;
 	self->ordered_samples.flush_limit = ULLONG_MAX;
 	INIT_LIST_HEAD(&self->ordered_samples.samples_head);
 
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 61ca92e58ad..f2b2c6a3a49 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -34,6 +34,7 @@ struct perf_session {
 	u64			sample_type;
 	int			fd;
 	bool			fd_pipe;
+	bool			repipe;
 	int			cwdlen;
 	char			*cwd;
 	struct ordered_samples	ordered_samples;
@@ -59,7 +60,7 @@ struct perf_event_ops {
 	bool	ordered_samples;
 };
 
-struct perf_session *perf_session__new(const char *filename, int mode, bool force);
+struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe);
 void perf_session__delete(struct perf_session *self);
 
 void perf_event_header__bswap(struct perf_event_header *self);
diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c
index 44889c9b563..43f19c1fed3 100644
--- a/tools/perf/util/trace-event-read.c
+++ b/tools/perf/util/trace-event-read.c
@@ -51,6 +51,7 @@ static int long_size;
 static unsigned long	page_size;
 
 static ssize_t calc_data_size;
+static bool repipe;
 
 static int do_read(int fd, void *buf, int size)
 {
@@ -62,6 +63,13 @@ static int do_read(int fd, void *buf, int size)
 		if (ret <= 0)
 			return -1;
 
+		if (repipe) {
+			int retw = write(STDOUT_FILENO, buf, ret);
+
+			if (retw <= 0 || retw != ret)
+				die("repiping input file");
+		}
+
 		size -= ret;
 		buf += ret;
 	}
@@ -116,6 +124,13 @@ static char *read_string(void)
 		if (!r)
 			die("no data");
 
+		if (repipe) {
+			int retw = write(STDOUT_FILENO, &c, 1);
+
+			if (retw <= 0 || retw != r)
+				die("repiping input file string");
+		}
+
 		buf[size++] = c;
 
 		if (!c)
@@ -454,7 +469,7 @@ struct record *trace_read_data(int cpu)
 	return data;
 }
 
-ssize_t trace_report(int fd)
+ssize_t trace_report(int fd, bool __repipe)
 {
 	char buf[BUFSIZ];
 	char test[] = { 23, 8, 68 };
@@ -465,6 +480,7 @@ ssize_t trace_report(int fd)
 	ssize_t size;
 
 	calc_data_size = 1;
+	repipe = __repipe;
 
 	input_fd = fd;
 
@@ -499,6 +515,7 @@ ssize_t trace_report(int fd)
 
 	size = calc_data_size - 1;
 	calc_data_size = 0;
+	repipe = false;
 
 	if (show_funcs) {
 		print_funcs();
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 1f45d468fd9..ebfee80e4a0 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -163,7 +163,7 @@ struct record *trace_read_data(int cpu);
 
 void parse_set_info(int nr_cpus, int long_sz);
 
-ssize_t trace_report(int fd);
+ssize_t trace_report(int fd, bool repipe);
 
 void *malloc_or_die(unsigned int size);
 
-- 
cgit v1.2.3


From 2c9faa060064343a4a0b16f5b77f3c61d1d17e23 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 2 May 2010 13:37:24 -0300
Subject: perf record: Don't exit in live mode when no tracepoints are enabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With this I was able to actually test Tom Zanussi's two previous patches
in my usual perf testing ways, i.e. without any tracepoints activated.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index e382d93d369..ac989e9ba8f 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -666,12 +666,15 @@ static int __cmd_record(int argc, const char **argv)
 						     nr_counters,
 						     process_synthesized_event,
 						     session);
-		if (err <= 0) {
-			pr_err("Couldn't record tracing data.\n");
-			return err;
-		}
-
-		advance_output(err);
+		/*
+		 * FIXME err <= 0 here actually means that there were no tracepoints
+		 * so its not really an error, just that we don't need to synthesize
+		 * anything.
+		 * We really have to return this more properly and also propagate
+		 * errors that now are calling die()
+		 */
+		if (err > 0)
+			advance_output(err);
 	}
 
 	machine = perf_session__find_host_machine(session);
-- 
cgit v1.2.3


From 090f7204dfdb5d7f18208ea81dfdba845897cedd Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 2 May 2010 19:46:36 -0300
Subject: perf inject: Refactor read_buildid function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Into two functions, one that actually reads the build_id for the dso if
it wasn't already read, and another taht will inject the event if the
build_id is available.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-inject.c | 76 ++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index a5902a3eadd..59e981a8890 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -67,46 +67,44 @@ static int event__repipe_tracing_data(event_t *self,
 	return err;
 }
 
-static int read_buildid(struct map *self, struct perf_session *session)
+static int dso__read_build_id(struct dso *self)
 {
-	const char *name = self->dso->long_name;
-	int err;
+	if (self->has_build_id)
+		return 0;
 
-	if (filename__read_build_id(self->dso->long_name, self->dso->build_id,
-				    sizeof(self->dso->build_id)) > 0) {
-		char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+	if (filename__read_build_id(self->long_name, self->build_id,
+				    sizeof(self->build_id)) > 0) {
+		self->has_build_id = true;
+		return 0;
+	}
 
-		self->dso->has_build_id = true;
+	return -1;
+}
 
-		build_id__sprintf(self->dso->build_id,
-				  sizeof(self->dso->build_id),
-				  sbuild_id);
-		pr_debug("build id found for %s: %s\n", self->dso->long_name,
-			 sbuild_id);
-	}
+static int dso__inject_build_id(struct dso *self, struct perf_session *session)
+{
+	u16 misc = PERF_RECORD_MISC_USER;
+	struct machine *machine;
+	int err;
 
-	if (self->dso->has_build_id) {
-		u16 misc = PERF_RECORD_MISC_USER;
-		struct machine *machine;
+	if (dso__read_build_id(self) < 0) {
+		pr_debug("no build_id found for %s\n", self->long_name);
+		return -1;
+	}
 
-		misc = self->dso->kernel ? PERF_RECORD_MISC_KERNEL : misc;
+	machine = perf_session__find_host_machine(session);
+	if (machine == NULL) {
+		pr_err("Can't find machine for session\n");
+		return -1;
+	}
 
-		machine = perf_session__find_host_machine(session);
-		if (!machine) {
-			pr_err("Can't find machine for session\n");
-			return -1;
-		}
+	if (self->kernel)
+		misc = PERF_RECORD_MISC_KERNEL;
 
-		err = event__synthesize_build_id(self->dso, misc,
-						 event__repipe, machine,
-						 session);
-		if (err) {
-			pr_err("Can't synthesize build_id event for %s\n",
-			       name);
-			return -1;
-		}
-	} else {
-		pr_debug("no build_id found for %s\n", name);
+	err = event__synthesize_build_id(self, misc, event__repipe,
+					 machine, session);
+	if (err) {
+		pr_err("Can't synthesize build_id event for %s\n", self->long_name);
 		return -1;
 	}
 
@@ -118,7 +116,6 @@ static int event__inject_buildid(event_t *event, struct perf_session *session)
 	struct addr_location al;
 	struct thread *thread;
 	u8 cpumode;
-	int err = 0;
 
 	cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
@@ -126,7 +123,6 @@ static int event__inject_buildid(event_t *event, struct perf_session *session)
 	if (thread == NULL) {
 		pr_err("problem processing %d event, skipping it.\n",
 		       event->header.type);
-		err = -1;
 		goto repipe;
 	}
 
@@ -136,9 +132,13 @@ static int event__inject_buildid(event_t *event, struct perf_session *session)
 	if (al.map != NULL) {
 		if (!al.map->dso->hit) {
 			al.map->dso->hit = 1;
-			if (map__load(al.map, NULL) >= 0)
-				read_buildid(al.map, session);
-			else
+			if (map__load(al.map, NULL) >= 0) {
+				dso__inject_build_id(al.map->dso, session);
+				/*
+				 * If this fails, too bad, let the other side
+				 * account this as unresolved.
+				 */
+			} else
 				pr_warning("no symbols found in %s, maybe "
 					   "install a debug package?\n",
 					   al.map->dso->long_name);
@@ -147,7 +147,7 @@ static int event__inject_buildid(event_t *event, struct perf_session *session)
 
 repipe:
 	event__repipe(event, session);
-	return err;
+	return 0;
 }
 
 struct perf_event_ops inject_ops = {
-- 
cgit v1.2.3


From 63e0c7715aab6085faa487d498889f4361dc6542 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Mon, 3 May 2010 00:14:48 -0500
Subject: perf: record TRACE_INFO only if using tracepoints and SAMPLE_RAW
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current perf code implicitly assumes SAMPLE_RAW means tracepoints
are being used, but doesn't check for that.  It happily records the
TRACE_INFO even if SAMPLE_RAW is used without tracepoints, but when the
perf data is read it won't go any further when it finds TRACE_INFO but
no tracepoints, and displays misleading errors.

This adds a check for both in perf-record, and won't record TRACE_INFO
unless both are true.  This at least allows perf report -D to dump raw
events, and avoids triggering a misleading error condition in perf
trace.  It doesn't actually enable the non-tracepoint raw events to be
displayed in perf trace, since perf trace currently only deals with
tracepoint events.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1272865861.7932.16.camel@tropicana>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c        | 35 +++++++++++++++++++++--------------
 tools/perf/util/header.c           |  1 -
 tools/perf/util/parse-events.h     |  1 +
 tools/perf/util/trace-event-info.c |  5 +++++
 4 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index ac989e9ba8f..0ff67d1c475 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -560,11 +560,12 @@ static int __cmd_record(int argc, const char **argv)
 			return err;
 	}
 
-	if (raw_samples) {
+	if (raw_samples && have_tracepoints(attrs, nr_counters)) {
 		perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
 	} else {
 		for (i = 0; i < nr_counters; i++) {
-			if (attrs[i].sample_type & PERF_SAMPLE_RAW) {
+			if (attrs[i].sample_type & PERF_SAMPLE_RAW &&
+				attrs[i].type == PERF_TYPE_TRACEPOINT) {
 				perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
 				break;
 			}
@@ -662,19 +663,25 @@ static int __cmd_record(int argc, const char **argv)
 			return err;
 		}
 
-		err = event__synthesize_tracing_data(output, attrs,
-						     nr_counters,
-						     process_synthesized_event,
-						     session);
-		/*
-		 * FIXME err <= 0 here actually means that there were no tracepoints
-		 * so its not really an error, just that we don't need to synthesize
-		 * anything.
-		 * We really have to return this more properly and also propagate
-		 * errors that now are calling die()
-		 */
-		if (err > 0)
+		if (have_tracepoints(attrs, nr_counters)) {
+			/*
+			 * FIXME err <= 0 here actually means that
+			 * there were no tracepoints so its not really
+			 * an error, just that we don't need to
+			 * synthesize anything.  We really have to
+			 * return this more properly and also
+			 * propagate errors that now are calling die()
+			 */
+			err = event__synthesize_tracing_data(output, attrs,
+							     nr_counters,
+							     process_synthesized_event,
+							     session);
+			if (err <= 0) {
+				pr_err("Couldn't record tracing data.\n");
+				return err;
+			}
 			advance_output(err);
+		}
 	}
 
 	machine = perf_session__find_host_machine(session);
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 79da0e50ef8..2b9f898efea 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -436,7 +436,6 @@ static int perf_header__adds_write(struct perf_header *self, int fd)
 		trace_sec->size = lseek(fd, 0, SEEK_CUR) - trace_sec->offset;
 	}
 
-
 	if (perf_header__has_feat(self, HEADER_BUILD_ID)) {
 		struct perf_file_section *buildid_sec;
 
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index b8c1f64bc93..fc4ab3fe877 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -13,6 +13,7 @@ struct tracepoint_path {
 };
 
 extern struct tracepoint_path *tracepoint_id_to_path(u64 config);
+extern bool have_tracepoints(struct perf_event_attr *pattrs, int nb_events);
 
 extern int			nr_counters;
 
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index 30cd9b57595..0a1fb9d4f3b 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -487,6 +487,11 @@ get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events)
 	return nr_tracepoints > 0 ? path.next : NULL;
 }
 
+bool have_tracepoints(struct perf_event_attr *pattrs, int nb_events)
+{
+	return get_tracepoints_path(pattrs, nb_events) ? true : false;
+}
+
 int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events)
 {
 	char buf[BUFSIZ];
-- 
cgit v1.2.3


From 40d2e76315da38993129090dc5d56377e573c312 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 21 Mar 2010 09:00:43 -0400
Subject: x86-32: Rework cache flush denied handler

The cache flush denied error is an erratum on some AMD 486 clones.  If an invd
instruction is executed in userspace, the processor calls exception 19 (13 hex)
instead of #GP (13 decimal).  On cpus where XMM is not supported, redirect
exception 19 to do_general_protection().  Also, remove die_if_kernel(), since
this was the last user.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-2-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig.cpu       |  4 ++++
 arch/x86/kernel/entry_32.S | 19 +++++++++++++++++++
 arch/x86/kernel/traps.c    | 31 +++----------------------------
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index a19829374e6..6f6792c5601 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -338,6 +338,10 @@ config X86_F00F_BUG
 	def_bool y
 	depends on M586MMX || M586TSC || M586 || M486 || M386
 
+config X86_INVD_BUG
+	def_bool y
+	depends on M486 || M386
+
 config X86_WP_WORKS_OK
 	def_bool y
 	depends on !M386
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 44a8e0dc673..cd49141cf15 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -53,6 +53,7 @@
 #include <asm/processor-flags.h>
 #include <asm/ftrace.h>
 #include <asm/irq_vectors.h>
+#include <asm/cpufeature.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -905,7 +906,25 @@ ENTRY(simd_coprocessor_error)
 	RING0_INT_FRAME
 	pushl $0
 	CFI_ADJUST_CFA_OFFSET 4
+#ifdef CONFIG_X86_INVD_BUG
+	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
+661:	pushl $do_general_protection
+662:
+.section .altinstructions,"a"
+	.balign 4
+	.long 661b
+	.long 663f
+	.byte X86_FEATURE_XMM
+	.byte 662b-661b
+	.byte 664f-663f
+.previous
+.section .altinstr_replacement,"ax"
+663:	pushl $do_simd_coprocessor_error
+664:
+.previous
+#else
 	pushl $do_simd_coprocessor_error
+#endif
 	CFI_ADJUST_CFA_OFFSET 4
 	jmp error_code
 	CFI_ENDPROC
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e445418..a16c9dfe6b7 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -108,15 +108,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	dec_preempt_count();
 }
 
-#ifdef CONFIG_X86_32
-static inline void
-die_if_kernel(const char *str, struct pt_regs *regs, long err)
-{
-	if (!user_mode_vm(regs))
-		die(str, regs, err);
-}
-#endif
-
 static void __kprobes
 do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 	long error_code, siginfo_t *info)
@@ -729,30 +720,14 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 	conditional_sti(regs);
 
 #ifdef CONFIG_X86_32
-	if (cpu_has_xmm) {
-		/* Handle SIMD FPU exceptions on PIII+ processors. */
-		ignore_fpu_irq = 1;
-		simd_math_error((void __user *)regs->ip);
-		return;
-	}
-	/*
-	 * Handle strange cache flush from user space exception
-	 * in all other cases.  This is undocumented behaviour.
-	 */
-	if (regs->flags & X86_VM_MASK) {
-		handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
-		return;
-	}
-	current->thread.trap_no = 19;
-	current->thread.error_code = error_code;
-	die_if_kernel("cache flush denied", regs, error_code);
-	force_sig(SIGSEGV, current);
+	ignore_fpu_irq = 1;
 #else
 	if (!user_mode(regs) &&
 			kernel_math_error(regs, "kernel simd math error", 19))
 		return;
-	simd_math_error((void __user *)regs->ip);
 #endif
+
+	simd_math_error((void __user *)regs->ip);
 }
 
 dotraplinkage void
-- 
cgit v1.2.3


From 9b6dba9e0798325dab427b9d60c61630ccc39b28 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 21 Mar 2010 09:00:44 -0400
Subject: x86: Merge simd_math_error() into math_error()

The only difference between FPU and SIMD exceptions is where the
status bits are read from (cwd/swd vs. mxcsr).  This also fixes
the discrepency introduced by commit adf77bac, which fixed FPU
but not SIMD.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-3-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/traps.h |   2 +-
 arch/x86/kernel/irqinit.c    |   2 +-
 arch/x86/kernel/traps.c      | 100 ++++++++++++++-----------------------------
 3 files changed, 34 insertions(+), 70 deletions(-)

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 4da91ad69e0..f66cda56781 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -79,7 +79,7 @@ static inline int get_si_code(unsigned long condition)
 
 extern int panic_on_unrecovered_nmi;
 
-void math_error(void __user *);
+void math_error(struct pt_regs *, int, int);
 void math_emulate(struct math_emu_info *);
 #ifndef CONFIG_X86_32
 asmlinkage void smp_thermal_interrupt(void);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 0ed2d300cd4..990ae7cfc57 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -60,7 +60,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
 	outb(0, 0xF0);
 	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
 		return IRQ_NONE;
-	math_error((void __user *)get_irq_regs()->ip);
+	math_error(get_irq_regs(), 0, 16);
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a16c9dfe6b7..a472992cecd 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -595,36 +595,48 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
  * the correct behaviour even in the presence of the asynchronous
  * IRQ13 behaviour
  */
-void math_error(void __user *ip)
+void math_error(struct pt_regs *regs, int error_code, int trapnr)
 {
 	struct task_struct *task;
 	siginfo_t info;
-	unsigned short cwd, swd, err;
+	unsigned short err;
 
 	/*
 	 * Save the info for the exception handler and clear the error.
 	 */
 	task = current;
 	save_init_fpu(task);
-	task->thread.trap_no = 16;
-	task->thread.error_code = 0;
+	task->thread.trap_no = trapnr;
+	task->thread.error_code = error_code;
 	info.si_signo = SIGFPE;
 	info.si_errno = 0;
-	info.si_addr = ip;
-	/*
-	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
-	 * status.  0x3f is the exception bits in these regs, 0x200 is the
-	 * C1 reg you need in case of a stack fault, 0x040 is the stack
-	 * fault bit.  We should only be taking one exception at a time,
-	 * so if this combination doesn't produce any single exception,
-	 * then we have a bad program that isn't synchronizing its FPU usage
-	 * and it will suffer the consequences since we won't be able to
-	 * fully reproduce the context of the exception
-	 */
-	cwd = get_fpu_cwd(task);
-	swd = get_fpu_swd(task);
+	info.si_addr = (void __user *)regs->ip;
+	if (trapnr == 16) {
+		unsigned short cwd, swd;
+		/*
+		 * (~cwd & swd) will mask out exceptions that are not set to unmasked
+		 * status.  0x3f is the exception bits in these regs, 0x200 is the
+		 * C1 reg you need in case of a stack fault, 0x040 is the stack
+		 * fault bit.  We should only be taking one exception at a time,
+		 * so if this combination doesn't produce any single exception,
+		 * then we have a bad program that isn't synchronizing its FPU usage
+		 * and it will suffer the consequences since we won't be able to
+		 * fully reproduce the context of the exception
+		 */
+		cwd = get_fpu_cwd(task);
+		swd = get_fpu_swd(task);
 
-	err = swd & ~cwd;
+		err = swd & ~cwd;
+	} else {
+		/*
+		 * The SIMD FPU exceptions are handled a little differently, as there
+		 * is only a single status/control register.  Thus, to determine which
+		 * unmasked exception was caught we must mask the exception mask bits
+		 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+		 */
+		unsigned short mxcsr = get_fpu_mxcsr(task);
+		err = ~(mxcsr >> 7) & mxcsr;
+	}
 
 	if (err & 0x001) {	/* Invalid op */
 		/*
@@ -663,55 +675,7 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 		return;
 #endif
 
-	math_error((void __user *)regs->ip);
-}
-
-static void simd_math_error(void __user *ip)
-{
-	struct task_struct *task;
-	siginfo_t info;
-	unsigned short mxcsr;
-
-	/*
-	 * Save the info for the exception handler and clear the error.
-	 */
-	task = current;
-	save_init_fpu(task);
-	task->thread.trap_no = 19;
-	task->thread.error_code = 0;
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_code = __SI_FAULT;
-	info.si_addr = ip;
-	/*
-	 * The SIMD FPU exceptions are handled a little differently, as there
-	 * is only a single status/control register.  Thus, to determine which
-	 * unmasked exception was caught we must mask the exception mask bits
-	 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
-	 */
-	mxcsr = get_fpu_mxcsr(task);
-	switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
-	case 0x000:
-	default:
-		break;
-	case 0x001: /* Invalid Op */
-		info.si_code = FPE_FLTINV;
-		break;
-	case 0x002: /* Denormalize */
-	case 0x010: /* Underflow */
-		info.si_code = FPE_FLTUND;
-		break;
-	case 0x004: /* Zero Divide */
-		info.si_code = FPE_FLTDIV;
-		break;
-	case 0x008: /* Overflow */
-		info.si_code = FPE_FLTOVF;
-		break;
-	case 0x020: /* Precision */
-		info.si_code = FPE_FLTRES;
-		break;
-	}
-	force_sig_info(SIGFPE, &info, task);
+	math_error(regs, error_code, 16);
 }
 
 dotraplinkage void
@@ -727,7 +691,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 		return;
 #endif
 
-	simd_math_error((void __user *)regs->ip);
+	math_error(regs, error_code, 19);
 }
 
 dotraplinkage void
-- 
cgit v1.2.3


From e2e75c915de045f0785387dc32f55e92fab0614c Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 21 Mar 2010 09:00:45 -0400
Subject: x86: Merge kernel_math_error() into math_error()

Clean up the kernel exception handling and make it more similar to
the other traps.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-4-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/traps.c | 44 ++++++++++++++++----------------------------
 1 file changed, 16 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a472992cecd..53ba86fc5dd 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -576,20 +576,6 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	return;
 }
 
-#ifdef CONFIG_X86_64
-static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
-{
-	if (fixup_exception(regs))
-		return 1;
-
-	notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
-	/* Illegal floating point operation in the kernel */
-	current->thread.trap_no = trapnr;
-	die(str, regs, 0);
-	return 0;
-}
-#endif
-
 /*
  * Note that we play around with the 'TS' bit in an attempt to get
  * the correct behaviour even in the presence of the asynchronous
@@ -597,14 +583,28 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
  */
 void math_error(struct pt_regs *regs, int error_code, int trapnr)
 {
-	struct task_struct *task;
+	struct task_struct *task = current;
 	siginfo_t info;
 	unsigned short err;
+	char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
+
+	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
+		return;
+	conditional_sti(regs);
+
+	if (!user_mode_vm(regs))
+	{
+		if (!fixup_exception(regs)) {
+			task->thread.error_code = error_code;
+			task->thread.trap_no = trapnr;
+			die(str, regs, error_code);
+		}
+		return;
+	}
 
 	/*
 	 * Save the info for the exception handler and clear the error.
 	 */
-	task = current;
 	save_init_fpu(task);
 	task->thread.trap_no = trapnr;
 	task->thread.error_code = error_code;
@@ -665,14 +665,8 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	conditional_sti(regs);
-
 #ifdef CONFIG_X86_32
 	ignore_fpu_irq = 1;
-#else
-	if (!user_mode(regs) &&
-	    kernel_math_error(regs, "kernel x87 math error", 16))
-		return;
 #endif
 
 	math_error(regs, error_code, 16);
@@ -681,14 +675,8 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	conditional_sti(regs);
-
 #ifdef CONFIG_X86_32
 	ignore_fpu_irq = 1;
-#else
-	if (!user_mode(regs) &&
-			kernel_math_error(regs, "kernel simd math error", 19))
-		return;
 #endif
 
 	math_error(regs, error_code, 19);
-- 
cgit v1.2.3


From 250825008f1f94887bc039e9227a8adfb5ba366e Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 21 Mar 2010 09:00:46 -0400
Subject: x86-32: Don't set ignore_fpu_irq in simd exception

Any processor that supports simd will have an internal fpu, and the
irq13 handler will not be enabled.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
LKML-Reference: <1269176446-2489-5-git-send-email-brgerst@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/traps.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 53ba86fc5dd..00516e1de55 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -675,10 +675,6 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-#ifdef CONFIG_X86_32
-	ignore_fpu_irq = 1;
-#endif
-
 	math_error(regs, error_code, 19);
 }
 
-- 
cgit v1.2.3


From 9414e99672271adcc661f3c160a30b374179b92f Mon Sep 17 00:00:00 2001
From: Phil Carmody <ext-phil.2.carmody@nokia.com>
Date: Wed, 28 Apr 2010 12:09:16 -0500
Subject: oprofile: protect from not being in an IRQ context

http://lkml.org/lkml/2010/4/27/285

Protect against dereferencing regs when it's NULL, and
force a magic number into pc to prevent too deep processing.
This approach permits the dropped samples to be tallied as
invalid Instruction Pointer events.

e.g. output from about 15mins at 10kHz sample rate:
Nr. samples received: 2565380
Nr. samples lost invalid pc: 4

Signed-off-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 drivers/oprofile/cpu_buffer.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c
index 0ac8b065ee0..219f79e2210 100644
--- a/drivers/oprofile/cpu_buffer.c
+++ b/drivers/oprofile/cpu_buffer.c
@@ -319,8 +319,16 @@ void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
 
 void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
 {
-	int is_kernel = !user_mode(regs);
-	unsigned long pc = profile_pc(regs);
+	int is_kernel;
+	unsigned long pc;
+
+	if (likely(regs)) {
+		is_kernel = !user_mode(regs);
+		pc = profile_pc(regs);
+	} else {
+		is_kernel = 0;    /* This value will not be used */
+		pc = ESCAPE_CODE; /* as this causes an early return. */
+	}
 
 	__oprofile_add_ext_sample(pc, regs, event, is_kernel);
 }
-- 
cgit v1.2.3


From 81c4a8a6733ad2ff49c0e077b51403367601b3e7 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 22 Apr 2010 19:14:49 +0200
Subject: oprofile: update file list in MAINTAINERS file

File list now catches:

 $ xargs | eval ls -d $(cat) | sort -u
 arch/*/include/asm/oprofile*.h
 arch/*/oprofile/
 drivers/oprofile/
 include/linux/oprofile.h

 arch/alpha/oprofile/
 arch/arm/oprofile/
 arch/avr32/oprofile/
 arch/blackfin/oprofile/
 arch/ia64/oprofile/
 arch/m32r/oprofile/
 arch/microblaze/oprofile/
 arch/mips/oprofile/
 arch/mn10300/oprofile/
 arch/parisc/oprofile/
 arch/powerpc/include/asm/oprofile_impl.h
 arch/powerpc/oprofile/
 arch/s390/oprofile/
 arch/sh/oprofile/
 arch/sparc/oprofile/
 arch/x86/oprofile/
 drivers/oprofile/
 include/linux/oprofile.h

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index a0e3c3a47a5..31e858617d0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4165,6 +4165,7 @@ OPROFILE
 M:	Robert Richter <robert.richter@amd.com>
 L:	oprofile-list@lists.sf.net
 S:	Maintained
+F:	arch/*/include/asm/oprofile*.h
 F:	arch/*/oprofile/
 F:	drivers/oprofile/
 F:	include/linux/oprofile.h
-- 
cgit v1.2.3


From d88d95eb1c2a72b6126a550debe0883ff723a948 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Sat, 24 Apr 2010 09:56:53 +0200
Subject: x86, k8: Fix build error when K8_NB is disabled

K8_NB depends on PCI and when the last is disabled (allnoconfig) we fail
at the final linking stage due to missing exported num_k8_northbridges.
Add a header stub for that.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100503183036.GJ26107@aftab>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/k8.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index f70e60071fe..af00bd1d208 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -16,11 +16,16 @@ extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
 extern int k8_scan_nodes(void);
 
 #ifdef CONFIG_K8_NB
+extern int num_k8_northbridges;
+
 static inline struct pci_dev *node_to_k8_nb_misc(int node)
 {
 	return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
 }
+
 #else
+#define num_k8_northbridges 0
+
 static inline struct pci_dev *node_to_k8_nb_misc(int node)
 {
 	return NULL;
-- 
cgit v1.2.3


From 097c1bd5673edaf2a162724636858b71f658fdd2 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 3 May 2010 15:49:31 -0700
Subject: x86, cpu: Make APERF/MPERF a normal table-driven flag

APERF/MPERF can be handled via the table like all the other scattered
CPU flags.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Renninger <trenn@suse.de>
Cc: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <1270065406-1814-4-git-send-email-bp@amd64.org>
---
 arch/x86/kernel/cpu/addon_cpuid_features.c | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index fd1fc1902a4..10fa5684a66 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -30,13 +30,14 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	const struct cpuid_bit *cb;
 
 	static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
-		{ X86_FEATURE_IDA,   CR_EAX, 1, 0x00000006 },
-		{ X86_FEATURE_ARAT,  CR_EAX, 2, 0x00000006 },
-		{ X86_FEATURE_CPB,   CR_EDX, 9, 0x80000007 },
-		{ X86_FEATURE_NPT,   CR_EDX, 0, 0x8000000a },
-		{ X86_FEATURE_LBRV,  CR_EDX, 1, 0x8000000a },
-		{ X86_FEATURE_SVML,  CR_EDX, 2, 0x8000000a },
-		{ X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
+		{ X86_FEATURE_IDA,   		CR_EAX, 1, 0x00000006 },
+		{ X86_FEATURE_ARAT,  		CR_EAX, 2, 0x00000006 },
+		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006 },
+		{ X86_FEATURE_CPB,   		CR_EDX, 9, 0x80000007 },
+		{ X86_FEATURE_NPT,   		CR_EDX, 0, 0x8000000a },
+		{ X86_FEATURE_LBRV,  		CR_EDX, 1, 0x8000000a },
+		{ X86_FEATURE_SVML,  		CR_EDX, 2, 0x8000000a },
+		{ X86_FEATURE_NRIPS, 		CR_EDX, 3, 0x8000000a },
 		{ 0, 0, 0, 0 }
 	};
 
@@ -54,14 +55,6 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		if (regs[cb->reg] & (1 << cb->bit))
 			set_cpu_cap(c, cb->feature);
 	}
-
-	/*
-	 * common AMD/Intel features
-	 */
-	if (c->cpuid_level >= 6) {
-		if (cpuid_ecx(6) & 0x1)
-			set_cpu_cap(c, X86_FEATURE_APERFMPERF);
-	}
 }
 
 /* leaf 0xb SMT level */
-- 
cgit v1.2.3


From ba697f40dbb704956a4cf67a7845b538015a01ea Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 4 May 2010 04:47:25 +0200
Subject: lockdep: Provide off case for redundant_hardirqs_on increment

We forgot to provide a !CONFIG_DEBUG_LOCKDEP case for the
redundant_hardirqs_on stat handling.

Manage that in the headers with a new __debug_atomic_inc() helper.

Fixes:

	kernel/lockdep.c:2306: error: 'lockdep_stats' undeclared (first use in this function)
	kernel/lockdep.c:2306: error: (Each undeclared identifier is reported only once
	kernel/lockdep.c:2306: error: for each function it appears in.)

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/lockdep.c           | 2 +-
 kernel/lockdep_internals.h | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1b58a1bbcc8..9cf79858fd8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2303,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 		 * so this is racy by nature but loosing one hit
 		 * in a stat is not a big deal.
 		 */
-		this_cpu_inc(lockdep_stats.redundant_hardirqs_on);
+		__debug_atomic_inc(redundant_hardirqs_on);
 		return;
 	}
 	/* we'll do an OFF -> ON transition: */
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 2b174762fa0..7de27a80f80 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -139,6 +139,9 @@ struct lockdep_stats {
 
 DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
 
+#define __debug_atomic_inc(ptr)					\
+	this_cpu_inc(lockdep_stats.ptr);
+
 #define debug_atomic_inc(ptr)			{		\
 	WARN_ON_ONCE(!irqs_disabled());				\
 	this_cpu_inc(lockdep_stats.ptr);			\
@@ -160,6 +163,7 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
 	__total;							\
 })
 #else
+# define __debug_atomic_inc(ptr)	do { } while (0)
 # define debug_atomic_inc(ptr)		do { } while (0)
 # define debug_atomic_dec(ptr)		do { } while (0)
 # define debug_atomic_read(ptr)		0
-- 
cgit v1.2.3


From fa9a97dec611c5356301645d576b523ce3919eba Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 4 May 2010 04:52:48 +0200
Subject: lockdep: Actually _dec_ in debug_atomic_dec

Fix a silly copy-paste mistake that was making debug_atomic_dec use
this_cpu_inc instead of this_cpu_dec.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/lockdep_internals.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 7de27a80f80..8d929c717d3 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -149,7 +149,7 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
 
 #define debug_atomic_dec(ptr)			{		\
 	WARN_ON_ONCE(!irqs_disabled());				\
-	this_cpu_inc(lockdep_stats.ptr);			\
+	this_cpu_dec(lockdep_stats.ptr);			\
 }
 
 #define debug_atomic_read(ptr)		({				\
-- 
cgit v1.2.3


From 54d47a2be5e7f928fb77b2f5a0761f6bd3c9dbff Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 4 May 2010 04:54:47 +0200
Subject: lockdep: No need to disable preemption in debug atomic ops

No need to disable preemption in the debug_atomic_* ops, as
we ensure interrupts are disabled already.

So let's use the __this_cpu_ops() rather than this_cpu_ops() that
enclose the ops in a preempt disabled section.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/lockdep_internals.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 8d929c717d3..4f560cfedc8 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -144,12 +144,12 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
 
 #define debug_atomic_inc(ptr)			{		\
 	WARN_ON_ONCE(!irqs_disabled());				\
-	this_cpu_inc(lockdep_stats.ptr);			\
+	__this_cpu_inc(lockdep_stats.ptr);			\
 }
 
 #define debug_atomic_dec(ptr)			{		\
 	WARN_ON_ONCE(!irqs_disabled());				\
-	this_cpu_dec(lockdep_stats.ptr);			\
+	__this_cpu_dec(lockdep_stats.ptr);			\
 }
 
 #define debug_atomic_read(ptr)		({				\
-- 
cgit v1.2.3


From b8b14c66765ccba884c5c4570bf8be361d211d95 Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Mon, 3 May 2010 07:36:22 -0500
Subject: powerpc/swiotlb: Fix off by one in determining boundary of which ops
 to use

When we compare the devices DMA mask to the amount of memory we need to
make sure we treat the DMA mask as an address boundary.  For example if
the DMA_MASK(32) and we have 4G of memory we'd incorrectly set the dma
ops to swiotlb.  We need to add one to the dma mask when we convert it.

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/kernel/dma-swiotlb.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c
index 59c928564a0..4ff4da2c238 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -1,7 +1,8 @@
 /*
  * Contains routines needed to support swiotlb for ppc.
  *
- * Copyright (C) 2009 Becky Bruce, Freescale Semiconductor
+ * Copyright (C) 2009-2010 Freescale Semiconductor, Inc.
+ * Author: Becky Bruce
  *
  * This program is free software; you can redistribute  it and/or modify it
  * under  the terms of  the GNU General  Public License as published by the
@@ -70,7 +71,7 @@ static int ppc_swiotlb_bus_notify(struct notifier_block *nb,
 	sd->max_direct_dma_addr = 0;
 
 	/* May need to bounce if the device can't address all of DRAM */
-	if (dma_get_mask(dev) < lmb_end_of_DRAM())
+	if ((dma_get_mask(dev) + 1) < lmb_end_of_DRAM())
 		set_dma_ops(dev, &swiotlb_dma_ops);
 
 	return NOTIFY_DONE;
-- 
cgit v1.2.3


From 777d0411cd1e384115985dac5ccd42031e3eee2b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 3 May 2010 15:39:45 +0200
Subject: hw_breakpoints: Fix percpu build failure

Fix this build error:

   kernel/hw_breakpoint.c:58:1: error: pasting "__pcpu_scope_" and "*" does not give a valid preprocessing token

It happens if CONFIG_DEBUG_FORCE_WEAK_PER_CPU, because we concatenate
someting with the name and we have the "*" in the name.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: K. Prasad <prasad@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
LKML-Reference: <20100503133942.GA5497@nowhere>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hw_breakpoint.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 684b710cbb9..7a56b22e060 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -55,7 +55,7 @@
 static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
 
 /* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, *nr_task_bp_pinned[TYPE_MAX]);
+static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
 
 /* Number of non-pinned cpu/task breakpoints in a cpu */
 static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
-- 
cgit v1.2.3


From 8f5a2dd83a1f8e89fdc17eb0f2f07c2e713e635a Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 23 Mar 2010 19:09:51 +0100
Subject: oprofile/x86: rework error handler in nmi_setup()

This patch improves the error handler in nmi_setup(). Most parts of
the code are moved to allocate_msrs(). In case of an error
allocate_msrs() also frees already allocated memory. nmi_setup()
becomes easier and better extendable.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 2c505ee7101..c0c21f200fa 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -295,6 +295,7 @@ static void free_msrs(void)
 		kfree(per_cpu(cpu_msrs, i).controls);
 		per_cpu(cpu_msrs, i).controls = NULL;
 	}
+	nmi_shutdown_mux();
 }
 
 static int allocate_msrs(void)
@@ -307,14 +308,21 @@ static int allocate_msrs(void)
 		per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
 							GFP_KERNEL);
 		if (!per_cpu(cpu_msrs, i).counters)
-			return 0;
+			goto fail;
 		per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
 							GFP_KERNEL);
 		if (!per_cpu(cpu_msrs, i).controls)
-			return 0;
+			goto fail;
 	}
 
+	if (!nmi_setup_mux())
+		goto fail;
+
 	return 1;
+
+fail:
+	free_msrs();
+	return 0;
 }
 
 static void nmi_cpu_setup(void *dummy)
@@ -342,17 +350,7 @@ static int nmi_setup(void)
 	int cpu;
 
 	if (!allocate_msrs())
-		err = -ENOMEM;
-	else if (!nmi_setup_mux())
-		err = -ENOMEM;
-	else
-		err = register_die_notifier(&profile_exceptions_nb);
-
-	if (err) {
-		free_msrs();
-		nmi_shutdown_mux();
-		return err;
-	}
+		return -ENOMEM;
 
 	/* We need to serialize save and setup for HT because the subset
 	 * of msrs are distinct for save and setup operations
@@ -374,9 +372,17 @@ static int nmi_setup(void)
 
 		mux_clone(cpu);
 	}
+
+	err = register_die_notifier(&profile_exceptions_nb);
+	if (err)
+		goto fail;
+
 	on_each_cpu(nmi_cpu_setup, NULL, 1);
 	nmi_enabled = 1;
 	return 0;
+fail:
+	free_msrs();
+	return err;
 }
 
 static void nmi_cpu_restore_registers(struct op_msrs *msrs)
@@ -421,7 +427,6 @@ static void nmi_shutdown(void)
 	nmi_enabled = 0;
 	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
 	unregister_die_notifier(&profile_exceptions_nb);
-	nmi_shutdown_mux();
 	msrs = &get_cpu_var(cpu_msrs);
 	model->shutdown(msrs);
 	free_msrs();
-- 
cgit v1.2.3


From d0e4120fda6f87eead438eed4d49032e12060e58 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 23 Mar 2010 19:33:21 +0100
Subject: oprofile/x86: reserve counter msrs pairwise

For AMD's and Intel's P6 generic performance counters have pairwise
counter and control msrs. This patch changes the counter reservation
in a way that both msrs must be registered. It joins some counter
loops and also removes the unnecessary NUM_CONTROLS macro in the AMD
implementation.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c  | 43 +++++++++++++++++----------------------
 arch/x86/oprofile/op_model_ppro.c | 36 ++++++++++++++++----------------
 2 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 090cbbec7db..2e2bc902b86 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -30,13 +30,10 @@
 #include "op_counter.h"
 
 #define NUM_COUNTERS 4
-#define NUM_CONTROLS 4
 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
 #define NUM_VIRT_COUNTERS 32
-#define NUM_VIRT_CONTROLS 32
 #else
 #define NUM_VIRT_COUNTERS NUM_COUNTERS
-#define NUM_VIRT_CONTROLS NUM_CONTROLS
 #endif
 
 #define OP_EVENT_MASK			0x0FFF
@@ -134,13 +131,15 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
 	int i;
 
 	for (i = 0; i < NUM_COUNTERS; i++) {
-		if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
-			msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
-	}
-
-	for (i = 0; i < NUM_CONTROLS; i++) {
-		if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
-			msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+		if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
+			continue;
+		if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
+			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+			continue;
+		}
+		/* both registers must be reserved */
+		msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+		msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
 	}
 }
 
@@ -160,7 +159,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
 	}
 
 	/* clear all counters */
-	for (i = 0; i < NUM_CONTROLS; ++i) {
+	for (i = 0; i < NUM_COUNTERS; ++i) {
 		if (unlikely(!msrs->controls[i].addr)) {
 			if (counter_config[i].enabled && !smp_processor_id())
 				/*
@@ -175,12 +174,10 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
 			op_x86_warn_in_use(i);
 		val &= model->reserved;
 		wrmsrl(msrs->controls[i].addr, val);
-	}
-
-	/* avoid a false detection of ctr overflows in NMI handler */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (unlikely(!msrs->counters[i].addr))
-			continue;
+		/*
+		 * avoid a false detection of ctr overflows in NMI
+		 * handler
+		 */
 		wrmsrl(msrs->counters[i].addr, -1LL);
 	}
 
@@ -430,12 +427,10 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
 	int i;
 
 	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (msrs->counters[i].addr)
-			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-	}
-	for (i = 0; i < NUM_CONTROLS; ++i) {
-		if (msrs->controls[i].addr)
-			release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
 	}
 }
 
@@ -583,7 +578,7 @@ static void op_amd_exit(void)
 
 struct op_x86_model_spec op_amd_spec = {
 	.num_counters		= NUM_COUNTERS,
-	.num_controls		= NUM_CONTROLS,
+	.num_controls		= NUM_COUNTERS,
 	.num_virt_counters	= NUM_VIRT_COUNTERS,
 	.reserved		= MSR_AMD_EVENTSEL_RESERVED,
 	.event_mask		= OP_EVENT_MASK,
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 2bf90fafa7b..f8e268e8e99 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -35,13 +35,15 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs)
 	int i;
 
 	for (i = 0; i < num_counters; i++) {
-		if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
-			msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
-	}
-
-	for (i = 0; i < num_counters; i++) {
-		if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
-			msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
+		if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
+			continue;
+		if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) {
+			release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+			continue;
+		}
+		/* both registers must be reserved */
+		msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
+		msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
 	}
 }
 
@@ -92,12 +94,10 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
 			op_x86_warn_in_use(i);
 		val &= model->reserved;
 		wrmsrl(msrs->controls[i].addr, val);
-	}
-
-	/* avoid a false detection of ctr overflows in NMI handler */
-	for (i = 0; i < num_counters; ++i) {
-		if (unlikely(!msrs->counters[i].addr))
-			continue;
+		/*
+		 * avoid a false detection of ctr overflows in NMI *
+		 * handler
+		 */
 		wrmsrl(msrs->counters[i].addr, -1LL);
 	}
 
@@ -194,12 +194,10 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
 	int i;
 
 	for (i = 0; i < num_counters; ++i) {
-		if (msrs->counters[i].addr)
-			release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
-	}
-	for (i = 0; i < num_counters; ++i) {
-		if (msrs->controls[i].addr)
-			release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
 	}
 	if (reset_value) {
 		kfree(reset_value);
-- 
cgit v1.2.3


From 83300ce0df6b72e156b386457aa0f0902b8c0a98 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 23 Mar 2010 20:01:54 +0100
Subject: oprofile/x86: moving shutdown functions

Moving some code in preparation of the next patch.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c  | 24 ++++++++++++------------
 arch/x86/oprofile/op_model_p4.c   | 38 ++++++++++++++++++--------------------
 arch/x86/oprofile/op_model_ppro.c | 33 ++++++++++++++++-----------------
 3 files changed, 46 insertions(+), 49 deletions(-)

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 2e2bc902b86..7e5886d54bd 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -126,6 +126,18 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
 
 /* functions for op_amd_spec */
 
+static void op_amd_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+	}
+}
+
 static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
 {
 	int i;
@@ -422,18 +434,6 @@ static void op_amd_stop(struct op_msrs const * const msrs)
 	op_amd_stop_ibs();
 }
 
-static void op_amd_shutdown(struct op_msrs const * const msrs)
-{
-	int i;
-
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (!msrs->counters[i].addr)
-			continue;
-		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-	}
-}
-
 static u8 ibs_eilvt_off;
 
 static inline void apic_init_ibs_nmi_per_cpu(void *arg)
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index e6a160a4684..7cc80df330d 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -385,6 +385,24 @@ static unsigned int get_stagger(void)
 
 static unsigned long reset_value[NUM_COUNTERS_NON_HT];
 
+static void p4_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (msrs->counters[i].addr)
+			release_perfctr_nmi(msrs->counters[i].addr);
+	}
+	/*
+	 * some of the control registers are specially reserved in
+	 * conjunction with the counter registers (hence the starting offset).
+	 * This saves a few bits.
+	 */
+	for (i = num_counters; i < num_controls; ++i) {
+		if (msrs->controls[i].addr)
+			release_evntsel_nmi(msrs->controls[i].addr);
+	}
+}
 
 static void p4_fill_in_addresses(struct op_msrs * const msrs)
 {
@@ -668,26 +686,6 @@ static void p4_stop(struct op_msrs const * const msrs)
 	}
 }
 
-static void p4_shutdown(struct op_msrs const * const msrs)
-{
-	int i;
-
-	for (i = 0; i < num_counters; ++i) {
-		if (msrs->counters[i].addr)
-			release_perfctr_nmi(msrs->counters[i].addr);
-	}
-	/*
-	 * some of the control registers are specially reserved in
-	 * conjunction with the counter registers (hence the starting offset).
-	 * This saves a few bits.
-	 */
-	for (i = num_counters; i < num_controls; ++i) {
-		if (msrs->controls[i].addr)
-			release_evntsel_nmi(msrs->controls[i].addr);
-	}
-}
-
-
 #ifdef CONFIG_SMP
 struct op_x86_model_spec op_p4_ht2_spec = {
 	.num_counters		= NUM_COUNTERS_HT2,
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index f8e268e8e99..b07d25a52f0 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -30,6 +30,22 @@ static int counter_width = 32;
 
 static u64 *reset_value;
 
+static void ppro_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
+	}
+	if (reset_value) {
+		kfree(reset_value);
+		reset_value = NULL;
+	}
+}
+
 static void ppro_fill_in_addresses(struct op_msrs * const msrs)
 {
 	int i;
@@ -189,23 +205,6 @@ static void ppro_stop(struct op_msrs const * const msrs)
 	}
 }
 
-static void ppro_shutdown(struct op_msrs const * const msrs)
-{
-	int i;
-
-	for (i = 0; i < num_counters; ++i) {
-		if (!msrs->counters[i].addr)
-			continue;
-		release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
-		release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
-	}
-	if (reset_value) {
-		kfree(reset_value);
-		reset_value = NULL;
-	}
-}
-
-
 struct op_x86_model_spec op_ppro_spec = {
 	.num_counters		= 2,
 	.num_controls		= 2,
-- 
cgit v1.2.3


From 8617f98c001d00b176422d707e6a67b88bcd7e0d Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Fri, 26 Feb 2010 17:20:55 +0100
Subject: oprofile/x86: return -EBUSY if counters are already reserved

In case a counter is already reserved by the watchdog or perf_event
subsystem, oprofile ignored this counters silently. This case is
handled now and oprofile_setup() now reports an error.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c       |  5 ++++-
 arch/x86/oprofile/op_model_amd.c  | 24 +++++++++++++-----------
 arch/x86/oprofile/op_model_p4.c   | 14 +++++++++++++-
 arch/x86/oprofile/op_model_ppro.c | 24 +++++++++++++-----------
 arch/x86/oprofile/op_x86_model.h  |  2 +-
 5 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index c0c21f200fa..9f001d90459 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -357,7 +357,10 @@ static int nmi_setup(void)
 	 */
 
 	/* Assume saved/restored counters are the same on all CPUs */
-	model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
+	err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
+	if (err)
+		goto fail;
+
 	for_each_possible_cpu(cpu) {
 		if (!cpu)
 			continue;
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 7e5886d54bd..536d0b0b39a 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -138,21 +138,30 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
 	}
 }
 
-static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
+static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
 {
 	int i;
 
 	for (i = 0; i < NUM_COUNTERS; i++) {
 		if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
-			continue;
+			goto fail;
 		if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
 			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-			continue;
+			goto fail;
 		}
 		/* both registers must be reserved */
 		msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
 		msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+		continue;
+	fail:
+		if (!counter_config[i].enabled)
+			continue;
+		op_x86_warn_reserved(i);
+		op_amd_shutdown(msrs);
+		return -EBUSY;
 	}
+
+	return 0;
 }
 
 static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
@@ -172,15 +181,8 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
 
 	/* clear all counters */
 	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (unlikely(!msrs->controls[i].addr)) {
-			if (counter_config[i].enabled && !smp_processor_id())
-				/*
-				 * counter is reserved, this is on all
-				 * cpus, so report only for cpu #0
-				 */
-				op_x86_warn_reserved(i);
+		if (!msrs->controls[i].addr)
 			continue;
-		}
 		rdmsrl(msrs->controls[i].addr, val);
 		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
 			op_x86_warn_in_use(i);
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 7cc80df330d..182558dd551 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -404,7 +404,7 @@ static void p4_shutdown(struct op_msrs const * const msrs)
 	}
 }
 
-static void p4_fill_in_addresses(struct op_msrs * const msrs)
+static int p4_fill_in_addresses(struct op_msrs * const msrs)
 {
 	unsigned int i;
 	unsigned int addr, cccraddr, stag;
@@ -486,6 +486,18 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
 			msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
 		}
 	}
+
+	for (i = 0; i < num_counters; ++i) {
+		if (!counter_config[i].enabled)
+			continue;
+		if (msrs->controls[i].addr)
+			continue;
+		op_x86_warn_reserved(i);
+		p4_shutdown(msrs);
+		return -EBUSY;
+	}
+
+	return 0;
 }
 
 
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index b07d25a52f0..1fd17cfb956 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -46,21 +46,30 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
 	}
 }
 
-static void ppro_fill_in_addresses(struct op_msrs * const msrs)
+static int ppro_fill_in_addresses(struct op_msrs * const msrs)
 {
 	int i;
 
 	for (i = 0; i < num_counters; i++) {
 		if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
-			continue;
+			goto fail;
 		if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) {
 			release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
-			continue;
+			goto fail;
 		}
 		/* both registers must be reserved */
 		msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
 		msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
+		continue;
+	fail:
+		if (!counter_config[i].enabled)
+			continue;
+		op_x86_warn_reserved(i);
+		ppro_shutdown(msrs);
+		return -EBUSY;
 	}
+
+	return 0;
 }
 
 
@@ -96,15 +105,8 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
 
 	/* clear all counters */
 	for (i = 0; i < num_counters; ++i) {
-		if (unlikely(!msrs->controls[i].addr)) {
-			if (counter_config[i].enabled && !smp_processor_id())
-				/*
-				 * counter is reserved, this is on all
-				 * cpus, so report only for cpu #0
-				 */
-				op_x86_warn_reserved(i);
+		if (!msrs->controls[i].addr)
 			continue;
-		}
 		rdmsrl(msrs->controls[i].addr, val);
 		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
 			op_x86_warn_in_use(i);
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index ff82a755edd..551401398fb 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -41,7 +41,7 @@ struct op_x86_model_spec {
 	u16		event_mask;
 	int		(*init)(struct oprofile_operations *ops);
 	void		(*exit)(void);
-	void		(*fill_in_addresses)(struct op_msrs * const msrs);
+	int		(*fill_in_addresses)(struct op_msrs * const msrs);
 	void		(*setup_ctrs)(struct op_x86_model_spec const *model,
 				      struct op_msrs const * const msrs);
 	int		(*check_ctrs)(struct pt_regs * const regs,
-- 
cgit v1.2.3


From da759fe5be24ec3b236a76c007b460cf6caf2009 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Fri, 26 Feb 2010 10:54:56 +0100
Subject: oprofile/x86: move IBS code

Moving code to make future changes easier. This groups all IBS code
together.

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c | 220 +++++++++++++++++++--------------------
 1 file changed, 110 insertions(+), 110 deletions(-)

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 536d0b0b39a..e159254fb7c 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -102,116 +102,6 @@ static u32 get_ibs_caps(void)
 	return ibs_caps;
 }
 
-#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-
-static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
-			       struct op_msrs const * const msrs)
-{
-	u64 val;
-	int i;
-
-	/* enable active counters */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		int virt = op_x86_phys_to_virt(i);
-		if (!reset_value[virt])
-			continue;
-		rdmsrl(msrs->controls[i].addr, val);
-		val &= model->reserved;
-		val |= op_x86_get_ctrl(model, &counter_config[virt]);
-		wrmsrl(msrs->controls[i].addr, val);
-	}
-}
-
-#endif
-
-/* functions for op_amd_spec */
-
-static void op_amd_shutdown(struct op_msrs const * const msrs)
-{
-	int i;
-
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (!msrs->counters[i].addr)
-			continue;
-		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
-	}
-}
-
-static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
-{
-	int i;
-
-	for (i = 0; i < NUM_COUNTERS; i++) {
-		if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
-			goto fail;
-		if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
-			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
-			goto fail;
-		}
-		/* both registers must be reserved */
-		msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
-		msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
-		continue;
-	fail:
-		if (!counter_config[i].enabled)
-			continue;
-		op_x86_warn_reserved(i);
-		op_amd_shutdown(msrs);
-		return -EBUSY;
-	}
-
-	return 0;
-}
-
-static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
-			      struct op_msrs const * const msrs)
-{
-	u64 val;
-	int i;
-
-	/* setup reset_value */
-	for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
-		if (counter_config[i].enabled
-		    && msrs->counters[op_x86_virt_to_phys(i)].addr)
-			reset_value[i] = counter_config[i].count;
-		else
-			reset_value[i] = 0;
-	}
-
-	/* clear all counters */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		if (!msrs->controls[i].addr)
-			continue;
-		rdmsrl(msrs->controls[i].addr, val);
-		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
-			op_x86_warn_in_use(i);
-		val &= model->reserved;
-		wrmsrl(msrs->controls[i].addr, val);
-		/*
-		 * avoid a false detection of ctr overflows in NMI
-		 * handler
-		 */
-		wrmsrl(msrs->counters[i].addr, -1LL);
-	}
-
-	/* enable active counters */
-	for (i = 0; i < NUM_COUNTERS; ++i) {
-		int virt = op_x86_phys_to_virt(i);
-		if (!reset_value[virt])
-			continue;
-
-		/* setup counter registers */
-		wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
-
-		/* setup control registers */
-		rdmsrl(msrs->controls[i].addr, val);
-		val &= model->reserved;
-		val |= op_x86_get_ctrl(model, &counter_config[virt]);
-		wrmsrl(msrs->controls[i].addr, val);
-	}
-}
-
 /*
  * 16-bit Linear Feedback Shift Register (LFSR)
  *
@@ -376,6 +266,116 @@ static void op_amd_stop_ibs(void)
 		wrmsrl(MSR_AMD64_IBSOPCTL, 0);
 }
 
+#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
+
+static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
+			       struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	/* enable active counters */
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		int virt = op_x86_phys_to_virt(i);
+		if (!reset_value[virt])
+			continue;
+		rdmsrl(msrs->controls[i].addr, val);
+		val &= model->reserved;
+		val |= op_x86_get_ctrl(model, &counter_config[virt]);
+		wrmsrl(msrs->controls[i].addr, val);
+	}
+}
+
+#endif
+
+/* functions for op_amd_spec */
+
+static void op_amd_shutdown(struct op_msrs const * const msrs)
+{
+	int i;
+
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		if (!msrs->counters[i].addr)
+			continue;
+		release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+		release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
+	}
+}
+
+static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
+{
+	int i;
+
+	for (i = 0; i < NUM_COUNTERS; i++) {
+		if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
+			goto fail;
+		if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
+			release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
+			goto fail;
+		}
+		/* both registers must be reserved */
+		msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+		msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+		continue;
+	fail:
+		if (!counter_config[i].enabled)
+			continue;
+		op_x86_warn_reserved(i);
+		op_amd_shutdown(msrs);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
+			      struct op_msrs const * const msrs)
+{
+	u64 val;
+	int i;
+
+	/* setup reset_value */
+	for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
+		if (counter_config[i].enabled
+		    && msrs->counters[op_x86_virt_to_phys(i)].addr)
+			reset_value[i] = counter_config[i].count;
+		else
+			reset_value[i] = 0;
+	}
+
+	/* clear all counters */
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		if (!msrs->controls[i].addr)
+			continue;
+		rdmsrl(msrs->controls[i].addr, val);
+		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+			op_x86_warn_in_use(i);
+		val &= model->reserved;
+		wrmsrl(msrs->controls[i].addr, val);
+		/*
+		 * avoid a false detection of ctr overflows in NMI
+		 * handler
+		 */
+		wrmsrl(msrs->counters[i].addr, -1LL);
+	}
+
+	/* enable active counters */
+	for (i = 0; i < NUM_COUNTERS; ++i) {
+		int virt = op_x86_phys_to_virt(i);
+		if (!reset_value[virt])
+			continue;
+
+		/* setup counter registers */
+		wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]);
+
+		/* setup control registers */
+		rdmsrl(msrs->controls[i].addr, val);
+		val &= model->reserved;
+		val |= op_x86_get_ctrl(model, &counter_config[virt]);
+		wrmsrl(msrs->controls[i].addr, val);
+	}
+}
+
 static int op_amd_check_ctrs(struct pt_regs * const regs,
 			     struct op_msrs const * const msrs)
 {
-- 
cgit v1.2.3


From 5bdb7934ca4115a12c7d585c5a45312b1c36909b Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 31 Mar 2010 11:58:36 +0200
Subject: oprofile/x86: remove duplicate IBS capability check

The check is already done in ibs_exit().

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/op_model_amd.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index e159254fb7c..384c5241048 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -490,8 +490,7 @@ static int init_ibs_nmi(void)
 /* uninitialize the APIC for the IBS interrupts if needed */
 static void clear_ibs_nmi(void)
 {
-	if (ibs_caps)
-		on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
+	on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
 }
 
 /* initialize the APIC for the IBS interrupts if available */
-- 
cgit v1.2.3


From 2623a1d55a6260c855e1f6d1895900b50b40a896 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 3 May 2010 19:44:32 +0200
Subject: oprofile/x86: fix uninitialized counter usage during cpu hotplug

This fixes a NULL pointer dereference that is triggered when taking a
cpu offline after oprofile was initialized, e.g.:

 $ opcontrol --init
 $ opcontrol --start-daemon
 $ opcontrol --shutdown
 $ opcontrol --deinit
 $ echo 0 > /sys/devices/system/cpu/cpu1/online

See the crash dump below. Though the counter has been disabled the cpu
notifier is still active and trying to use already freed counter data.

This fix is for linux-stable. To proper fix this, the hotplug code
must be rewritten. Thus I will leave a WARN_ON_ONCE() message with
this patch.

BUG: unable to handle kernel NULL pointer dereference at (null)
IP: [<ffffffff8132ad57>] op_amd_stop+0x2d/0x8e
PGD 0
Oops: 0000 [#1] SMP
last sysfs file: /sys/devices/system/cpu/cpu1/online
CPU 1
Modules linked in:

Pid: 0, comm: swapper Not tainted 2.6.34-rc5-oprofile-x86_64-standard-00210-g8c00f06 #16 Anaheim/Anaheim
RIP: 0010:[<ffffffff8132ad57>]  [<ffffffff8132ad57>] op_amd_stop+0x2d/0x8e
RSP: 0018:ffff880001843f28  EFLAGS: 00010006
RAX: 0000000000000000 RBX: 0000000000000000 RCX: dead000000200200
RDX: ffff880001843f68 RSI: dead000000100100 RDI: 0000000000000000
RBP: ffff880001843f48 R08: 0000000000000000 R09: ffff880001843f08
R10: ffffffff8102c9a5 R11: ffff88000184ea80 R12: 0000000000000000
R13: ffff88000184f6c0 R14: 0000000000000000 R15: 0000000000000000
FS:  00007fec6a92e6f0(0000) GS:ffff880001840000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000000 CR3: 000000000163b000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffff88042fcd8000, task ffff88042fcd51d0)
Stack:
 ffff880001843f48 0000000000000001 ffff88042e9f7d38 ffff880001843f68
<0> ffff880001843f58 ffffffff8132a602 ffff880001843f98 ffffffff810521b3
<0> ffff880001843f68 ffff880001843f68 ffff880001843f88 ffff88042fcd9fd8
Call Trace:
 <IRQ>
 [<ffffffff8132a602>] nmi_cpu_stop+0x21/0x23
 [<ffffffff810521b3>] generic_smp_call_function_single_interrupt+0xdf/0x11b
 [<ffffffff8101804f>] smp_call_function_single_interrupt+0x22/0x31
 [<ffffffff810029f3>] call_function_single_interrupt+0x13/0x20
 <EOI>
 [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff81008701>] ? default_idle+0x22/0x37
 [<ffffffff8100896d>] c1e_idle+0xdf/0xe6
 [<ffffffff813f1170>] ? atomic_notifier_call_chain+0x13/0x15
 [<ffffffff810012fb>] cpu_idle+0x4b/0x7e
 [<ffffffff813e8a4e>] start_secondary+0x1ae/0x1b2
Code: 89 e5 41 55 49 89 fd 41 54 45 31 e4 53 31 db 48 83 ec 08 89 df e8 be f8 ff ff 48 98 48 83 3c c5 10 67 7a 81 00 74 1f 49 8b 45 08 <42> 8b 0c 20 0f 32 48 c1 e2 20 25 ff ff bf ff 48 09 d0 48 89 c2
RIP  [<ffffffff8132ad57>] op_amd_stop+0x2d/0x8e
 RSP <ffff880001843f28>
CR2: 0000000000000000
---[ end trace 679ac372d674b757 ]---
Kernel panic - not syncing: Fatal exception in interrupt
Pid: 0, comm: swapper Tainted: G      D    2.6.34-rc5-oprofile-x86_64-standard-00210-g8c00f06 #16
Call Trace:
 <IRQ>  [<ffffffff813ebd6a>] panic+0x9e/0x10c
 [<ffffffff810474b0>] ? up+0x34/0x39
 [<ffffffff81031ccc>] ? kmsg_dump+0x112/0x12c
 [<ffffffff813eeff1>] oops_end+0x81/0x8e
 [<ffffffff8101efee>] no_context+0x1f3/0x202
 [<ffffffff8101f1b7>] __bad_area_nosemaphore+0x1ba/0x1e0
 [<ffffffff81028d24>] ? enqueue_task_fair+0x16d/0x17a
 [<ffffffff810264dc>] ? activate_task+0x42/0x53
 [<ffffffff8102c967>] ? try_to_wake_up+0x272/0x284
 [<ffffffff8101f1eb>] bad_area_nosemaphore+0xe/0x10
 [<ffffffff813f0f3f>] do_page_fault+0x1c8/0x37c
 [<ffffffff81028d24>] ? enqueue_task_fair+0x16d/0x17a
 [<ffffffff813ee55f>] page_fault+0x1f/0x30
 [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff8132ad57>] ? op_amd_stop+0x2d/0x8e
 [<ffffffff8132ad46>] ? op_amd_stop+0x1c/0x8e
 [<ffffffff8132a602>] nmi_cpu_stop+0x21/0x23
 [<ffffffff810521b3>] generic_smp_call_function_single_interrupt+0xdf/0x11b
 [<ffffffff8101804f>] smp_call_function_single_interrupt+0x22/0x31
 [<ffffffff810029f3>] call_function_single_interrupt+0x13/0x20
 <EOI>  [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff81008701>] ? default_idle+0x22/0x37
 [<ffffffff8100896d>] c1e_idle+0xdf/0xe6
 [<ffffffff813f1170>] ? atomic_notifier_call_chain+0x13/0x15
 [<ffffffff810012fb>] cpu_idle+0x4b/0x7e
 [<ffffffff813e8a4e>] start_secondary+0x1ae/0x1b2
------------[ cut here ]------------
WARNING: at /local/rrichter/.source/linux/arch/x86/kernel/smp.c:118 native_smp_send_reschedule+0x27/0x53()
Hardware name: Anaheim
Modules linked in:
Pid: 0, comm: swapper Tainted: G      D    2.6.34-rc5-oprofile-x86_64-standard-00210-g8c00f06 #16
Call Trace:
 <IRQ>  [<ffffffff81017f32>] ? native_smp_send_reschedule+0x27/0x53
 [<ffffffff81030ee2>] warn_slowpath_common+0x77/0xa4
 [<ffffffff81030f1e>] warn_slowpath_null+0xf/0x11
 [<ffffffff81017f32>] native_smp_send_reschedule+0x27/0x53
 [<ffffffff8102634b>] resched_task+0x60/0x62
 [<ffffffff8102653a>] check_preempt_curr_idle+0x10/0x12
 [<ffffffff8102c8ea>] try_to_wake_up+0x1f5/0x284
 [<ffffffff8102c986>] default_wake_function+0xd/0xf
 [<ffffffff810a110d>] pollwake+0x57/0x5a
 [<ffffffff8102c979>] ? default_wake_function+0x0/0xf
 [<ffffffff81026be5>] __wake_up_common+0x46/0x75
 [<ffffffff81026ed0>] __wake_up+0x38/0x50
 [<ffffffff81031694>] printk_tick+0x39/0x3b
 [<ffffffff8103ac37>] update_process_times+0x3f/0x5c
 [<ffffffff8104dc63>] tick_periodic+0x5d/0x69
 [<ffffffff8104dc90>] tick_handle_periodic+0x21/0x71
 [<ffffffff81018fd0>] smp_apic_timer_interrupt+0x82/0x95
 [<ffffffff81002853>] apic_timer_interrupt+0x13/0x20
 [<ffffffff81030cb5>] ? panic_blink_one_second+0x0/0x7b
 [<ffffffff813ebdd6>] ? panic+0x10a/0x10c
 [<ffffffff810474b0>] ? up+0x34/0x39
 [<ffffffff81031ccc>] ? kmsg_dump+0x112/0x12c
 [<ffffffff813eeff1>] ? oops_end+0x81/0x8e
 [<ffffffff8101efee>] ? no_context+0x1f3/0x202
 [<ffffffff8101f1b7>] ? __bad_area_nosemaphore+0x1ba/0x1e0
 [<ffffffff81028d24>] ? enqueue_task_fair+0x16d/0x17a
 [<ffffffff810264dc>] ? activate_task+0x42/0x53
 [<ffffffff8102c967>] ? try_to_wake_up+0x272/0x284
 [<ffffffff8101f1eb>] ? bad_area_nosemaphore+0xe/0x10
 [<ffffffff813f0f3f>] ? do_page_fault+0x1c8/0x37c
 [<ffffffff81028d24>] ? enqueue_task_fair+0x16d/0x17a
 [<ffffffff813ee55f>] ? page_fault+0x1f/0x30
 [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff8132ad57>] ? op_amd_stop+0x2d/0x8e
 [<ffffffff8132ad46>] ? op_amd_stop+0x1c/0x8e
 [<ffffffff8132a602>] ? nmi_cpu_stop+0x21/0x23
 [<ffffffff810521b3>] ? generic_smp_call_function_single_interrupt+0xdf/0x11b
 [<ffffffff8101804f>] ? smp_call_function_single_interrupt+0x22/0x31
 [<ffffffff810029f3>] ? call_function_single_interrupt+0x13/0x20
 <EOI>  [<ffffffff8102c9a5>] ? wake_up_process+0x10/0x12
 [<ffffffff81008701>] ? default_idle+0x22/0x37
 [<ffffffff8100896d>] ? c1e_idle+0xdf/0xe6
 [<ffffffff813f1170>] ? atomic_notifier_call_chain+0x13/0x15
 [<ffffffff810012fb>] ? cpu_idle+0x4b/0x7e
 [<ffffffff813e8a4e>] ? start_secondary+0x1ae/0x1b2
---[ end trace 679ac372d674b758 ]---

Cc: Andi Kleen <andi@firstfloor.org>
Cc: stable <stable@kernel.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 9f001d90459..24582040b71 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -95,7 +95,10 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs)
 static void nmi_cpu_start(void *dummy)
 {
 	struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
-	model->start(msrs);
+	if (!msrs->controls)
+		WARN_ON_ONCE(1);
+	else
+		model->start(msrs);
 }
 
 static int nmi_start(void)
@@ -107,7 +110,10 @@ static int nmi_start(void)
 static void nmi_cpu_stop(void *dummy)
 {
 	struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
-	model->stop(msrs);
+	if (!msrs->controls)
+		WARN_ON_ONCE(1);
+	else
+		model->stop(msrs);
 }
 
 static void nmi_stop(void)
-- 
cgit v1.2.3


From 216f3d9b4e5121feea4b13fae9d4c83e8d7e1c8a Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 3 May 2010 11:58:46 +0200
Subject: oprofile/x86: remove CONFIG_SMP macros

CPU notifier register functions also exist if CONFIG_SMP is
disabled. This change is part of hotplug code rework and also
necessary for later patches.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 24582040b71..c5df8ee76ee 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -471,7 +471,6 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
 	return 0;
 }
 
-#ifdef CONFIG_SMP
 static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
 				 void *data)
 {
@@ -491,7 +490,6 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
 static struct notifier_block oprofile_cpu_nb = {
 	.notifier_call = oprofile_cpu_notifier
 };
-#endif
 
 #ifdef CONFIG_PM
 
@@ -701,9 +699,8 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 		return -ENODEV;
 	}
 
-#ifdef CONFIG_SMP
 	register_cpu_notifier(&oprofile_cpu_nb);
-#endif
+
 	/* default values, can be overwritten by model */
 	ops->create_files	= nmi_create_files;
 	ops->setup		= nmi_setup;
@@ -732,9 +729,7 @@ void op_nmi_exit(void)
 {
 	if (using_nmi) {
 		exit_sysfs();
-#ifdef CONFIG_SMP
 		unregister_cpu_notifier(&oprofile_cpu_nb);
-#endif
 	}
 	if (model->exit)
 		model->exit();
-- 
cgit v1.2.3


From 6ae56b55bc364bc2f2342f599b46581627ba22da Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Thu, 29 Apr 2010 14:55:55 +0200
Subject: oprofile/x86: protect cpu hotplug sections

This patch reworks oprofile cpu hotplug code as follows:

Introduce ctr_running variable to check, if counters are running or
not. The state must be known for taking a cpu on or offline and when
switching counters during counter multiplexing.

Protect on_each_cpu() sections with get_online_cpus()/put_online_cpu()
functions. This is necessary if notifiers or states are
modified. Within these sections the cpu mask may not change.

Switch only between counters in nmi_cpu_switch(), if counters are
running. Otherwise the switch may restart a counter though they are
disabled.

Add nmi_cpu_setup() and nmi_cpu_shutdown() to cpu hotplug code. The
function must also be called to avoid uninitialzed counter usage.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 52 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index c5df8ee76ee..b56601eaf29 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -31,8 +31,9 @@ static struct op_x86_model_spec *model;
 static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
 static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
 
-/* 0 == registered but off, 1 == registered and on */
-static int nmi_enabled = 0;
+/* must be protected with get_online_cpus()/put_online_cpus(): */
+static int nmi_enabled;
+static int ctr_running;
 
 struct op_counter_config counter_config[OP_MAX_COUNTER];
 
@@ -103,7 +104,10 @@ static void nmi_cpu_start(void *dummy)
 
 static int nmi_start(void)
 {
+	get_online_cpus();
 	on_each_cpu(nmi_cpu_start, NULL, 1);
+	ctr_running = 1;
+	put_online_cpus();
 	return 0;
 }
 
@@ -118,7 +122,10 @@ static void nmi_cpu_stop(void *dummy)
 
 static void nmi_stop(void)
 {
+	get_online_cpus();
 	on_each_cpu(nmi_cpu_stop, NULL, 1);
+	ctr_running = 0;
+	put_online_cpus();
 }
 
 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
@@ -258,7 +265,10 @@ static int nmi_switch_event(void)
 	if (nmi_multiplex_on() < 0)
 		return -EINVAL;		/* not necessary */
 
-	on_each_cpu(nmi_cpu_switch, NULL, 1);
+	get_online_cpus();
+	if (ctr_running)
+		on_each_cpu(nmi_cpu_switch, NULL, 1);
+	put_online_cpus();
 
 	return 0;
 }
@@ -386,8 +396,11 @@ static int nmi_setup(void)
 	if (err)
 		goto fail;
 
+	get_online_cpus();
 	on_each_cpu(nmi_cpu_setup, NULL, 1);
 	nmi_enabled = 1;
+	put_online_cpus();
+
 	return 0;
 fail:
 	free_msrs();
@@ -433,8 +446,11 @@ static void nmi_shutdown(void)
 {
 	struct op_msrs *msrs;
 
-	nmi_enabled = 0;
+	get_online_cpus();
 	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
+	nmi_enabled = 0;
+	ctr_running = 0;
+	put_online_cpus();
 	unregister_die_notifier(&profile_exceptions_nb);
 	msrs = &get_cpu_var(cpu_msrs);
 	model->shutdown(msrs);
@@ -442,6 +458,22 @@ static void nmi_shutdown(void)
 	put_cpu_var(cpu_msrs);
 }
 
+static void nmi_cpu_up(void *dummy)
+{
+	if (nmi_enabled)
+		nmi_cpu_setup(dummy);
+	if (ctr_running)
+		nmi_cpu_start(dummy);
+}
+
+static void nmi_cpu_down(void *dummy)
+{
+	if (ctr_running)
+		nmi_cpu_stop(dummy);
+	if (nmi_enabled)
+		nmi_cpu_shutdown(dummy);
+}
+
 static int nmi_create_files(struct super_block *sb, struct dentry *root)
 {
 	unsigned int i;
@@ -478,10 +510,10 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
 	switch (action) {
 	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
-		smp_call_function_single(cpu, nmi_cpu_start, NULL, 0);
+		smp_call_function_single(cpu, nmi_cpu_up, NULL, 0);
 		break;
 	case CPU_DOWN_PREPARE:
-		smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1);
+		smp_call_function_single(cpu, nmi_cpu_down, NULL, 1);
 		break;
 	}
 	return NOTIFY_DONE;
@@ -699,7 +731,11 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 		return -ENODEV;
 	}
 
+	get_online_cpus();
 	register_cpu_notifier(&oprofile_cpu_nb);
+	nmi_enabled = 0;
+	ctr_running = 0;
+	put_online_cpus();
 
 	/* default values, can be overwritten by model */
 	ops->create_files	= nmi_create_files;
@@ -729,7 +765,11 @@ void op_nmi_exit(void)
 {
 	if (using_nmi) {
 		exit_sysfs();
+		get_online_cpus();
 		unregister_cpu_notifier(&oprofile_cpu_nb);
+		nmi_enabled = 0;
+		ctr_running = 0;
+		put_online_cpus();
 	}
 	if (model->exit)
 		model->exit();
-- 
cgit v1.2.3


From de654649737696ecf32873c341b305e30f3dc777 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 3 May 2010 14:41:22 +0200
Subject: oprofile/x86: stop disabled counters in nmi handler

This patch adds checks to the nmi handler. Now samples are only
generated and counters reenabled, if the counters are running.
Otherwise the counters are stopped, if oprofile is using the nmi. In
other cases it will ignore the nmi notification.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index b56601eaf29..94b5481bb6c 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -62,12 +62,16 @@ static int profile_exceptions_notify(struct notifier_block *self,
 {
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
-	int cpu = smp_processor_id();
 
 	switch (val) {
 	case DIE_NMI:
 	case DIE_NMI_IPI:
-		model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
+		if (ctr_running)
+			model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
+		else if (!nmi_enabled)
+			break;
+		else
+			model->stop(&__get_cpu_var(cpu_msrs));
 		ret = NOTIFY_STOP;
 		break;
 	default:
@@ -392,6 +396,9 @@ static int nmi_setup(void)
 		mux_clone(cpu);
 	}
 
+	nmi_enabled = 0;
+	ctr_running = 0;
+	barrier();
 	err = register_die_notifier(&profile_exceptions_nb);
 	if (err)
 		goto fail;
@@ -451,6 +458,7 @@ static void nmi_shutdown(void)
 	nmi_enabled = 0;
 	ctr_running = 0;
 	put_online_cpus();
+	barrier();
 	unregister_die_notifier(&profile_exceptions_nb);
 	msrs = &get_cpu_var(cpu_msrs);
 	model->shutdown(msrs);
-- 
cgit v1.2.3


From d30d64c6da3ec7a0708bfffa7e05752d5b9a1093 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 3 May 2010 15:52:26 +0200
Subject: oprofile/x86: reordering some functions

Reordering some functions. Necessary for the next patch. No functional
changes.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 134 ++++++++++++++++++++++----------------------
 1 file changed, 67 insertions(+), 67 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 94b5481bb6c..7de0572b0a5 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -364,56 +364,6 @@ static struct notifier_block profile_exceptions_nb = {
 	.priority = 2
 };
 
-static int nmi_setup(void)
-{
-	int err = 0;
-	int cpu;
-
-	if (!allocate_msrs())
-		return -ENOMEM;
-
-	/* We need to serialize save and setup for HT because the subset
-	 * of msrs are distinct for save and setup operations
-	 */
-
-	/* Assume saved/restored counters are the same on all CPUs */
-	err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
-	if (err)
-		goto fail;
-
-	for_each_possible_cpu(cpu) {
-		if (!cpu)
-			continue;
-
-		memcpy(per_cpu(cpu_msrs, cpu).counters,
-		       per_cpu(cpu_msrs, 0).counters,
-		       sizeof(struct op_msr) * model->num_counters);
-
-		memcpy(per_cpu(cpu_msrs, cpu).controls,
-		       per_cpu(cpu_msrs, 0).controls,
-		       sizeof(struct op_msr) * model->num_controls);
-
-		mux_clone(cpu);
-	}
-
-	nmi_enabled = 0;
-	ctr_running = 0;
-	barrier();
-	err = register_die_notifier(&profile_exceptions_nb);
-	if (err)
-		goto fail;
-
-	get_online_cpus();
-	on_each_cpu(nmi_cpu_setup, NULL, 1);
-	nmi_enabled = 1;
-	put_online_cpus();
-
-	return 0;
-fail:
-	free_msrs();
-	return err;
-}
-
 static void nmi_cpu_restore_registers(struct op_msrs *msrs)
 {
 	struct op_msr *counters = msrs->counters;
@@ -449,23 +399,6 @@ static void nmi_cpu_shutdown(void *dummy)
 	nmi_cpu_restore_registers(msrs);
 }
 
-static void nmi_shutdown(void)
-{
-	struct op_msrs *msrs;
-
-	get_online_cpus();
-	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
-	nmi_enabled = 0;
-	ctr_running = 0;
-	put_online_cpus();
-	barrier();
-	unregister_die_notifier(&profile_exceptions_nb);
-	msrs = &get_cpu_var(cpu_msrs);
-	model->shutdown(msrs);
-	free_msrs();
-	put_cpu_var(cpu_msrs);
-}
-
 static void nmi_cpu_up(void *dummy)
 {
 	if (nmi_enabled)
@@ -531,6 +464,73 @@ static struct notifier_block oprofile_cpu_nb = {
 	.notifier_call = oprofile_cpu_notifier
 };
 
+static int nmi_setup(void)
+{
+	int err = 0;
+	int cpu;
+
+	if (!allocate_msrs())
+		return -ENOMEM;
+
+	/* We need to serialize save and setup for HT because the subset
+	 * of msrs are distinct for save and setup operations
+	 */
+
+	/* Assume saved/restored counters are the same on all CPUs */
+	err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
+	if (err)
+		goto fail;
+
+	for_each_possible_cpu(cpu) {
+		if (!cpu)
+			continue;
+
+		memcpy(per_cpu(cpu_msrs, cpu).counters,
+		       per_cpu(cpu_msrs, 0).counters,
+		       sizeof(struct op_msr) * model->num_counters);
+
+		memcpy(per_cpu(cpu_msrs, cpu).controls,
+		       per_cpu(cpu_msrs, 0).controls,
+		       sizeof(struct op_msr) * model->num_controls);
+
+		mux_clone(cpu);
+	}
+
+	nmi_enabled = 0;
+	ctr_running = 0;
+	barrier();
+	err = register_die_notifier(&profile_exceptions_nb);
+	if (err)
+		goto fail;
+
+	get_online_cpus();
+	on_each_cpu(nmi_cpu_setup, NULL, 1);
+	nmi_enabled = 1;
+	put_online_cpus();
+
+	return 0;
+fail:
+	free_msrs();
+	return err;
+}
+
+static void nmi_shutdown(void)
+{
+	struct op_msrs *msrs;
+
+	get_online_cpus();
+	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
+	nmi_enabled = 0;
+	ctr_running = 0;
+	put_online_cpus();
+	barrier();
+	unregister_die_notifier(&profile_exceptions_nb);
+	msrs = &get_cpu_var(cpu_msrs);
+	model->shutdown(msrs);
+	free_msrs();
+	put_cpu_var(cpu_msrs);
+}
+
 #ifdef CONFIG_PM
 
 static int nmi_suspend(struct sys_device *dev, pm_message_t state)
-- 
cgit v1.2.3


From 11d232ec285b07860670277c8ab3f6076f7bce1e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 4 May 2010 10:48:22 -0300
Subject: perf inject: Add missing bits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New commands need to have Documentation and be added to command-list.txt
so that they can appear when 'perf' is called withouth any subcommand:

[root@doppio linux-2.6-tip]# perf

 usage: perf [--version] [--help] COMMAND [ARGS]

 The most commonly used perf commands are:
   annotate        Read perf.data (created by perf record) and display annotated code
   archive         Create archive with object files with build-ids found in perf.data file
   bench           General framework for benchmark suites
   buildid-cache   Manage build-id cache.
   buildid-list    List the buildids in a perf.data file
   diff            Read two perf.data files and display the differential profile
   inject          Filter to augment the events stream with additional information
   kmem            Tool to trace/measure kernel memory(slab) properties
   kvm             Tool to trace/measure kvm guest os
   list            List all symbolic event types
   lock            Analyze lock events
   probe           Define new dynamic tracepoints
   record          Run a command and record its profile into perf.data
   report          Read perf.data (created by perf record) and display the profile
   sched           Tool to trace/measure scheduler properties (latencies)
   stat            Run a command and gather performance counter statistics
   test            Runs sanity tests.
   timechart       Tool to visualize total system behavior during a workload
   top             System profiling tool.
   trace           Read perf.data (created by perf record) and display trace output

 See 'perf help COMMAND' for more information on a specific command.

[root@doppio linux-2.6-tip]#

The new 'perf inject' command hadn't so it wasn't appearing on that list.

Also fix the long option, that should have no spaces in it, rename the faulty one
to be '--build-ids', instead of '--inject build-ids'.

Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-buildid-cache.txt |  4 +--
 tools/perf/Documentation/perf-inject.txt        | 35 +++++++++++++++++++++++++
 tools/perf/builtin-inject.c                     |  2 +-
 tools/perf/command-list.txt                     |  1 +
 4 files changed, 39 insertions(+), 3 deletions(-)
 create mode 100644 tools/perf/Documentation/perf-inject.txt

diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt
index 88bc3b51974..5d1a9500277 100644
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -8,7 +8,7 @@ perf-buildid-cache - Manage build-id cache.
 SYNOPSIS
 --------
 [verse]
-'perf buildid-list <options>'
+'perf buildid-cache <options>'
 
 DESCRIPTION
 -----------
@@ -30,4 +30,4 @@ OPTIONS
 
 SEE ALSO
 --------
-linkperf:perf-record[1], linkperf:perf-report[1]
+linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-buildid-list[1]
diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
new file mode 100644
index 00000000000..025630d43cd
--- /dev/null
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -0,0 +1,35 @@
+perf-inject(1)
+==============
+
+NAME
+----
+perf-inject - Filter to augment the events stream with additional information
+
+SYNOPSIS
+--------
+[verse]
+'perf inject <options>'
+
+DESCRIPTION
+-----------
+perf-inject reads a perf-record event stream and repipes it to stdout.  At any
+point the processing code can inject other events into the event stream - in
+this case build-ids (-b option) are read and injected as needed into the event
+stream.
+
+Build-ids are just the first user of perf-inject - potentially anything that
+needs userspace processing to augment the events stream with additional
+information could make use of this facility.
+
+OPTIONS
+-------
+-b::
+--build-ids=::
+        Inject build-ids into the output stream
+-v::
+--verbose::
+	Be more verbose.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1]
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 59e981a8890..8e3e47b064c 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -204,7 +204,7 @@ static const char * const report_usage[] = {
 };
 
 static const struct option options[] = {
-	OPT_BOOLEAN('b', "inject build-ids", &inject_build_ids,
+	OPT_BOOLEAN('b', "build-ids", &inject_build_ids,
 		    "Inject build-ids into the output stream"),
 	OPT_INCR('v', "verbose", &verbose,
 		 "be more verbose (show build ids, etc)"),
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index 80a1a446ce3..949d77fc0b9 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -8,6 +8,7 @@ perf-bench			mainporcelain common
 perf-buildid-cache		mainporcelain common
 perf-buildid-list		mainporcelain common
 perf-diff			mainporcelain common
+perf-inject			mainporcelain common
 perf-list			mainporcelain common
 perf-sched			mainporcelain common
 perf-record			mainporcelain common
-- 
cgit v1.2.3


From 02bf60aad7d5912dfcdbe0154f1bd67ea7a8301e Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 4 May 2010 21:19:15 +1000
Subject: perf: Fix performance issue with perf report

On a large machine we spend a lot of time in perf_header__find_attr when
running perf report.

If we are parsing a file without PERF_SAMPLE_ID then for each sample we call
perf_header__find_attr and loop through all counter IDs, never finding a match.
As the machine gets larger there are more per cpu counters and we spend an
awful lot of time in there.

The patch below initialises each sample id to -1ULL and checks for this in
perf_header__find_attr. We may need to do something more intelligent eventually
(eg a hash lookup from counter id to attr) but this at least fixes the most
common usage of perf report.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric B Munson <ebmunson@us.ibm.com>
Acked-by: Eric B Munson <ebmunson@us.ibm.com>
LKML-Reference: <20100504111915.GB14636@kryten>
Signed-off-by: Anton Blanchard <anton@samba.org>
--
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c  | 1 +
 tools/perf/util/header.c | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 1757b0ffeaa..2477270c1d3 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -713,6 +713,7 @@ int event__parse_sample(event_t *event, u64 type, struct sample_data *data)
 		array++;
 	}
 
+	data->id = -1ULL;
 	if (type & PERF_SAMPLE_ID) {
 		data->id = *array;
 		array++;
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 2b9f898efea..8847bec64c5 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -922,6 +922,14 @@ perf_header__find_attr(u64 id, struct perf_header *header)
 {
 	int i;
 
+	/*
+	 * We set id to -1 if the data file doesn't contain sample
+	 * ids. Check for this and avoid walking through the entire
+	 * list of ids which may be large.
+	 */
+	if (id == -1ULL)
+		return NULL;
+
 	for (i = 0; i < header->attrs; i++) {
 		struct perf_header_attr *attr = header->attr[i];
 		int j;
-- 
cgit v1.2.3


From 4dbf6bc239c169b032777616806ecc648058f6b2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 4 May 2010 11:24:01 -0400
Subject: tracing: Convert nop macros to static inlines

The ftrace.h file contains several functions as macros when the
functions are disabled due to config options. This patch converts
most of them to static inlines.

There are two exceptions:

  register_ftrace_function() and unregister_ftrace_function()

This is because their parameter "ops" must not be evaluated since
code using the function is allowed to #ifdef out the creation of
the parameter.

This also fixes an error caused by recent changes:

 kernel/trace/trace_irqsoff.c: In function 'start_irqsoff_tracer':
 kernel/trace/trace_irqsoff.c:571: error: expected expression before 'do'

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 8415a522f43..e0ae83bbd9c 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -82,9 +82,13 @@ void clear_ftrace_function(void);
 extern void ftrace_stub(unsigned long a0, unsigned long a1);
 
 #else /* !CONFIG_FUNCTION_TRACER */
-# define register_ftrace_function(ops) do { } while (0)
-# define unregister_ftrace_function(ops) do { } while (0)
-# define clear_ftrace_function(ops) do { } while (0)
+/*
+ * (un)register_ftrace_function must be a macro since the ops parameter
+ * must not be evaluated.
+ */
+#define register_ftrace_function(ops) ({ 0; })
+#define unregister_ftrace_function(ops) ({ 0; })
+static inline void clear_ftrace_function(void) { }
 static inline void ftrace_kill(void) { }
 static inline void ftrace_stop(void) { }
 static inline void ftrace_start(void) { }
@@ -237,11 +241,13 @@ extern int skip_trace(unsigned long ip);
 extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
 #else
-# define skip_trace(ip)				({ 0; })
-# define ftrace_force_update()			({ 0; })
-# define ftrace_set_filter(buf, len, reset)	do { } while (0)
-# define ftrace_disable_daemon()		do { } while (0)
-# define ftrace_enable_daemon()			do { } while (0)
+static inline int skip_trace(unsigned long ip) { return 0; }
+static inline int ftrace_force_update(void) { return 0; }
+static inline void ftrace_set_filter(unsigned char *buf, int len, int reset)
+{
+}
+static inline void ftrace_disable_daemon(void) { }
+static inline void ftrace_enable_daemon(void) { }
 static inline void ftrace_release_mod(struct module *mod) {}
 static inline int register_ftrace_command(struct ftrace_func_command *cmd)
 {
@@ -314,16 +320,16 @@ static inline void __ftrace_enabled_restore(int enabled)
   extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
   extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
 #else
-# define time_hardirqs_on(a0, a1)		do { } while (0)
-# define time_hardirqs_off(a0, a1)		do { } while (0)
+  static inline void time_hardirqs_on(unsigned long a0, unsigned long a1) { }
+  static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { }
 #endif
 
 #ifdef CONFIG_PREEMPT_TRACER
   extern void trace_preempt_on(unsigned long a0, unsigned long a1);
   extern void trace_preempt_off(unsigned long a0, unsigned long a1);
 #else
-# define trace_preempt_on(a0, a1)		do { } while (0)
-# define trace_preempt_off(a0, a1)		do { } while (0)
+  static inline void trace_preempt_on(unsigned long a0, unsigned long a1) { }
+  static inline void trace_preempt_off(unsigned long a0, unsigned long a1) { }
 #endif
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
-- 
cgit v1.2.3


From 956097912c40a03bf22603a3be73503fd9ea9e44 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Sun, 2 May 2010 08:03:54 +0200
Subject: ring-buffer: Wrap open-coded WARN_ONCE

Wrap open-coded WARN_ONCE functionality into the equivalent macro.

Signed-off-by: Borislav Petkov <bp@alien8.de>
LKML-Reference: <20100502060354.GA5281@liondog.tnic>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2a090448ef6..7f6059c5aa9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2000,17 +2000,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  u64 *ts, u64 *delta)
 {
 	struct ring_buffer_event *event;
-	static int once;
 	int ret;
 
-	if (unlikely(*delta > (1ULL << 59) && !once++)) {
-		printk(KERN_WARNING "Delta way too big! %llu"
-		       " ts=%llu write stamp = %llu\n",
-		       (unsigned long long)*delta,
-		       (unsigned long long)*ts,
-		       (unsigned long long)cpu_buffer->write_stamp);
-		WARN_ON(1);
-	}
+	WARN_ONCE(*delta > (1ULL << 59),
+		  KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+		  (unsigned long long)*delta,
+		  (unsigned long long)*ts,
+		  (unsigned long long)cpu_buffer->write_stamp);
 
 	/*
 	 * The delta is too big, we to add a
-- 
cgit v1.2.3


From 4677d4a53e0d565742277e8913e91c821453e63e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Mon, 3 May 2010 14:57:11 +0200
Subject: arch, hweight: Fix compilation errors

Fix function prototype visibility issues when compiling for non-x86
architectures. Tested with crosstool
(ftp://ftp.kernel.org/pub/tools/crosstool/) with alpha, ia64 and sparc
targets.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100503130736.GD26107@aftab>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 include/asm-generic/bitops/arch_hweight.h | 8 ++++----
 include/linux/bitops.h                    | 5 +++++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/asm-generic/bitops/arch_hweight.h b/include/asm-generic/bitops/arch_hweight.h
index 9a81c1e9436..6a211f40665 100644
--- a/include/asm-generic/bitops/arch_hweight.h
+++ b/include/asm-generic/bitops/arch_hweight.h
@@ -3,22 +3,22 @@
 
 #include <asm/types.h>
 
-inline unsigned int __arch_hweight32(unsigned int w)
+static inline unsigned int __arch_hweight32(unsigned int w)
 {
 	return __sw_hweight32(w);
 }
 
-inline unsigned int __arch_hweight16(unsigned int w)
+static inline unsigned int __arch_hweight16(unsigned int w)
 {
 	return __sw_hweight16(w);
 }
 
-inline unsigned int __arch_hweight8(unsigned int w)
+static inline unsigned int __arch_hweight8(unsigned int w)
 {
 	return __sw_hweight8(w);
 }
 
-inline unsigned long __arch_hweight64(__u64 w)
+static inline unsigned long __arch_hweight64(__u64 w)
 {
 	return __sw_hweight64(w);
 }
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index c55d5bc4ee5..26caa608ccd 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -10,6 +10,11 @@
 #define BITS_TO_LONGS(nr)	DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
 #endif
 
+extern unsigned int __sw_hweight8(unsigned int w);
+extern unsigned int __sw_hweight16(unsigned int w);
+extern unsigned int __sw_hweight32(unsigned int w);
+extern unsigned long __sw_hweight64(__u64 w);
+
 /*
  * Include this here because some architectures need generic_ffs/fls in
  * scope
-- 
cgit v1.2.3


From 20503664b008e17976bff1fdbc693c77ebd6f6c9 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Mon, 3 May 2010 20:54:34 +0200
Subject: logfs: survive logfs_buf_recover read errors

Refusing to mount beats a kernel crash.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/journal.c |  7 +++----
 fs/logfs/logfs.h   | 10 +++++-----
 fs/logfs/segment.c |  7 +++++--
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index fb0a613f885..4b0e0616b35 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -132,10 +132,9 @@ static int read_area(struct super_block *sb, struct logfs_je_area *a)
 
 	ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
 	if (super->s_writesize > 1)
-		logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
+		return logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
 	else
-		logfs_buf_recover(area, ofs, NULL, 0);
-	return 0;
+		return logfs_buf_recover(area, ofs, NULL, 0);
 }
 
 static void *unpack(void *from, void *to)
@@ -245,7 +244,7 @@ static int read_je(struct super_block *sb, u64 ofs)
 		read_erasecount(sb, unpack(jh, scratch));
 		break;
 	case JE_AREA:
-		read_area(sb, unpack(jh, scratch));
+		err = read_area(sb, unpack(jh, scratch));
 		break;
 	case JE_OBJ_ALIAS:
 		err = logfs_load_object_aliases(sb, unpack(jh, scratch),
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 0a3df1a0c93..32bf55616e5 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -598,19 +598,19 @@ void freeseg(struct super_block *sb, u32 segno);
 int logfs_init_areas(struct super_block *sb);
 void logfs_cleanup_areas(struct super_block *sb);
 int logfs_open_area(struct logfs_area *area, size_t bytes);
-void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
 		int use_filler);
 
-static inline void logfs_buf_write(struct logfs_area *area, u64 ofs,
+static inline int logfs_buf_write(struct logfs_area *area, u64 ofs,
 		void *buf, size_t len)
 {
-	__logfs_buf_write(area, ofs, buf, len, 0);
+	return __logfs_buf_write(area, ofs, buf, len, 0);
 }
 
-static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs,
+static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
 		void *buf, size_t len)
 {
-	__logfs_buf_write(area, ofs, buf, len, 1);
+	return __logfs_buf_write(area, ofs, buf, len, 1);
 }
 
 /* super.c */
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index f77ce2b470b..a9657afb70a 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -67,7 +67,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
 	return page;
 }
 
-void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
+int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
 		int use_filler)
 {
 	pgoff_t index = ofs >> PAGE_SHIFT;
@@ -81,8 +81,10 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
 		copylen = min((ulong)len, PAGE_SIZE - offset);
 
 		page = get_mapping_page(area->a_sb, index, use_filler);
-		SetPageUptodate(page);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
 		BUG_ON(!page); /* FIXME: reserve a pool */
+		SetPageUptodate(page);
 		memcpy(page_address(page) + offset, buf, copylen);
 		SetPagePrivate(page);
 		page_cache_release(page);
@@ -92,6 +94,7 @@ void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
 		offset = 0;
 		index++;
 	} while (len);
+	return 0;
 }
 
 static void pad_partial_page(struct logfs_area *area)
-- 
cgit v1.2.3


From 05ebad852901cf9127a743df6ea10c0e8b1590c3 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Tue, 4 May 2010 19:41:09 +0200
Subject: logfs: commit reservations under space pressure

Ensures we only return -ENOSPC when there really is no space.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/file.c      | 12 +++++++++++-
 fs/logfs/logfs.h     |  1 +
 fs/logfs/readwrite.c | 15 ++++++++++++---
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 370f367a933..bf9b1cf953a 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -161,7 +161,17 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
 
 static void logfs_invalidatepage(struct page *page, unsigned long offset)
 {
-	move_page_to_btree(page);
+	struct logfs_block *block = logfs_block(page);
+
+	if (block->reserved_bytes) {
+		struct super_block *sb = page->mapping->host->i_sb;
+		struct logfs_super *super = logfs_super(sb);
+
+		super->s_dirty_pages -= block->reserved_bytes;
+		block->ops->free_block(sb, block);
+		BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR));
+	} else
+		move_page_to_btree(page);
 	BUG_ON(PagePrivate(page) || page->private);
 }
 
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 32bf55616e5..26a9458e6b1 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -394,6 +394,7 @@ struct logfs_super {
 	int	 s_lock_count;
 	mempool_t *s_block_pool;		/* struct logfs_block pool */
 	mempool_t *s_shadow_pool;		/* struct logfs_shadow pool */
+	struct list_head s_writeback_list;	/* writeback pages */
 	/*
 	 * Space accounting:
 	 * - s_used_bytes specifies space used to store valid data objects.
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index e37cee3b100..0718d112a1a 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1095,17 +1095,25 @@ static int logfs_reserve_bytes(struct inode *inode, int bytes)
 int get_page_reserve(struct inode *inode, struct page *page)
 {
 	struct logfs_super *super = logfs_super(inode->i_sb);
+	struct logfs_block *block = logfs_block(page);
 	int ret;
 
-	if (logfs_block(page) && logfs_block(page)->reserved_bytes)
+	if (block && block->reserved_bytes)
 		return 0;
 
 	logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
-	ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE);
+	while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) &&
+			!list_empty(&super->s_writeback_list)) {
+		block = list_entry(super->s_writeback_list.next,
+				struct logfs_block, alias_list);
+		block->ops->write_block(block);
+	}
 	if (!ret) {
 		alloc_data_block(inode, page);
-		logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
+		block = logfs_block(page);
+		block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
 		super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
+		list_move_tail(&block->alias_list, &super->s_writeback_list);
 	}
 	logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
 	return ret;
@@ -2251,6 +2259,7 @@ int logfs_init_rw(struct super_block *sb)
 	int min_fill = 3 * super->s_no_blocks;
 
 	INIT_LIST_HEAD(&super->s_object_alias);
+	INIT_LIST_HEAD(&super->s_writeback_list);
 	mutex_init(&super->s_write_mutex);
 	super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
 			sizeof(struct logfs_block));
-- 
cgit v1.2.3


From 24797535e18ae219be1fc2632959327075bef5da Mon Sep 17 00:00:00 2001
From: Prasad Joshi <prasadjoshi124@gmail.com>
Date: Tue, 4 May 2010 22:13:59 +0200
Subject: logfs: initialize li->li_refcount

li_refcount was not re-initialized in function logfs_init_inode(), small
patch that will fix the problem

Signed-off-by: Prasad Joshi <prasadjoshi124@gmail.com>
Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 45bf86f1595..af78b6e8289 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -193,6 +193,7 @@ static void logfs_init_inode(struct super_block *sb, struct inode *inode)
 	inode->i_ctime	= CURRENT_TIME;
 	inode->i_mtime	= CURRENT_TIME;
 	inode->i_nlink	= 1;
+	li->li_refcount = 1;
 	INIT_LIST_HEAD(&li->li_freeing_list);
 
 	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
-- 
cgit v1.2.3


From 03d646e62b06e9364e2dbb939d67934c6c9826cd Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 21 Dec 2009 14:27:24 +0800
Subject: tracing: Make the documentation clear on trace_event boot option

Make it clear that event-list is a comma separated list of events.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4B2F154C.2060503@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/trace/events.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index 02ac6ed38b2..778ddf38b82 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -90,7 +90,8 @@ In order to facilitate early boot debugging, use boot option:
 
 	trace_event=[event-list]
 
-The format of this boot option is the same as described in section 2.1.
+event-list is a comma separated list of events. See section 2.1 for event
+format.
 
 3. Defining an event-enabled tracepoint
 =======================================
-- 
cgit v1.2.3


From 2c2df8418ac7908eec4558407b83f16739006c54 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:02 -0700
Subject: x86, acpi/irq: Introduce apci_isa_irq_to_gsi

There are a number of cases where the current code makes the assumption
that isa irqs identity map to the first 16 acpi global system intereupts.
In most instances that assumption is correct as that is the required
behaviour in dual i8259 mode and the default behavior in ioapic mode.

However there are some systems out there that take advantage of acpis
interrupt remapping  for the isa irqs to have a completely different
mapping of isa_irq to gsi.

Introduce acpi_isa_irq_to_gsi to perform this mapping explicitly in the
code that needs it.  Initially this will be just the current assumed
identity mapping to ensure it's introduction does not cause regressions.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-1-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/ia64/kernel/acpi.c     | 8 ++++++++
 arch/x86/kernel/acpi/boot.c | 8 ++++++++
 include/linux/acpi.h        | 1 +
 3 files changed, 17 insertions(+)

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 4d1a7e9314c..c6c90f39f4d 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -785,6 +785,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 	return 0;
 }
 
+int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
+{
+	if (isa_irq >= 16)
+		return -1;
+	*gsi = isa_irq;
+	return 0;
+}
+
 /*
  *  ACPI based hotplug CPU support
  */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index cd40aba6aa9..da718d67259 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -458,6 +458,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 	return 0;
 }
 
+int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
+{
+	if (isa_irq >= 16)
+		return -1;
+	*gsi = isa_irq;
+	return 0;
+}
+
 /*
  * success: return IRQ number (>=0)
  * failure: return < 0
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index b926afe8c03..7a937dabcc4 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -116,6 +116,7 @@ extern unsigned long acpi_realmode_flags;
 
 int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity);
 int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
+int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi);
 
 #ifdef CONFIG_X86_IO_APIC
 extern int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity);
-- 
cgit v1.2.3


From 9a0a91bb56d2915cdb8585717de38376ad20fef9 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:03 -0700
Subject: x86, acpi/irq: Teach acpi_get_override_irq to take a gsi not an
 isa_irq

In perverse acpi implementations the isa irqs are not identity mapped
to the first 16 gsi.  Furthermore at least the extended interrupt
resource capability may return gsi's and not isa irqs.  So since
what we get from acpi is a gsi teach acpi_get_overrride_irq to
operate on a gsi instead of an isa_irq.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-2-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 23 ++++++++++++++---------
 include/linux/acpi.h           |  4 ++--
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 127b8718abf..73ec92838d8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -4082,22 +4082,27 @@ int __init io_apic_get_version(int ioapic)
 	return reg_01.bits.version;
 }
 
-int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
 {
-	int i;
+	int ioapic, pin, idx;
 
 	if (skip_ioapic_setup)
 		return -1;
 
-	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].irqtype == mp_INT &&
-		    mp_irqs[i].srcbusirq == bus_irq)
-			break;
-	if (i >= mp_irq_entries)
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return -1;
+
+	pin = mp_find_ioapic_pin(ioapic, gsi);
+	if (pin < 0)
+		return -1;
+
+	idx = find_irq_entry(ioapic, pin, mp_INT);
+	if (idx < 0)
 		return -1;
 
-	*trigger = irq_trigger(i);
-	*polarity = irq_polarity(i);
+	*trigger = irq_trigger(idx);
+	*polarity = irq_polarity(idx);
 	return 0;
 }
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 7a937dabcc4..3da73f5f0ae 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -119,9 +119,9 @@ int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
 int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi);
 
 #ifdef CONFIG_X86_IO_APIC
-extern int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity);
+extern int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
 #else
-#define acpi_get_override_irq(bus, trigger, polarity) (-1)
+#define acpi_get_override_irq(gsi, trigger, polarity) (-1)
 #endif
 /*
  * This function undoes the effect of one call to acpi_register_gsi().
-- 
cgit v1.2.3


From 414d3448dbcb40807a1265ace64b2576ef919fbe Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:04 -0700
Subject: x86, acpi/irq: pci device dev->irq is an isa irq not a gsi

Strictly speaking on x86 (where acpi is used) dev->irq must be
a dual i8259 irq input aka an isa irq.  Therefore we should translate
that isa irq into a gsi before passing it to a function that
takes a gsi.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-3-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 drivers/acpi/pci_irq.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index b0a71ecee68..e4804fb05e2 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -401,11 +401,13 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 	 * driver reported one, then use it. Exit in any case.
 	 */
 	if (gsi < 0) {
+		u32 dev_gsi;
 		dev_warn(&dev->dev, "PCI INT %c: no GSI", pin_name(pin));
 		/* Interrupt Line values above 0xF are forbidden */
-		if (dev->irq > 0 && (dev->irq <= 0xF)) {
-			printk(" - using IRQ %d\n", dev->irq);
-			acpi_register_gsi(&dev->dev, dev->irq,
+		if (dev->irq > 0 && (dev->irq <= 0xF) &&
+		    (acpi_isa_irq_to_gsi(dev->irq, &dev_gsi) == 0)) {
+			printk(" - using ISA IRQ %d\n", dev->irq);
+			acpi_register_gsi(&dev->dev, dev_gsi,
 					  ACPI_LEVEL_SENSITIVE,
 					  ACPI_ACTIVE_LOW);
 			return 0;
-- 
cgit v1.2.3


From 9d2062b879495649bb525cf7979126da2e45d288 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:05 -0700
Subject: x86, acpi/irq: Fix acpi_sci_ioapic_setup so it has both bus_irq and
 gsi

Currently acpi_sci_ioapic_setup calls mp_override_legacy_irq with
bus_irq == gsi, which is wrong if we are comming from an override
Instead pass the bus_irq into acpi_sci_ioapic_setup.

This fix was inspired by a similar fix from:
Yinghai Lu <yinghai@kernel.org>

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-4-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index da718d67259..0a036dc6f9f 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -313,7 +313,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 /*
  * Parse Interrupt Source Override for the ACPI SCI
  */
-static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
+static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi)
 {
 	if (trigger == 0)	/* compatible SCI trigger is level */
 		trigger = 3;
@@ -333,7 +333,7 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
 	 * If GSI is < 16, this will update its flags,
 	 * else it will create a new mp_irqs[] entry.
 	 */
-	mp_override_legacy_irq(gsi, polarity, trigger, gsi);
+	mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
 
 	/*
 	 * stash over-ride to indicate we've been here
@@ -357,9 +357,10 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
 	acpi_table_print_madt_entry(header);
 
 	if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
-		acpi_sci_ioapic_setup(intsrc->global_irq,
+		acpi_sci_ioapic_setup(intsrc->source_irq,
 				      intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
-				      (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2);
+				      (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+				      intsrc->global_irq);
 		return 0;
 	}
 
@@ -1162,7 +1163,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 	 * pretend we got one so we can set the SCI flags.
 	 */
 	if (!acpi_sci_override_gsi)
-		acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
+		acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
+				      acpi_gbl_FADT.sci_interrupt);
 
 	/* Fill in identity legacy mappings where no override */
 	mp_config_acpi_legacy_irqs();
-- 
cgit v1.2.3


From 0fd52670fb6400be0996ac492b5ed77f3d83d69a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:06 -0700
Subject: x86, acpi/irq: Generalize mp_config_acpi_legacy_irqs

Remove the assumption that there is not an override for isa irq 0.
Instead lookup the gsi and from that lookup the ioapic and pin of each
isa irq indivdually.

In general this should not have any behavioural affect but in
perverse cases this gets all of the details correct, instead of
doing something weird.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-5-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 0a036dc6f9f..3ee92f28a4b 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -961,8 +961,6 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 void __init mp_config_acpi_legacy_irqs(void)
 {
 	int i;
-	int ioapic;
-	unsigned int dstapic;
 	struct mpc_intsrc mp_irq;
 
 #if defined (CONFIG_MCA) || defined (CONFIG_EISA)
@@ -982,20 +980,28 @@ void __init mp_config_acpi_legacy_irqs(void)
 		return;
 #endif
 
-	/*
-	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
-	 */
-	ioapic = mp_find_ioapic(0);
-	if (ioapic < 0)
-		return;
-	dstapic = mp_ioapics[ioapic].apicid;
-
 	/*
 	 * Use the default configuration for the IRQs 0-15.  Unless
 	 * overridden by (MADT) interrupt source override entries.
 	 */
 	for (i = 0; i < 16; i++) {
+		int ioapic, pin;
+		unsigned int dstapic;
 		int idx;
+		u32 gsi;
+
+		/* Locate the gsi that irq i maps to. */
+		if (acpi_isa_irq_to_gsi(i, &gsi))
+			continue;
+
+		/*
+		 * Locate the IOAPIC that manages the ISA IRQ.
+		 */
+		ioapic = mp_find_ioapic(gsi);
+		if (ioapic < 0)
+			continue;
+		pin = mp_find_ioapic_pin(ioapic, gsi);
+		dstapic = mp_ioapics[ioapic].apicid;
 
 		for (idx = 0; idx < mp_irq_entries; idx++) {
 			struct mpc_intsrc *irq = mp_irqs + idx;
@@ -1005,7 +1011,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 				break;
 
 			/* Do we already have a mapping for this IOAPIC pin */
-			if (irq->dstapic == dstapic && irq->dstirq == i)
+			if (irq->dstapic == dstapic && irq->dstirq == pin)
 				break;
 		}
 
@@ -1020,7 +1026,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 		mp_irq.dstapic = dstapic;
 		mp_irq.irqtype = mp_INT;
 		mp_irq.srcbusirq = i; /* Identity mapped */
-		mp_irq.dstirq = i;
+		mp_irq.dstirq = pin;
 
 		save_mp_irq(&mp_irq);
 	}
-- 
cgit v1.2.3


From 9638fa521e42c9281c874c6b5a382b1ced4ee496 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:07 -0700
Subject: x86, ioapic: Only export mp_find_ioapic and mp_find_ioapic_pin in
 io_apic.h

Multiple declarations of the same function in different headers
is a pain to maintain.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-6-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mpspec.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index d8bf23a88d0..29994f06c7e 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -106,10 +106,6 @@ struct device;
 extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
 				 int active_high_low);
 extern int acpi_probe_gsi(void);
-#ifdef CONFIG_X86_IO_APIC
-extern int mp_find_ioapic(int gsi);
-extern int mp_find_ioapic_pin(int ioapic, int gsi);
-#endif
 #else /* !CONFIG_ACPI: */
 static inline int acpi_probe_gsi(void)
 {
-- 
cgit v1.2.3


From 4b6b19a1c7302477653d799a53d48063dd53d555 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:08 -0700
Subject: x86, ioapic: Fix io_apic_redir_entries to return the number of
 entries.

io_apic_redir_entries has a huge conceptual bug.  It returns the maximum
redirection entry not the number of redirection entries.  Which simply
does not match what the name of the function.  This just caught me
and it caught  Feng Tang, and  Len Brown when they wrote sfi_parse_ioapic.

Modify io_apic_redir_entries to actually return the number of redirection
entries, and fix the callers so that they properly handle receiving the
number of the number of redirection table entries, instead of the
number of redirection table entries less one.

While the usage in sfi.c does not show up in this patch it is fixed
by virtue of the fact that io_apic_redir_entries now has the semantics
sfi_parse_ioapic most reasonably expects.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-7-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 73ec92838d8..0a053e61b3e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3855,7 +3855,11 @@ int __init io_apic_get_redir_entries (int ioapic)
 	reg_01.raw = io_apic_read(ioapic, 1);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	return reg_01.bits.entries;
+	/* The register returns the maximum index redir index
+	 * supported, which is one less than the total number of redir
+	 * entries.
+	 */
+	return reg_01.bits.entries + 1;
 }
 
 void __init probe_nr_irqs_gsi(void)
@@ -3871,7 +3875,7 @@ void __init probe_nr_irqs_gsi(void)
 
 		nr = 0;
 		for (idx = 0; idx < nr_ioapics; idx++)
-			nr += io_apic_get_redir_entries(idx) + 1;
+			nr += io_apic_get_redir_entries(idx);
 
 		if (nr > nr_irqs_gsi)
 			nr_irqs_gsi = nr;
@@ -4306,7 +4310,7 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	 */
 	mp_gsi_routing[idx].gsi_base = gsi_base;
 	mp_gsi_routing[idx].gsi_end = gsi_base +
-	    io_apic_get_redir_entries(idx);
+	    io_apic_get_redir_entries(idx) - 1;
 
 	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
 	       "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
-- 
cgit v1.2.3


From eddb0c55a14074d6bac8c2ef169aefd7e2c6f139 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:09 -0700
Subject: x86, ioapic: Fix the types of gsi values

This patches fixes the types of gsi_base and gsi_end values in
struct mp_ioapic_gsi, and the gsi parameter of mp_find_ioapic
and mp_find_ioapic_pin

A gsi is cannonically a u32, not an int.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-8-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/io_apic.h | 10 +++++-----
 arch/x86/kernel/apic/io_apic.c |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 35832a03a51..feeaf0d9246 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -180,12 +180,12 @@ extern void ioapic_write_entry(int apic, int pin,
 extern void setup_ioapic_ids_from_mpc(void);
 
 struct mp_ioapic_gsi{
-	int gsi_base;
-	int gsi_end;
+	u32 gsi_base;
+	u32 gsi_end;
 };
 extern struct mp_ioapic_gsi  mp_gsi_routing[];
-int mp_find_ioapic(int gsi);
-int mp_find_ioapic_pin(int ioapic, int gsi);
+int mp_find_ioapic(u32 gsi);
+int mp_find_ioapic_pin(int ioapic, u32 gsi);
 void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
 extern void __init pre_init_apic_IRQ0(void);
 
@@ -197,7 +197,7 @@ static const int timer_through_8259 = 0;
 static inline void ioapic_init_mappings(void)	{ }
 static inline void ioapic_insert_resources(void) { }
 static inline void probe_nr_irqs_gsi(void)	{ }
-static inline int mp_find_ioapic(int gsi) { return 0; }
+static inline int mp_find_ioapic(u32 gsi) { return 0; }
 
 struct io_apic_irq_attr;
 static inline int io_apic_set_pci_routing(struct device *dev, int irq,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0a053e61b3e..9ab97622b8e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -4247,7 +4247,7 @@ void __init ioapic_insert_resources(void)
 	}
 }
 
-int mp_find_ioapic(int gsi)
+int mp_find_ioapic(u32 gsi)
 {
 	int i = 0;
 
@@ -4262,7 +4262,7 @@ int mp_find_ioapic(int gsi)
 	return -1;
 }
 
-int mp_find_ioapic_pin(int ioapic, int gsi)
+int mp_find_ioapic_pin(int ioapic, u32 gsi)
 {
 	if (WARN_ON(ioapic == -1))
 		return -1;
-- 
cgit v1.2.3


From 5777372af5c929b8f3c706ed7b295b7279537c88 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:10 -0700
Subject: x86, ioapic: Teach mp_register_ioapic to compute a global gsi_end

Add the global variable gsi_end and teach mp_register_ioapic
to keep it uptodate as we add more ioapics into the system.

ioapics can only be added early in boot so the code that
runs later can treat gsi_end as a constant.

Remove the have hacks in sfi.c to second guess mp_register_ioapic
by keeping t's own running total of how many gsi's have been seen,
and instead use the gsi_end.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-9-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/io_apic.h | 1 +
 arch/x86/kernel/apic/io_apic.c | 6 ++++++
 arch/x86/kernel/sfi.c          | 4 +---
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index feeaf0d9246..37b0f2bb503 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -184,6 +184,7 @@ struct mp_ioapic_gsi{
 	u32 gsi_end;
 };
 extern struct mp_ioapic_gsi  mp_gsi_routing[];
+extern u32 gsi_end;
 int mp_find_ioapic(u32 gsi);
 int mp_find_ioapic_pin(int ioapic, u32 gsi);
 void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9ab97622b8e..f8072557157 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -89,6 +89,9 @@ int nr_ioapics;
 /* IO APIC gsi routing info */
 struct mp_ioapic_gsi  mp_gsi_routing[MAX_IO_APICS];
 
+/* The last gsi number used */
+u32 gsi_end;
+
 /* MP IRQ source entries */
 struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
@@ -4312,6 +4315,9 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	mp_gsi_routing[idx].gsi_end = gsi_base +
 	    io_apic_get_redir_entries(idx) - 1;
 
+	if (mp_gsi_routing[idx].gsi_end > gsi_end)
+		gsi_end = mp_gsi_routing[idx].gsi_end;
+
 	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
 	       "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
 	       mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index 34e09938265..7ded57896c0 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -81,7 +81,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_IO_APIC
-static u32 gsi_base;
 
 static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 {
@@ -94,8 +93,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 	pentry = (struct sfi_apic_table_entry *)sb->pentry;
 
 	for (i = 0; i < num; i++) {
-		mp_register_ioapic(i, pentry->phys_addr, gsi_base);
-		gsi_base += io_apic_get_redir_entries(i);
+		mp_register_ioapic(i, pentry->phys_addr, gsi_end + 1);
 		pentry++;
 	}
 
-- 
cgit v1.2.3


From cf7500c0ea133d66f8449d86392d83f840102632 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:11 -0700
Subject: x86, ioapic: In mpparse use mp_register_ioapic

Long ago MP_ioapic_info was the primary way of setting up our
ioapic data structures and mp_register_ioapic was a compatibility
shim for acpi code.  Now the situation is reversed and
and mp_register_ioapic is the primary way of setting up our
ioapic data structures.

Keep the setting up of ioapic data structures uniform by
having mp_register_ioapic call mp_register_ioapic.

This changes a few fields:

- type: is now hardset to MP_IOAPIC but type had to
  bey MP_IOAPIC or MP_ioapic_info would not have been called.

- flags: is now hard coded to MPC_APIC_USABLE.
  We require flags to contain at least MPC_APIC_USEBLE in
  MP_ioapic_info and we don't ever examine flags so dropping
  a few flags that might possibly exist that we have never
  used is harmless.

- apicaddr: Unchanged

- apicver: Read from the ioapic instead of using the cached
  hardware value in the MP table.  The real hardware value
  will be more accurate.

- apicid: Now verified to be unique and changed if it is not.
  If the BIOS got this right this is a noop.  If the BIOS did
  not fixing things appears to be the better solution.

This adds gsi_base and gsi_end values to our ioapics defined with
the mpatable, which will make our lives simpler later since
we can always assume gsi_base and gsi_end are valid.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-10-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/mpparse.c | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e81030f71a8..5ae5d2426ed 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -115,21 +115,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
 		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
 }
 
-static int bad_ioapic(unsigned long address)
-{
-	if (nr_ioapics >= MAX_IO_APICS) {
-		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
-		       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
-		panic("Recompile kernel with bigger MAX_IO_APICS!\n");
-	}
-	if (!address) {
-		printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
-		       " found in table, skipping!\n");
-		return 1;
-	}
-	return 0;
-}
-
 static void __init MP_ioapic_info(struct mpc_ioapic *m)
 {
 	if (!(m->flags & MPC_APIC_USABLE))
@@ -138,15 +123,7 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
 	printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
 	       m->apicid, m->apicver, m->apicaddr);
 
-	if (bad_ioapic(m->apicaddr))
-		return;
-
-	mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
-	mp_ioapics[nr_ioapics].apicid = m->apicid;
-	mp_ioapics[nr_ioapics].type = m->type;
-	mp_ioapics[nr_ioapics].apicver = m->apicver;
-	mp_ioapics[nr_ioapics].flags = m->flags;
-	nr_ioapics++;
+	mp_register_ioapic(m->apicid, m->apicaddr, gsi_end + 1);
 }
 
 static void print_MP_intsrc_info(struct mpc_intsrc *m)
-- 
cgit v1.2.3


From 7716a5c4ff5f1f3dc5e9edcab125cbf7fceef0af Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:12 -0700
Subject: x86, ioapic: Move nr_ioapic_registers calculation to
 mp_register_ioapic.

Now that all ioapic registration happens in mp_register_ioapic we can
move the calculation of nr_ioapic_registers there from enable_IO_APIC.
The number of ioapic registers is already calucated in mp_register_ioapic
so all that really needs to be done is to save the caluclated value
in nr_ioapic_registers.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-11-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index f8072557157..dae9240bd28 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1953,20 +1953,8 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
 void __init enable_IO_APIC(void)
 {
-	union IO_APIC_reg_01 reg_01;
 	int i8259_apic, i8259_pin;
 	int apic;
-	unsigned long flags;
-
-	/*
-	 * The number of IO-APIC IRQ registers (== #pins):
-	 */
-	for (apic = 0; apic < nr_ioapics; apic++) {
-		raw_spin_lock_irqsave(&ioapic_lock, flags);
-		reg_01.raw = io_apic_read(apic, 1);
-		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-		nr_ioapic_registers[apic] = reg_01.bits.entries+1;
-	}
 
 	if (!legacy_pic->nr_legacy_irqs)
 		return;
@@ -4293,6 +4281,7 @@ static int bad_ioapic(unsigned long address)
 void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 {
 	int idx = 0;
+	int entries;
 
 	if (bad_ioapic(address))
 		return;
@@ -4311,9 +4300,14 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
 	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
 	 */
+	entries = io_apic_get_redir_entries(idx);
 	mp_gsi_routing[idx].gsi_base = gsi_base;
-	mp_gsi_routing[idx].gsi_end = gsi_base +
-	    io_apic_get_redir_entries(idx) - 1;
+	mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
+
+	/*
+	 * The number of IO-APIC IRQ registers (== #pins):
+	 */
+	nr_ioapic_registers[idx] = entries;
 
 	if (mp_gsi_routing[idx].gsi_end > gsi_end)
 		gsi_end = mp_gsi_routing[idx].gsi_end;
-- 
cgit v1.2.3


From d464207c4fdd70c2a0febd4f9c58206fa915bb36 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:13 -0700
Subject: x86, ioapic: Optimize pin_2_irq

Now that all ioapics have valid gsi_base values use this to
accellerate pin_2_irq.  In the case of acpi this also ensures
that pin_2_irq will compute the same irq value for an ioapic
pin as acpi will.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-12-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/io_apic.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index dae9240bd28..0d35f46929d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1019,7 +1019,7 @@ static inline int irq_trigger(int idx)
 int (*ioapic_renumber_irq)(int ioapic, int irq);
 static int pin_2_irq(int idx, int apic, int pin)
 {
-	int irq, i;
+	int irq;
 	int bus = mp_irqs[idx].srcbus;
 
 	/*
@@ -1031,18 +1031,13 @@ static int pin_2_irq(int idx, int apic, int pin)
 	if (test_bit(bus, mp_bus_not_pci)) {
 		irq = mp_irqs[idx].srcbusirq;
 	} else {
-		/*
-		 * PCI IRQs are mapped in order
-		 */
-		i = irq = 0;
-		while (i < apic)
-			irq += nr_ioapic_registers[i++];
-		irq += pin;
+		u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
 		/*
                  * For MPS mode, so far only needed by ES7000 platform
                  */
 		if (ioapic_renumber_irq)
-			irq = ioapic_renumber_irq(apic, irq);
+			gsi = ioapic_renumber_irq(apic, gsi);
+		irq = gsi;
 	}
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3


From 4afc51a835d3aeba11c35090f524e05c84586d27 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:14 -0700
Subject: x86, ioapic: Simplify probe_nr_irqs_gsi.

Use the global gsi_end value now that all ioapics have
valid gsi numbers instead of a combination of acpi_probe_gsi
and walking all of the ioapics and couting their number of
entries by hand if acpi_probe_gsi gave us an answer we did
not like.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-13-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/mpspec.h  |  6 ------
 arch/x86/kernel/acpi/boot.c    | 23 -----------------------
 arch/x86/kernel/apic/io_apic.c | 17 +++--------------
 3 files changed, 3 insertions(+), 43 deletions(-)

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 29994f06c7e..c82868e9f90 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -105,12 +105,6 @@ extern void mp_config_acpi_legacy_irqs(void);
 struct device;
 extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
 				 int active_high_low);
-extern int acpi_probe_gsi(void);
-#else /* !CONFIG_ACPI: */
-static inline int acpi_probe_gsi(void)
-{
-	return 0;
-}
 #endif /* CONFIG_ACPI */
 
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_APICS)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3ee92f28a4b..07a63ce5811 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -876,29 +876,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
 extern int es7000_plat;
 #endif
 
-int __init acpi_probe_gsi(void)
-{
-	int idx;
-	int gsi;
-	int max_gsi = 0;
-
-	if (acpi_disabled)
-		return 0;
-
-	if (!acpi_ioapic)
-		return 0;
-
-	max_gsi = 0;
-	for (idx = 0; idx < nr_ioapics; idx++) {
-		gsi = mp_gsi_routing[idx].gsi_end;
-
-		if (gsi > max_gsi)
-			max_gsi = gsi;
-	}
-
-	return max_gsi + 1;
-}
-
 static void assign_to_mp_irq(struct mpc_intsrc *m,
 				    struct mpc_intsrc *mp_irq)
 {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 0d35f46929d..9f3f6ca86da 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3850,22 +3850,11 @@ int __init io_apic_get_redir_entries (int ioapic)
 
 void __init probe_nr_irqs_gsi(void)
 {
-	int nr = 0;
+	int nr;
 
-	nr = acpi_probe_gsi();
-	if (nr > nr_irqs_gsi) {
+	nr = gsi_end + 1;
+	if (nr > nr_irqs_gsi)
 		nr_irqs_gsi = nr;
-	} else {
-		/* for acpi=off or acpi is not compiled in */
-		int idx;
-
-		nr = 0;
-		for (idx = 0; idx < nr_ioapics; idx++)
-			nr += io_apic_get_redir_entries(idx);
-
-		if (nr > nr_irqs_gsi)
-			nr_irqs_gsi = nr;
-	}
 
 	printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
 }
-- 
cgit v1.2.3


From 988856ee1623bd37e384105f7bb2b7fe44c009f6 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:15 -0700
Subject: x86, acpi/irq: Handle isa irqs that are not identity mapped to gsi's.

ACPI irq source overrides are allowed for the 16 isa irqs and are
allowed to map any gsi to any isa irq.  A few motherboards have been
seen to take advantage of this and put the isa irqs on the 2nd or
3rd ioapic.  This causes some problems, most notably the fact
that we can not use any gsi < 16.

To correct this move the gsis that are not isa irqs and have
a gsi number < 16 into the linux irq space just past gsi_end.
This is what the es7000 platform is doing today.  Moving only the
low 16 gsis above the rest of the gsi's only penalizes weird
platforms, leaving sane acpi implementations with a 1-1 mapping
of gsis and irqs.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-14-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/boot.c    | 57 +++++++++++++++++++++++++++++++++++++++---
 arch/x86/kernel/apic/io_apic.c |  8 ++++--
 2 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 07a63ce5811..325fbbab7f8 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -93,6 +93,53 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
 enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
 
 
+/*
+ * ISA irqs by default are the first 16 gsis but can be
+ * any gsi as specified by an interrupt source override.
+ */
+static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+static unsigned int gsi_to_irq(unsigned int gsi)
+{
+	unsigned int irq = gsi + NR_IRQS_LEGACY;
+	unsigned int i;
+
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
+		if (isa_irq_to_gsi[i] == gsi) {
+			return i;
+		}
+	}
+
+	/* Provide an identity mapping of gsi == irq
+	 * except on truly weird platforms that have
+	 * non isa irqs in the first 16 gsis.
+	 */
+	if (gsi >= NR_IRQS_LEGACY)
+		irq = gsi;
+	else
+		irq = gsi_end + 1 + gsi;
+
+	return irq;
+}
+
+static u32 irq_to_gsi(int irq)
+{
+	unsigned int gsi;
+
+	if (irq < NR_IRQS_LEGACY)
+		gsi = isa_irq_to_gsi[irq];
+	else if (irq <= gsi_end)
+		gsi = irq;
+	else if (irq <= (gsi_end + NR_IRQS_LEGACY))
+		gsi = irq - gsi_end;
+	else
+		gsi = 0xffffffff;
+
+	return gsi;
+}
+
 /*
  * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
  * to map the target physical address. The problem is that set_fixmap()
@@ -449,7 +496,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
 
 int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 {
-	*irq = gsi;
+	*irq = gsi_to_irq(gsi);
 
 #ifdef CONFIG_X86_IO_APIC
 	if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
@@ -463,7 +510,7 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
 {
 	if (isa_irq >= 16)
 		return -1;
-	*gsi = isa_irq;
+	*gsi = irq_to_gsi(isa_irq);
 	return 0;
 }
 
@@ -491,7 +538,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 		plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
 	}
 #endif
-	irq = plat_gsi;
+	irq = gsi_to_irq(plat_gsi);
 
 	return irq;
 }
@@ -933,6 +980,8 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 	mp_irq.dstirq = pin;	/* INTIN# */
 
 	save_mp_irq(&mp_irq);
+
+	isa_irq_to_gsi[bus_irq] = gsi;
 }
 
 void __init mp_config_acpi_legacy_irqs(void)
@@ -1086,7 +1135,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 	set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
 			     trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
 			     polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-	io_apic_set_pci_routing(dev, gsi, &irq_attr);
+	io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
 
 	return gsi;
 }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9f3f6ca86da..594827c3c61 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1037,7 +1037,11 @@ static int pin_2_irq(int idx, int apic, int pin)
                  */
 		if (ioapic_renumber_irq)
 			gsi = ioapic_renumber_irq(apic, gsi);
-		irq = gsi;
+
+		if (gsi >= NR_IRQS_LEGACY)
+			irq = gsi;
+		else
+			irq = gsi_end + 1 + gsi;
 	}
 
 #ifdef CONFIG_X86_32
@@ -3852,7 +3856,7 @@ void __init probe_nr_irqs_gsi(void)
 {
 	int nr;
 
-	nr = gsi_end + 1;
+	nr = gsi_end + 1 + NR_IRQS_LEGACY;
 	if (nr > nr_irqs_gsi)
 		nr_irqs_gsi = nr;
 
-- 
cgit v1.2.3


From 7b20bd5fb902088579af4e70f7f802b93181a628 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 30 Mar 2010 01:07:16 -0700
Subject: x86, irq: Kill io_apic_renumber_irq

Now that the generic irq layer is performing the exact same remapping as
io_apic_renumber_irq we can kill this weird  es7000 specific function.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <1269936436-7039-15-git-send-email-ebiederm@xmission.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/io_apic.h   |  1 -
 arch/x86/kernel/acpi/boot.c      |  5 -----
 arch/x86/kernel/apic/es7000_32.c | 19 -------------------
 arch/x86/kernel/apic/io_apic.c   |  6 ------
 4 files changed, 31 deletions(-)

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 37b0f2bb503..9da192a17f0 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -159,7 +159,6 @@ struct io_apic_irq_attr;
 extern int io_apic_set_pci_routing(struct device *dev, int irq,
 		 struct io_apic_irq_attr *irq_attr);
 void setup_IO_APIC_irq_extra(u32 gsi);
-extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 extern void ioapic_insert_resources(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 325fbbab7f8..9a5ed58f09d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1117,11 +1117,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 
 	ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
 
-#ifdef CONFIG_X86_32
-	if (ioapic_renumber_irq)
-		gsi = ioapic_renumber_irq(ioapic, gsi);
-#endif
-
 	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
 		printk(KERN_ERR "Invalid reference to IOAPIC pin "
 		       "%d-%d\n", mp_ioapics[ioapic].apicid,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 03ba1b895f5..425e53a87fe 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -131,24 +131,6 @@ int					es7000_plat;
 
 static unsigned int			base;
 
-static int
-es7000_rename_gsi(int ioapic, int gsi)
-{
-	if (es7000_plat == ES7000_ZORRO)
-		return gsi;
-
-	if (!base) {
-		int i;
-		for (i = 0; i < nr_ioapics; i++)
-			base += nr_ioapic_registers[i];
-	}
-
-	if (!ioapic && (gsi < 16))
-		gsi += base;
-
-	return gsi;
-}
-
 static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
 {
 	unsigned long vect = 0, psaival = 0;
@@ -190,7 +172,6 @@ static void setup_unisys(void)
 		es7000_plat = ES7000_ZORRO;
 	else
 		es7000_plat = ES7000_CLASSIC;
-	ioapic_renumber_irq = es7000_rename_gsi;
 }
 
 /*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 594827c3c61..d174d886654 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1016,7 +1016,6 @@ static inline int irq_trigger(int idx)
 	return MPBIOS_trigger(idx);
 }
 
-int (*ioapic_renumber_irq)(int ioapic, int irq);
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq;
@@ -1032,11 +1031,6 @@ static int pin_2_irq(int idx, int apic, int pin)
 		irq = mp_irqs[idx].srcbusirq;
 	} else {
 		u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
-		/*
-                 * For MPS mode, so far only needed by ES7000 platform
-                 */
-		if (ioapic_renumber_irq)
-			gsi = ioapic_renumber_irq(apic, gsi);
 
 		if (gsi >= NR_IRQS_LEGACY)
 			irq = gsi;
-- 
cgit v1.2.3


From 5dfc589a8467470226feccdc50f1b32713318e7b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 4 May 2010 16:14:46 -0700
Subject: ceph: unregister bdi before kill_anon_super releases device name

Unregister and destroy the bdi in put_super, after mount is r/o, but before
put_anon_super releases the device name.

For symmetry, bdi_destroy in destroy_client (we bdi_init in create_client).

Only set s_bdi if bdi_register succeeds, since we use it to decide whether
to bdi_unregister.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/super.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f888cf487b7..110857ba926 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -47,10 +47,20 @@ const char *ceph_file_part(const char *s, int len)
  */
 static void ceph_put_super(struct super_block *s)
 {
-	struct ceph_client *cl = ceph_client(s);
+	struct ceph_client *client = ceph_sb_to_client(s);
 
 	dout("put_super\n");
-	ceph_mdsc_close_sessions(&cl->mdsc);
+	ceph_mdsc_close_sessions(&client->mdsc);
+
+	/*
+	 * ensure we release the bdi before put_anon_super releases
+	 * the device name.
+	 */
+	if (s->s_bdi == &client->backing_dev_info) {
+		bdi_unregister(&client->backing_dev_info);
+		s->s_bdi = NULL;
+	}
+
 	return;
 }
 
@@ -636,6 +646,8 @@ static void ceph_destroy_client(struct ceph_client *client)
 	destroy_workqueue(client->pg_inv_wq);
 	destroy_workqueue(client->trunc_wq);
 
+	bdi_destroy(&client->backing_dev_info);
+
 	if (client->msgr)
 		ceph_messenger_destroy(client->msgr);
 	mempool_destroy(client->wb_pagevec_pool);
@@ -876,14 +888,14 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 {
 	int err;
 
-	sb->s_bdi = &client->backing_dev_info;
-
 	/* set ra_pages based on rsize mount option? */
 	if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
 		client->backing_dev_info.ra_pages =
 			(client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
 			>> PAGE_SHIFT;
 	err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+	if (!err)
+		sb->s_bdi = &client->backing_dev_info;
 	return err;
 }
 
@@ -957,9 +969,6 @@ static void ceph_kill_sb(struct super_block *s)
 	dout("kill_sb %p\n", s);
 	ceph_mdsc_pre_umount(&client->mdsc);
 	kill_anon_super(s);    /* will call put_super after sb is r/o */
-	if (s->s_bdi == &client->backing_dev_info)
-		bdi_unregister(&client->backing_dev_info);
-	bdi_destroy(&client->backing_dev_info);
 	ceph_destroy_client(client);
 }
 
-- 
cgit v1.2.3


From 9890948d857c2120c234b0ca91a80416e8f747fb Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 4 May 2010 20:58:51 -0300
Subject: perf report: Make dso__calc_col_width agree with
 hist_entry__dso_snprintf
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The first was always using the ->long_name, while the later used
->short_name if verbose was not set, resulting in the dso column to be
much wider than needed most of the time.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 2477270c1d3..23d5dfd4ed7 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -626,8 +626,10 @@ static void dso__calc_col_width(struct dso *self)
 	if (!symbol_conf.col_width_list_str && !symbol_conf.field_sep &&
 	    (!symbol_conf.dso_list ||
 	     strlist__has_entry(symbol_conf.dso_list, self->name))) {
-		unsigned int slen = strlen(self->name);
-		if (slen > dsos__col_width)
+		u16 slen = self->short_name_len;
+		if (verbose)
+			slen = self->long_name_len;
+		if (dsos__col_width < slen)
 			dsos__col_width = slen;
 	}
 
-- 
cgit v1.2.3


From db620b1c2fb172346dc54eb62bba9b4a117d173b Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 4 May 2010 22:20:16 -0500
Subject: perf/record: simplify TRACE_INFO tracepoint check

Fix a couple of inefficiencies and redundancies related to
have_tracepoints() and its use when checking whether to write
TRACE_INFO.

First, there's no need to use get_tracepoints_path() in
have_tracepoints() - we really just want the part that checks whether
any attributes correspondo to tracepoints.

Second, we really don't care about raw_samples per se - tracepoints
are always raw_samples.  In any case, the have_tracepoints() check
should be sufficient to decide whether or not to write TRACE_INFO.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>,
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1273030770.6383.6.camel@tropicana>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c        | 11 +----------
 tools/perf/util/trace-event-info.c |  8 +++++++-
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 0ff67d1c475..d3981ac50e1 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -560,17 +560,8 @@ static int __cmd_record(int argc, const char **argv)
 			return err;
 	}
 
-	if (raw_samples && have_tracepoints(attrs, nr_counters)) {
+	if (have_tracepoints(attrs, nr_counters))
 		perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
-	} else {
-		for (i = 0; i < nr_counters; i++) {
-			if (attrs[i].sample_type & PERF_SAMPLE_RAW &&
-				attrs[i].type == PERF_TYPE_TRACEPOINT) {
-				perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
-				break;
-			}
-		}
-	}
 
 	atexit(atexit_header);
 
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index 0a1fb9d4f3b..b1572601286 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -489,7 +489,13 @@ get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events)
 
 bool have_tracepoints(struct perf_event_attr *pattrs, int nb_events)
 {
-	return get_tracepoints_path(pattrs, nb_events) ? true : false;
+	int i;
+
+	for (i = 0; i < nb_events; i++)
+		if (pattrs[i].type == PERF_TYPE_TRACEPOINT)
+			return true;
+
+	return false;
 }
 
 int read_tracing_data(int fd, struct perf_event_attr *pattrs, int nb_events)
-- 
cgit v1.2.3


From 9e32a3cb0661a6a30e0fd2b77ce85293805e6337 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 5 May 2010 11:20:05 -0300
Subject: perf list: Add explanation about raw hardware event descriptors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using explanation given by Ingo Molnar in the oprofile mailing list.

Suggested-by: Nick Black <dank@qemfd.net>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Nick Black <dank@qemfd.net>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-list.txt | 18 ++++++++++++++++++
 tools/perf/util/parse-events.c         |  2 +-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 8290b942266..ad765e0b886 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -15,6 +15,24 @@ DESCRIPTION
 This command displays the symbolic event types which can be selected in the
 various perf commands with the -e option.
 
+RAW HARDWARE EVENT DESCRIPTOR
+-----------------------------
+Even when an event is not available in a symbolic form within perf right now,
+it can be encoded as <UMASK VALUE><EVENT NUM>, for instance, if the Intel docs
+describe an event as:
+
+  Event  Umask  Event Mask
+  Num.   Value  Mnemonic    Description                        Comment
+
+  A8H      01H  LSD.UOPS    Counts the number of micro-ops     Use cmask=1 and
+                            delivered by loop stream detector  invert to count
+                                                               cycles
+
+raw encoding of 0x1A8 can be used:
+
+ perf stat -e r1a8 -a sleep 1
+ perf record -e r1a8 ...
+
 OPTIONS
 -------
 None
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 600d3271425..bc8b7e61420 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -936,7 +936,7 @@ void print_events(void)
 
 	printf("\n");
 	printf("  %-42s [%s]\n",
-		"rNNN", event_type_descriptors[PERF_TYPE_RAW]);
+		"rNNN (NNN=<UMASK VALUE><EVENT NUM>)", event_type_descriptors[PERF_TYPE_RAW]);
 	printf("\n");
 
 	printf("  %-42s [%s]\n",
-- 
cgit v1.2.3


From 4778e0e8c64f683a71632dba1cff1f85f76f83c4 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 5 May 2010 11:23:27 -0300
Subject: perf tools: Fixup minor doc formatting issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-annotate.txt     | 2 +-
 tools/perf/Documentation/perf-bench.txt        | 2 +-
 tools/perf/Documentation/perf-diff.txt         | 2 +-
 tools/perf/Documentation/perf-kmem.txt         | 2 +-
 tools/perf/Documentation/perf-trace-python.txt | 2 +-
 tools/perf/Documentation/perf-trace.txt        | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index c9dcade0683..5164a655c39 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -1,5 +1,5 @@
 perf-annotate(1)
-==============
+================
 
 NAME
 ----
diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt
index 0181dddf6b6..a3dbadb26ef 100644
--- a/tools/perf/Documentation/perf-bench.txt
+++ b/tools/perf/Documentation/perf-bench.txt
@@ -1,5 +1,5 @@
 perf-bench(1)
-============
+=============
 
 NAME
 ----
diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt
index 8974e208cba..20d97d84ea1 100644
--- a/tools/perf/Documentation/perf-diff.txt
+++ b/tools/perf/Documentation/perf-diff.txt
@@ -1,5 +1,5 @@
 perf-diff(1)
-==============
+============
 
 NAME
 ----
diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt
index eac4d852e7c..a52fcde894c 100644
--- a/tools/perf/Documentation/perf-kmem.txt
+++ b/tools/perf/Documentation/perf-kmem.txt
@@ -1,5 +1,5 @@
 perf-kmem(1)
-==============
+============
 
 NAME
 ----
diff --git a/tools/perf/Documentation/perf-trace-python.txt b/tools/perf/Documentation/perf-trace-python.txt
index 16a86500dcf..864aac283a7 100644
--- a/tools/perf/Documentation/perf-trace-python.txt
+++ b/tools/perf/Documentation/perf-trace-python.txt
@@ -1,5 +1,5 @@
 perf-trace-python(1)
-==================
+====================
 
 NAME
 ----
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 8879299cd9d..122ec9dc485 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -1,5 +1,5 @@
 perf-trace(1)
-==============
+=============
 
 NAME
 ----
-- 
cgit v1.2.3


From 2e26ca7150a4f2ab3e69471dfc65f131e7dd7a05 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 5 May 2010 10:52:31 -0400
Subject: tracing: Fix tracepoint.h DECLARE_TRACE() to allow more than one
 header

When more than one header is included under CREATE_TRACE_POINTS
the DECLARE_TRACE() macro is not defined back to its original meaning
and the second include will fail to initialize the TRACE_EVENT()
and DECLARE_TRACE() correctly.

To fix this the tracepoint.h file moves the define of DECLARE_TRACE()
out of the #ifdef _LINUX_TRACEPOINT_H protection (just like the
define of the TRACE_EVENT()). This way the define_trace.h will undef
the DECLARE_TRACE() at the end and allow new headers to start
from scratch.

This patch also requires fixing the include/events/napi.h

It currently uses DECLARE_TRACE() and should be converted to a TRACE_EVENT()
format. But I'll leave that change to the authors of that file.
But since the napi.h file depends on using the CREATE_TRACE_POINTS
and does not define its own DEFINE_TRACE() it must use the define_trace.h
method instead.

Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h   | 114 ++++++++++++++++++++++---------------------
 include/trace/define_trace.h |   5 ++
 include/trace/events/napi.h  |  10 +++-
 3 files changed, 72 insertions(+), 57 deletions(-)

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 78b4bd3be49..1d85f9a6a19 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -33,6 +33,65 @@ struct tracepoint {
 					 * Keep in sync with vmlinux.lds.h.
 					 */
 
+/*
+ * Connect a probe to a tracepoint.
+ * Internal API, should not be used directly.
+ */
+extern int tracepoint_probe_register(const char *name, void *probe);
+
+/*
+ * Disconnect a probe from a tracepoint.
+ * Internal API, should not be used directly.
+ */
+extern int tracepoint_probe_unregister(const char *name, void *probe);
+
+extern int tracepoint_probe_register_noupdate(const char *name, void *probe);
+extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe);
+extern void tracepoint_probe_update_all(void);
+
+struct tracepoint_iter {
+	struct module *module;
+	struct tracepoint *tracepoint;
+};
+
+extern void tracepoint_iter_start(struct tracepoint_iter *iter);
+extern void tracepoint_iter_next(struct tracepoint_iter *iter);
+extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
+extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
+extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
+	struct tracepoint *begin, struct tracepoint *end);
+
+/*
+ * tracepoint_synchronize_unregister must be called between the last tracepoint
+ * probe unregistration and the end of module exit to make sure there is no
+ * caller executing a probe when it is freed.
+ */
+static inline void tracepoint_synchronize_unregister(void)
+{
+	synchronize_sched();
+}
+
+#define PARAMS(args...) args
+
+#ifdef CONFIG_TRACEPOINTS
+extern void tracepoint_update_probe_range(struct tracepoint *begin,
+	struct tracepoint *end);
+#else
+static inline void tracepoint_update_probe_range(struct tracepoint *begin,
+	struct tracepoint *end)
+{ }
+#endif /* CONFIG_TRACEPOINTS */
+
+#endif /* _LINUX_TRACEPOINT_H */
+
+/*
+ * Note: we keep the TRACE_EVENT and DECLARE_TRACE outside the include
+ *  file ifdef protection.
+ *  This is due to the way trace events work. If a file includes two
+ *  trace event headers under one "CREATE_TRACE_POINTS" the first include
+ *  will override the TRACE_EVENT and break the second include.
+ */
+
 #ifndef DECLARE_TRACE
 
 #define TP_PROTO(args...)	args
@@ -96,9 +155,6 @@ struct tracepoint {
 #define EXPORT_TRACEPOINT_SYMBOL(name)					\
 	EXPORT_SYMBOL(__tracepoint_##name)
 
-extern void tracepoint_update_probe_range(struct tracepoint *begin,
-	struct tracepoint *end);
-
 #else /* !CONFIG_TRACEPOINTS */
 #define DECLARE_TRACE(name, proto, args)				\
 	static inline void _do_trace_##name(struct tracepoint *tp, proto) \
@@ -119,61 +175,9 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin,
 #define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
 #define EXPORT_TRACEPOINT_SYMBOL(name)
 
-static inline void tracepoint_update_probe_range(struct tracepoint *begin,
-	struct tracepoint *end)
-{ }
 #endif /* CONFIG_TRACEPOINTS */
 #endif /* DECLARE_TRACE */
 
-/*
- * Connect a probe to a tracepoint.
- * Internal API, should not be used directly.
- */
-extern int tracepoint_probe_register(const char *name, void *probe);
-
-/*
- * Disconnect a probe from a tracepoint.
- * Internal API, should not be used directly.
- */
-extern int tracepoint_probe_unregister(const char *name, void *probe);
-
-extern int tracepoint_probe_register_noupdate(const char *name, void *probe);
-extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe);
-extern void tracepoint_probe_update_all(void);
-
-struct tracepoint_iter {
-	struct module *module;
-	struct tracepoint *tracepoint;
-};
-
-extern void tracepoint_iter_start(struct tracepoint_iter *iter);
-extern void tracepoint_iter_next(struct tracepoint_iter *iter);
-extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
-extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
-extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
-	struct tracepoint *begin, struct tracepoint *end);
-
-/*
- * tracepoint_synchronize_unregister must be called between the last tracepoint
- * probe unregistration and the end of module exit to make sure there is no
- * caller executing a probe when it is freed.
- */
-static inline void tracepoint_synchronize_unregister(void)
-{
-	synchronize_sched();
-}
-
-#define PARAMS(args...) args
-
-#endif /* _LINUX_TRACEPOINT_H */
-
-/*
- * Note: we keep the TRACE_EVENT outside the include file ifdef protection.
- *  This is due to the way trace events work. If a file includes two
- *  trace event headers under one "CREATE_TRACE_POINTS" the first include
- *  will override the TRACE_EVENT and break the second include.
- */
-
 #ifndef TRACE_EVENT
 /*
  * For use with the TRACE_EVENT macro:
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 5acfb1eb4df..1dfab540151 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -65,6 +65,10 @@
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
+/* Make all open coded DECLARE_TRACE nops */
+#undef DECLARE_TRACE
+#define DECLARE_TRACE(name, proto, args)
+
 #ifdef CONFIG_EVENT_TRACING
 #include <trace/ftrace.h>
 #endif
@@ -75,6 +79,7 @@
 #undef DEFINE_EVENT
 #undef DEFINE_EVENT_PRINT
 #undef TRACE_HEADER_MULTI_READ
+#undef DECLARE_TRACE
 
 /* Only undef what we defined in this file */
 #ifdef UNDEF_TRACE_INCLUDE_FILE
diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h
index a8989c4547e..188deca2f3c 100644
--- a/include/trace/events/napi.h
+++ b/include/trace/events/napi.h
@@ -1,4 +1,7 @@
-#ifndef _TRACE_NAPI_H_
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM napi
+
+#if !defined(_TRACE_NAPI_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_NAPI_H_
 
 #include <linux/netdevice.h>
@@ -8,4 +11,7 @@ DECLARE_TRACE(napi_poll,
 	TP_PROTO(struct napi_struct *napi),
 	TP_ARGS(napi));
 
-#endif
+#endif /* _TRACE_NAPI_H_ */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 668eb65f092902eb7dd526af73d4a7f025a94612 Mon Sep 17 00:00:00 2001
From: Thiago Farina <tfransosi@gmail.com>
Date: Sun, 24 Jan 2010 11:03:50 -0500
Subject: tracing: Fix "integer as NULL pointer" warning.

kernel/trace/trace_output.c:256:24: warning: Using plain integer as NULL pointer

Signed-off-by: Thiago Farina <tfransosi@gmail.com>
LKML-Reference: <1264349038-1766-3-git-send-email-tfransosi@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cd..2404c129a8c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -253,7 +253,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
 	void *ret;
 
 	if (s->full)
-		return 0;
+		return NULL;
 
 	if (len > ((PAGE_SIZE - 1) - s->len)) {
 		s->full = 1;
-- 
cgit v1.2.3


From bba0b5c2c27e6dadc93c476f8a4b49d108b66292 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Wed, 5 May 2010 22:32:52 +0200
Subject: logfs: fix compile failure

When CONFIG_BLOCK is not enabled:

fs/logfs/super.c:142: error: implicit declaration of function 'bdev_get_queue'
fs/logfs/super.c:142: error: invalid type argument of '->' (have 'int')

Found by Randy Dunlap <randy.dunlap@oracle.com>

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/super.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index edd99487f93..2b269385196 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -138,10 +138,14 @@ static int logfs_sb_set(struct super_block *sb, void *_super)
 	sb->s_fs_info = super;
 	sb->s_mtd = super->s_mtd;
 	sb->s_bdev = super->s_bdev;
+#ifdef CONFIG_BLOCK
 	if (sb->s_bdev)
 		sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
+#endif
+#ifdef CONFIG_MTD
 	if (sb->s_mtd)
 		sb->s_bdi = sb->s_mtd->backing_dev_info;
+#endif
 	return 0;
 }
 
-- 
cgit v1.2.3


From c0c79c31c9d5fcc19812c6c35f842baf50ee788a Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Wed, 5 May 2010 22:33:36 +0200
Subject: logfs: fix sync

Rather self-explanatory.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/file.c  | 3 +--
 fs/logfs/inode.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index bf9b1cf953a..1639e9235f5 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -224,8 +224,7 @@ int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	struct super_block *sb = dentry->d_inode->i_sb;
 	struct logfs_super *super = logfs_super(sb);
 
-	/* FIXME: write anchor */
-	super->s_devops->sync(sb);
+	logfs_write_anchor(sb);
 	return 0;
 }
 
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index af78b6e8289..755a92e8daa 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -387,8 +387,7 @@ static void logfs_init_once(void *_li)
 
 static int logfs_sync_fs(struct super_block *sb, int wait)
 {
-	/* FIXME: write anchor */
-	logfs_super(sb)->s_devops->sync(sb);
+	logfs_write_anchor(sb);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 198b5682781b97251afd9025dbf559a77969abdd Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 24 Apr 2010 07:57:48 -0400
Subject: cifs: break negotiate protocol calls out of cifs_setup_session

So that we can reasonably set up the secType based on both the
NegotiateProtocol response and the parsed mount options.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  4 ++-
 fs/cifs/cifssmb.c   |  3 ++-
 fs/cifs/connect.c   | 78 ++++++++++++++++++++++++++++++++---------------------
 3 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8e9214275e4..6fa808ec7e3 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -138,7 +138,9 @@ extern void cifs_dfs_release_automount_timer(void);
 void cifs_proc_init(void);
 void cifs_proc_clean(void);
 
-extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
+extern int cifs_negotiate_protocol(unsigned int xid,
+				  struct cifsSesInfo *ses);
+extern int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
 			struct nls_table *nls_info);
 extern int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses);
 
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1372253a060..30742d8eef1 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -172,7 +172,8 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
 	 * reconnect the same SMB session
 	 */
 	mutex_lock(&ses->session_mutex);
-	if (ses->need_reconnect)
+	rc = cifs_negotiate_protocol(0, ses);
+	if (rc == 0 && ses->need_reconnect)
 		rc = cifs_setup_session(0, ses, nls_codepage);
 
 	/* do we need to reconnect tcon? */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9123c23bd1d..2208f06e4c4 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1651,6 +1651,14 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 		cifs_put_tcp_session(server);
 
 		mutex_lock(&ses->session_mutex);
+		rc = cifs_negotiate_protocol(xid, ses);
+		if (rc) {
+			mutex_unlock(&ses->session_mutex);
+			/* problem -- put our ses reference */
+			cifs_put_smb_ses(ses);
+			FreeXid(xid);
+			return ERR_PTR(rc);
+		}
 		if (ses->need_reconnect) {
 			cFYI(1, "Session needs reconnect");
 			rc = cifs_setup_session(xid, ses,
@@ -1702,7 +1710,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 	ses->overrideSecFlg = volume_info->secFlg;
 
 	mutex_lock(&ses->session_mutex);
-	rc = cifs_setup_session(xid, ses, volume_info->local_nls);
+	rc = cifs_negotiate_protocol(xid, ses);
+	if (!rc)
+		rc = cifs_setup_session(xid, ses, volume_info->local_nls);
 	mutex_unlock(&ses->session_mutex);
 	if (rc)
 		goto get_ses_fail;
@@ -2874,55 +2884,61 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb)
 	return rc;
 }
 
-int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
-					   struct nls_table *nls_info)
+int cifs_negotiate_protocol(unsigned int xid, struct cifsSesInfo *ses)
 {
 	int rc = 0;
-	struct TCP_Server_Info *server = pSesInfo->server;
-
-	/* what if server changes its buffer size after dropping the session? */
-	if (server->maxBuf == 0) /* no need to send on reconnect */ {
-		rc = CIFSSMBNegotiate(xid, pSesInfo);
-		if (rc == -EAGAIN) {
-			/* retry only once on 1st time connection */
-			rc = CIFSSMBNegotiate(xid, pSesInfo);
-			if (rc == -EAGAIN)
-				rc = -EHOSTDOWN;
-		}
-		if (rc == 0) {
-			spin_lock(&GlobalMid_Lock);
-			if (server->tcpStatus != CifsExiting)
-				server->tcpStatus = CifsGood;
-			else
-				rc = -EHOSTDOWN;
-			spin_unlock(&GlobalMid_Lock);
+	struct TCP_Server_Info *server = ses->server;
 
-		}
+	/* only send once per connect */
+	if (server->maxBuf != 0)
+		return 0;
+
+	rc = CIFSSMBNegotiate(xid, ses);
+	if (rc == -EAGAIN) {
+		/* retry only once on 1st time connection */
+		rc = CIFSSMBNegotiate(xid, ses);
+		if (rc == -EAGAIN)
+			rc = -EHOSTDOWN;
 	}
+	if (rc == 0) {
+		spin_lock(&GlobalMid_Lock);
+		if (server->tcpStatus != CifsExiting)
+			server->tcpStatus = CifsGood;
+		else
+			rc = -EHOSTDOWN;
+		spin_unlock(&GlobalMid_Lock);
 
-	if (rc)
-		goto ss_err_exit;
+	}
+
+	return rc;
+}
+
+
+int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses,
+			struct nls_table *nls_info)
+{
+	int rc = 0;
+	struct TCP_Server_Info *server = ses->server;
 
-	pSesInfo->flags = 0;
-	pSesInfo->capabilities = server->capabilities;
+	ses->flags = 0;
+	ses->capabilities = server->capabilities;
 	if (linuxExtEnabled == 0)
-		pSesInfo->capabilities &= (~CAP_UNIX);
+		ses->capabilities &= (~CAP_UNIX);
 
 	cFYI(1, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
 		 server->secMode, server->capabilities, server->timeAdj);
 
-	rc = CIFS_SessSetup(xid, pSesInfo, nls_info);
+	rc = CIFS_SessSetup(xid, ses, nls_info);
 	if (rc) {
 		cERROR(1, "Send error in SessSetup = %d", rc);
 	} else {
 		cFYI(1, "CIFS Session Established successfully");
 		spin_lock(&GlobalMid_Lock);
-		pSesInfo->status = CifsGood;
-		pSesInfo->need_reconnect = false;
+		ses->status = CifsGood;
+		ses->need_reconnect = false;
 		spin_unlock(&GlobalMid_Lock);
 	}
 
-ss_err_exit:
 	return rc;
 }
 
-- 
cgit v1.2.3


From 26efa0bac9dc3587ee8892c06642735bcded59e5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sat, 24 Apr 2010 07:57:49 -0400
Subject: cifs: have decode_negTokenInit set flags in server struct

...rather than the secType. This allows us to get rid of the MSKerberos
securityEnum. The client just makes a decision at upcall time.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/asn1.c        | 30 ++++++++----------------------
 fs/cifs/cifs_spnego.c |  4 ++--
 fs/cifs/cifsglob.h    |  6 +++++-
 fs/cifs/cifsproto.h   |  2 +-
 fs/cifs/cifssmb.c     | 12 +++++++++---
 fs/cifs/sess.c        |  2 +-
 6 files changed, 26 insertions(+), 30 deletions(-)

diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index 6d555c05dba..cfd1ce34e0b 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -492,17 +492,13 @@ compare_oid(unsigned long *oid1, unsigned int oid1len,
 
 int
 decode_negTokenInit(unsigned char *security_blob, int length,
-		    enum securityEnum *secType)
+		    struct TCP_Server_Info *server)
 {
 	struct asn1_ctx ctx;
 	unsigned char *end;
 	unsigned char *sequence_end;
 	unsigned long *oid = NULL;
 	unsigned int cls, con, tag, oidlen, rc;
-	bool use_ntlmssp = false;
-	bool use_kerberos = false;
-	bool use_kerberosu2u = false;
-	bool use_mskerberos = false;
 
 	/* cifs_dump_mem(" Received SecBlob ", security_blob, length); */
 
@@ -599,20 +595,17 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 					*(oid + 1), *(oid + 2), *(oid + 3));
 
 				if (compare_oid(oid, oidlen, MSKRB5_OID,
-						MSKRB5_OID_LEN) &&
-						!use_mskerberos)
-					use_mskerberos = true;
+						MSKRB5_OID_LEN))
+					server->sec_mskerberos = true;
 				else if (compare_oid(oid, oidlen, KRB5U2U_OID,
-						     KRB5U2U_OID_LEN) &&
-						     !use_kerberosu2u)
-					use_kerberosu2u = true;
+						     KRB5U2U_OID_LEN))
+					server->sec_kerberosu2u = true;
 				else if (compare_oid(oid, oidlen, KRB5_OID,
-						     KRB5_OID_LEN) &&
-						     !use_kerberos)
-					use_kerberos = true;
+						     KRB5_OID_LEN))
+					server->sec_kerberos = true;
 				else if (compare_oid(oid, oidlen, NTLMSSP_OID,
 						     NTLMSSP_OID_LEN))
-					use_ntlmssp = true;
+					server->sec_ntlmssp = true;
 
 				kfree(oid);
 			}
@@ -669,12 +662,5 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 	cFYI(1, "Need to call asn1_octets_decode() function for %s",
 		ctx.pointer);	/* is this UTF-8 or ASCII? */
 decode_negtoken_exit:
-	if (use_kerberos)
-		*secType = Kerberos;
-	else if (use_mskerberos)
-		*secType = MSKerberos;
-	else if (use_ntlmssp)
-		*secType = RawNTLMSSP;
-
 	return 1;
 }
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index c53587b8330..379bd7d9c05 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -133,9 +133,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	dp = description + strlen(description);
 
 	/* for now, only sec=krb5 and sec=mskrb5 are valid */
-	if (server->secType == Kerberos)
+	if (server->sec_kerberos)
 		sprintf(dp, ";sec=krb5");
-	else if (server->secType == MSKerberos)
+	else if (server->sec_mskerberos)
 		sprintf(dp, ";sec=mskrb5");
 	else
 		goto out;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c412568b4a1..4a99487400f 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -87,7 +87,6 @@ enum securityEnum {
 	RawNTLMSSP,		/* NTLMSSP without SPNEGO, NTLMv2 hash */
 /*	NTLMSSP, */ /* can use rawNTLMSSP instead of NTLMSSP via SPNEGO */
 	Kerberos,		/* Kerberos via SPNEGO */
-	MSKerberos,		/* MS Kerberos via SPNEGO */
 };
 
 enum protocolEnum {
@@ -186,6 +185,11 @@ struct TCP_Server_Info {
 	char ntlmv2_hash[16];
 	unsigned long lstrp; /* when we got last response from this server */
 	u16 dialect; /* dialect index that server chose */
+	/* extended security flavors that server supports */
+	bool	sec_kerberos;		/* supports plain Kerberos */
+	bool	sec_mskerberos;		/* supports legacy MS Kerberos */
+	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
+	bool	sec_ntlmssp;		/* supports NTLMSSP */
 };
 
 /*
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 6fa808ec7e3..2e07da9a46f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -85,7 +85,7 @@ extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *);
 extern unsigned int smbCalcSize(struct smb_hdr *ptr);
 extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
-			enum securityEnum *secType);
+			struct TCP_Server_Info *server);
 extern int cifs_convert_address(char *src, void *dst);
 extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 30742d8eef1..c65c3419dd3 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -597,13 +597,19 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			server->secType = RawNTLMSSP;
 		} else {
 			rc = decode_negTokenInit(pSMBr->u.extended_response.
-						 SecurityBlob,
-						 count - 16,
-						 &server->secType);
+						 SecurityBlob, count - 16,
+						 server);
 			if (rc == 1)
 				rc = 0;
 			else
 				rc = -EINVAL;
+
+			if (server->sec_kerberos || server->sec_mskerberos)
+				server->secType = Kerberos;
+			else if (server->sec_ntlmssp)
+				server->secType = RawNTLMSSP;
+			else
+				rc = -EOPNOTSUPP;
 		}
 	} else
 		server->capabilities &= ~CAP_EXTENDED_SECURITY;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 84b92dfaf84..7707389bdf2 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -751,7 +751,7 @@ ssetup_ntlmssp_authenticate:
 			unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
 		} else
 			ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-	} else if (type == Kerberos || type == MSKerberos) {
+	} else if (type == Kerberos) {
 #ifdef CONFIG_CIFS_UPCALL
 		struct cifs_spnego_msg *msg;
 		spnego_key = cifs_get_spnego_key(ses);
-- 
cgit v1.2.3


From bdfae149c5b7430b9a26371f14b2d385fd3a4389 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 6 May 2010 00:38:16 +0000
Subject: [CIFS] Remove unused cifs_oplock_cachep

CC: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c    | 13 -------------
 fs/cifs/transport.c |  1 -
 2 files changed, 14 deletions(-)

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 80a93562b47..09842d3f7e1 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -86,8 +86,6 @@ extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
 
-extern struct kmem_cache *cifs_oplock_cachep;
-
 static int
 cifs_read_super(struct super_block *sb, void *data,
 		const char *devname, int silent)
@@ -289,7 +287,6 @@ static int cifs_permission(struct inode *inode, int mask)
 static struct kmem_cache *cifs_inode_cachep;
 static struct kmem_cache *cifs_req_cachep;
 static struct kmem_cache *cifs_mid_cachep;
-struct kmem_cache *cifs_oplock_cachep;
 static struct kmem_cache *cifs_sm_req_cachep;
 mempool_t *cifs_sm_req_poolp;
 mempool_t *cifs_req_poolp;
@@ -939,15 +936,6 @@ cifs_init_mids(void)
 		return -ENOMEM;
 	}
 
-	cifs_oplock_cachep = kmem_cache_create("cifs_oplock_structs",
-					sizeof(struct oplock_q_entry), 0,
-					SLAB_HWCACHE_ALIGN, NULL);
-	if (cifs_oplock_cachep == NULL) {
-		mempool_destroy(cifs_mid_poolp);
-		kmem_cache_destroy(cifs_mid_cachep);
-		return -ENOMEM;
-	}
-
 	return 0;
 }
 
@@ -956,7 +944,6 @@ cifs_destroy_mids(void)
 {
 	mempool_destroy(cifs_mid_poolp);
 	kmem_cache_destroy(cifs_mid_cachep);
-	kmem_cache_destroy(cifs_oplock_cachep);
 }
 
 static int __init
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 28f563cef5d..82f78c4d697 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -35,7 +35,6 @@
 #include "cifs_debug.h"
 
 extern mempool_t *cifs_mid_poolp;
-extern struct kmem_cache *cifs_oplock_cachep;
 
 static struct mid_q_entry *
 AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
-- 
cgit v1.2.3


From 54ad023ba8108d0163acc931ed4b5e4a8a3a7327 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 5 May 2010 21:30:35 -0700
Subject: ceph: don't use writeback_control in writepages completion

The ->writepages writeback_control is not still valid in the writepages
completion.  We were touching it solely to adjust pages_skipped when there
was a writeback error (EIO, ENOSPC, EPERM due to bad osd credentials),
causing an oops in the writeback code shortly thereafter.  Updating
pages_skipped on error isn't correct anyway, so let's just rip out this
(clearly broken) code to pass the wbc to the completion.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c       | 6 ------
 fs/ceph/osd_client.h | 1 -
 2 files changed, 7 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4b42c2bb603..a9005d862ed 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -504,7 +504,6 @@ static void writepages_finish(struct ceph_osd_request *req,
 	int i;
 	struct ceph_snap_context *snapc = req->r_snapc;
 	struct address_space *mapping = inode->i_mapping;
-	struct writeback_control *wbc = req->r_wbc;
 	__s32 rc = -EIO;
 	u64 bytes = 0;
 	struct ceph_client *client = ceph_inode_to_client(inode);
@@ -546,10 +545,6 @@ static void writepages_finish(struct ceph_osd_request *req,
 			clear_bdi_congested(&client->backing_dev_info,
 					    BLK_RW_ASYNC);
 
-		if (i >= wrote) {
-			dout("inode %p skipping page %p\n", inode, page);
-			wbc->pages_skipped++;
-		}
 		ceph_put_snap_context((void *)page->private);
 		page->private = 0;
 		ClearPagePrivate(page);
@@ -799,7 +794,6 @@ get_more_pages:
 				alloc_page_vec(client, req);
 				req->r_callback = writepages_finish;
 				req->r_inode = inode;
-				req->r_wbc = wbc;
 			}
 
 			/* note position of first page in pvec */
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index b0759911e7c..c5191d62f24 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -66,7 +66,6 @@ struct ceph_osd_request {
 	struct list_head  r_unsafe_item;
 
 	struct inode *r_inode;         	      /* for use by callbacks */
-	struct writeback_control *r_wbc;      /* ditto */
 
 	char              r_oid[40];          /* object name */
 	int               r_oid_len;
-- 
cgit v1.2.3


From 4f47b4c9f0b711bf84adb8c27774ae80d346b628 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 5 May 2010 13:22:25 -0700
Subject: x86, acpi/irq: Define gsi_end when X86_IO_APIC is undefined

My recent changes introducing a global gsi_end variable
failed to take into account the case of using acpi on a system
not built to support IO_APICs, causing the build to fail.

Define gsi_end to 15 when CONFIG_X86_IO_APIC is not set to avoid
compile errors.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <m1tyqm14la.fsf_-_@fess.ebiederm.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/io_apic.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9da192a17f0..63cb4096c3d 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -197,6 +197,7 @@ static const int timer_through_8259 = 0;
 static inline void ioapic_init_mappings(void)	{ }
 static inline void ioapic_insert_resources(void) { }
 static inline void probe_nr_irqs_gsi(void)	{ }
+#define gsi_end (NR_IRQS_LEGACY - 1)
 static inline int mp_find_ioapic(u32 gsi) { return 0; }
 
 struct io_apic_irq_attr;
-- 
cgit v1.2.3


From b0c4d952a158a6a2547672cf4fc9d55e415410de Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 6 May 2010 02:24:34 -0700
Subject: x86: Fix fake apicid to node mapping for numa emulation

With NUMA emulation, it's possible for a single cpu to be bound
to multiple nodes since more than one may have affinity if
allocated on a physical node that is local to the cpu.

APIC ids must therefore be mapped to the lowest node ids to
maintain generic kernel use of functions such as cpu_to_node()
that determine device affinity.  For example, if a device has
proximity to physical node 1, for instance, and a cpu happens to
be mapped to a higher emulated node id 8, the proximity may not
be correctly determined by comparison in generic code even
though the cpu may be truly local and allocated on physical node 1.

When this happens, the true topology of the machine isn't
accurately represented in the emulated environment; although
this isn't critical to the system's uptime, any generic code
that is NUMA aware benefits from the physical topology being
accurately represented.

This can affect any system that maps multiple APIC ids to a
single node and is booted with numa=fake=N where N is greater
than the number of physical nodes.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <alpine.DEB.2.00.1005060224140.19473@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/srat_64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 28c68762648..38512d0c474 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -461,7 +461,8 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
 		 * node, it must now point to the fake node ID.
 		 */
 		for (j = 0; j < MAX_LOCAL_APIC; j++)
-			if (apicid_to_node[j] == nid)
+			if (apicid_to_node[j] == nid &&
+			    fake_apicid_to_node[j] == NUMA_NO_NODE)
 				fake_apicid_to_node[j] = i;
 	}
 	for (i = 0; i < num_nodes; i++)
-- 
cgit v1.2.3


From 3de668ee8d5b1e08da3200f926ff5a28aeb99bc2 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 3 May 2010 15:00:25 +0200
Subject: oprofile/x86: notify cpus only when daemon is running

This patch moves the cpu notifier registration from nmi_init() to
nmi_setup(). The corresponding unregistration function is now in
nmi_shutdown(). Thus, the hotplug code is only active, if the oprofile
daemon is running.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 7de0572b0a5..2a086726cad 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -504,6 +504,7 @@ static int nmi_setup(void)
 		goto fail;
 
 	get_online_cpus();
+	register_cpu_notifier(&oprofile_cpu_nb);
 	on_each_cpu(nmi_cpu_setup, NULL, 1);
 	nmi_enabled = 1;
 	put_online_cpus();
@@ -519,6 +520,7 @@ static void nmi_shutdown(void)
 	struct op_msrs *msrs;
 
 	get_online_cpus();
+	unregister_cpu_notifier(&oprofile_cpu_nb);
 	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
 	nmi_enabled = 0;
 	ctr_running = 0;
@@ -739,12 +741,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 		return -ENODEV;
 	}
 
-	get_online_cpus();
-	register_cpu_notifier(&oprofile_cpu_nb);
-	nmi_enabled = 0;
-	ctr_running = 0;
-	put_online_cpus();
-
 	/* default values, can be overwritten by model */
 	ops->create_files	= nmi_create_files;
 	ops->setup		= nmi_setup;
@@ -771,14 +767,8 @@ int __init op_nmi_init(struct oprofile_operations *ops)
 
 void op_nmi_exit(void)
 {
-	if (using_nmi) {
+	if (using_nmi)
 		exit_sysfs();
-		get_online_cpus();
-		unregister_cpu_notifier(&oprofile_cpu_nb);
-		nmi_enabled = 0;
-		ctr_running = 0;
-		put_online_cpus();
-	}
 	if (model->exit)
 		model->exit();
 }
-- 
cgit v1.2.3


From bae663bc635e2726c7c5228dbf0f2051e16d1c81 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 5 May 2010 17:47:17 +0200
Subject: oprofile/x86: make AMD IBS hotplug capable

Current IBS code is not hotplug capable. An offline cpu might not be
initialized or deinitialized properly. This patch fixes this by
removing on_each_cpu() functions. The IBS init/deinit code is executed
in the per-cpu functions model->setup_ctrs() and model->cpu_down()
which are also called by hotplug notifiers. model->cpu_down() replaces
model->exit() that became obsolete.

Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c      |  4 +--
 arch/x86/oprofile/op_model_amd.c | 54 ++++++++++++----------------------------
 arch/x86/oprofile/op_x86_model.h |  2 +-
 3 files changed, 19 insertions(+), 41 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 2a086726cad..b28d2f1253b 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -397,6 +397,8 @@ static void nmi_cpu_shutdown(void *dummy)
 	apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
 	apic_write(APIC_LVTERR, v);
 	nmi_cpu_restore_registers(msrs);
+	if (model->cpu_down)
+		model->cpu_down();
 }
 
 static void nmi_cpu_up(void *dummy)
@@ -769,6 +771,4 @@ void op_nmi_exit(void)
 {
 	if (using_nmi)
 		exit_sysfs();
-	if (model->exit)
-		model->exit();
 }
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 384c5241048..b67a6b5aa8d 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -374,6 +374,15 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
 		val |= op_x86_get_ctrl(model, &counter_config[virt]);
 		wrmsrl(msrs->controls[i].addr, val);
 	}
+
+	if (ibs_caps)
+		setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
+}
+
+static void op_amd_cpu_shutdown(void)
+{
+	if (ibs_caps)
+		setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
 }
 
 static int op_amd_check_ctrs(struct pt_regs * const regs,
@@ -436,28 +445,16 @@ static void op_amd_stop(struct op_msrs const * const msrs)
 	op_amd_stop_ibs();
 }
 
-static u8 ibs_eilvt_off;
-
-static inline void apic_init_ibs_nmi_per_cpu(void *arg)
-{
-	ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
-}
-
-static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
-{
-	setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
-}
-
-static int init_ibs_nmi(void)
+static int __init_ibs_nmi(void)
 {
 #define IBSCTL_LVTOFFSETVAL		(1 << 8)
 #define IBSCTL				0x1cc
 	struct pci_dev *cpu_cfg;
 	int nodes;
 	u32 value = 0;
+	u8 ibs_eilvt_off;
 
-	/* per CPU setup */
-	on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1);
+	ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
 
 	nodes = 0;
 	cpu_cfg = NULL;
@@ -487,21 +484,15 @@ static int init_ibs_nmi(void)
 	return 0;
 }
 
-/* uninitialize the APIC for the IBS interrupts if needed */
-static void clear_ibs_nmi(void)
-{
-	on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
-}
-
 /* initialize the APIC for the IBS interrupts if available */
-static void ibs_init(void)
+static void init_ibs(void)
 {
 	ibs_caps = get_ibs_caps();
 
 	if (!ibs_caps)
 		return;
 
-	if (init_ibs_nmi()) {
+	if (__init_ibs_nmi()) {
 		ibs_caps = 0;
 		return;
 	}
@@ -510,14 +501,6 @@ static void ibs_init(void)
 	       (unsigned)ibs_caps);
 }
 
-static void ibs_exit(void)
-{
-	if (!ibs_caps)
-		return;
-
-	clear_ibs_nmi();
-}
-
 static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
 
 static int setup_ibs_files(struct super_block *sb, struct dentry *root)
@@ -566,17 +549,12 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
 
 static int op_amd_init(struct oprofile_operations *ops)
 {
-	ibs_init();
+	init_ibs();
 	create_arch_files = ops->create_files;
 	ops->create_files = setup_ibs_files;
 	return 0;
 }
 
-static void op_amd_exit(void)
-{
-	ibs_exit();
-}
-
 struct op_x86_model_spec op_amd_spec = {
 	.num_counters		= NUM_COUNTERS,
 	.num_controls		= NUM_COUNTERS,
@@ -584,9 +562,9 @@ struct op_x86_model_spec op_amd_spec = {
 	.reserved		= MSR_AMD_EVENTSEL_RESERVED,
 	.event_mask		= OP_EVENT_MASK,
 	.init			= op_amd_init,
-	.exit			= op_amd_exit,
 	.fill_in_addresses	= &op_amd_fill_in_addresses,
 	.setup_ctrs		= &op_amd_setup_ctrs,
+	.cpu_down		= &op_amd_cpu_shutdown,
 	.check_ctrs		= &op_amd_check_ctrs,
 	.start			= &op_amd_start,
 	.stop			= &op_amd_stop,
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 551401398fb..89017fa1fd6 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -40,10 +40,10 @@ struct op_x86_model_spec {
 	u64		reserved;
 	u16		event_mask;
 	int		(*init)(struct oprofile_operations *ops);
-	void		(*exit)(void);
 	int		(*fill_in_addresses)(struct op_msrs * const msrs);
 	void		(*setup_ctrs)(struct op_x86_model_spec const *model,
 				      struct op_msrs const * const msrs);
+	void		(*cpu_down)(void);
 	int		(*check_ctrs)(struct pt_regs * const regs,
 				      struct op_msrs const * const msrs);
 	void		(*start)(struct op_msrs const * const msrs);
-- 
cgit v1.2.3


From 1142d810298e694754498dbb4983fcb6cb7fd884 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 6 May 2010 18:49:20 +0200
Subject: cpu_stop: implement stop_cpu[s]()

Implement a simplistic per-cpu maximum priority cpu monopolization
mechanism.  A non-sleeping callback can be scheduled to run on one or
multiple cpus with maximum priority monopolozing those cpus.  This is
primarily to replace and unify RT workqueue usage in stop_machine and
scheduler migration_thread which currently is serving multiple
purposes.

Four functions are provided - stop_one_cpu(), stop_one_cpu_nowait(),
stop_cpus() and try_stop_cpus().

This is to allow clean sharing of resources among stop_cpu and all the
migration thread users.  One stopper thread per cpu is created which
is currently named "stopper/CPU".  This will eventually replace the
migration thread and take on its name.

* This facility was originally named cpuhog and lived in separate
  files but Peter Zijlstra nacked the name and thus got renamed to
  cpu_stop and moved into stop_machine.c.

* Better reporting of preemption leak as per Peter's suggestion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
---
 include/linux/stop_machine.h |  39 ++++-
 kernel/stop_machine.c        | 372 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 402 insertions(+), 9 deletions(-)

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index baba3a23a81..efcbd6c3794 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -1,15 +1,46 @@
 #ifndef _LINUX_STOP_MACHINE
 #define _LINUX_STOP_MACHINE
-/* "Bogolock": stop the entire machine, disable interrupts.  This is a
-   very heavy lock, which is equivalent to grabbing every spinlock
-   (and more).  So the "read" side to such a lock is anything which
-   disables preeempt. */
+
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
+#include <linux/list.h>
 #include <asm/system.h>
 
 #if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
 
+/*
+ * stop_cpu[s]() is simplistic per-cpu maximum priority cpu
+ * monopolization mechanism.  The caller can specify a non-sleeping
+ * function to be executed on a single or multiple cpus preempting all
+ * other processes and monopolizing those cpus until it finishes.
+ *
+ * Resources for this mechanism are preallocated when a cpu is brought
+ * up and requests are guaranteed to be served as long as the target
+ * cpus are online.
+ */
+
+typedef int (*cpu_stop_fn_t)(void *arg);
+
+struct cpu_stop_work {
+	struct list_head	list;		/* cpu_stopper->works */
+	cpu_stop_fn_t		fn;
+	void			*arg;
+	struct cpu_stop_done	*done;
+};
+
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg);
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+			 struct cpu_stop_work *work_buf);
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+
+/*
+ * stop_machine "Bogolock": stop the entire machine, disable
+ * interrupts.  This is a very heavy lock, which is equivalent to
+ * grabbing every spinlock (and more).  So the "read" side to such a
+ * lock is anything which disables preeempt.
+ */
+
 /**
  * stop_machine: freeze the machine on all CPUs and run this function
  * @fn: the function to run
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bb9fb1bd79..7e3f9182aef 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,379 @@
-/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
- * GPL v2 and any later version.
+/*
+ * kernel/stop_machine.c
+ *
+ * Copyright (C) 2008, 2005	IBM Corporation.
+ * Copyright (C) 2008, 2005	Rusty Russell rusty@rustcorp.com.au
+ * Copyright (C) 2010		SUSE Linux Products GmbH
+ * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2 and any later version.
  */
+#include <linux/completion.h>
 #include <linux/cpu.h>
-#include <linux/err.h>
+#include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/stop_machine.h>
-#include <linux/syscalls.h>
 #include <linux/interrupt.h>
+#include <linux/kallsyms.h>
 
 #include <asm/atomic.h>
-#include <asm/uaccess.h>
+
+/*
+ * Structure to determine completion condition and record errors.  May
+ * be shared by works on different cpus.
+ */
+struct cpu_stop_done {
+	atomic_t		nr_todo;	/* nr left to execute */
+	bool			executed;	/* actually executed? */
+	int			ret;		/* collected return value */
+	struct completion	completion;	/* fired if nr_todo reaches 0 */
+};
+
+/* the actual stopper, one per every possible cpu, enabled on online cpus */
+struct cpu_stopper {
+	spinlock_t		lock;
+	struct list_head	works;		/* list of pending works */
+	struct task_struct	*thread;	/* stopper thread */
+	bool			enabled;	/* is this stopper enabled? */
+};
+
+static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+
+static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
+{
+	memset(done, 0, sizeof(*done));
+	atomic_set(&done->nr_todo, nr_todo);
+	init_completion(&done->completion);
+}
+
+/* signal completion unless @done is NULL */
+static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
+{
+	if (done) {
+		if (executed)
+			done->executed = true;
+		if (atomic_dec_and_test(&done->nr_todo))
+			complete(&done->completion);
+	}
+}
+
+/* queue @work to @stopper.  if offline, @work is completed immediately */
+static void cpu_stop_queue_work(struct cpu_stopper *stopper,
+				struct cpu_stop_work *work)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&stopper->lock, flags);
+
+	if (stopper->enabled) {
+		list_add_tail(&work->list, &stopper->works);
+		wake_up_process(stopper->thread);
+	} else
+		cpu_stop_signal_done(work->done, false);
+
+	spin_unlock_irqrestore(&stopper->lock, flags);
+}
+
+/**
+ * stop_one_cpu - stop a cpu
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on @cpu.  @fn is run in a process context with
+ * the highest priority preempting any task on the cpu and
+ * monopolizing it.  This function returns after the execution is
+ * complete.
+ *
+ * This function doesn't guarantee @cpu stays online till @fn
+ * completes.  If @cpu goes down in the middle, execution may happen
+ * partially or fully on different cpus.  @fn should either be ready
+ * for that or the caller should ensure that @cpu stays online until
+ * this function completes.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
+ * otherwise, the return value of @fn.
+ */
+int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+	struct cpu_stop_done done;
+	struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
+
+	cpu_stop_init_done(&done, 1);
+	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
+	wait_for_completion(&done.completion);
+	return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_one_cpu_nowait - stop a cpu but don't wait for completion
+ * @cpu: cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Similar to stop_one_cpu() but doesn't wait for completion.  The
+ * caller is responsible for ensuring @work_buf is currently unused
+ * and will remain untouched until stopper starts executing @fn.
+ *
+ * CONTEXT:
+ * Don't care.
+ */
+void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
+			struct cpu_stop_work *work_buf)
+{
+	*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
+	cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+}
+
+/* static data for stop_cpus */
+static DEFINE_MUTEX(stop_cpus_mutex);
+static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
+
+int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+	struct cpu_stop_work *work;
+	struct cpu_stop_done done;
+	unsigned int cpu;
+
+	/* initialize works and done */
+	for_each_cpu(cpu, cpumask) {
+		work = &per_cpu(stop_cpus_work, cpu);
+		work->fn = fn;
+		work->arg = arg;
+		work->done = &done;
+	}
+	cpu_stop_init_done(&done, cpumask_weight(cpumask));
+
+	/*
+	 * Disable preemption while queueing to avoid getting
+	 * preempted by a stopper which might wait for other stoppers
+	 * to enter @fn which can lead to deadlock.
+	 */
+	preempt_disable();
+	for_each_cpu(cpu, cpumask)
+		cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
+				    &per_cpu(stop_cpus_work, cpu));
+	preempt_enable();
+
+	wait_for_completion(&done.completion);
+	return done.executed ? done.ret : -ENOENT;
+}
+
+/**
+ * stop_cpus - stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Execute @fn(@arg) on online cpus in @cpumask.  On each target cpu,
+ * @fn is run in a process context with the highest priority
+ * preempting any task on the cpu and monopolizing it.  This function
+ * returns after all executions are complete.
+ *
+ * This function doesn't guarantee the cpus in @cpumask stay online
+ * till @fn completes.  If some cpus go down in the middle, execution
+ * on the cpu may happen partially or fully on different cpus.  @fn
+ * should either be ready for that or the caller should ensure that
+ * the cpus stay online until this function completes.
+ *
+ * All stop_cpus() calls are serialized making it safe for @fn to wait
+ * for all cpus to start executing it.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -ENOENT if @fn(@arg) was not executed at all because all cpus in
+ * @cpumask were offline; otherwise, 0 if all executions of @fn
+ * returned 0, any non zero return value if any returned non zero.
+ */
+int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+	int ret;
+
+	/* static works are used, process one request at a time */
+	mutex_lock(&stop_cpus_mutex);
+	ret = __stop_cpus(cpumask, fn, arg);
+	mutex_unlock(&stop_cpus_mutex);
+	return ret;
+}
+
+/**
+ * try_stop_cpus - try to stop multiple cpus
+ * @cpumask: cpus to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Identical to stop_cpus() except that it fails with -EAGAIN if
+ * someone else is already using the facility.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * -EAGAIN if someone else is already stopping cpus, -ENOENT if
+ * @fn(@arg) was not executed at all because all cpus in @cpumask were
+ * offline; otherwise, 0 if all executions of @fn returned 0, any non
+ * zero return value if any returned non zero.
+ */
+int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
+{
+	int ret;
+
+	/* static works are used, process one request at a time */
+	if (!mutex_trylock(&stop_cpus_mutex))
+		return -EAGAIN;
+	ret = __stop_cpus(cpumask, fn, arg);
+	mutex_unlock(&stop_cpus_mutex);
+	return ret;
+}
+
+static int cpu_stopper_thread(void *data)
+{
+	struct cpu_stopper *stopper = data;
+	struct cpu_stop_work *work;
+	int ret;
+
+repeat:
+	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
+
+	if (kthread_should_stop()) {
+		__set_current_state(TASK_RUNNING);
+		return 0;
+	}
+
+	work = NULL;
+	spin_lock_irq(&stopper->lock);
+	if (!list_empty(&stopper->works)) {
+		work = list_first_entry(&stopper->works,
+					struct cpu_stop_work, list);
+		list_del_init(&work->list);
+	}
+	spin_unlock_irq(&stopper->lock);
+
+	if (work) {
+		cpu_stop_fn_t fn = work->fn;
+		void *arg = work->arg;
+		struct cpu_stop_done *done = work->done;
+		char ksym_buf[KSYM_NAME_LEN];
+
+		__set_current_state(TASK_RUNNING);
+
+		/* cpu stop callbacks are not allowed to sleep */
+		preempt_disable();
+
+		ret = fn(arg);
+		if (ret)
+			done->ret = ret;
+
+		/* restore preemption and check it's still balanced */
+		preempt_enable();
+		WARN_ONCE(preempt_count(),
+			  "cpu_stop: %s(%p) leaked preempt count\n",
+			  kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
+					  ksym_buf), arg);
+
+		cpu_stop_signal_done(done, true);
+	} else
+		schedule();
+
+	goto repeat;
+}
+
+/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
+static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
+					   unsigned long action, void *hcpu)
+{
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+	unsigned int cpu = (unsigned long)hcpu;
+	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+	struct cpu_stop_work *work;
+	struct task_struct *p;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		BUG_ON(stopper->thread || stopper->enabled ||
+		       !list_empty(&stopper->works));
+		p = kthread_create(cpu_stopper_thread, stopper, "stopper/%d",
+				   cpu);
+		if (IS_ERR(p))
+			return NOTIFY_BAD;
+		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+		get_task_struct(p);
+		stopper->thread = p;
+		break;
+
+	case CPU_ONLINE:
+		kthread_bind(stopper->thread, cpu);
+		/* strictly unnecessary, as first user will wake it */
+		wake_up_process(stopper->thread);
+		/* mark enabled */
+		spin_lock_irq(&stopper->lock);
+		stopper->enabled = true;
+		spin_unlock_irq(&stopper->lock);
+		break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+	case CPU_DEAD:
+		/* kill the stopper */
+		kthread_stop(stopper->thread);
+		/* drain remaining works */
+		spin_lock_irq(&stopper->lock);
+		list_for_each_entry(work, &stopper->works, list)
+			cpu_stop_signal_done(work->done, false);
+		stopper->enabled = false;
+		spin_unlock_irq(&stopper->lock);
+		/* release the stopper */
+		put_task_struct(stopper->thread);
+		stopper->thread = NULL;
+		break;
+#endif
+	}
+
+	return NOTIFY_OK;
+}
+
+/*
+ * Give it a higher priority so that cpu stopper is available to other
+ * cpu notifiers.  It currently shares the same priority as sched
+ * migration_notifier.
+ */
+static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
+	.notifier_call	= cpu_stop_cpu_callback,
+	.priority	= 10,
+};
+
+static int __init cpu_stop_init(void)
+{
+	void *bcpu = (void *)(long)smp_processor_id();
+	unsigned int cpu;
+	int err;
+
+	for_each_possible_cpu(cpu) {
+		struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+
+		spin_lock_init(&stopper->lock);
+		INIT_LIST_HEAD(&stopper->works);
+	}
+
+	/* start one for the boot cpu */
+	err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
+				    bcpu);
+	BUG_ON(err == NOTIFY_BAD);
+	cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
+	register_cpu_notifier(&cpu_stop_cpu_notifier);
+
+	return 0;
+}
+early_initcall(cpu_stop_init);
 
 /* This controls the threads on each CPU. */
 enum stopmachine_state {
-- 
cgit v1.2.3


From 3fc1f1e27a5b807791d72e5d992aa33b668a6626 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 6 May 2010 18:49:20 +0200
Subject: stop_machine: reimplement using cpu_stop

Reimplement stop_machine using cpu_stop.  As cpu stoppers are
guaranteed to be available for all online cpus,
stop_machine_create/destroy() are no longer necessary and removed.

With resource management and synchronization handled by cpu_stop, the
new implementation is much simpler.  Asking the cpu_stop to execute
the stop_cpu() state machine on all online cpus with cpu hotplug
disabled is enough.

stop_machine itself doesn't need to manage any global resources
anymore, so all per-instance information is rolled into struct
stop_machine_data and the mutex and all static data variables are
removed.

The previous implementation created and destroyed RT workqueues as
necessary which made stop_machine() calls highly expensive on very
large machines.  According to Dimitri Sivanich, preventing the dynamic
creation/destruction makes booting faster more than twice on very
large machines.  cpu_stop resources are preallocated for all online
cpus and should have the same effect.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
---
 arch/s390/kernel/time.c      |   1 -
 drivers/xen/manage.c         |  14 +---
 include/linux/stop_machine.h |  20 ------
 kernel/cpu.c                 |   8 ---
 kernel/module.c              |  14 +---
 kernel/stop_machine.c        | 158 +++++++++++--------------------------------
 6 files changed, 42 insertions(+), 173 deletions(-)

diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index fba6dec156b..03d96569f18 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -390,7 +390,6 @@ static void __init time_init_wq(void)
 	if (time_sync_wq)
 		return;
 	time_sync_wq = create_singlethread_workqueue("timesync");
-	stop_machine_create();
 }
 
 /*
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 2ac4440e7b0..8943b8ccee1 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -80,12 +80,6 @@ static void do_suspend(void)
 
 	shutting_down = SHUTDOWN_SUSPEND;
 
-	err = stop_machine_create();
-	if (err) {
-		printk(KERN_ERR "xen suspend: failed to setup stop_machine %d\n", err);
-		goto out;
-	}
-
 #ifdef CONFIG_PREEMPT
 	/* If the kernel is preemptible, we need to freeze all the processes
 	   to prevent them from being in the middle of a pagetable update
@@ -93,7 +87,7 @@ static void do_suspend(void)
 	err = freeze_processes();
 	if (err) {
 		printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
-		goto out_destroy_sm;
+		goto out;
 	}
 #endif
 
@@ -136,12 +130,8 @@ out_resume:
 out_thaw:
 #ifdef CONFIG_PREEMPT
 	thaw_processes();
-
-out_destroy_sm:
-#endif
-	stop_machine_destroy();
-
 out:
+#endif
 	shutting_down = SHUTDOWN_INVALID;
 }
 #endif	/* CONFIG_PM_SLEEP */
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index efcbd6c3794..0e552e72a4c 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -67,23 +67,6 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
  */
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
 
-/**
- * stop_machine_create: create all stop_machine threads
- *
- * Description: This causes all stop_machine threads to be created before
- * stop_machine actually gets called. This can be used by subsystems that
- * need a non failing stop_machine infrastructure.
- */
-int stop_machine_create(void);
-
-/**
- * stop_machine_destroy: destroy all stop_machine threads
- *
- * Description: This causes all stop_machine threads which were created with
- * stop_machine_create to be destroyed again.
- */
-void stop_machine_destroy(void);
-
 #else
 
 static inline int stop_machine(int (*fn)(void *), void *data,
@@ -96,8 +79,5 @@ static inline int stop_machine(int (*fn)(void *), void *data,
 	return ret;
 }
 
-static inline int stop_machine_create(void) { return 0; }
-static inline void stop_machine_destroy(void) { }
-
 #endif /* CONFIG_SMP */
 #endif /* _LINUX_STOP_MACHINE */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 914aedcde84..54577757477 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -266,9 +266,6 @@ int __ref cpu_down(unsigned int cpu)
 {
 	int err;
 
-	err = stop_machine_create();
-	if (err)
-		return err;
 	cpu_maps_update_begin();
 
 	if (cpu_hotplug_disabled) {
@@ -280,7 +277,6 @@ int __ref cpu_down(unsigned int cpu)
 
 out:
 	cpu_maps_update_done();
-	stop_machine_destroy();
 	return err;
 }
 EXPORT_SYMBOL(cpu_down);
@@ -361,9 +357,6 @@ int disable_nonboot_cpus(void)
 {
 	int cpu, first_cpu, error;
 
-	error = stop_machine_create();
-	if (error)
-		return error;
 	cpu_maps_update_begin();
 	first_cpu = cpumask_first(cpu_online_mask);
 	/*
@@ -394,7 +387,6 @@ int disable_nonboot_cpus(void)
 		printk(KERN_ERR "Non-boot CPUs are not disabled\n");
 	}
 	cpu_maps_update_done();
-	stop_machine_destroy();
 	return error;
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index 1016b75b026..0838246d8c9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -723,16 +723,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		return -EFAULT;
 	name[MODULE_NAME_LEN-1] = '\0';
 
-	/* Create stop_machine threads since free_module relies on
-	 * a non-failing stop_machine call. */
-	ret = stop_machine_create();
-	if (ret)
-		return ret;
-
-	if (mutex_lock_interruptible(&module_mutex) != 0) {
-		ret = -EINTR;
-		goto out_stop;
-	}
+	if (mutex_lock_interruptible(&module_mutex) != 0)
+		return -EINTR;
 
 	mod = find_module(name);
 	if (!mod) {
@@ -792,8 +784,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 
  out:
 	mutex_unlock(&module_mutex);
-out_stop:
-	stop_machine_destroy();
 	return ret;
 }
 
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 7e3f9182aef..884c7a1afee 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -388,174 +388,92 @@ enum stopmachine_state {
 	/* Exit */
 	STOPMACHINE_EXIT,
 };
-static enum stopmachine_state state;
 
 struct stop_machine_data {
-	int (*fn)(void *);
-	void *data;
-	int fnret;
+	int			(*fn)(void *);
+	void			*data;
+	/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+	unsigned int		num_threads;
+	const struct cpumask	*active_cpus;
+
+	enum stopmachine_state	state;
+	atomic_t		thread_ack;
 };
 
-/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-static unsigned int num_threads;
-static atomic_t thread_ack;
-static DEFINE_MUTEX(lock);
-/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
-static DEFINE_MUTEX(setup_lock);
-/* Users of stop_machine. */
-static int refcount;
-static struct workqueue_struct *stop_machine_wq;
-static struct stop_machine_data active, idle;
-static const struct cpumask *active_cpus;
-static void __percpu *stop_machine_work;
-
-static void set_state(enum stopmachine_state newstate)
+static void set_state(struct stop_machine_data *smdata,
+		      enum stopmachine_state newstate)
 {
 	/* Reset ack counter. */
-	atomic_set(&thread_ack, num_threads);
+	atomic_set(&smdata->thread_ack, smdata->num_threads);
 	smp_wmb();
-	state = newstate;
+	smdata->state = newstate;
 }
 
 /* Last one to ack a state moves to the next state. */
-static void ack_state(void)
+static void ack_state(struct stop_machine_data *smdata)
 {
-	if (atomic_dec_and_test(&thread_ack))
-		set_state(state + 1);
+	if (atomic_dec_and_test(&smdata->thread_ack))
+		set_state(smdata, smdata->state + 1);
 }
 
-/* This is the actual function which stops the CPU. It runs
- * in the context of a dedicated stopmachine workqueue. */
-static void stop_cpu(struct work_struct *unused)
+/* This is the cpu_stop function which stops the CPU. */
+static int stop_machine_cpu_stop(void *data)
 {
+	struct stop_machine_data *smdata = data;
 	enum stopmachine_state curstate = STOPMACHINE_NONE;
-	struct stop_machine_data *smdata = &idle;
-	int cpu = smp_processor_id();
-	int err;
+	int cpu = smp_processor_id(), err = 0;
+	bool is_active;
+
+	if (!smdata->active_cpus)
+		is_active = cpu == cpumask_first(cpu_online_mask);
+	else
+		is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
 
-	if (!active_cpus) {
-		if (cpu == cpumask_first(cpu_online_mask))
-			smdata = &active;
-	} else {
-		if (cpumask_test_cpu(cpu, active_cpus))
-			smdata = &active;
-	}
 	/* Simple state machine */
 	do {
 		/* Chill out and ensure we re-read stopmachine_state. */
 		cpu_relax();
-		if (state != curstate) {
-			curstate = state;
+		if (smdata->state != curstate) {
+			curstate = smdata->state;
 			switch (curstate) {
 			case STOPMACHINE_DISABLE_IRQ:
 				local_irq_disable();
 				hard_irq_disable();
 				break;
 			case STOPMACHINE_RUN:
-				/* On multiple CPUs only a single error code
-				 * is needed to tell that something failed. */
-				err = smdata->fn(smdata->data);
-				if (err)
-					smdata->fnret = err;
+				if (is_active)
+					err = smdata->fn(smdata->data);
 				break;
 			default:
 				break;
 			}
-			ack_state();
+			ack_state(smdata);
 		}
 	} while (curstate != STOPMACHINE_EXIT);
 
 	local_irq_enable();
+	return err;
 }
 
-/* Callback for CPUs which aren't supposed to do anything. */
-static int chill(void *unused)
-{
-	return 0;
-}
-
-int stop_machine_create(void)
-{
-	mutex_lock(&setup_lock);
-	if (refcount)
-		goto done;
-	stop_machine_wq = create_rt_workqueue("kstop");
-	if (!stop_machine_wq)
-		goto err_out;
-	stop_machine_work = alloc_percpu(struct work_struct);
-	if (!stop_machine_work)
-		goto err_out;
-done:
-	refcount++;
-	mutex_unlock(&setup_lock);
-	return 0;
-
-err_out:
-	if (stop_machine_wq)
-		destroy_workqueue(stop_machine_wq);
-	mutex_unlock(&setup_lock);
-	return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(stop_machine_create);
-
-void stop_machine_destroy(void)
-{
-	mutex_lock(&setup_lock);
-	refcount--;
-	if (refcount)
-		goto done;
-	destroy_workqueue(stop_machine_wq);
-	free_percpu(stop_machine_work);
-done:
-	mutex_unlock(&setup_lock);
-}
-EXPORT_SYMBOL_GPL(stop_machine_destroy);
-
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
-	struct work_struct *sm_work;
-	int i, ret;
-
-	/* Set up initial state. */
-	mutex_lock(&lock);
-	num_threads = num_online_cpus();
-	active_cpus = cpus;
-	active.fn = fn;
-	active.data = data;
-	active.fnret = 0;
-	idle.fn = chill;
-	idle.data = NULL;
-
-	set_state(STOPMACHINE_PREPARE);
-
-	/* Schedule the stop_cpu work on all cpus: hold this CPU so one
-	 * doesn't hit this CPU until we're ready. */
-	get_cpu();
-	for_each_online_cpu(i) {
-		sm_work = per_cpu_ptr(stop_machine_work, i);
-		INIT_WORK(sm_work, stop_cpu);
-		queue_work_on(i, stop_machine_wq, sm_work);
-	}
-	/* This will release the thread on our CPU. */
-	put_cpu();
-	flush_workqueue(stop_machine_wq);
-	ret = active.fnret;
-	mutex_unlock(&lock);
-	return ret;
+	struct stop_machine_data smdata = { .fn = fn, .data = data,
+					    .num_threads = num_online_cpus(),
+					    .active_cpus = cpus };
+
+	/* Set the initial state and stop all online cpus. */
+	set_state(&smdata, STOPMACHINE_PREPARE);
+	return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
 }
 
 int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
 	int ret;
 
-	ret = stop_machine_create();
-	if (ret)
-		return ret;
 	/* No CPUs can come up or down during this. */
 	get_online_cpus();
 	ret = __stop_machine(fn, data, cpus);
 	put_online_cpus();
-	stop_machine_destroy();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(stop_machine);
-- 
cgit v1.2.3


From 969c79215a35b06e5e3efe69b9412f858df7856c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 6 May 2010 18:49:21 +0200
Subject: sched: replace migration_thread with cpu_stop

Currently migration_thread is serving three purposes - migration
pusher, context to execute active_load_balance() and forced context
switcher for expedited RCU synchronize_sched.  All three roles are
hardcoded into migration_thread() and determining which job is
scheduled is slightly messy.

This patch kills migration_thread and replaces all three uses with
cpu_stop.  The three different roles of migration_thread() are
splitted into three separate cpu_stop callbacks -
migration_cpu_stop(), active_load_balance_cpu_stop() and
synchronize_sched_expedited_cpu_stop() - and each use case now simply
asks cpu_stop to execute the callback as necessary.

synchronize_sched_expedited() was implemented with private
preallocated resources and custom multi-cpu queueing and waiting
logic, both of which are provided by cpu_stop.
synchronize_sched_expedited_count is made atomic and all other shared
resources along with the mutex are dropped.

synchronize_sched_expedited() also implemented a check to detect cases
where not all the callback got executed on their assigned cpus and
fall back to synchronize_sched().  If called with cpu hotplug blocked,
cpu_stop already guarantees that and the condition cannot happen;
otherwise, stop_machine() would break.  However, this patch preserves
the paranoid check using a cpumask to record on which cpus the stopper
ran so that it can serve as a bisection point if something actually
goes wrong theree.

Because the internal execution state is no longer visible,
rcu_expedited_torture_stats() is removed.

This patch also renames cpu_stop threads to from "stopper/%d" to
"migration/%d".  The names of these threads ultimately don't matter
and there's no reason to make unnecessary userland visible changes.

With this patch applied, stop_machine() and sched now share the same
resources.  stop_machine() is faster without wasting any resources and
sched migration users are much cleaner.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Josh Triplett <josh@freedesktop.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
---
 Documentation/RCU/torture.txt |  10 --
 include/linux/rcutiny.h       |   2 -
 include/linux/rcutree.h       |   1 -
 kernel/rcutorture.c           |   2 +-
 kernel/sched.c                | 315 ++++++++++++------------------------------
 kernel/sched_fair.c           |  48 +++++--
 kernel/stop_machine.c         |   2 +-
 7 files changed, 127 insertions(+), 253 deletions(-)

diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 0e50bc2aa1e..5d9016795fd 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -182,16 +182,6 @@ Similarly, sched_expedited RCU provides the following:
 	sched_expedited-torture: Reader Pipe:  12660320201 95875 0 0 0 0 0 0 0 0 0
 	sched_expedited-torture: Reader Batch:  12660424885 0 0 0 0 0 0 0 0 0 0
 	sched_expedited-torture: Free-Block Circulation:  1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0
-	state: -1 / 0:0 3:0 4:0
-
-As before, the first four lines are similar to those for RCU.
-The last line shows the task-migration state.  The first number is
--1 if synchronize_sched_expedited() is idle, -2 if in the process of
-posting wakeups to the migration kthreads, and N when waiting on CPU N.
-Each of the colon-separated fields following the "/" is a CPU:state pair.
-Valid states are "0" for idle, "1" for waiting for quiescent state,
-"2" for passed through quiescent state, and "3" when a race with a
-CPU-hotplug event forces use of the synchronize_sched() primitive.
 
 
 USAGE
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index a5195875480..0006b2df00e 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -60,8 +60,6 @@ static inline long rcu_batches_completed_bh(void)
 	return 0;
 }
 
-extern int rcu_expedited_torture_stats(char *page);
-
 static inline void rcu_force_quiescent_state(void)
 {
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 42cc3a04779..24e467e526b 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -35,7 +35,6 @@ struct notifier_block;
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern int rcu_needs_cpu(int cpu);
-extern int rcu_expedited_torture_stats(char *page);
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 58df55bf83e..2b676f3a0f2 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -669,7 +669,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
 	.sync		= synchronize_sched_expedited,
 	.cb_barrier	= NULL,
 	.fqs		= rcu_sched_force_quiescent_state,
-	.stats		= rcu_expedited_torture_stats,
+	.stats		= NULL,
 	.irq_capable	= 1,
 	.name		= "sched_expedited"
 };
diff --git a/kernel/sched.c b/kernel/sched.c
index 4956ed09283..f1d577a0a8a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/percpu.h>
-#include <linux/kthread.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/stop_machine.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
@@ -539,15 +539,13 @@ struct rq {
 	int post_schedule;
 	int active_balance;
 	int push_cpu;
+	struct cpu_stop_work active_balance_work;
 	/* cpu of this runqueue: */
 	int cpu;
 	int online;
 
 	unsigned long avg_load_per_task;
 
-	struct task_struct *migration_thread;
-	struct list_head migration_queue;
-
 	u64 rt_avg;
 	u64 age_stamp;
 	u64 idle_stamp;
@@ -2037,21 +2035,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	__set_task_cpu(p, new_cpu);
 }
 
-struct migration_req {
-	struct list_head list;
-
+struct migration_arg {
 	struct task_struct *task;
 	int dest_cpu;
-
-	struct completion done;
 };
 
+static int migration_cpu_stop(void *data);
+
 /*
  * The task's runqueue lock must be held.
  * Returns true if you have to wait for migration thread.
  */
-static int
-migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
+static bool migrate_task(struct task_struct *p, int dest_cpu)
 {
 	struct rq *rq = task_rq(p);
 
@@ -2059,15 +2054,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 	 * If the task is not on a runqueue (and not running), then
 	 * the next wake-up will properly place the task.
 	 */
-	if (!p->se.on_rq && !task_running(rq, p))
-		return 0;
-
-	init_completion(&req->done);
-	req->task = p;
-	req->dest_cpu = dest_cpu;
-	list_add(&req->list, &rq->migration_queue);
-
-	return 1;
+	return p->se.on_rq || task_running(rq, p);
 }
 
 /*
@@ -3110,7 +3097,6 @@ static void update_cpu_load(struct rq *this_rq)
 void sched_exec(void)
 {
 	struct task_struct *p = current;
-	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
 	int dest_cpu;
@@ -3124,17 +3110,11 @@ void sched_exec(void)
 	 * select_task_rq() can race against ->cpus_allowed
 	 */
 	if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-	    likely(cpu_active(dest_cpu)) &&
-	    migrate_task(p, dest_cpu, &req)) {
-		/* Need to wait for migration thread (might exit: take ref). */
-		struct task_struct *mt = rq->migration_thread;
+	    likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+		struct migration_arg arg = { p, dest_cpu };
 
-		get_task_struct(mt);
 		task_rq_unlock(rq, &flags);
-		wake_up_process(mt);
-		put_task_struct(mt);
-		wait_for_completion(&req.done);
-
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		return;
 	}
 unlock:
@@ -5290,17 +5270,15 @@ static inline void sched_init_granularity(void)
 /*
  * This is how migration works:
  *
- * 1) we queue a struct migration_req structure in the source CPU's
- *    runqueue and wake up that CPU's migration thread.
- * 2) we down() the locked semaphore => thread blocks.
- * 3) migration thread wakes up (implicitly it forces the migrated
- *    thread off the CPU)
- * 4) it gets the migration request and checks whether the migrated
- *    task is still in the wrong runqueue.
- * 5) if it's in the wrong runqueue then the migration thread removes
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ *    stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ *    off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
  *    it and puts it into the right queue.
- * 6) migration thread up()s the semaphore.
- * 7) we wake up and the migration is done.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ *    is done.
  */
 
 /*
@@ -5314,9 +5292,9 @@ static inline void sched_init_granularity(void)
  */
 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
-	struct migration_req req;
 	unsigned long flags;
 	struct rq *rq;
+	unsigned int dest_cpu;
 	int ret = 0;
 
 	/*
@@ -5354,15 +5332,12 @@ again:
 	if (cpumask_test_cpu(task_cpu(p), new_mask))
 		goto out;
 
-	if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
+	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+	if (migrate_task(p, dest_cpu)) {
+		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
-		struct task_struct *mt = rq->migration_thread;
-
-		get_task_struct(mt);
 		task_rq_unlock(rq, &flags);
-		wake_up_process(mt);
-		put_task_struct(mt);
-		wait_for_completion(&req.done);
+		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		tlb_migrate_finish(p->mm);
 		return 0;
 	}
@@ -5420,70 +5395,22 @@ fail:
 	return ret;
 }
 
-#define RCU_MIGRATION_IDLE	0
-#define RCU_MIGRATION_NEED_QS	1
-#define RCU_MIGRATION_GOT_QS	2
-#define RCU_MIGRATION_MUST_SYNC	3
-
 /*
- * migration_thread - this is a highprio system thread that performs
- * thread migration by bumping thread off CPU then 'pushing' onto
- * another runqueue.
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
  */
-static int migration_thread(void *data)
+static int migration_cpu_stop(void *data)
 {
-	int badcpu;
-	int cpu = (long)data;
-	struct rq *rq;
-
-	rq = cpu_rq(cpu);
-	BUG_ON(rq->migration_thread != current);
-
-	set_current_state(TASK_INTERRUPTIBLE);
-	while (!kthread_should_stop()) {
-		struct migration_req *req;
-		struct list_head *head;
-
-		raw_spin_lock_irq(&rq->lock);
-
-		if (cpu_is_offline(cpu)) {
-			raw_spin_unlock_irq(&rq->lock);
-			break;
-		}
-
-		if (rq->active_balance) {
-			active_load_balance(rq, cpu);
-			rq->active_balance = 0;
-		}
-
-		head = &rq->migration_queue;
-
-		if (list_empty(head)) {
-			raw_spin_unlock_irq(&rq->lock);
-			schedule();
-			set_current_state(TASK_INTERRUPTIBLE);
-			continue;
-		}
-		req = list_entry(head->next, struct migration_req, list);
-		list_del_init(head->next);
-
-		if (req->task != NULL) {
-			raw_spin_unlock(&rq->lock);
-			__migrate_task(req->task, cpu, req->dest_cpu);
-		} else if (likely(cpu == (badcpu = smp_processor_id()))) {
-			req->dest_cpu = RCU_MIGRATION_GOT_QS;
-			raw_spin_unlock(&rq->lock);
-		} else {
-			req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
-			raw_spin_unlock(&rq->lock);
-			WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
-		}
-		local_irq_enable();
-
-		complete(&req->done);
-	}
-	__set_current_state(TASK_RUNNING);
+	struct migration_arg *arg = data;
 
+	/*
+	 * The original target cpu might have gone down and we might
+	 * be on another cpu but it doesn't matter.
+	 */
+	local_irq_disable();
+	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
+	local_irq_enable();
 	return 0;
 }
 
@@ -5850,35 +5777,20 @@ static void set_rq_offline(struct rq *rq)
 static int __cpuinit
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
-	struct task_struct *p;
 	int cpu = (long)hcpu;
 	unsigned long flags;
-	struct rq *rq;
+	struct rq *rq = cpu_rq(cpu);
 
 	switch (action) {
 
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
-		if (IS_ERR(p))
-			return NOTIFY_BAD;
-		kthread_bind(p, cpu);
-		/* Must be high prio: stop_machine expects to yield to it. */
-		rq = task_rq_lock(p, &flags);
-		__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-		task_rq_unlock(rq, &flags);
-		get_task_struct(p);
-		cpu_rq(cpu)->migration_thread = p;
 		rq->calc_load_update = calc_load_update;
 		break;
 
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		/* Strictly unnecessary, as first user will wake it. */
-		wake_up_process(cpu_rq(cpu)->migration_thread);
-
 		/* Update our root-domain */
-		rq = cpu_rq(cpu);
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5889,25 +5801,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		if (!cpu_rq(cpu)->migration_thread)
-			break;
-		/* Unbind it from offline cpu so it can run. Fall thru. */
-		kthread_bind(cpu_rq(cpu)->migration_thread,
-			     cpumask_any(cpu_online_mask));
-		kthread_stop(cpu_rq(cpu)->migration_thread);
-		put_task_struct(cpu_rq(cpu)->migration_thread);
-		cpu_rq(cpu)->migration_thread = NULL;
-		break;
-
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		migrate_live_tasks(cpu);
-		rq = cpu_rq(cpu);
-		kthread_stop(rq->migration_thread);
-		put_task_struct(rq->migration_thread);
-		rq->migration_thread = NULL;
 		/* Idle task back to normal (off runqueue, low prio) */
 		raw_spin_lock_irq(&rq->lock);
 		deactivate_task(rq, rq->idle, 0);
@@ -5918,29 +5814,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
 		calc_global_load_remove(rq);
-		/*
-		 * No need to migrate the tasks: it was best-effort if
-		 * they didn't take sched_hotcpu_mutex. Just wake up
-		 * the requestors.
-		 */
-		raw_spin_lock_irq(&rq->lock);
-		while (!list_empty(&rq->migration_queue)) {
-			struct migration_req *req;
-
-			req = list_entry(rq->migration_queue.next,
-					 struct migration_req, list);
-			list_del_init(&req->list);
-			raw_spin_unlock_irq(&rq->lock);
-			complete(&req->done);
-			raw_spin_lock_irq(&rq->lock);
-		}
-		raw_spin_unlock_irq(&rq->lock);
 		break;
 
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
 		/* Update our root-domain */
-		rq = cpu_rq(cpu);
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7757,10 +7635,8 @@ void __init sched_init(void)
 		rq->push_cpu = 0;
 		rq->cpu = i;
 		rq->online = 0;
-		rq->migration_thread = NULL;
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
-		INIT_LIST_HEAD(&rq->migration_queue);
 		rq_attach_root(rq, &def_root_domain);
 #endif
 		init_rq_hrtick(rq);
@@ -9054,43 +8930,39 @@ struct cgroup_subsys cpuacct_subsys = {
 
 #ifndef CONFIG_SMP
 
-int rcu_expedited_torture_stats(char *page)
-{
-	return 0;
-}
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-
 void synchronize_sched_expedited(void)
 {
+	/*
+	 * There must be a full memory barrier on each affected CPU
+	 * between the time that try_stop_cpus() is called and the
+	 * time that it returns.
+	 *
+	 * In the current initial implementation of cpu_stop, the
+	 * above condition is already met when the control reaches
+	 * this point and the following smp_mb() is not strictly
+	 * necessary.  Do smp_mb() anyway for documentation and
+	 * robustness against future implementation changes.
+	 */
+	smp_mb();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
 #else /* #ifndef CONFIG_SMP */
 
-static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
-static DEFINE_MUTEX(rcu_sched_expedited_mutex);
-
-#define RCU_EXPEDITED_STATE_POST -2
-#define RCU_EXPEDITED_STATE_IDLE -1
+static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
 
-static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
-
-int rcu_expedited_torture_stats(char *page)
+static int synchronize_sched_expedited_cpu_stop(void *data)
 {
-	int cnt = 0;
-	int cpu;
+	static DEFINE_SPINLOCK(done_mask_lock);
+	struct cpumask *done_mask = data;
 
-	cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
-	for_each_online_cpu(cpu) {
-		 cnt += sprintf(&page[cnt], " %d:%d",
-				cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
+	if (done_mask) {
+		spin_lock(&done_mask_lock);
+		cpumask_set_cpu(smp_processor_id(), done_mask);
+		spin_unlock(&done_mask_lock);
 	}
-	cnt += sprintf(&page[cnt], "\n");
-	return cnt;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
-
-static long synchronize_sched_expedited_count;
 
 /*
  * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9104,60 +8976,55 @@ static long synchronize_sched_expedited_count;
  */
 void synchronize_sched_expedited(void)
 {
-	int cpu;
-	unsigned long flags;
-	bool need_full_sync = 0;
-	struct rq *rq;
-	struct migration_req *req;
-	long snap;
-	int trycount = 0;
+	cpumask_var_t done_mask_var;
+	struct cpumask *done_mask = NULL;
+	int snap, trycount = 0;
+
+	/*
+	 * done_mask is used to check that all cpus actually have
+	 * finished running the stopper, which is guaranteed by
+	 * stop_cpus() if it's called with cpu hotplug blocked.  Keep
+	 * the paranoia for now but it's best effort if cpumask is off
+	 * stack.
+	 */
+	if (zalloc_cpumask_var(&done_mask_var, GFP_ATOMIC))
+		done_mask = done_mask_var;
 
 	smp_mb();  /* ensure prior mod happens before capturing snap. */
-	snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+	snap = atomic_read(&synchronize_sched_expedited_count) + 1;
 	get_online_cpus();
-	while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+	while (try_stop_cpus(cpu_online_mask,
+			     synchronize_sched_expedited_cpu_stop,
+			     done_mask) == -EAGAIN) {
 		put_online_cpus();
 		if (trycount++ < 10)
 			udelay(trycount * num_online_cpus());
 		else {
 			synchronize_sched();
-			return;
+			goto free_out;
 		}
-		if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+		if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
 			smp_mb(); /* ensure test happens before caller kfree */
-			return;
+			goto free_out;
 		}
 		get_online_cpus();
 	}
-	rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
-	for_each_online_cpu(cpu) {
-		rq = cpu_rq(cpu);
-		req = &per_cpu(rcu_migration_req, cpu);
-		init_completion(&req->done);
-		req->task = NULL;
-		req->dest_cpu = RCU_MIGRATION_NEED_QS;
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		list_add(&req->list, &rq->migration_queue);
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-		wake_up_process(rq->migration_thread);
-	}
-	for_each_online_cpu(cpu) {
-		rcu_expedited_state = cpu;
-		req = &per_cpu(rcu_migration_req, cpu);
-		rq = cpu_rq(cpu);
-		wait_for_completion(&req->done);
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
-			need_full_sync = 1;
-		req->dest_cpu = RCU_MIGRATION_IDLE;
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-	}
-	rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
-	synchronize_sched_expedited_count++;
-	mutex_unlock(&rcu_sched_expedited_mutex);
+	atomic_inc(&synchronize_sched_expedited_count);
+	if (done_mask)
+		cpumask_xor(done_mask, done_mask, cpu_online_mask);
 	put_online_cpus();
-	if (need_full_sync)
+
+	/* paranoia - this can't happen */
+	if (done_mask && cpumask_weight(done_mask)) {
+		char buf[80];
+
+		cpulist_scnprintf(buf, sizeof(buf), done_mask);
+		WARN_ONCE(1, "synchronize_sched_expedited: cpu online and done masks disagree on %d cpus: %s\n",
+			  cpumask_weight(done_mask), buf);
 		synchronize_sched();
+	}
+free_out:
+	free_cpumask_var(done_mask_var);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index cbd8b8a296d..217e4a9393e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2798,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 
+static int active_load_balance_cpu_stop(void *data);
+
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
@@ -2887,8 +2889,9 @@ redo:
 		if (need_active_balance(sd, sd_idle, idle)) {
 			raw_spin_lock_irqsave(&busiest->lock, flags);
 
-			/* don't kick the migration_thread, if the curr
-			 * task on busiest cpu can't be moved to this_cpu
+			/* don't kick the active_load_balance_cpu_stop,
+			 * if the curr task on busiest cpu can't be
+			 * moved to this_cpu
 			 */
 			if (!cpumask_test_cpu(this_cpu,
 					      &busiest->curr->cpus_allowed)) {
@@ -2898,14 +2901,22 @@ redo:
 				goto out_one_pinned;
 			}
 
+			/*
+			 * ->active_balance synchronizes accesses to
+			 * ->active_balance_work.  Once set, it's cleared
+			 * only after active load balance is finished.
+			 */
 			if (!busiest->active_balance) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
 			}
 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
+
 			if (active_balance)
-				wake_up_process(busiest->migration_thread);
+				stop_one_cpu_nowait(cpu_of(busiest),
+					active_load_balance_cpu_stop, busiest,
+					&busiest->active_balance_work);
 
 			/*
 			 * We've kicked active balancing, reset the failure
@@ -3012,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 }
 
 /*
- * active_load_balance is run by migration threads. It pushes running tasks
- * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
- * running on each physical CPU where possible, and avoids physical /
- * logical imbalances.
- *
- * Called with busiest_rq locked.
+ * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * running tasks off the busiest CPU onto idle CPUs. It requires at
+ * least 1 task to be running on each physical CPU where possible, and
+ * avoids physical / logical imbalances.
  */
-static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
+static int active_load_balance_cpu_stop(void *data)
 {
+	struct rq *busiest_rq = data;
+	int busiest_cpu = cpu_of(busiest_rq);
 	int target_cpu = busiest_rq->push_cpu;
+	struct rq *target_rq = cpu_rq(target_cpu);
 	struct sched_domain *sd;
-	struct rq *target_rq;
+
+	raw_spin_lock_irq(&busiest_rq->lock);
+
+	/* make sure the requested cpu hasn't gone down in the meantime */
+	if (unlikely(busiest_cpu != smp_processor_id() ||
+		     !busiest_rq->active_balance))
+		goto out_unlock;
 
 	/* Is there any task to move? */
 	if (busiest_rq->nr_running <= 1)
-		return;
-
-	target_rq = cpu_rq(target_cpu);
+		goto out_unlock;
 
 	/*
 	 * This condition is "impossible", if it occurs
@@ -3058,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 			schedstat_inc(sd, alb_failed);
 	}
 	double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+	busiest_rq->active_balance = 0;
+	raw_spin_unlock_irq(&busiest_rq->lock);
+	return 0;
 }
 
 #ifdef CONFIG_NO_HZ
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 884c7a1afee..5b20141a5ec 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -301,7 +301,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 	case CPU_UP_PREPARE:
 		BUG_ON(stopper->thread || stopper->enabled ||
 		       !list_empty(&stopper->works));
-		p = kthread_create(cpu_stopper_thread, stopper, "stopper/%d",
+		p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
 				   cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
-- 
cgit v1.2.3


From 94458d5ecb3da844823cc191e73e5c5ead98a464 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 6 May 2010 18:49:21 +0200
Subject: sched: kill paranoia check in synchronize_sched_expedited()

The paranoid check which verifies that the cpu_stop callback is
actually called on all online cpus is completely superflous.  It's
guaranteed by cpu_stop facility and if it didn't work as advertised
other things would go horribly wrong and trying to recover using
synchronize_sched() wouldn't be very meaningful.

Kill the paranoid check.  Removal of this feature is done as a
separate step so that it can serve as a bisection point if something
actually goes wrong.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Josh Triplett <josh@freedesktop.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
---
 kernel/sched.c | 40 +++-------------------------------------
 1 file changed, 3 insertions(+), 37 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index f1d577a0a8a..e9c6d798831 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8953,14 +8953,6 @@ static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
 
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
-	static DEFINE_SPINLOCK(done_mask_lock);
-	struct cpumask *done_mask = data;
-
-	if (done_mask) {
-		spin_lock(&done_mask_lock);
-		cpumask_set_cpu(smp_processor_id(), done_mask);
-		spin_unlock(&done_mask_lock);
-	}
 	return 0;
 }
 
@@ -8976,55 +8968,29 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
  */
 void synchronize_sched_expedited(void)
 {
-	cpumask_var_t done_mask_var;
-	struct cpumask *done_mask = NULL;
 	int snap, trycount = 0;
 
-	/*
-	 * done_mask is used to check that all cpus actually have
-	 * finished running the stopper, which is guaranteed by
-	 * stop_cpus() if it's called with cpu hotplug blocked.  Keep
-	 * the paranoia for now but it's best effort if cpumask is off
-	 * stack.
-	 */
-	if (zalloc_cpumask_var(&done_mask_var, GFP_ATOMIC))
-		done_mask = done_mask_var;
-
 	smp_mb();  /* ensure prior mod happens before capturing snap. */
 	snap = atomic_read(&synchronize_sched_expedited_count) + 1;
 	get_online_cpus();
 	while (try_stop_cpus(cpu_online_mask,
 			     synchronize_sched_expedited_cpu_stop,
-			     done_mask) == -EAGAIN) {
+			     NULL) == -EAGAIN) {
 		put_online_cpus();
 		if (trycount++ < 10)
 			udelay(trycount * num_online_cpus());
 		else {
 			synchronize_sched();
-			goto free_out;
+			return;
 		}
 		if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
 			smp_mb(); /* ensure test happens before caller kfree */
-			goto free_out;
+			return;
 		}
 		get_online_cpus();
 	}
 	atomic_inc(&synchronize_sched_expedited_count);
-	if (done_mask)
-		cpumask_xor(done_mask, done_mask, cpu_online_mask);
 	put_online_cpus();
-
-	/* paranoia - this can't happen */
-	if (done_mask && cpumask_weight(done_mask)) {
-		char buf[80];
-
-		cpulist_scnprintf(buf, sizeof(buf), done_mask);
-		WARN_ONCE(1, "synchronize_sched_expedited: cpu online and done masks disagree on %d cpus: %s\n",
-			  cpumask_weight(done_mask), buf);
-		synchronize_sched();
-	}
-free_out:
-	free_cpumask_var(done_mask_var);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
-- 
cgit v1.2.3


From cc631fb732b8ccd6a0cc45557475ea09b0c21a68 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 6 May 2010 18:49:21 +0200
Subject: sched: correctly place paranioa memory barriers in
 synchronize_sched_expedited()

The memory barriers must be in the SMP case, not in the !SMP case.
Also add a barrier after the atomic_inc() in order to ensure that
other CPUs see post-synchronize_sched_expedited() actions as following
the expedited grace period.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index e9c6d798831..155a16d5214 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8931,6 +8931,15 @@ struct cgroup_subsys cpuacct_subsys = {
 #ifndef CONFIG_SMP
 
 void synchronize_sched_expedited(void)
+{
+}
+EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+
+#else /* #ifndef CONFIG_SMP */
+
+static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
+
+static int synchronize_sched_expedited_cpu_stop(void *data)
 {
 	/*
 	 * There must be a full memory barrier on each affected CPU
@@ -8943,16 +8952,7 @@ void synchronize_sched_expedited(void)
 	 * necessary.  Do smp_mb() anyway for documentation and
 	 * robustness against future implementation changes.
 	 */
-	smp_mb();
-}
-EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-
-#else /* #ifndef CONFIG_SMP */
-
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
-
-static int synchronize_sched_expedited_cpu_stop(void *data)
-{
+	smp_mb(); /* See above comment block. */
 	return 0;
 }
 
@@ -8990,6 +8990,7 @@ void synchronize_sched_expedited(void)
 		get_online_cpus();
 	}
 	atomic_inc(&synchronize_sched_expedited_count);
+	smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
 	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
-- 
cgit v1.2.3


From d9f599e1e6d019968b35d2dc63074b9e8964fa69 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 20 Mar 2010 17:39:11 +0300
Subject: perf: Fix check at end of event search

The original code doesn't work because "call" is never NULL there.

Signed-off-by: Dan Carpenter <error27@gmail.com>
LKML-Reference: <20100320143911.GF5331@bicker>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 88c0b6dbd7f..58092d844a1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1398,7 +1398,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
 	}
 
 	err = -EINVAL;
-	if (!call)
+	if (&call->list == &ftrace_events)
 		goto out_unlock;
 
 	err = -EEXIST;
-- 
cgit v1.2.3


From a2a47c6c3d1a7c01da4464b3b7be93b924f874c1 Mon Sep 17 00:00:00 2001
From: Ky Srinivasan <ksrinivasan@novell.com>
Date: Thu, 6 May 2010 12:08:41 -0700
Subject: x86: Detect running on a Microsoft HyperV system

This patch integrates HyperV detection within the framework currently
used by VmWare. With this patch, we can avoid having to replicate the
HyperV detection code in each of the Microsoft HyperV drivers.

Reworked and tweaked by Greg K-H to build properly.

Signed-off-by: K. Y. Srinivasan <ksrinivasan@novell.com>
LKML-Reference: <20100506190841.GA1605@kroah.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Vadim Rozenfeld <vrozenfe@redhat.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "K.Prasad" <prasad@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/hyperv.h    |  6 ++++
 arch/x86/include/asm/mshyperv.h  |  7 +++++
 arch/x86/include/asm/processor.h |  3 ++
 arch/x86/kernel/cpu/Makefile     |  2 +-
 arch/x86/kernel/cpu/hypervisor.c | 10 ++++--
 arch/x86/kernel/cpu/mshyperv.c   | 67 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 91 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/include/asm/mshyperv.h
 create mode 100644 arch/x86/kernel/cpu/mshyperv.c

diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
index e153a2b3889..46040473e12 100644
--- a/arch/x86/include/asm/hyperv.h
+++ b/arch/x86/include/asm/hyperv.h
@@ -14,6 +14,9 @@
 #define HYPERV_CPUID_ENLIGHTMENT_INFO		0x40000004
 #define HYPERV_CPUID_IMPLEMENT_LIMITS		0x40000005
 
+#define HYPERV_HYPERVISOR_PRESENT_BIT		0x80000000
+#define HYPERV_CPUID_MIN			0x40000005
+
 /*
  * Feature identification. EAX indicates which features are available
  * to the partition based upon the current partition privileges.
@@ -129,6 +132,9 @@
 /* MSR used to provide vcpu index */
 #define HV_X64_MSR_VP_INDEX			0x40000002
 
+/* MSR used to read the per-partition time reference counter */
+#define HV_X64_MSR_TIME_REF_COUNT		0x40000020
+
 /* Define the virtual APIC registers */
 #define HV_X64_MSR_EOI				0x40000070
 #define HV_X64_MSR_ICR				0x40000071
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
new file mode 100644
index 00000000000..6cd8101d134
--- /dev/null
+++ b/arch/x86/include/asm/mshyperv.h
@@ -0,0 +1,7 @@
+#ifndef ASM_X86__MSHYPER_H
+#define ASM_X86__MSHYPER_H
+
+int ms_hyperv_platform(void);
+void __cpuinit ms_hyperv_set_feature_bits(struct cpuinfo_x86 *c);
+
+#endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b753ea59703..597c041bd12 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -114,6 +114,8 @@ struct cpuinfo_x86 {
 	u16			cpu_index;
 #endif
 	unsigned int		x86_hyper_vendor;
+	/* The layout of this field is hypervisor specific */
+	unsigned int		x86_hyper_features;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL	0
@@ -129,6 +131,7 @@ struct cpuinfo_x86 {
 
 #define X86_HYPER_VENDOR_NONE  0
 #define X86_HYPER_VENDOR_VMWARE 1
+#define X86_HYPER_VENDOR_MSFT	2
 
 /*
  * capabilities of CPUs
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c202b62f367..3a785da34b6 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o		:= $(nostackp)
 
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
-obj-y			+= vmware.o hypervisor.o sched.o
+obj-y			+= vmware.o hypervisor.o sched.o mshyperv.o
 
 obj-$(CONFIG_X86_32)	+= bugs.o cmpxchg.o
 obj-$(CONFIG_X86_64)	+= bugs_64.o
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 08be922de33..de3f4e0ce8e 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -23,6 +23,7 @@
 
 #include <asm/processor.h>
 #include <asm/vmware.h>
+#include <asm/mshyperv.h>
 #include <asm/hypervisor.h>
 
 static inline void __cpuinit
@@ -30,6 +31,8 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c)
 {
 	if (vmware_platform())
 		c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
+	else if (ms_hyperv_platform())
+		c->x86_hyper_vendor = X86_HYPER_VENDOR_MSFT;
 	else
 		c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
 }
@@ -37,10 +40,11 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c)
 static inline void __cpuinit
 hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
 {
-	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) {
+	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
 		vmware_set_feature_bits(c);
-		return;
-	}
+	else if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_MSFT)
+		ms_hyperv_set_feature_bits(c);
+	return;
 }
 
 void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 00000000000..2443b61cdb1
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,67 @@
+/*
+ * HyperV  Detection code.
+ *
+ * Copyright (C) 2010, Novell, Inc.
+ * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/types.h>
+#include <asm/processor.h>
+#include <asm/hyperv.h>
+#include <asm/mshyperv.h>
+
+
+int ms_hyperv_platform(void)
+{
+	u32 eax, ebx, ecx, edx;
+	char hyp_signature[13];
+
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+	if (!(ecx & HYPERV_HYPERVISOR_PRESENT_BIT))
+		return 0;
+
+	cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &ebx, &ecx, &edx);
+	*(u32 *)(hyp_signature + 0) = ebx;
+	*(u32 *)(hyp_signature + 4) = ecx;
+	*(u32 *)(hyp_signature + 8) = edx;
+
+	if ((eax < HYPERV_CPUID_MIN) || (memcmp("Microsoft Hv", hyp_signature, 12)))
+		return 0;
+	return 1;
+}
+
+void __cpuinit ms_hyperv_set_feature_bits(struct cpuinfo_x86 *c)
+{
+	u32 eax, ebx, ecx, edx;
+
+	c->x86_hyper_features = 0;
+	/*
+	 * Extract the features, recommendations etc.
+	 * The first 9 bits will be used to track hypervisor features.
+	 * The next 6 bits will be used to track the hypervisor
+	 * recommendations.
+	 */
+	cpuid(HYPERV_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
+	c->x86_hyper_features |= (eax & 0x1ff);
+
+	cpuid(HYPERV_CPUID_ENLIGHTMENT_INFO, &eax, &ebx, &ecx, &edx);
+	c->x86_hyper_features |= ((eax & 0x3f) << 9);
+	printk(KERN_INFO "Detected HyperV with features: %x\n",
+		c->x86_hyper_features);
+}
-- 
cgit v1.2.3


From fc390cde362309f6892bb719194f242c466a978b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 6 May 2010 11:42:52 -0700
Subject: rcu: need barrier() in UP synchronize_sched_expedited()

If synchronize_sched_expedited() is ever to be called from within
kernel/sched.c in a !SMP PREEMPT kernel, the !SMP implementation needs
a barrier().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched.c b/kernel/sched.c
index 155a16d5214..fbaf3128d01 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8932,6 +8932,7 @@ struct cgroup_subsys cpuacct_subsys = {
 
 void synchronize_sched_expedited(void)
 {
+	barrier();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
-- 
cgit v1.2.3


From 4726f2a617ebd868a4fdeb5679613b897e5f1676 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang@windriver.com>
Date: Tue, 4 May 2010 14:16:48 +0800
Subject: lockdep: Reduce stack_trace usage

When calling check_prevs_add(), if all validations passed
add_lock_to_list() will add new lock to dependency tree and
alloc stack_trace for each list_entry.

But at this time, we are always on the same stack, so stack_trace
for each list_entry has the same value. This is redundant and eats
up lots of memory which could lead to warning on low
MAX_STACK_TRACE_ENTRIES.

Use one copy of stack_trace instead.

V2: As suggested by Peter Zijlstra, move save_trace() from
    check_prevs_add() to check_prev_add().
    Add tracking for trylock dependence which is also redundant.

Signed-off-by: Yong Zhang <yong.zhang0@windriver.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100504065711.GC10784@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9cf79858fd8..51080807dc8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -805,7 +805,8 @@ static struct lock_list *alloc_list_entry(void)
  * Add a new dependency to the head of the list:
  */
 static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
-			    struct list_head *head, unsigned long ip, int distance)
+			    struct list_head *head, unsigned long ip,
+			    int distance, struct stack_trace *trace)
 {
 	struct lock_list *entry;
 	/*
@@ -816,11 +817,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
 	if (!entry)
 		return 0;
 
-	if (!save_trace(&entry->trace))
-		return 0;
-
 	entry->class = this;
 	entry->distance = distance;
+	entry->trace = *trace;
 	/*
 	 * Since we never remove from the dependency list, the list can
 	 * be walked lockless by other CPUs, it's only allocation
@@ -1622,12 +1621,20 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
  */
 static int
 check_prev_add(struct task_struct *curr, struct held_lock *prev,
-	       struct held_lock *next, int distance)
+	       struct held_lock *next, int distance, int trylock_loop)
 {
 	struct lock_list *entry;
 	int ret;
 	struct lock_list this;
 	struct lock_list *uninitialized_var(target_entry);
+	/*
+	 * Static variable, serialized by the graph_lock().
+	 *
+	 * We use this static variable to save the stack trace in case
+	 * we call into this function multiple times due to encountering
+	 * trylocks in the held lock stack.
+	 */
+	static struct stack_trace trace;
 
 	/*
 	 * Prove that the new <prev> -> <next> dependency would not
@@ -1675,20 +1682,23 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
 		}
 	}
 
+	if (!trylock_loop && !save_trace(&trace))
+		return 0;
+
 	/*
 	 * Ok, all validations passed, add the new lock
 	 * to the previous lock's dependency list:
 	 */
 	ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
 			       &hlock_class(prev)->locks_after,
-			       next->acquire_ip, distance);
+			       next->acquire_ip, distance, &trace);
 
 	if (!ret)
 		return 0;
 
 	ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
 			       &hlock_class(next)->locks_before,
-			       next->acquire_ip, distance);
+			       next->acquire_ip, distance, &trace);
 	if (!ret)
 		return 0;
 
@@ -1718,6 +1728,7 @@ static int
 check_prevs_add(struct task_struct *curr, struct held_lock *next)
 {
 	int depth = curr->lockdep_depth;
+	int trylock_loop = 0;
 	struct held_lock *hlock;
 
 	/*
@@ -1743,7 +1754,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		 * added:
 		 */
 		if (hlock->read != 2) {
-			if (!check_prev_add(curr, hlock, next, distance))
+			if (!check_prev_add(curr, hlock, next,
+						distance, trylock_loop))
 				return 0;
 			/*
 			 * Stop after the first non-trylock entry,
@@ -1766,6 +1778,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 		if (curr->held_locks[depth].irq_context !=
 				curr->held_locks[depth-1].irq_context)
 			break;
+		trylock_loop = 1;
 	}
 	return 1;
 out_bug:
-- 
cgit v1.2.3


From 27a9da6538ee18046d7bff8e36a9f783542c54c3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 4 May 2010 20:36:56 +0200
Subject: sched: Remove rq argument to the tracepoints

struct rq isn't visible outside of sched.o so its near useless to
expose the pointer, also there are no users of it, so remove it.

Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1272997616.1642.207.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/events/sched.h      | 32 ++++++++++----------------------
 kernel/sched.c                    |  8 ++++----
 kernel/trace/ftrace.c             |  3 +--
 kernel/trace/trace_sched_switch.c |  5 ++---
 kernel/trace/trace_sched_wakeup.c |  5 ++---
 5 files changed, 19 insertions(+), 34 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index cfceb0b73e2..4f733ecea46 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -51,15 +51,12 @@ TRACE_EVENT(sched_kthread_stop_ret,
 
 /*
  * Tracepoint for waiting on task to unschedule:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 TRACE_EVENT(sched_wait_task,
 
-	TP_PROTO(struct rq *rq, struct task_struct *p),
+	TP_PROTO(struct task_struct *p),
 
-	TP_ARGS(rq, p),
+	TP_ARGS(p),
 
 	TP_STRUCT__entry(
 		__array(	char,	comm,	TASK_COMM_LEN	)
@@ -79,15 +76,12 @@ TRACE_EVENT(sched_wait_task,
 
 /*
  * Tracepoint for waking up a task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 DECLARE_EVENT_CLASS(sched_wakeup_template,
 
-	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+	TP_PROTO(struct task_struct *p, int success),
 
-	TP_ARGS(rq, p, success),
+	TP_ARGS(p, success),
 
 	TP_STRUCT__entry(
 		__array(	char,	comm,	TASK_COMM_LEN	)
@@ -111,31 +105,25 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
 );
 
 DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
-	     TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-	     TP_ARGS(rq, p, success));
+	     TP_PROTO(struct task_struct *p, int success),
+	     TP_ARGS(p, success));
 
 /*
  * Tracepoint for waking up a new task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
-	     TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-	     TP_ARGS(rq, p, success));
+	     TP_PROTO(struct task_struct *p, int success),
+	     TP_ARGS(p, success));
 
 /*
  * Tracepoint for task switches, performed by the scheduler:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
  */
 TRACE_EVENT(sched_switch,
 
-	TP_PROTO(struct rq *rq, struct task_struct *prev,
+	TP_PROTO(struct task_struct *prev,
 		 struct task_struct *next),
 
-	TP_ARGS(rq, prev, next),
+	TP_ARGS(prev, next),
 
 	TP_STRUCT__entry(
 		__array(	char,	prev_comm,	TASK_COMM_LEN	)
diff --git a/kernel/sched.c b/kernel/sched.c
index 4956ed09283..11ac0eb0bce 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2168,7 +2168,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * just go back and repeat.
 		 */
 		rq = task_rq_lock(p, &flags);
-		trace_sched_wait_task(rq, p);
+		trace_sched_wait_task(p);
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
 		ncsw = 0;
@@ -2439,7 +2439,7 @@ out_activate:
 	success = 1;
 
 out_running:
-	trace_sched_wakeup(rq, p, success);
+	trace_sched_wakeup(p, success);
 	check_preempt_curr(rq, p, wake_flags);
 
 	p->state = TASK_RUNNING;
@@ -2613,7 +2613,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 
 	rq = task_rq_lock(p, &flags);
 	activate_task(rq, p, 0);
-	trace_sched_wakeup_new(rq, p, 1);
+	trace_sched_wakeup_new(p, 1);
 	check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_woken)
@@ -2833,7 +2833,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
-	trace_sched_switch(rq, prev, next);
+	trace_sched_switch(prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2404b59b309..aa3a92b511e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3212,8 +3212,7 @@ free:
 }
 
 static void
-ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
-				struct task_struct *next)
+ftrace_graph_probe_sched_switch(struct task_struct *prev, struct task_struct *next)
 {
 	unsigned long long timestamp;
 	int index;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde..a55fccfede5 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 }
 
 static void
-probe_sched_switch(struct rq *__rq, struct task_struct *prev,
-			struct task_struct *next)
+probe_sched_switch(struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array_cpu *data;
 	unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 }
 
 static void
-probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
+probe_sched_wakeup(struct task_struct *wakee, int success)
 {
 	struct trace_array_cpu *data;
 	unsigned long flags;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8..8052446ceea 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -107,8 +107,7 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
 }
 
 static void notrace
-probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
-	struct task_struct *next)
+probe_wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array_cpu *data;
 	cycle_t T0, T1, delta;
@@ -200,7 +199,7 @@ static void wakeup_reset(struct trace_array *tr)
 }
 
 static void
-probe_wakeup(struct rq *rq, struct task_struct *p, int success)
+probe_wakeup(struct task_struct *p, int success)
 {
 	struct trace_array_cpu *data;
 	int cpu = smp_processor_id();
-- 
cgit v1.2.3


From 4fd38e4595e2f6c9d27732c042a0e16b2753049c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 6 May 2010 17:31:38 +0200
Subject: perf: Fix exit() vs PERF_FORMAT_GROUP

Both Stephane and Corey reported that PERF_FORMAT_GROUP didn't work
as expected if the task the counters were attached to quit before
the read() call.

The cause is that we unconditionally destroy the grouping when we
remove counters from their context. Fix this by only doing this when
we free the counter itself.

Reported-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Reported-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1273160566.5605.404.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 1 +
 kernel/perf_event.c        | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c8e37544040..bf8f3c00329 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -522,6 +522,7 @@ struct pmu {
  * enum perf_event_active_state - the states of a event
  */
 enum perf_event_active_state {
+	PERF_EVENT_STATE_FREE		= -3,
 	PERF_EVENT_STATE_ERROR		= -2,
 	PERF_EVENT_STATE_OFF		= -1,
 	PERF_EVENT_STATE_INACTIVE	=  0,
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3d1552d3c12..f13c3db765f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -341,6 +341,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (event->state > PERF_EVENT_STATE_OFF)
 		event->state = PERF_EVENT_STATE_OFF;
 
+	if (event->state > PERF_EVENT_STATE_FREE)
+		return;
+
 	/*
 	 * If this was a group event with sibling events then
 	 * upgrade the siblings to singleton events by adding them
@@ -1856,6 +1859,8 @@ int perf_event_release_kernel(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 
+	event->state = PERF_EVENT_STATE_FREE;
+
 	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	perf_event_remove_from_context(event);
-- 
cgit v1.2.3


From a0507c84bf47dfd204299774f45fd16da33f0619 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 6 May 2010 15:42:53 +0200
Subject: perf: Annotate perf_event_read_group() vs perf_event_release_kernel()

Stephane reported a lockdep warning while using PERF_FORMAT_GROUP.

The issue is that perf_event_read_group() takes faults while holding
the ctx->mutex, while perf_event_release_kernel() can be called from
munmap(). Which makes for an AB-BA deadlock.

Except we can never establish the deadlock because we'll only ever
call perf_event_release_kernel() after all file descriptors are dead
so there is no concurrency possible.

Reported-by: Stephane Eranian <eranian@google.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 49d8be5a45e..34659d4085c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1867,7 +1867,19 @@ int perf_event_release_kernel(struct perf_event *event)
 	event->state = PERF_EVENT_STATE_FREE;
 
 	WARN_ON_ONCE(ctx->parent_ctx);
-	mutex_lock(&ctx->mutex);
+	/*
+	 * There are two ways this annotation is useful:
+	 *
+	 *  1) there is a lock recursion from perf_event_exit_task
+	 *     see the comment there.
+	 *
+	 *  2) there is a lock-inversion with mmap_sem through
+	 *     perf_event_read_group(), which takes faults while
+	 *     holding ctx->mutex, however this is called after
+	 *     the last filedesc died, so there is no possibility
+	 *     to trigger the AB-BA case.
+	 */
+	mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
 	perf_event_remove_from_context(event);
 	mutex_unlock(&ctx->mutex);
 
@@ -5305,7 +5317,7 @@ void perf_event_exit_task(struct task_struct *child)
 	 *
 	 * But since its the parent context it won't be the same instance.
 	 */
-	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
+	mutex_lock(&child_ctx->mutex);
 
 again:
 	list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
-- 
cgit v1.2.3


From 4261e0e0efd9e04b6c69e0773c3cf4d6f337c416 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 13 Apr 2010 22:23:10 +0200
Subject: perf, x86: Move perfctr init code to x86_setup_perfctr()

Split __hw_perf_event_init() to configure pmu events other than
perfctrs. Perfctr code is moved to a separate function
x86_setup_perfctr(). This and the following patches refactor the code.

Split in multiple patches for better review.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 7de70613e6c..801441a5424 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -426,6 +426,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 	return 0;
 }
 
+static int x86_setup_perfctr(struct perf_event *event);
+
 static int x86_pmu_hw_config(struct perf_event *event)
 {
 	/*
@@ -453,9 +455,6 @@ static int x86_pmu_hw_config(struct perf_event *event)
  */
 static int __hw_perf_event_init(struct perf_event *event)
 {
-	struct perf_event_attr *attr = &event->attr;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 config;
 	int err;
 
 	if (!x86_pmu_initialized())
@@ -482,15 +481,24 @@ static int __hw_perf_event_init(struct perf_event *event)
 
 	event->destroy = hw_perf_event_destroy;
 
-	hwc->idx = -1;
-	hwc->last_cpu = -1;
-	hwc->last_tag = ~0ULL;
+	event->hw.idx = -1;
+	event->hw.last_cpu = -1;
+	event->hw.last_tag = ~0ULL;
 
 	/* Processor specifics */
 	err = x86_pmu.hw_config(event);
 	if (err)
 		return err;
 
+	return x86_setup_perfctr(event);
+}
+
+static int x86_setup_perfctr(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 config;
+
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
 		hwc->last_period = hwc->sample_period;
-- 
cgit v1.2.3


From c1726f343b3bfc2ee037e191907c632a31903021 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 13 Apr 2010 22:23:11 +0200
Subject: perf, x86: Move x86_setup_perfctr()

Move x86_setup_perfctr(), no other changes made.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-3-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 120 +++++++++++++++++++--------------------
 1 file changed, 59 insertions(+), 61 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 801441a5424..3d3bceb9e83 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -426,7 +426,65 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
 	return 0;
 }
 
-static int x86_setup_perfctr(struct perf_event *event);
+static int x86_setup_perfctr(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 config;
+
+	if (!hwc->sample_period) {
+		hwc->sample_period = x86_pmu.max_period;
+		hwc->last_period = hwc->sample_period;
+		atomic64_set(&hwc->period_left, hwc->sample_period);
+	} else {
+		/*
+		 * If we have a PMU initialized but no APIC
+		 * interrupts, we cannot sample hardware
+		 * events (user-space has to fall back and
+		 * sample via a hrtimer based software event):
+		 */
+		if (!x86_pmu.apic)
+			return -EOPNOTSUPP;
+	}
+
+	if (attr->type == PERF_TYPE_RAW)
+		return 0;
+
+	if (attr->type == PERF_TYPE_HW_CACHE)
+		return set_ext_hw_attr(hwc, attr);
+
+	if (attr->config >= x86_pmu.max_events)
+		return -EINVAL;
+
+	/*
+	 * The generic map:
+	 */
+	config = x86_pmu.event_map(attr->config);
+
+	if (config == 0)
+		return -ENOENT;
+
+	if (config == -1LL)
+		return -EINVAL;
+
+	/*
+	 * Branch tracing:
+	 */
+	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+	    (hwc->sample_period == 1)) {
+		/* BTS is not supported by this architecture. */
+		if (!x86_pmu.bts)
+			return -EOPNOTSUPP;
+
+		/* BTS is currently only allowed for user-mode. */
+		if (!attr->exclude_kernel)
+			return -EOPNOTSUPP;
+	}
+
+	hwc->config |= config;
+
+	return 0;
+}
 
 static int x86_pmu_hw_config(struct perf_event *event)
 {
@@ -493,66 +551,6 @@ static int __hw_perf_event_init(struct perf_event *event)
 	return x86_setup_perfctr(event);
 }
 
-static int x86_setup_perfctr(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 config;
-
-	if (!hwc->sample_period) {
-		hwc->sample_period = x86_pmu.max_period;
-		hwc->last_period = hwc->sample_period;
-		atomic64_set(&hwc->period_left, hwc->sample_period);
-	} else {
-		/*
-		 * If we have a PMU initialized but no APIC
-		 * interrupts, we cannot sample hardware
-		 * events (user-space has to fall back and
-		 * sample via a hrtimer based software event):
-		 */
-		if (!x86_pmu.apic)
-			return -EOPNOTSUPP;
-	}
-
-	if (attr->type == PERF_TYPE_RAW)
-		return 0;
-
-	if (attr->type == PERF_TYPE_HW_CACHE)
-		return set_ext_hw_attr(hwc, attr);
-
-	if (attr->config >= x86_pmu.max_events)
-		return -EINVAL;
-
-	/*
-	 * The generic map:
-	 */
-	config = x86_pmu.event_map(attr->config);
-
-	if (config == 0)
-		return -ENOENT;
-
-	if (config == -1LL)
-		return -EINVAL;
-
-	/*
-	 * Branch tracing:
-	 */
-	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
-	    (hwc->sample_period == 1)) {
-		/* BTS is not supported by this architecture. */
-		if (!x86_pmu.bts)
-			return -EOPNOTSUPP;
-
-		/* BTS is currently only allowed for user-mode. */
-		if (!attr->exclude_kernel)
-			return -EOPNOTSUPP;
-	}
-
-	hwc->config |= config;
-
-	return 0;
-}
-
 static void x86_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-- 
cgit v1.2.3


From 9d0fcba67e47ff398a6fa86476d4884d472dc98a Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 13 Apr 2010 22:23:12 +0200
Subject: perf, x86: Call x86_setup_perfctr() from .hw_config()

The perfctr setup calls are in the corresponding .hw_config()
functions now. This makes it possible to introduce config functions
for other pmu events that are not perfctr specific.

Also, all of a sudden the code looks much nicer.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-4-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c    | 9 ++-------
 arch/x86/kernel/cpu/perf_event_p4.c | 2 +-
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3d3bceb9e83..c2c1e10f7b0 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -505,7 +505,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
 	if (event->attr.type == PERF_TYPE_RAW)
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
-	return 0;
+	return x86_setup_perfctr(event);
 }
 
 /*
@@ -543,12 +543,7 @@ static int __hw_perf_event_init(struct perf_event *event)
 	event->hw.last_cpu = -1;
 	event->hw.last_tag = ~0ULL;
 
-	/* Processor specifics */
-	err = x86_pmu.hw_config(event);
-	if (err)
-		return err;
-
-	return x86_setup_perfctr(event);
+	return x86_pmu.hw_config(event);
 }
 
 static void x86_pmu_disable_all(void)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 15367cce66b..9e002054cb5 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -455,7 +455,7 @@ static int p4_hw_config(struct perf_event *event)
 		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
 		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
 
-	return 0;
+	return x86_setup_perfctr(event);
 }
 
 static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
-- 
cgit v1.2.3


From 31fa58af57c41d2912debf62d47d5811062411f1 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 13 Apr 2010 22:23:14 +0200
Subject: perf, x86: Pass enable bit mask to __x86_pmu_enable_event()

To reuse this function for events with different enable bit masks,
this mask is part of the function's argument list now.

The function will be used later to control ibs events too.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-6-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c       | 9 +++++----
 arch/x86/kernel/cpu/perf_event_intel.c | 5 +++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c2c1e10f7b0..4e218d7ac85 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -844,10 +844,10 @@ void hw_perf_enable(void)
 	x86_pmu.enable_all(added);
 }
 
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+					  u64 enable_mask)
 {
-	wrmsrl(hwc->config_base + hwc->idx,
-			      hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
+	wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
 }
 
 static inline void x86_pmu_disable_event(struct perf_event *event)
@@ -919,7 +919,8 @@ static void x86_pmu_enable_event(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	if (cpuc->enabled)
-		__x86_pmu_enable_event(&event->hw);
+		__x86_pmu_enable_event(&event->hw,
+				       ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a099df96f91..a4b56ac425c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -513,7 +513,8 @@ static void intel_pmu_nhm_enable_all(int added)
 			if (!event)
 				continue;
 
-			__x86_pmu_enable_event(&event->hw);
+			__x86_pmu_enable_event(&event->hw,
+					       ARCH_PERFMON_EVENTSEL_ENABLE);
 		}
 	}
 	intel_pmu_enable_all(added);
@@ -617,7 +618,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 	if (unlikely(event->attr.precise))
 		intel_pmu_pebs_enable(event);
 
-	__x86_pmu_enable_event(hwc);
+	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 
 /*
-- 
cgit v1.2.3


From a1f2b70a942b8d858a0ab02567da3999b60a99b2 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Tue, 13 Apr 2010 22:23:15 +0200
Subject: perf, x86: Use weight instead of cmask in for_each_event_constraint()

There may exist constraints with a cmask set to zero. In this case
for_each_event_constraint() will not work properly. Now weight is used
instead of the cmask for loop exit detection. Weight is always a value
other than zero since the default contains the HWEIGHT from the
counter mask and in other cases a value of zero does not fit too.

This is in preparation of ibs event constraints that wont have a
cmask.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1271190201-25705-7-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4e218d7ac85..4a3f1f2b9b9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -170,7 +170,7 @@ struct cpu_hw_events {
 	EVENT_CONSTRAINT(0, 0, 0)
 
 #define for_each_event_constraint(e, c)	\
-	for ((e) = (c); (e)->cmask; (e)++)
+	for ((e) = (c); (e)->weight; (e)++)
 
 union perf_capabilities {
 	struct {
-- 
cgit v1.2.3


From 1e9a6d8d44cb6dcd2799b36ceb23007e6a423bfe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 4 May 2010 16:30:21 +0200
Subject: perf, x86: Remove PEBS SAMPLE_RAW support

Its broken, we really should get PERF_SAMPLE_REGS sorted.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ec8b2e12e10..080b9b065bd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -459,7 +459,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
 	struct pebs_record_core *at, *top;
 	struct perf_sample_data data;
-	struct perf_raw_record raw;
 	struct pt_regs regs;
 	int n;
 
@@ -499,12 +498,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	perf_sample_data_init(&data, 0);
 	data.period = event->hw.last_period;
 
-	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		raw.size = x86_pmu.pebs_record_size;
-		raw.data = at;
-		data.raw = &raw;
-	}
-
 	/*
 	 * We use the interrupt regs as a base because the PEBS record
 	 * does not contain a full regs set, specifically it seems to
@@ -536,7 +529,6 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	struct pebs_record_nhm *at, *top;
 	struct perf_sample_data data;
 	struct perf_event *event = NULL;
-	struct perf_raw_record raw;
 	struct pt_regs regs;
 	u64 status = 0;
 	int bit, n;
@@ -585,12 +577,6 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		perf_sample_data_init(&data, 0);
 		data.period = event->hw.last_period;
 
-		if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-			raw.size = x86_pmu.pebs_record_size;
-			raw.data = at;
-			data.raw = &raw;
-		}
-
 		/*
 		 * See the comment in intel_pmu_drain_pebs_core()
 		 */
-- 
cgit v1.2.3


From 2b0b5c6fe9b383f3cf35a0a6371c9d577bd523ff Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 8 Apr 2010 23:03:20 +0200
Subject: perf, x86: Consolidate some code repetition

Remove some duplicated logic.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 97 ++++++++++++++-----------------
 1 file changed, 44 insertions(+), 53 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 080b9b065bd..35056f715e9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -452,14 +452,54 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 
 static int intel_pmu_save_and_restart(struct perf_event *event);
 
+static void __intel_pmu_pebs_event(struct perf_event *event,
+				   struct pt_regs *iregs, void *__pebs)
+{
+	/*
+	 * We cast to pebs_record_core since that is a subset of
+	 * both formats and we don't use the other fields in this
+	 * routine.
+	 */
+	struct pebs_record_core *pebs = __pebs;
+	struct perf_sample_data data;
+	struct pt_regs regs;
+
+	if (!intel_pmu_save_and_restart(event))
+		return;
+
+	perf_sample_data_init(&data, 0);
+	data.period = event->hw.last_period;
+
+	/*
+	 * We use the interrupt regs as a base because the PEBS record
+	 * does not contain a full regs set, specifically it seems to
+	 * lack segment descriptors, which get used by things like
+	 * user_mode().
+	 *
+	 * In the simple case fix up only the IP and BP,SP regs, for
+	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+	 */
+	regs = *iregs;
+	regs.ip = pebs->ip;
+	regs.bp = pebs->bp;
+	regs.sp = pebs->sp;
+
+	if (intel_pmu_pebs_fixup_ip(regs))
+		regs.flags |= PERF_EFLAGS_EXACT;
+	else
+		regs.flags &= ~PERF_EFLAGS_EXACT;
+
+	if (perf_event_overflow(event, 1, &data, &regs))
+		x86_pmu_stop(event);
+}
+
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
 	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
 	struct pebs_record_core *at, *top;
-	struct perf_sample_data data;
-	struct pt_regs regs;
 	int n;
 
 	if (!ds || !x86_pmu.pebs)
@@ -485,9 +525,6 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	if (n <= 0)
 		return;
 
-	if (!intel_pmu_save_and_restart(event))
-		return;
-
 	/*
 	 * Should not happen, we program the threshold at 1 and do not
 	 * set a reset value.
@@ -495,31 +532,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	WARN_ON_ONCE(n > 1);
 	at += n - 1;
 
-	perf_sample_data_init(&data, 0);
-	data.period = event->hw.last_period;
-
-	/*
-	 * We use the interrupt regs as a base because the PEBS record
-	 * does not contain a full regs set, specifically it seems to
-	 * lack segment descriptors, which get used by things like
-	 * user_mode().
-	 *
-	 * In the simple case fix up only the IP and BP,SP regs, for
-	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
-	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
-	 */
-	regs = *iregs;
-	regs.ip = at->ip;
-	regs.bp = at->bp;
-	regs.sp = at->sp;
-
-	if (intel_pmu_pebs_fixup_ip(&regs))
-		regs.flags |= PERF_EFLAGS_EXACT;
-	else
-		regs.flags &= ~PERF_EFLAGS_EXACT;
-
-	if (perf_event_overflow(event, 1, &data, &regs))
-		x86_pmu_stop(event);
+	__intel_pmu_pebs_event(event, iregs, at);
 }
 
 static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
@@ -527,9 +540,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
 	struct pebs_record_nhm *at, *top;
-	struct perf_sample_data data;
 	struct perf_event *event = NULL;
-	struct pt_regs regs;
 	u64 status = 0;
 	int bit, n;
 
@@ -571,27 +582,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		if (!event || bit >= MAX_PEBS_EVENTS)
 			continue;
 
-		if (!intel_pmu_save_and_restart(event))
-			continue;
-
-		perf_sample_data_init(&data, 0);
-		data.period = event->hw.last_period;
-
-		/*
-		 * See the comment in intel_pmu_drain_pebs_core()
-		 */
-		regs = *iregs;
-		regs.ip = at->ip;
-		regs.bp = at->bp;
-		regs.sp = at->sp;
-
-		if (intel_pmu_pebs_fixup_ip(&regs))
-			regs.flags |= PERF_EFLAGS_EXACT;
-		else
-			regs.flags &= ~PERF_EFLAGS_EXACT;
-
-		if (perf_event_overflow(event, 1, &data, &regs))
-			x86_pmu_stop(event);
+		__intel_pmu_pebs_event(event, iregs, at);
 	}
 }
 
-- 
cgit v1.2.3


From ab608344bcbde4f55ec4cd911b686b0ce3eae076 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 8 Apr 2010 23:03:20 +0200
Subject: perf, x86: Improve the PEBS ABI

Rename perf_event_attr::precise to perf_event_attr::precise_ip and
widen it to 2 bits. This new field describes the required precision of
the PERF_SAMPLE_IP field:

  0 - SAMPLE_IP can have arbitrary skid
  1 - SAMPLE_IP must have constant skid
  2 - SAMPLE_IP requested to have 0 skid
  3 - SAMPLE_IP must have 0 skid

And modify the Intel PEBS code accordingly. The PEBS implementation
now supports up to precise_ip == 2, where we perform the IP fixup.

Also s/PERF_RECORD_MISC_EXACT/&_IP/ to clarify its meaning, this bit
should be set for each PERF_SAMPLE_IP field known to match the actual
instruction triggering the event.

This new scheme allows for a PEBS mode that uses the buffer for more
than a single event.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c          | 17 ++++++++++++++++-
 arch/x86/kernel/cpu/perf_event_intel.c    |  4 ++--
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 12 ++++++------
 include/linux/perf_event.h                | 23 +++++++++++++++++++----
 tools/perf/builtin-top.c                  |  2 +-
 tools/perf/util/parse-events.c            | 25 ++++++++++++++++---------
 6 files changed, 60 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4a3f1f2b9b9..27fa9eeed02 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -488,6 +488,21 @@ static int x86_setup_perfctr(struct perf_event *event)
 
 static int x86_pmu_hw_config(struct perf_event *event)
 {
+	if (event->attr.precise_ip) {
+		int precise = 0;
+
+		/* Support for constant skid */
+		if (x86_pmu.pebs)
+			precise++;
+
+		/* Support for IP fixup */
+		if (x86_pmu.lbr_nr)
+			precise++;
+
+		if (event->attr.precise_ip > precise)
+			return -EOPNOTSUPP;
+	}
+
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
@@ -1780,7 +1795,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
 	}
 
 	if (regs->flags & PERF_EFLAGS_EXACT)
-		misc |= PERF_RECORD_MISC_EXACT;
+		misc |= PERF_RECORD_MISC_EXACT_IP;
 
 	return misc;
 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a4b56ac425c..fdbc652d3fe 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -563,7 +563,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
 
 	x86_pmu_disable_event(event);
 
-	if (unlikely(event->attr.precise))
+	if (unlikely(event->attr.precise_ip))
 		intel_pmu_pebs_disable(event);
 }
 
@@ -615,7 +615,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 		return;
 	}
 
-	if (unlikely(event->attr.precise))
+	if (unlikely(event->attr.precise_ip))
 		intel_pmu_pebs_enable(event);
 
 	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 35056f715e9..18018d1311c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -307,7 +307,7 @@ intel_pebs_constraints(struct perf_event *event)
 {
 	struct event_constraint *c;
 
-	if (!event->attr.precise)
+	if (!event->attr.precise_ip)
 		return NULL;
 
 	if (x86_pmu.pebs_constraints) {
@@ -330,7 +330,7 @@ static void intel_pmu_pebs_enable(struct perf_event *event)
 	cpuc->pebs_enabled |= 1ULL << hwc->idx;
 	WARN_ON_ONCE(cpuc->enabled);
 
-	if (x86_pmu.intel_cap.pebs_trap)
+	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
 		intel_pmu_lbr_enable(event);
 }
 
@@ -345,7 +345,7 @@ static void intel_pmu_pebs_disable(struct perf_event *event)
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
 
-	if (x86_pmu.intel_cap.pebs_trap)
+	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
 		intel_pmu_lbr_disable(event);
 }
 
@@ -485,7 +485,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	regs.bp = pebs->bp;
 	regs.sp = pebs->sp;
 
-	if (intel_pmu_pebs_fixup_ip(regs))
+	if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
 		regs.flags |= PERF_EFLAGS_EXACT;
 	else
 		regs.flags &= ~PERF_EFLAGS_EXACT;
@@ -518,7 +518,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 
 	WARN_ON_ONCE(!event);
 
-	if (!event->attr.precise)
+	if (!event->attr.precise_ip)
 		return;
 
 	n = top - at;
@@ -570,7 +570,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 
 			WARN_ON_ONCE(!event);
 
-			if (!event->attr.precise)
+			if (!event->attr.precise_ip)
 				continue;
 
 			if (__test_and_set_bit(bit, (unsigned long *)&status))
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 6be4a0f9137..23cd0057a68 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -203,9 +203,19 @@ struct perf_event_attr {
 				enable_on_exec :  1, /* next exec enables     */
 				task           :  1, /* trace fork/exit       */
 				watermark      :  1, /* wakeup_watermark      */
-				precise        :  1, /* OoO invariant counter */
-
-				__reserved_1   : 48;
+				/*
+				 * precise_ip:
+				 *
+				 *  0 - SAMPLE_IP can have arbitrary skid
+				 *  1 - SAMPLE_IP must have constant skid
+				 *  2 - SAMPLE_IP requested to have 0 skid
+				 *  3 - SAMPLE_IP must have 0 skid
+				 *
+				 *  See also PERF_RECORD_MISC_EXACT_IP
+				 */
+				precise_ip     :  2, /* skid constraint       */
+
+				__reserved_1   : 47;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -296,7 +306,12 @@ struct perf_event_mmap_page {
 #define PERF_RECORD_MISC_GUEST_KERNEL		(4 << 0)
 #define PERF_RECORD_MISC_GUEST_USER		(5 << 0)
 
-#define PERF_RECORD_MISC_EXACT			(1 << 14)
+/*
+ * Indicates that the content of PERF_SAMPLE_IP points to
+ * the actual instruction that triggered the event. See also
+ * perf_event_attr::precise_ip.
+ */
+#define PERF_RECORD_MISC_EXACT_IP		(1 << 14)
 /*
  * Reserve the last bit to indicate some extended misc field
  */
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 3de397764cb..ed9b5b6905f 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1021,7 +1021,7 @@ static void event__process_sample(const event_t *self,
 		return;
 	}
 
-	if (self->header.misc & PERF_RECORD_MISC_EXACT)
+	if (self->header.misc & PERF_RECORD_MISC_EXACT_IP)
 		exact_samples++;
 
 	if (event__preprocess_sample(self, session, &al, symbol_filter) < 0 ||
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index bc8b7e61420..ae7f5917935 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -654,10 +654,6 @@ parse_raw_event(const char **strp, struct perf_event_attr *attr)
 		return EVT_FAILED;
 	n = hex2u64(str + 1, &config);
 	if (n > 0) {
-		if (str[n+1] == 'p') {
-			attr->precise = 1;
-			n++;
-		}
 		*strp = str + n + 1;
 		attr->type = PERF_TYPE_RAW;
 		attr->config = config;
@@ -692,19 +688,29 @@ static enum event_result
 parse_event_modifier(const char **strp, struct perf_event_attr *attr)
 {
 	const char *str = *strp;
-	int eu = 1, ek = 1, eh = 1;
+	int exclude = 0;
+	int eu = 0, ek = 0, eh = 0, precise = 0;
 
 	if (*str++ != ':')
 		return 0;
 	while (*str) {
-		if (*str == 'u')
+		if (*str == 'u') {
+			if (!exclude)
+				exclude = eu = ek = eh = 1;
 			eu = 0;
-		else if (*str == 'k')
+		} else if (*str == 'k') {
+			if (!exclude)
+				exclude = eu = ek = eh = 1;
 			ek = 0;
-		else if (*str == 'h')
+		} else if (*str == 'h') {
+			if (!exclude)
+				exclude = eu = ek = eh = 1;
 			eh = 0;
-		else
+		} else if (*str == 'p') {
+			precise++;
+		} else
 			break;
+
 		++str;
 	}
 	if (str >= *strp + 2) {
@@ -712,6 +718,7 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr)
 		attr->exclude_user   = eu;
 		attr->exclude_kernel = ek;
 		attr->exclude_hv     = eh;
+		attr->precise_ip     = precise;
 		return 1;
 	}
 	return 0;
-- 
cgit v1.2.3


From 6bde9b6ce0127e2a56228a2071536d422be31336 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Fri, 23 Apr 2010 13:56:00 +0800
Subject: perf: Add group scheduling transactional APIs

Add group scheduling transactional APIs to struct pmu.
These APIs will be implemented in arch code, based on Peter's idea as
below.

> the idea behind hw_perf_group_sched_in() is to not perform
> schedulability tests on each event in the group, but to add the group
> as a whole and then perform one test.
>
> Of course, when that test fails, you'll have to roll-back the whole
> group again.
>
> So start_txn (or a better name) would simply toggle a flag in the pmu
> implementation that will make pmu::enable() not perform the
> schedulablilty test.
>
> Then commit_txn() will perform the schedulability test (so note the
> method has to have a !void return value.
>
> This will allow us to use the regular
> kernel/perf_event.c::group_sched_in() and all the rollback code.
> Currently each hw_perf_group_sched_in() implementation duplicates all
> the rolllback code (with various bugs).

->start_txn:
Start group events scheduling transaction, set a flag to make
pmu::enable() not perform the schedulability test, it will be performed
at commit time.

->commit_txn:
Commit group events scheduling transaction, perform the group
schedulability as a whole

->cancel_txn:
Stop group events scheduling transaction, clear the flag so
pmu::enable() will perform the schedulability test.

Reviewed-by: Stephane Eranian <eranian@google.com>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1272002160.5707.60.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 15 ++++++++++++---
 kernel/perf_event.c        | 33 ++++++++++++++++++++-------------
 2 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 23cd0057a68..4924c96d7e2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -547,6 +547,8 @@ struct hw_perf_event {
 
 struct perf_event;
 
+#define PERF_EVENT_TXN_STARTED 1
+
 /**
  * struct pmu - generic performance monitoring unit
  */
@@ -557,6 +559,16 @@ struct pmu {
 	void (*stop)			(struct perf_event *event);
 	void (*read)			(struct perf_event *event);
 	void (*unthrottle)		(struct perf_event *event);
+
+	/*
+	 * group events scheduling is treated as a transaction,
+	 * add group events as a whole and perform one schedulability test.
+	 * If test fails, roll back the whole group
+	 */
+
+	void (*start_txn)	(const struct pmu *pmu);
+	void (*cancel_txn)	(const struct pmu *pmu);
+	int  (*commit_txn)	(const struct pmu *pmu);
 };
 
 /**
@@ -823,9 +835,6 @@ extern void perf_disable(void);
 extern void perf_enable(void);
 extern int perf_event_task_disable(void);
 extern int perf_event_task_enable(void);
-extern int hw_perf_group_sched_in(struct perf_event *group_leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_event_context *ctx);
 extern void perf_event_update_userpage(struct perf_event *event);
 extern int perf_event_release_kernel(struct perf_event *event);
 extern struct perf_event *
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 34659d4085c..bb06382f98e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -83,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
 void __weak hw_perf_disable(void)		{ barrier(); }
 void __weak hw_perf_enable(void)		{ barrier(); }
 
-int __weak
-hw_perf_group_sched_in(struct perf_event *group_leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_event_context *ctx)
-{
-	return 0;
-}
-
 void __weak perf_event_print_debug(void)	{ }
 
 static DEFINE_PER_CPU(int, perf_disable_count);
@@ -644,15 +636,20 @@ group_sched_in(struct perf_event *group_event,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_event_context *ctx)
 {
-	struct perf_event *event, *partial_group;
+	struct perf_event *event, *partial_group = NULL;
+	const struct pmu *pmu = group_event->pmu;
+	bool txn = false;
 	int ret;
 
 	if (group_event->state == PERF_EVENT_STATE_OFF)
 		return 0;
 
-	ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
-	if (ret)
-		return ret < 0 ? ret : 0;
+	/* Check if group transaction availabe */
+	if (pmu->start_txn)
+		txn = true;
+
+	if (txn)
+		pmu->start_txn(pmu);
 
 	if (event_sched_in(group_event, cpuctx, ctx))
 		return -EAGAIN;
@@ -667,9 +664,19 @@ group_sched_in(struct perf_event *group_event,
 		}
 	}
 
-	return 0;
+	if (txn) {
+		ret = pmu->commit_txn(pmu);
+		if (!ret) {
+			pmu->cancel_txn(pmu);
+
+			return 0;
+		}
+	}
 
 group_error:
+	if (txn)
+		pmu->cancel_txn(pmu);
+
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
 	 * partial group before returning:
-- 
cgit v1.2.3


From 4d1c52b02d977d884abb21d0bbaba6b5d6bc8374 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Fri, 23 Apr 2010 13:56:12 +0800
Subject: perf, x86: implement group scheduling transactional APIs

Convert to the transactional PMU API and remove the duplication of
group_sched_in().

Reviewed-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: David Miller <davem@davemloft.net>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1272002172.5707.61.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 180 +++++++++++++++------------------------
 1 file changed, 67 insertions(+), 113 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 27fa9eeed02..fd4db0db370 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -110,6 +110,8 @@ struct cpu_hw_events {
 	u64			tags[X86_PMC_IDX_MAX];
 	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
 
+	unsigned int		group_flag;
+
 	/*
 	 * Intel DebugStore bits
 	 */
@@ -961,6 +963,14 @@ static int x86_pmu_enable(struct perf_event *event)
 	if (n < 0)
 		return n;
 
+	/*
+	 * If group events scheduling transaction was started,
+	 * skip the schedulability test here, it will be peformed
+	 * at commit time(->commit_txn) as a whole
+	 */
+	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+		goto out;
+
 	ret = x86_pmu.schedule_events(cpuc, n, assign);
 	if (ret)
 		return ret;
@@ -970,6 +980,7 @@ static int x86_pmu_enable(struct perf_event *event)
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
+out:
 	cpuc->n_events = n;
 	cpuc->n_added += n - n0;
 
@@ -1227,119 +1238,6 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 	return &unconstrained;
 }
 
-static int x86_event_sched_in(struct perf_event *event,
-			  struct perf_cpu_context *cpuctx)
-{
-	int ret = 0;
-
-	event->state = PERF_EVENT_STATE_ACTIVE;
-	event->oncpu = smp_processor_id();
-	event->tstamp_running += event->ctx->time - event->tstamp_stopped;
-
-	if (!is_x86_event(event))
-		ret = event->pmu->enable(event);
-
-	if (!ret && !is_software_event(event))
-		cpuctx->active_oncpu++;
-
-	if (!ret && event->attr.exclusive)
-		cpuctx->exclusive = 1;
-
-	return ret;
-}
-
-static void x86_event_sched_out(struct perf_event *event,
-			    struct perf_cpu_context *cpuctx)
-{
-	event->state = PERF_EVENT_STATE_INACTIVE;
-	event->oncpu = -1;
-
-	if (!is_x86_event(event))
-		event->pmu->disable(event);
-
-	event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
-
-	if (!is_software_event(event))
-		cpuctx->active_oncpu--;
-
-	if (event->attr.exclusive || !cpuctx->active_oncpu)
-		cpuctx->exclusive = 0;
-}
-
-/*
- * Called to enable a whole group of events.
- * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
- * Assumes the caller has disabled interrupts and has
- * frozen the PMU with hw_perf_save_disable.
- *
- * called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
- */
-int hw_perf_group_sched_in(struct perf_event *leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_event_context *ctx)
-{
-	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	struct perf_event *sub;
-	int assign[X86_PMC_IDX_MAX];
-	int n0, n1, ret;
-
-	if (!x86_pmu_initialized())
-		return 0;
-
-	/* n0 = total number of events */
-	n0 = collect_events(cpuc, leader, true);
-	if (n0 < 0)
-		return n0;
-
-	ret = x86_pmu.schedule_events(cpuc, n0, assign);
-	if (ret)
-		return ret;
-
-	ret = x86_event_sched_in(leader, cpuctx);
-	if (ret)
-		return ret;
-
-	n1 = 1;
-	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-		if (sub->state > PERF_EVENT_STATE_OFF) {
-			ret = x86_event_sched_in(sub, cpuctx);
-			if (ret)
-				goto undo;
-			++n1;
-		}
-	}
-	/*
-	 * copy new assignment, now we know it is possible
-	 * will be used by hw_perf_enable()
-	 */
-	memcpy(cpuc->assign, assign, n0*sizeof(int));
-
-	cpuc->n_events  = n0;
-	cpuc->n_added  += n1;
-	ctx->nr_active += n1;
-
-	/*
-	 * 1 means successful and events are active
-	 * This is not quite true because we defer
-	 * actual activation until hw_perf_enable() but
-	 * this way we* ensure caller won't try to enable
-	 * individual events
-	 */
-	return 1;
-undo:
-	x86_event_sched_out(leader, cpuctx);
-	n0  = 1;
-	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
-			x86_event_sched_out(sub, cpuctx);
-			if (++n0 == n1)
-				break;
-		}
-	}
-	return ret;
-}
-
 #include "perf_event_amd.c"
 #include "perf_event_p6.c"
 #include "perf_event_p4.c"
@@ -1471,6 +1369,59 @@ static inline void x86_pmu_read(struct perf_event *event)
 	x86_perf_event_update(event);
 }
 
+/*
+ * Start group events scheduling transaction
+ * Set the flag to make pmu::enable() not perform the
+ * schedulability test, it will be performed at commit time
+ */
+static void x86_pmu_start_txn(const struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+}
+
+/*
+ * Stop group events scheduling transaction
+ * Clear the flag and pmu::enable() will perform the
+ * schedulability test.
+ */
+static void x86_pmu_cancel_txn(const struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+}
+
+/*
+ * Commit group events scheduling transaction
+ * Perform the group schedulability test as a whole
+ * Return 0 if success
+ */
+static int x86_pmu_commit_txn(const struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int assign[X86_PMC_IDX_MAX];
+	int n, ret;
+
+	n = cpuc->n_events;
+
+	if (!x86_pmu_initialized())
+		return -EAGAIN;
+
+	ret = x86_pmu.schedule_events(cpuc, n, assign);
+	if (ret)
+		return ret;
+
+	/*
+	 * copy new assignment, now we know it is possible
+	 * will be used by hw_perf_enable()
+	 */
+	memcpy(cpuc->assign, assign, n*sizeof(int));
+
+	return 0;
+}
+
 static const struct pmu pmu = {
 	.enable		= x86_pmu_enable,
 	.disable	= x86_pmu_disable,
@@ -1478,6 +1429,9 @@ static const struct pmu pmu = {
 	.stop		= x86_pmu_stop,
 	.read		= x86_pmu_read,
 	.unthrottle	= x86_pmu_unthrottle,
+	.start_txn	= x86_pmu_start_txn,
+	.cancel_txn	= x86_pmu_cancel_txn,
+	.commit_txn	= x86_pmu_commit_txn,
 };
 
 /*
-- 
cgit v1.2.3


From 58e323cf5e4ed621a6e88263aca40c3d9c3d9bfd Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Fri, 7 May 2010 14:15:04 +0200
Subject: logfs: remove unused variable

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/file.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 1639e9235f5..0de52407187 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -222,7 +222,6 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct super_block *sb = dentry->d_inode->i_sb;
-	struct logfs_super *super = logfs_super(sb);
 
 	logfs_write_anchor(sb);
 	return 0;
-- 
cgit v1.2.3


From ccf31c10f125ab5233c8517f91d4b3bd0bd60936 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Fri, 7 May 2010 10:59:06 +0200
Subject: logfs: handle errors from get_mtd_device()

The get_mtd_device() function returns error pointers on failure and if we
don't handle it, it leads to a crash.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/dev_mtd.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index cafb6ef2e05..b02a4020241 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -250,5 +250,7 @@ int logfs_get_sb_mtd(struct file_system_type *type, int flags,
 	const struct logfs_device_ops *devops = &mtd_devops;
 
 	mtd = get_mtd_device(NULL, mtdnr);
+	if (IS_ERR(mtd))
+		return PTR_ERR(mtd);
 	return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
 }
-- 
cgit v1.2.3


From 1cf4a0632c24ea61162ed819bde358bc94c55510 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 7 May 2010 14:07:05 -0300
Subject: perf list: Improve the raw hw event descriptor documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It was x86 specific and imcomplete at that, improve the situation by
making it clear where the example provided applies and by adding the
URLs for the Intel and AMD manuals where this is discussed in depth.

Acked-by: Robert Richter <robert.richter@amd.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Robert Richter <robert.richter@amd.com>
Reported-by: Robert Richter <robert.richter@amd.com
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-list.txt | 19 ++++++++++++++++---
 tools/perf/util/parse-events.c         |  3 ++-
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index ad765e0b886..43e3dd284b9 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -18,8 +18,16 @@ various perf commands with the -e option.
 RAW HARDWARE EVENT DESCRIPTOR
 -----------------------------
 Even when an event is not available in a symbolic form within perf right now,
-it can be encoded as <UMASK VALUE><EVENT NUM>, for instance, if the Intel docs
-describe an event as:
+it can be encoded in a per processor specific way.
+
+For instance For x86 CPUs NNN represents the raw register encoding with the
+layout of IA32_PERFEVTSELx MSRs (see [Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide] Figure 30-1 Layout
+of IA32_PERFEVTSELx MSRs) or AMD's PerfEvtSeln (see [AMD64 Architecture Programmer’s Manual Volume 2: System Programming], Page 344,
+Figure 13-7 Performance Event-Select Register (PerfEvtSeln)).
+
+Example:
+
+If the Intel docs for a QM720 Core i7 describe an event as:
 
   Event  Umask  Event Mask
   Num.   Value  Mnemonic    Description                        Comment
@@ -33,6 +41,9 @@ raw encoding of 0x1A8 can be used:
  perf stat -e r1a8 -a sleep 1
  perf record -e r1a8 ...
 
+You should refer to the processor specific documentation for getting these
+details. Some of them are referenced in the SEE ALSO section below.
+
 OPTIONS
 -------
 None
@@ -40,4 +51,6 @@ None
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-top[1],
-linkperf:perf-record[1]
+linkperf:perf-record[1],
+http://www.intel.com/Assets/PDF/manual/253669.pdf[Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide],
+http://support.amd.com/us/Processor_TechDocs/24593.pdf[AMD64 Architecture Programmer’s Manual Volume 2: System Programming]
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index bc8b7e61420..7b24b5dee8b 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -936,7 +936,8 @@ void print_events(void)
 
 	printf("\n");
 	printf("  %-42s [%s]\n",
-		"rNNN (NNN=<UMASK VALUE><EVENT NUM>)", event_type_descriptors[PERF_TYPE_RAW]);
+		"rNNN (see 'perf list --help' on how to encode it)",
+	       event_type_descriptors[PERF_TYPE_RAW]);
 	printf("\n");
 
 	printf("  %-42s [%s]\n",
-- 
cgit v1.2.3


From 6f485b41875dbf5160c1990322469c1f65f77b28 Mon Sep 17 00:00:00 2001
From: Joern Engel <joern@logfs.org>
Date: Fri, 7 May 2010 19:38:40 +0200
Subject: logfs: handle powerfail on NAND flash

The write buffer may not have been written and may no longer be written
due to an interrupted write in the affected page.

Signed-off-by: Joern Engel <joern@logfs.org>
---
 fs/logfs/dev_bdev.c |  6 ++++++
 fs/logfs/dev_mtd.c  | 24 +++++++++++++++++++++++-
 fs/logfs/gc.c       | 49 +++++++++++++++++++++----------------------------
 fs/logfs/logfs.h    |  2 ++
 4 files changed, 52 insertions(+), 29 deletions(-)

diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 243c00071f7..9bd2ce2a304 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -303,6 +303,11 @@ static void bdev_put_device(struct super_block *sb)
 	close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
 }
 
+static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
+{
+	return 0;
+}
+
 static const struct logfs_device_ops bd_devops = {
 	.find_first_sb	= bdev_find_first_sb,
 	.find_last_sb	= bdev_find_last_sb,
@@ -310,6 +315,7 @@ static const struct logfs_device_ops bd_devops = {
 	.readpage	= bdev_readpage,
 	.writeseg	= bdev_writeseg,
 	.erase		= bdev_erase,
+	.can_write_buf	= bdev_can_write_buf,
 	.sync		= bdev_sync,
 	.put_device	= bdev_put_device,
 };
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index b02a4020241..a85d47d13e4 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -9,6 +9,7 @@
 #include <linux/completion.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
 
@@ -126,7 +127,8 @@ static int mtd_readpage(void *_sb, struct page *page)
 
 	err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
 			page_address(page));
-	if (err == -EUCLEAN) {
+	if (err == -EUCLEAN || err == -EBADMSG) {
+		/* -EBADMSG happens regularly on power failures */
 		err = 0;
 		/* FIXME: force GC this segment */
 	}
@@ -233,12 +235,32 @@ static void mtd_put_device(struct super_block *sb)
 	put_mtd_device(logfs_super(sb)->s_mtd);
 }
 
+static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
+{
+	struct logfs_super *super = logfs_super(sb);
+	void *buf;
+	int err;
+
+	buf = kmalloc(super->s_writesize, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	err = mtd_read(sb, ofs, super->s_writesize, buf);
+	if (err)
+		goto out;
+	if (memchr_inv(buf, 0xff, super->s_writesize))
+		err = -EIO;
+	kfree(buf);
+out:
+	return err;
+}
+
 static const struct logfs_device_ops mtd_devops = {
 	.find_first_sb	= mtd_find_first_sb,
 	.find_last_sb	= mtd_find_last_sb,
 	.readpage	= mtd_readpage,
 	.writeseg	= mtd_writeseg,
 	.erase		= mtd_erase,
+	.can_write_buf	= mtd_can_write_buf,
 	.sync		= mtd_sync,
 	.put_device	= mtd_put_device,
 };
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index 76c242fbe1b..caa4419285d 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -122,7 +122,7 @@ static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
 	logfs_safe_iput(inode, cookie);
 }
 
-static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
+static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
 {
 	struct logfs_super *super = logfs_super(sb);
 	struct logfs_segment_header sh;
@@ -401,7 +401,7 @@ static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
 			segno, (u64)segno << super->s_segshift,
 			dist, no_free_segments(sb), valid,
 			super->s_free_bytes);
-	cleaned = logfs_gc_segment(sb, segno, dist);
+	cleaned = logfs_gc_segment(sb, segno);
 	log_gc("GC segment #%02x complete - now %x valid\n", segno,
 			valid - cleaned);
 	BUG_ON(cleaned != valid);
@@ -632,38 +632,31 @@ static int check_area(struct super_block *sb, int i)
 {
 	struct logfs_super *super = logfs_super(sb);
 	struct logfs_area *area = super->s_area[i];
-	struct logfs_object_header oh;
+	gc_level_t gc_level;
+	u32 cleaned, valid, ec;
 	u32 segno = area->a_segno;
-	u32 ofs = area->a_used_bytes;
-	__be32 crc;
-	int err;
+	u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
 
 	if (!area->a_is_open)
 		return 0;
 
-	for (ofs = area->a_used_bytes;
-	     ofs <= super->s_segsize - sizeof(oh);
-	     ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
-		err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
-		if (err)
-			return err;
-
-		if (!memchr_inv(&oh, 0xff, sizeof(oh)))
-			break;
+	if (super->s_devops->can_write_buf(sb, ofs) == 0)
+		return 0;
 
-		crc = logfs_crc32(&oh, sizeof(oh) - 4, 4);
-		if (crc != oh.crc) {
-			printk(KERN_INFO "interrupted header at %llx\n",
-					dev_ofs(sb, segno, ofs));
-			return 0;
-		}
-	}
-	if (ofs != area->a_used_bytes) {
-		printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
-				ofs - area->a_used_bytes,
-				dev_ofs(sb, segno, area->a_used_bytes));
-		area->a_used_bytes = ofs;
-	}
+	printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
+	/*
+	 * The device cannot write back the write buffer.  Most likely the
+	 * wbuf was already written out and the system crashed at some point
+	 * before the journal commit happened.  In that case we wouldn't have
+	 * to do anything.  But if the crash happened before the wbuf was
+	 * written out correctly, we must GC this segment.  So assume the
+	 * worst and always do the GC run.
+	 */
+	area->a_is_open = 0;
+	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
+	cleaned = logfs_gc_segment(sb, segno);
+	if (cleaned != valid)
+		return -EIO;
 	return 0;
 }
 
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 26a9458e6b1..93b55f33724 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -144,6 +144,7 @@ struct logfs_area_ops {
  * @erase:			erase one segment
  * @read:			read from the device
  * @erase:			erase part of the device
+ * @can_write_buf:		decide whether wbuf can be written to ofs
  */
 struct logfs_device_ops {
 	struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
@@ -153,6 +154,7 @@ struct logfs_device_ops {
 	void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
 	int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
 			int ensure_write);
+	int (*can_write_buf)(struct super_block *sb, u64 ofs);
 	void (*sync)(struct super_block *sb);
 	void (*put_device)(struct super_block *sb);
 };
-- 
cgit v1.2.3


From 2b107d93635616db0c3f893c8cc2e6d5cd8d77b2 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 7 May 2010 14:59:45 -0700
Subject: x86: Avoid check hlt for newer cpus

Check hlt instruction was targeted for some older CPUs. It is an expensive
operation in that it takes 4 ticks to break out the check.  We can avoid
such check completely for newer x86 cpus (family >= 5).

[ hpa: corrected family > 5 to family >= 5 ]

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1273269585-14346-1-git-send-email-jacob.jun.pan@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/bugs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 01a26521239..c39576cb301 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
 
 static void __init check_hlt(void)
 {
-	if (paravirt_enabled())
+	if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
 		return;
 
 	printk(KERN_INFO "Checking 'hlt' instruction... ");
-- 
cgit v1.2.3


From 9fa02317429449e8176c9bb6da3ac00eb14d52d3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Fri, 7 May 2010 16:55:41 -0700
Subject: x86, HyperV: fix up the license to mshyperv.c

This should have been GPLv2 only, we cut and pasted from the wrong file
originally, sorry.

Also removed some unneeded boilerplate license code, we all know where
to find the GPLv2, and that there's no warranty as that is implicit from
the license.

Cc: Ky Srinivasan <ksrinivasan@novell.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
LKML-Reference: <20100507235541.GA15448@kroah.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mshyperv.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 2443b61cdb1..a58d8e64fc7 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -6,18 +6,7 @@
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ * the Free Software Foundation; version 2 of the License.
  *
  */
 
-- 
cgit v1.2.3


From e08cae4181af9483b04ecfac48f01c8e5a5f27bf Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 7 May 2010 16:57:28 -0700
Subject: x86: Clean up the hypervisor layer

Clean up the hypervisor layer and the hypervisor drivers, using an ops
structure instead of an enumeration with if statements.

The identity of the hypervisor, if needed, can be tested by testing
the pointer value in x86_hyper.

The MS-HyperV private state is moved into a normal global variable
(it's per-system state, not per-CPU state).  Being a normal bss
variable, it will be left at all zero on non-HyperV platforms, and so
can generally be tested for HyperV-specific features without
additional qualification.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Greg KH <greg@kroah.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Ky Srinivasan <ksrinivasan@novell.com>
LKML-Reference: <4BE49778.6060800@zytor.com>
---
 arch/x86/include/asm/hyperv.h     |  5 ++--
 arch/x86/include/asm/hypervisor.h | 27 +++++++++++++++++--
 arch/x86/include/asm/mshyperv.h   | 15 ++++++++---
 arch/x86/include/asm/processor.h  |  7 -----
 arch/x86/include/asm/vmware.h     | 27 -------------------
 arch/x86/kernel/cpu/hypervisor.c  | 56 ++++++++++++++++++++++++---------------
 arch/x86/kernel/cpu/mshyperv.c    | 51 +++++++++++++++++------------------
 arch/x86/kernel/cpu/vmware.c      | 36 ++++++++++++++-----------
 8 files changed, 117 insertions(+), 107 deletions(-)
 delete mode 100644 arch/x86/include/asm/vmware.h

diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
index 46040473e12..5df477ac3af 100644
--- a/arch/x86/include/asm/hyperv.h
+++ b/arch/x86/include/asm/hyperv.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_X86_KVM_HYPERV_H
-#define _ASM_X86_KVM_HYPERV_H
+#ifndef _ASM_X86_HYPERV_H
+#define _ASM_X86_HYPERV_H
 
 #include <linux/types.h>
 
@@ -16,6 +16,7 @@
 
 #define HYPERV_HYPERVISOR_PRESENT_BIT		0x80000000
 #define HYPERV_CPUID_MIN			0x40000005
+#define HYPERV_CPUID_MAX			0x4000ffff
 
 /*
  * Feature identification. EAX indicates which features are available
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index b78c0941e42..70abda7058c 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -17,10 +17,33 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  *
  */
-#ifndef ASM_X86__HYPERVISOR_H
-#define ASM_X86__HYPERVISOR_H
+#ifndef _ASM_X86_HYPERVISOR_H
+#define _ASM_X86_HYPERVISOR_H
 
 extern void init_hypervisor(struct cpuinfo_x86 *c);
 extern void init_hypervisor_platform(void);
 
+/*
+ * x86 hypervisor information
+ */
+struct hypervisor_x86 {
+	/* Hypervisor name */
+	const char	*name;
+
+	/* Detection routine */
+	bool		(*detect)(void);
+
+	/* Adjust CPU feature bits (run once per CPU) */
+	void		(*set_cpu_features)(struct cpuinfo_x86 *);
+
+	/* Platform setup (run once per boot) */
+	void		(*init_platform)(void);
+};
+
+extern const struct hypervisor_x86 *x86_hyper;
+
+/* Recognized hypervisors */
+extern const struct hypervisor_x86 x86_hyper_vmware;
+extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+
 #endif
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 6cd8101d134..79ce5685ab6 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -1,7 +1,14 @@
-#ifndef ASM_X86__MSHYPER_H
-#define ASM_X86__MSHYPER_H
+#ifndef _ASM_X86_MSHYPER_H
+#define _ASM_X86_MSHYPER_H
 
-int ms_hyperv_platform(void);
-void __cpuinit ms_hyperv_set_feature_bits(struct cpuinfo_x86 *c);
+#include <linux/types.h>
+#include <asm/hyperv.h>
+
+struct ms_hyperv_info {
+	u32 features;
+	u32 hints;
+};
+
+extern struct ms_hyperv_info ms_hyperv;
 
 #endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 597c041bd12..e4f1dfb2d05 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -113,9 +113,6 @@ struct cpuinfo_x86 {
 	/* Index into per_cpu list: */
 	u16			cpu_index;
 #endif
-	unsigned int		x86_hyper_vendor;
-	/* The layout of this field is hypervisor specific */
-	unsigned int		x86_hyper_features;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL	0
@@ -129,10 +126,6 @@ struct cpuinfo_x86 {
 
 #define X86_VENDOR_UNKNOWN	0xff
 
-#define X86_HYPER_VENDOR_NONE  0
-#define X86_HYPER_VENDOR_VMWARE 1
-#define X86_HYPER_VENDOR_MSFT	2
-
 /*
  * capabilities of CPUs
  */
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
deleted file mode 100644
index e49ed6d2fd4..00000000000
--- a/arch/x86/include/asm/vmware.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (C) 2008, VMware, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- */
-#ifndef ASM_X86__VMWARE_H
-#define ASM_X86__VMWARE_H
-
-extern void vmware_platform_setup(void);
-extern int vmware_platform(void);
-extern void vmware_set_feature_bits(struct cpuinfo_x86 *c);
-
-#endif
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index de3f4e0ce8e..87381759d3c 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -22,40 +22,52 @@
  */
 
 #include <asm/processor.h>
-#include <asm/vmware.h>
-#include <asm/mshyperv.h>
 #include <asm/hypervisor.h>
 
-static inline void __cpuinit
-detect_hypervisor_vendor(struct cpuinfo_x86 *c)
+/*
+ * Hypervisor detect order.  This is specified explicitly here because
+ * some hypervisors might implement compatibility modes for other
+ * hypervisors and therefore need to be detected in specific sequence.
+ */
+static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
-	if (vmware_platform())
-		c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
-	else if (ms_hyperv_platform())
-		c->x86_hyper_vendor = X86_HYPER_VENDOR_MSFT;
-	else
-		c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
-}
+	&x86_hyper_vmware,
+	&x86_hyper_ms_hyperv,
+};
 
-static inline void __cpuinit
-hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
+const struct hypervisor_x86 *x86_hyper;
+
+static inline void __init
+detect_hypervisor_vendor(void)
 {
-	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
-		vmware_set_feature_bits(c);
-	else if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_MSFT)
-		ms_hyperv_set_feature_bits(c);
-	return;
+	const struct hypervisor_x86 *h, * const *p;
+
+	for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
+		h = *p;
+		if (h->detect()) {
+			x86_hyper = h;
+			printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
+			break;
+		}
+	}
 }
 
 void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
 {
-	detect_hypervisor_vendor(c);
-	hypervisor_set_feature_bits(c);
+	if (x86_hyper && x86_hyper->set_cpu_features)
+		x86_hyper->set_cpu_features(c);
 }
 
 void __init init_hypervisor_platform(void)
 {
+
+	detect_hypervisor_vendor();
+
+	if (!x86_hyper)
+		return;
+
 	init_hypervisor(&boot_cpu_data);
-	if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
-		vmware_platform_setup();
+
+	if (x86_hyper->init_platform)
+		x86_hyper->init_platform();
 }
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index a58d8e64fc7..5969c3ee318 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -12,45 +12,42 @@
 
 #include <linux/types.h>
 #include <asm/processor.h>
+#include <asm/hypervisor.h>
 #include <asm/hyperv.h>
 #include <asm/mshyperv.h>
 
+struct ms_hyperv_info ms_hyperv;
 
-int ms_hyperv_platform(void)
+static bool __init ms_hyperv_platform(void)
 {
-	u32 eax, ebx, ecx, edx;
-	char hyp_signature[13];
+	u32 eax;
+	u32 hyp_signature[3];
 
-	cpuid(1, &eax, &ebx, &ecx, &edx);
-	if (!(ecx & HYPERV_HYPERVISOR_PRESENT_BIT))
-		return 0;
+	if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		return false;
 
-	cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &ebx, &ecx, &edx);
-	*(u32 *)(hyp_signature + 0) = ebx;
-	*(u32 *)(hyp_signature + 4) = ecx;
-	*(u32 *)(hyp_signature + 8) = edx;
+	cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
+	      &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
 
-	if ((eax < HYPERV_CPUID_MIN) || (memcmp("Microsoft Hv", hyp_signature, 12)))
-		return 0;
-	return 1;
+	return eax >= HYPERV_CPUID_MIN &&
+		eax <= HYPERV_CPUID_MAX &&
+		!memcmp("Microsoft Hv", hyp_signature, 12);
 }
 
-void __cpuinit ms_hyperv_set_feature_bits(struct cpuinfo_x86 *c)
+static void __init ms_hyperv_init_platform(void)
 {
-	u32 eax, ebx, ecx, edx;
-
-	c->x86_hyper_features = 0;
 	/*
-	 * Extract the features, recommendations etc.
-	 * The first 9 bits will be used to track hypervisor features.
-	 * The next 6 bits will be used to track the hypervisor
-	 * recommendations.
+	 * Extract the features and hints
 	 */
-	cpuid(HYPERV_CPUID_FEATURES, &eax, &ebx, &ecx, &edx);
-	c->x86_hyper_features |= (eax & 0x1ff);
+	ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+	ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
 
-	cpuid(HYPERV_CPUID_ENLIGHTMENT_INFO, &eax, &ebx, &ecx, &edx);
-	c->x86_hyper_features |= ((eax & 0x3f) << 9);
-	printk(KERN_INFO "Detected HyperV with features: %x\n",
-		c->x86_hyper_features);
+	printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
+	       ms_hyperv.features, ms_hyperv.hints);
 }
+
+const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+	.name			= "Microsoft HyperV",
+	.detect			= ms_hyperv_platform,
+	.init_platform		= ms_hyperv_init_platform,
+};
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97b59c..46a5b5d3ba5 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -23,8 +23,8 @@
 
 #include <linux/dmi.h>
 #include <asm/div64.h>
-#include <asm/vmware.h>
 #include <asm/x86_init.h>
+#include <asm/hypervisor.h>
 
 #define CPUID_VMWARE_INFO_LEAF	0x40000000
 #define VMWARE_HYPERVISOR_MAGIC	0x564D5868
@@ -64,7 +64,7 @@ static unsigned long vmware_get_tsc_khz(void)
 	return tsc_hz;
 }
 
-void __init vmware_platform_setup(void)
+static void __init vmware_platform_setup(void)
 {
 	uint32_t eax, ebx, ecx, edx;
 
@@ -82,24 +82,21 @@ void __init vmware_platform_setup(void)
  * serial key should be enough, as this will always have a VMware
  * specific string when running under VMware hypervisor.
  */
-int vmware_platform(void)
+static bool __init vmware_platform(void)
 {
 	if (cpu_has_hypervisor) {
-		unsigned int eax, ebx, ecx, edx;
-		char hyper_vendor_id[13];
-
-		cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx);
-		memcpy(hyper_vendor_id + 0, &ebx, 4);
-		memcpy(hyper_vendor_id + 4, &ecx, 4);
-		memcpy(hyper_vendor_id + 8, &edx, 4);
-		hyper_vendor_id[12] = '\0';
-		if (!strcmp(hyper_vendor_id, "VMwareVMware"))
-			return 1;
+		unsigned int eax;
+		unsigned int hyper_vendor_id[3];
+
+		cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
+		      &hyper_vendor_id[1], &hyper_vendor_id[2]);
+		if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
+			return true;
 	} else if (dmi_available && dmi_name_in_serial("VMware") &&
 		   __vmware_platform())
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
 /*
@@ -114,8 +111,15 @@ int vmware_platform(void)
  * so that the kernel could just trust the hypervisor with providing a
  * reliable virtual TSC that is suitable for timekeeping.
  */
-void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c)
+static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
 {
 	set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 	set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
 }
+
+const __refconst struct hypervisor_x86 x86_hyper_vmware = {
+	.name			= "VMware",
+	.detect			= vmware_platform,
+	.set_cpu_features	= vmware_set_cpu_features,
+	.init_platform		= vmware_platform_setup,
+};
-- 
cgit v1.2.3


From b1a9ceb2e003aab7b96e30d990c1092453a0ea44 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 6 May 2010 15:14:09 +0100
Subject: ARM: 6105/1: Fix the __arm_ioremap_caller() definition in nommu.c

Commit 31aa8fd6 introduced the __arm_ioremap_caller() function but the
nommu.c version did not have the _caller suffix.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/nommu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c
index 9bfeb6b9509..f8791eed759 100644
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@ -87,8 +87,8 @@ void __iomem *__arm_ioremap(unsigned long phys_addr, size_t size,
 }
 EXPORT_SYMBOL(__arm_ioremap);
 
-void __iomem *__arm_ioremap(unsigned long phys_addr, size_t size,
-			    unsigned int mtype, void *caller)
+void __iomem *__arm_ioremap_caller(unsigned long phys_addr, size_t size,
+				   unsigned int mtype, void *caller)
 {
 	return __arm_ioremap(phys_addr, size, mtype);
 }
-- 
cgit v1.2.3


From b5a07faadeb4e0cfd6dcee359e501d4755cab875 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 6 May 2010 15:15:28 +0100
Subject: ARM: 6106/1: Implement copy_to_user_page() for noMMU

Commit 7959722 introduced calls to copy_(to|from)_user_page() from
access_process_vm() in mm/nommu.c. The copy_to_user_page() was not
implemented on noMMU ARM.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/nommu.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c
index f8791eed759..33b327379f0 100644
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@ -65,6 +65,15 @@ void flush_dcache_page(struct page *page)
 }
 EXPORT_SYMBOL(flush_dcache_page);
 
+void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
+		       unsigned long uaddr, void *dst, const void *src,
+		       unsigned long len)
+{
+	memcpy(dst, src, len);
+	if (vma->vm_flags & VM_EXEC)
+		__cpuc_coherent_user_range(uaddr, uaddr + len);
+}
+
 void __iomem *__arm_ioremap_pfn(unsigned long pfn, unsigned long offset,
 				size_t size, unsigned int mtype)
 {
-- 
cgit v1.2.3


From f4d6477f7f073b99220386d62f5bf54bec3482cc Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 7 May 2010 16:26:24 +0100
Subject: ARM: 6111/1: Implement read/write for ownership in the ARMv6 DMA
 cache ops

The Snoop Control Unit on the ARM11MPCore hardware does not detect the
cache operations and the dma_cache_maint*() functions may leave stale
cache entries on other CPUs. The solution implemented in this patch
performs a Read or Write For Ownership in the ARMv6 DMA cache
maintenance functions. These LDR/STR instructions change the cache line
state to shared or exclusive so that the cache maintenance operation has
the desired effect.

Tested-by: George G. Davis <gdavis@mvista.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/cache-v6.S | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S
index 9d89c67a1cc..e46ecd84713 100644
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -211,6 +211,9 @@ v6_dma_inv_range:
 	mcrne	p15, 0, r1, c7, c15, 1		@ clean & invalidate unified line
 #endif
 1:
+#ifdef CONFIG_SMP
+	str	r0, [r0]			@ write for ownership
+#endif
 #ifdef HARVARD_CACHE
 	mcr	p15, 0, r0, c7, c6, 1		@ invalidate D line
 #else
@@ -231,6 +234,9 @@ v6_dma_inv_range:
 v6_dma_clean_range:
 	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
 1:
+#ifdef CONFIG_SMP
+	ldr	r2, [r0]			@ read for ownership
+#endif
 #ifdef HARVARD_CACHE
 	mcr	p15, 0, r0, c7, c10, 1		@ clean D line
 #else
@@ -251,6 +257,10 @@ v6_dma_clean_range:
 ENTRY(v6_dma_flush_range)
 	bic	r0, r0, #D_CACHE_LINE_SIZE - 1
 1:
+#ifdef CONFIG_SMP
+	ldr	r2, [r0]			@ read for ownership
+	str	r2, [r0]			@ write for ownership
+#endif
 #ifdef HARVARD_CACHE
 	mcr	p15, 0, r0, c7, c14, 1		@ clean & invalidate D line
 #else
@@ -273,7 +283,9 @@ ENTRY(v6_dma_map_area)
 	add	r1, r1, r0
 	teq	r2, #DMA_FROM_DEVICE
 	beq	v6_dma_inv_range
-	b	v6_dma_clean_range
+	teq	r2, #DMA_TO_DEVICE
+	beq	v6_dma_clean_range
+	b	v6_dma_flush_range
 ENDPROC(v6_dma_map_area)
 
 /*
@@ -283,9 +295,6 @@ ENDPROC(v6_dma_map_area)
  *	- dir	- DMA direction
  */
 ENTRY(v6_dma_unmap_area)
-	add	r1, r1, r0
-	teq	r2, #DMA_TO_DEVICE
-	bne	v6_dma_inv_range
 	mov	pc, lr
 ENDPROC(v6_dma_unmap_area)
 
-- 
cgit v1.2.3


From b8349b569aae661dea9d59d7d2ee587ccea3336c Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 7 May 2010 18:03:05 +0100
Subject: ARM: 6112/1: Use the Inner Shareable I-cache and BTB ops on ARMv7 SMP

The standard I-cache Invalidate All (ICIALLU) and Branch Predication
Invalidate All (BPIALL) operations are not automatically broadcast to
the other CPUs in an ARMv7 MP system. The patch adds the Inner Shareable
variants, ICIALLUIS and BPIALLIS, if ARMv7 and SMP.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/cacheflush.h |  4 ++++
 arch/arm/include/asm/tlbflush.h   | 29 ++++++++++++++++++++++++++++-
 arch/arm/mm/cache-v7.S            |  4 ++++
 arch/arm/mm/tlb-v7.S              |  8 ++++++++
 4 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 0d08d4170b6..4656a24058d 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -371,6 +371,10 @@ static inline void __flush_icache_all(void)
 #ifdef CONFIG_ARM_ERRATA_411920
 	extern void v6_icache_inval_all(void);
 	v6_icache_inval_all();
+#elif defined(CONFIG_SMP) && __LINUX_ARM_ARCH__ >= 7
+	asm("mcr	p15, 0, %0, c7, c1, 0	@ invalidate I-cache inner shareable\n"
+	    :
+	    : "r" (0));
 #else
 	asm("mcr	p15, 0, %0, c7, c5, 0	@ invalidate I-cache\n"
 	    :
diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index e085e2c545e..bd863d8608c 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -46,6 +46,9 @@
 #define TLB_V7_UIS_FULL (1 << 20)
 #define TLB_V7_UIS_ASID (1 << 21)
 
+/* Inner Shareable BTB operation (ARMv7 MP extensions) */
+#define TLB_V7_IS_BTB	(1 << 22)
+
 #define TLB_L2CLEAN_FR	(1 << 29)		/* Feroceon */
 #define TLB_DCLEAN	(1 << 30)
 #define TLB_WB		(1 << 31)
@@ -183,7 +186,7 @@
 #endif
 
 #ifdef CONFIG_SMP
-#define v7wbi_tlb_flags (TLB_WB | TLB_DCLEAN | TLB_BTB | \
+#define v7wbi_tlb_flags (TLB_WB | TLB_DCLEAN | TLB_V7_IS_BTB | \
 			 TLB_V7_UIS_FULL | TLB_V7_UIS_PAGE | TLB_V7_UIS_ASID)
 #else
 #define v7wbi_tlb_flags (TLB_WB | TLB_DCLEAN | TLB_BTB | \
@@ -339,6 +342,12 @@ static inline void local_flush_tlb_all(void)
 		dsb();
 		isb();
 	}
+	if (tlb_flag(TLB_V7_IS_BTB)) {
+		/* flush the branch target cache */
+		asm("mcr p15, 0, %0, c7, c1, 6" : : "r" (zero) : "cc");
+		dsb();
+		isb();
+	}
 }
 
 static inline void local_flush_tlb_mm(struct mm_struct *mm)
@@ -376,6 +385,12 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
 		asm("mcr p15, 0, %0, c7, c5, 6" : : "r" (zero) : "cc");
 		dsb();
 	}
+	if (tlb_flag(TLB_V7_IS_BTB)) {
+		/* flush the branch target cache */
+		asm("mcr p15, 0, %0, c7, c1, 6" : : "r" (zero) : "cc");
+		dsb();
+		isb();
+	}
 }
 
 static inline void
@@ -416,6 +431,12 @@ local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
 		asm("mcr p15, 0, %0, c7, c5, 6" : : "r" (zero) : "cc");
 		dsb();
 	}
+	if (tlb_flag(TLB_V7_IS_BTB)) {
+		/* flush the branch target cache */
+		asm("mcr p15, 0, %0, c7, c1, 6" : : "r" (zero) : "cc");
+		dsb();
+		isb();
+	}
 }
 
 static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
@@ -454,6 +475,12 @@ static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
 		dsb();
 		isb();
 	}
+	if (tlb_flag(TLB_V7_IS_BTB)) {
+		/* flush the branch target cache */
+		asm("mcr p15, 0, %0, c7, c1, 6" : : "r" (zero) : "cc");
+		dsb();
+		isb();
+	}
 }
 
 /*
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index bcd64f26587..06a90dcfc60 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -167,7 +167,11 @@ ENTRY(v7_coherent_user_range)
 	cmp	r0, r1
 	blo	1b
 	mov	r0, #0
+#ifdef CONFIG_SMP
+	mcr	p15, 0, r0, c7, c1, 6		@ invalidate BTB Inner Shareable
+#else
 	mcr	p15, 0, r0, c7, c5, 6		@ invalidate BTB
+#endif
 	dsb
 	isb
 	mov	pc, lr
diff --git a/arch/arm/mm/tlb-v7.S b/arch/arm/mm/tlb-v7.S
index 0cb1848bd87..f3f288a9546 100644
--- a/arch/arm/mm/tlb-v7.S
+++ b/arch/arm/mm/tlb-v7.S
@@ -50,7 +50,11 @@ ENTRY(v7wbi_flush_user_tlb_range)
 	cmp	r0, r1
 	blo	1b
 	mov	ip, #0
+#ifdef CONFIG_SMP
+	mcr	p15, 0, ip, c7, c1, 6		@ flush BTAC/BTB Inner Shareable
+#else
 	mcr	p15, 0, ip, c7, c5, 6		@ flush BTAC/BTB
+#endif
 	dsb
 	mov	pc, lr
 ENDPROC(v7wbi_flush_user_tlb_range)
@@ -79,7 +83,11 @@ ENTRY(v7wbi_flush_kern_tlb_range)
 	cmp	r0, r1
 	blo	1b
 	mov	r2, #0
+#ifdef CONFIG_SMP
+	mcr	p15, 0, r2, c7, c1, 6		@ flush BTAC/BTB Inner Shareable
+#else
 	mcr	p15, 0, r2, c7, c5, 6		@ flush BTAC/BTB
+#endif
 	dsb
 	isb
 	mov	pc, lr
-- 
cgit v1.2.3


From e814d826dc1821dbce3aa1ddb7f2f739f420ffd8 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 7 May 2010 10:52:32 +0100
Subject: ARM: 6110/1: Fix Thumb-2 kernel builds when UACCESS_WITH_MEMCPY is
 enabled

The patch adds the ENDPROC declarations for the __copy_to_user_std and
__clear_user_std functions. Without these, the compiler generates BXL to
ARM when compiling the kernel in Thumb-2 mode.

Reported-by: Kyungmin Park <kmpark@infradead.org>
Tested-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Nicolas Pitre <nico@fluxnic.net>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/lib/clear_user.S   | 1 +
 arch/arm/lib/copy_to_user.S | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/arm/lib/clear_user.S b/arch/arm/lib/clear_user.S
index 5e3f99620c0..14a0d988c82 100644
--- a/arch/arm/lib/clear_user.S
+++ b/arch/arm/lib/clear_user.S
@@ -45,6 +45,7 @@ USER(		strnebt	r2, [r0])
 		mov	r0, #0
 		ldmfd	sp!, {r1, pc}
 ENDPROC(__clear_user)
+ENDPROC(__clear_user_std)
 
 		.pushsection .fixup,"ax"
 		.align	0
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index 027b69bdbad..d066df686e1 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -93,6 +93,7 @@ WEAK(__copy_to_user)
 #include "copy_template.S"
 
 ENDPROC(__copy_to_user)
+ENDPROC(__copy_to_user_std)
 
 	.pushsection .fixup,"ax"
 	.align 0
-- 
cgit v1.2.3


From 6e85158cf5a2385264316870256fb6ad681156a0 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 8 May 2010 20:58:00 +1000
Subject: perf_event: Make software events work again

Commit 6bde9b6ce0127e2a56228a2071536d422be31336 ("perf: Add
group scheduling transactional APIs") added code to allow a
group to be scheduled in a single transaction.  However, it
introduced a bug in handling events whose pmu does not implement
transactions -- at the end of scheduling in the events in the
group, in the non-transactional case the code now falls through
to the group_error label, and proceeds to unschedule all the
events in the group and return failure.

This fixes it by returning 0 (success) in the non-transactional
case.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: eranian@gmail.com
LKML-Reference: <20100508105800.GB10650@brick.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index bb06382f98e..180151ff837 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -664,13 +664,13 @@ group_sched_in(struct perf_event *group_event,
 		}
 	}
 
-	if (txn) {
-		ret = pmu->commit_txn(pmu);
-		if (!ret) {
-			pmu->cancel_txn(pmu);
+	if (!txn)
+		return 0;
 
-			return 0;
-		}
+	ret = pmu->commit_txn(pmu);
+	if (!ret) {
+		pmu->cancel_txn(pmu);
+		return 0;
 	}
 
 group_error:
-- 
cgit v1.2.3


From de902d967feb96f2dfddfbe9dbd69dc22fd5ebcb Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sat, 8 May 2010 15:39:52 +0400
Subject: x86, perf: P4 PMU -- configure predefined events

If an event is not RAW we should not exit p4_hw_config
early but call x86_setup_perfctr as well.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Robert Richter <robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 9e002054cb5..b1f532d1d36 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -439,21 +439,20 @@ static int p4_hw_config(struct perf_event *event)
 	if (p4_ht_active() && p4_ht_thread(cpu))
 		event->hw.config = p4_set_ht_bit(event->hw.config);
 
-	if (event->attr.type != PERF_TYPE_RAW)
-		return 0;
-
-	/*
-	 * We don't control raw events so it's up to the caller
-	 * to pass sane values (and we don't count the thread number
-	 * on HT machine but allow HT-compatible specifics to be
-	 * passed on)
-	 *
-	 * XXX: HT wide things should check perf_paranoid_cpu() &&
-	 *      CAP_SYS_ADMIN
-	 */
-	event->hw.config |= event->attr.config &
-		(p4_config_pack_escr(P4_ESCR_MASK_HT) |
-		 p4_config_pack_cccr(P4_CCCR_MASK_HT));
+	if (event->attr.type == PERF_TYPE_RAW) {
+		/*
+		 * We don't control raw events so it's up to the caller
+		 * to pass sane values (and we don't count the thread number
+		 * on HT machine but allow HT-compatible specifics to be
+		 * passed on)
+		 *
+		 * XXX: HT wide things should check perf_paranoid_cpu() &&
+		 *      CAP_SYS_ADMIN
+		 */
+		event->hw.config |= event->attr.config &
+			(p4_config_pack_escr(P4_ESCR_MASK_HT) |
+			 p4_config_pack_cccr(P4_CCCR_MASK_HT));
+	}
 
 	return x86_setup_perfctr(event);
 }
-- 
cgit v1.2.3


From 137351e0feeb9f25d99488ee1afc1c79f5499a9a Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sat, 8 May 2010 15:25:52 +0400
Subject: x86, perf: P4 PMU -- protect sensible procedures from preemption

Steven reported:

|
| I'm getting:
|
| Pid: 3477, comm: perf Not tainted 2.6.34-rc6 #2727
| Call Trace:
|  [<ffffffff811c7565>] debug_smp_processor_id+0xd5/0xf0
|  [<ffffffff81019874>] p4_hw_config+0x2b/0x15c
|  [<ffffffff8107acbc>] ? trace_hardirqs_on_caller+0x12b/0x14f
|  [<ffffffff81019143>] hw_perf_event_init+0x468/0x7be
|  [<ffffffff810782fd>] ? debug_mutex_init+0x31/0x3c
|  [<ffffffff810c68b2>] T.850+0x273/0x42e
|  [<ffffffff810c6cab>] sys_perf_event_open+0x23e/0x3f1
|  [<ffffffff81009e6a>] ? sysret_check+0x2e/0x69
|  [<ffffffff81009e32>] system_call_fastpath+0x16/0x1b
|
| When running perf record in latest tip/perf/core
|

Due to the fact that p4 counters are shared between HT threads
we synthetically divide the whole set of counters into two
non-intersected subsets. And while we're "borrowing" counters
from these subsets we should not be preempted (well, strictly
speaking in p4_hw_config we just pre-set reference to the
subset which allow to save some cycles in schedule routine
if it happens on the same cpu). So use get_cpu/put_cpu pair.

Also p4_pmu_schedule_events should use smp_processor_id rather
than raw_ version. This allow us to catch up preemption issue
(if there will ever be).

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100508112716.963478928@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index b1f532d1d36..ca40180c41d 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -421,7 +421,8 @@ static u64 p4_pmu_event_map(int hw_event)
 
 static int p4_hw_config(struct perf_event *event)
 {
-	int cpu = raw_smp_processor_id();
+	int cpu = get_cpu();
+	int rc = 0;
 	u32 escr, cccr;
 
 	/*
@@ -454,7 +455,10 @@ static int p4_hw_config(struct perf_event *event)
 			 p4_config_pack_cccr(P4_CCCR_MASK_HT));
 	}
 
-	return x86_setup_perfctr(event);
+	rc = x86_setup_perfctr(event);
+	put_cpu();
+
+	return rc;
 }
 
 static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
-- 
cgit v1.2.3


From 3f51b7119d052827dbb0e40c966acdf2bdc6f47f Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sat, 8 May 2010 15:25:53 +0400
Subject: x86, perf: P4 PMU -- Get rid of redundant check for array index

The caller already has done such a check.
And it was wrong anyway, it had to be '>=' rather than '>'

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100508112717.130386882@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ca40180c41d..b8c2d379eea 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -406,11 +406,6 @@ static u64 p4_pmu_event_map(int hw_event)
 	unsigned int esel;
 	u64 config;
 
-	if (hw_event > ARRAY_SIZE(p4_general_events)) {
-		printk_once(KERN_ERR "P4 PMU: Bad index: %i\n", hw_event);
-		return 0;
-	}
-
 	config = p4_general_events[hw_event];
 	bind = p4_config_get_bind(config);
 	esel = P4_OPCODE_ESEL(bind->opcode);
-- 
cgit v1.2.3


From c7993165ef0c1d636ca05f4787739f8414584e6d Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sat, 8 May 2010 15:25:54 +0400
Subject: x86, perf: P4 PMU -- check for proper event index in RAW events

RAW events are special and we should be ready for user passing
in insane event index values.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100508112717.315897547@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index b8c2d379eea..a603930271f 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -418,6 +418,7 @@ static int p4_hw_config(struct perf_event *event)
 {
 	int cpu = get_cpu();
 	int rc = 0;
+	unsigned int evnt;
 	u32 escr, cccr;
 
 	/*
@@ -436,6 +437,14 @@ static int p4_hw_config(struct perf_event *event)
 		event->hw.config = p4_set_ht_bit(event->hw.config);
 
 	if (event->attr.type == PERF_TYPE_RAW) {
+
+		/* user data may have out-of-bound event index */
+		evnt = p4_config_unpack_event(event->attr.config);
+		if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
+			rc = -EINVAL;
+			goto out;
+		}
+
 		/*
 		 * We don't control raw events so it's up to the caller
 		 * to pass sane values (and we don't count the thread number
@@ -451,8 +460,8 @@ static int p4_hw_config(struct perf_event *event)
 	}
 
 	rc = x86_setup_perfctr(event);
+out:
 	put_cpu();
-
 	return rc;
 }
 
-- 
cgit v1.2.3


From bbf1bb3eee86f2eef2baa14e600be454d09109ee Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 8 May 2010 16:20:53 +0200
Subject: cpu_stop: add dummy implementation for UP

When !CONFIG_SMP, cpu_stop functions weren't defined at all which
could lead to build failures if UP code uses cpu_stop facility.  Add
dummy cpu_stop implementation for UP.  The waiting variants execute
the work function directly with preempt disabled and
stop_one_cpu_nowait() schedules a workqueue work.

Makefile and ifdefs around stop_machine implementation are updated to
accomodate CONFIG_SMP && !CONFIG_STOP_MACHINE case.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/stop_machine.h | 69 ++++++++++++++++++++++++++++++++++++++++----
 kernel/Makefile              |  2 +-
 kernel/stop_machine.c        |  4 +++
 3 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 0e552e72a4c..6b524a0d02e 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -6,8 +6,6 @@
 #include <linux/list.h>
 #include <asm/system.h>
 
-#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
-
 /*
  * stop_cpu[s]() is simplistic per-cpu maximum priority cpu
  * monopolization mechanism.  The caller can specify a non-sleeping
@@ -18,9 +16,10 @@
  * up and requests are guaranteed to be served as long as the target
  * cpus are online.
  */
-
 typedef int (*cpu_stop_fn_t)(void *arg);
 
+#ifdef CONFIG_SMP
+
 struct cpu_stop_work {
 	struct list_head	list;		/* cpu_stopper->works */
 	cpu_stop_fn_t		fn;
@@ -34,12 +33,70 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
 int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
 
+#else	/* CONFIG_SMP */
+
+#include <linux/workqueue.h>
+
+struct cpu_stop_work {
+	struct work_struct	work;
+	cpu_stop_fn_t		fn;
+	void			*arg;
+};
+
+static inline int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
+{
+	int ret = -ENOENT;
+	preempt_disable();
+	if (cpu == smp_processor_id())
+		ret = fn(arg);
+	preempt_enable();
+	return ret;
+}
+
+static void stop_one_cpu_nowait_workfn(struct work_struct *work)
+{
+	struct cpu_stop_work *stwork =
+		container_of(work, struct cpu_stop_work, work);
+	preempt_disable();
+	stwork->fn(stwork->arg);
+	preempt_enable();
+}
+
+static inline void stop_one_cpu_nowait(unsigned int cpu,
+				       cpu_stop_fn_t fn, void *arg,
+				       struct cpu_stop_work *work_buf)
+{
+	if (cpu == smp_processor_id()) {
+		INIT_WORK(&work_buf->work, stop_one_cpu_nowait_workfn);
+		work_buf->fn = fn;
+		work_buf->arg = arg;
+		schedule_work(&work_buf->work);
+	}
+}
+
+static inline int stop_cpus(const struct cpumask *cpumask,
+			    cpu_stop_fn_t fn, void *arg)
+{
+	if (cpumask_test_cpu(raw_smp_processor_id(), cpumask))
+		return stop_one_cpu(raw_smp_processor_id(), fn, arg);
+	return -ENOENT;
+}
+
+static inline int try_stop_cpus(const struct cpumask *cpumask,
+				cpu_stop_fn_t fn, void *arg)
+{
+	return stop_cpus(cpumask, fn, arg);
+}
+
+#endif	/* CONFIG_SMP */
+
 /*
  * stop_machine "Bogolock": stop the entire machine, disable
  * interrupts.  This is a very heavy lock, which is equivalent to
  * grabbing every spinlock (and more).  So the "read" side to such a
  * lock is anything which disables preeempt.
  */
+#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP)
 
 /**
  * stop_machine: freeze the machine on all CPUs and run this function
@@ -67,7 +124,7 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
  */
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
 
-#else
+#else	 /* CONFIG_STOP_MACHINE && CONFIG_SMP */
 
 static inline int stop_machine(int (*fn)(void *), void *data,
 			       const struct cpumask *cpus)
@@ -79,5 +136,5 @@ static inline int stop_machine(int (*fn)(void *), void *data,
 	return ret;
 }
 
-#endif /* CONFIG_SMP */
-#endif /* _LINUX_STOP_MACHINE */
+#endif	/* CONFIG_STOP_MACHINE && CONFIG_SMP */
+#endif	/* _LINUX_STOP_MACHINE */
diff --git a/kernel/Makefile b/kernel/Makefile
index a987aa1676b..149e18ef1ab 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,7 +68,7 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
-obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 5b20141a5ec..ef51d1fcf5e 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -375,6 +375,8 @@ static int __init cpu_stop_init(void)
 }
 early_initcall(cpu_stop_init);
 
+#ifdef CONFIG_STOP_MACHINE
+
 /* This controls the threads on each CPU. */
 enum stopmachine_state {
 	/* Dummy starting state for thread. */
@@ -477,3 +479,5 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(stop_machine);
+
+#endif	/* CONFIG_STOP_MACHINE */
-- 
cgit v1.2.3


From c0614829c16ab9d31f1b7d40516decfbf3d32102 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 27 Apr 2010 18:33:12 -0400
Subject: kprobes: Move enable/disable_kprobe() out from debugfs code

Move enable/disable_kprobe() API out from debugfs related code,
because these interfaces are not related to debugfs interface.

This fixes a compiler warning.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by:  Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: systemtap <systemtap@sources.redhat.com>
Cc: DLE <dle-develop@lists.sourceforge.net>
LKML-Reference: <20100427223312.2322.60512.stgit@localhost6.localdomain6>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 132 +++++++++++++++++++++++++++----------------------------
 1 file changed, 66 insertions(+), 66 deletions(-)

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0ed46f3e51e..282035f3ae9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1588,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p)
 	arch_remove_kprobe(p);
 }
 
+/* Disable one kprobe */
+int __kprobes disable_kprobe(struct kprobe *kp)
+{
+	int ret = 0;
+	struct kprobe *p;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* Check whether specified probe is valid. */
+	p = __get_valid_kprobe(kp);
+	if (unlikely(p == NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* If the probe is already disabled (or gone), just return */
+	if (kprobe_disabled(kp))
+		goto out;
+
+	kp->flags |= KPROBE_FLAG_DISABLED;
+	if (p != kp)
+		/* When kp != p, p is always enabled. */
+		try_to_disable_aggr_kprobe(p);
+
+	if (!kprobes_all_disarmed && kprobe_disabled(p))
+		disarm_kprobe(p);
+out:
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(disable_kprobe);
+
+/* Enable one kprobe */
+int __kprobes enable_kprobe(struct kprobe *kp)
+{
+	int ret = 0;
+	struct kprobe *p;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* Check whether specified probe is valid. */
+	p = __get_valid_kprobe(kp);
+	if (unlikely(p == NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (kprobe_gone(kp)) {
+		/* This kprobe has gone, we couldn't enable it. */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (p != kp)
+		kp->flags &= ~KPROBE_FLAG_DISABLED;
+
+	if (!kprobes_all_disarmed && kprobe_disabled(p)) {
+		p->flags &= ~KPROBE_FLAG_DISABLED;
+		arm_kprobe(p);
+	}
+out:
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(enable_kprobe);
+
 void __kprobes dump_kprobe(struct kprobe *kp)
 {
 	printk(KERN_WARNING "Dumping kprobe:\n");
@@ -1805,72 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = {
 	.release        = seq_release,
 };
 
-/* Disable one kprobe */
-int __kprobes disable_kprobe(struct kprobe *kp)
-{
-	int ret = 0;
-	struct kprobe *p;
-
-	mutex_lock(&kprobe_mutex);
-
-	/* Check whether specified probe is valid. */
-	p = __get_valid_kprobe(kp);
-	if (unlikely(p == NULL)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* If the probe is already disabled (or gone), just return */
-	if (kprobe_disabled(kp))
-		goto out;
-
-	kp->flags |= KPROBE_FLAG_DISABLED;
-	if (p != kp)
-		/* When kp != p, p is always enabled. */
-		try_to_disable_aggr_kprobe(p);
-
-	if (!kprobes_all_disarmed && kprobe_disabled(p))
-		disarm_kprobe(p);
-out:
-	mutex_unlock(&kprobe_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(disable_kprobe);
-
-/* Enable one kprobe */
-int __kprobes enable_kprobe(struct kprobe *kp)
-{
-	int ret = 0;
-	struct kprobe *p;
-
-	mutex_lock(&kprobe_mutex);
-
-	/* Check whether specified probe is valid. */
-	p = __get_valid_kprobe(kp);
-	if (unlikely(p == NULL)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (kprobe_gone(kp)) {
-		/* This kprobe has gone, we couldn't enable it. */
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (p != kp)
-		kp->flags &= ~KPROBE_FLAG_DISABLED;
-
-	if (!kprobes_all_disarmed && kprobe_disabled(p)) {
-		p->flags &= ~KPROBE_FLAG_DISABLED;
-		arm_kprobe(p);
-	}
-out:
-	mutex_unlock(&kprobe_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(enable_kprobe);
-
 static void __kprobes arm_all_kprobes(void)
 {
 	struct hlist_head *head;
-- 
cgit v1.2.3


From e157eb8341e7885ff2d9f1620155e3da6e0c8f56 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sat, 8 May 2010 18:33:03 +0300
Subject: perf report: Document '--call-graph' better for usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch improves 'perf report -h' output for the
'--call-graph' command line option by enumerating the
different output types.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1273332783-4268-1-git-send-email-penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 0152b5412cc..196473b5125 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -457,7 +457,7 @@ static const struct option options[] = {
 	OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other,
 		    "Only display entries with parent-match"),
 	OPT_CALLBACK_DEFAULT('g', "call-graph", NULL, "output_type,min_percent",
-		     "Display callchains using output_type and min percent threshold. "
+		     "Display callchains using output_type (graph, flat, fractal, or none) and min percent threshold. "
 		     "Default: fractal,0.5", &parse_callchain_opt, callchain_default_opt),
 	OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
 		   "only consider symbols in these dsos"),
-- 
cgit v1.2.3


From 96f6e775b58687d85ee33004d414419b5ec34106 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 9 May 2010 01:10:34 -0700
Subject: x86, hypervisor: Export the x86_hyper* symbols

Export x86_hyper and the related specific structures, allowing for
hypervisor identification by modules.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Greg KH <greg@kroah.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Ky Srinivasan <ksrinivasan@novell.com>
Cc: Dmitry Torokhov <dtor@vmware.com>
LKML-Reference: <4BE49778.6060800@zytor.com>
---
 arch/x86/kernel/cpu/hypervisor.c | 1 +
 arch/x86/kernel/cpu/mshyperv.c   | 1 +
 arch/x86/kernel/cpu/vmware.c     | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 87381759d3c..4afb5a2130e 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -36,6 +36,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
 };
 
 const struct hypervisor_x86 *x86_hyper;
+EXPORT_SYMBOL(x86_hyper);
 
 static inline void __init
 detect_hypervisor_vendor(void)
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 5969c3ee318..0f1371724c8 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -51,3 +51,4 @@ const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
 	.detect			= ms_hyperv_platform,
 	.init_platform		= ms_hyperv_init_platform,
 };
+EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 265b432f6e6..b9d1ff58844 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -99,7 +99,6 @@ static bool __init vmware_platform(void)
 
 	return false;
 }
-EXPORT_SYMBOL(vmware_platform);
 
 /*
  * VMware hypervisor takes care of exporting a reliable TSC to the guest.
@@ -125,3 +124,4 @@ const __refconst struct hypervisor_x86 x86_hyper_vmware = {
 	.set_cpu_features	= vmware_set_cpu_features,
 	.init_platform		= vmware_platform_setup,
 };
+EXPORT_SYMBOL(x86_hyper_vmware);
-- 
cgit v1.2.3


From a10a569806e43b9be5fce60b21f836b50b1010e4 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 9 May 2010 01:13:42 -0700
Subject: Modify the VMware balloon driver for the new x86_hyper API

Modify the VMware balloon driver to match the new x86_hyper API.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Greg KH <greg@kroah.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Ky Srinivasan <ksrinivasan@novell.com>
Cc: Dmitry Torokhov <dtor@vmware.com>
LKML-Reference: <4BE49778.6060800@zytor.com>
---
 drivers/misc/vmware_balloon.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/vmware_balloon.c b/drivers/misc/vmware_balloon.c
index e7161c4e379..db9cd0240c6 100644
--- a/drivers/misc/vmware_balloon.c
+++ b/drivers/misc/vmware_balloon.c
@@ -41,7 +41,7 @@
 #include <linux/workqueue.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
-#include <asm/vmware.h>
+#include <asm/hypervisor.h>
 
 MODULE_AUTHOR("VMware, Inc.");
 MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
@@ -767,7 +767,7 @@ static int __init vmballoon_init(void)
 	 * Check if we are running on VMware's hypervisor and bail out
 	 * if we are not.
 	 */
-	if (!vmware_platform())
+	if (x86_hyper != &x86_hyper_vmware)
 		return -ENODEV;
 
 	vmballoon_wq = create_freezeable_workqueue("vmmemctl");
-- 
cgit v1.2.3


From 984028075794c00cbf4fb1e94bb6233e8be08875 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 2 May 2010 22:05:29 +0200
Subject: perf: Introduce a new "round of buffers read" pseudo event

In order to provide a more rubust and deterministic reordering
algorithm, we need to know when we reach a point where we just
did a pass through over every counter buffers to read every thing
they had.

This patch introduces a new PERF_RECORD_FINISHED_ROUND pseudo event
that only consist in an event header and doesn't need to contain
anything.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
---
 tools/perf/builtin-record.c | 34 ++++++++++++++++++++++++----------
 tools/perf/util/event.h     |  3 ++-
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index d3981ac50e1..6b77b285fe1 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -494,6 +494,29 @@ static void event__synthesize_guest_os(struct machine *machine, void *data)
 		       " relocation symbol.\n", machine->pid);
 }
 
+static struct perf_event_header finished_round_event = {
+	.size = sizeof(struct perf_event_header),
+	.type = PERF_RECORD_FINISHED_ROUND,
+};
+
+static void mmap_read_all(void)
+{
+	int i, counter, thread;
+
+	for (i = 0; i < nr_cpu; i++) {
+		for (counter = 0; counter < nr_counters; counter++) {
+			for (thread = 0; thread < thread_num; thread++) {
+				if (mmap_array[i][counter][thread].base)
+					mmap_read(&mmap_array[i][counter][thread]);
+			}
+
+		}
+	}
+
+	if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO))
+		write_output(&finished_round_event, sizeof(finished_round_event));
+}
+
 static int __cmd_record(int argc, const char **argv)
 {
 	int i, counter;
@@ -726,16 +749,7 @@ static int __cmd_record(int argc, const char **argv)
 		int hits = samples;
 		int thread;
 
-		for (i = 0; i < nr_cpu; i++) {
-			for (counter = 0; counter < nr_counters; counter++) {
-				for (thread = 0;
-					thread < thread_num; thread++) {
-					if (mmap_array[i][counter][thread].base)
-						mmap_read(&mmap_array[i][counter][thread]);
-				}
-
-			}
-		}
+		mmap_read_all();
 
 		if (hits == samples) {
 			if (done)
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index b364da5b0cb..6cc1b1dced5 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -84,11 +84,12 @@ struct build_id_event {
 	char			 filename[];
 };
 
-enum perf_header_event_type { /* above any possible kernel type */
+enum perf_user_event_type { /* above any possible kernel type */
 	PERF_RECORD_HEADER_ATTR			= 64,
 	PERF_RECORD_HEADER_EVENT_TYPE		= 65,
 	PERF_RECORD_HEADER_TRACING_DATA		= 66,
 	PERF_RECORD_HEADER_BUILD_ID		= 67,
+	PERF_RECORD_FINISHED_ROUND		= 68,
 	PERF_RECORD_HEADER_MAX
 };
 
-- 
cgit v1.2.3


From d6b17bebd79dae2e3577f2ea27a832af4991a5e6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 3 May 2010 15:14:33 +0200
Subject: perf: Provide a new deterministic events reordering algorithm

The current events reordering algorithm is based on a heuristic that
gets broken once we deal with a very fast flow of events.

Indeed the time period based flushing is not suitable anymore
in the following case, assuming we have a flush period of two
seconds.

    CPU 0           |        CPU 1
                    |
  cnt1 timestamps   |      cnt1 timestamps
                    |
    0               |         0
    1               |         1
    2               |         2
    3               |         3
    [...]           |        [...]
    4 seconds later

If we spend too much time to read the buffers (case of a lot of
events to record in each buffers or when we have a lot of CPU buffers
to read), in the next pass the CPU 0 buffer could contain a slice
of several seconds of events. We'll read them all and notice we've
reached the period to flush. In the above example we flush the first
half of the CPU 0 buffer, then we read the CPU 1 buffer where we
have events that were on the flush slice and then the reordering
fails.

It's simple to reproduce with:

	perf lock record perf bench sched messaging

To solve this, we use a new solution that doesn't rely on an
heuristical time slice period anymore but on a deterministic basis
based on how perf record does its job.

perf record saves the buffers through passes. A pass is a tour
on every buffers from every CPUs. This is made in order: for
each CPU we read the buffers of every counters. So the more
buffers we visit, the later will be the timstamps of their events.

When perf record finishes a pass it records a
PERF_RECORD_FINISHED_ROUND pseudo event.
We record the max timestamp t found in the pass n. Assuming these
timestamps are monotonic across cpus, we know that if a buffer
still has events with timestamps below t, they will be all available
and then read in the pass n + 1.
Hence when we start to read the pass n + 2, we can safely flush every
events with timestamps below t.

      ============ PASS n =================
         CPU 0         |   CPU 1
                       |
      cnt1 timestamps  |   cnt2 timestamps
            1          |         2
            2          |         3
            -          |         4  <--- max recorded

      ============ PASS n + 1 ==============
         CPU 0         |   CPU 1
                       |
      cnt1 timestamps  |   cnt2 timestamps
            3          |         5
            4          |         6
            5          |         7 <---- max recorded

        Flush every events below timestamp 4

      ============ PASS n + 2 ==============
         CPU 0         |   CPU 1
                       |
      cnt1 timestamps  |   cnt2 timestamps
            6          |         8
            7          |         9
            -          |         10

        Flush every events below timestamp 7
        etc...

It also works on perf.data versions that don't have
PERF_RECORD_FINISHED_ROUND pseudo events. The difference is that
the events will be only flushed in the end of the perf.data
processing. It will then consume more memory and scale less with
large perf.data files.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
---
 tools/perf/util/session.c | 106 +++++++++++++++++++++++++++++++++-------------
 tools/perf/util/session.h |  36 +++++++++-------
 2 files changed, 97 insertions(+), 45 deletions(-)

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 5d353e70fe2..9401909fa28 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -98,7 +98,6 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
 	self->unknown_events = 0;
 	self->machines = RB_ROOT;
 	self->repipe = repipe;
-	self->ordered_samples.flush_limit = ULLONG_MAX;
 	INIT_LIST_HEAD(&self->ordered_samples.samples_head);
 
 	if (mode == O_RDONLY) {
@@ -194,6 +193,18 @@ static int process_event_stub(event_t *event __used,
 	return 0;
 }
 
+static int process_finished_round_stub(event_t *event __used,
+				       struct perf_session *session __used,
+				       struct perf_event_ops *ops __used)
+{
+	dump_printf(": unhandled!\n");
+	return 0;
+}
+
+static int process_finished_round(event_t *event,
+				  struct perf_session *session,
+				  struct perf_event_ops *ops);
+
 static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
 {
 	if (handler->sample == NULL)
@@ -222,6 +233,12 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
 		handler->tracing_data = process_event_stub;
 	if (handler->build_id == NULL)
 		handler->build_id = process_event_stub;
+	if (handler->finished_round == NULL) {
+		if (handler->ordered_samples)
+			handler->finished_round = process_finished_round;
+		else
+			handler->finished_round = process_finished_round_stub;
+	}
 }
 
 static const char *event__name[] = {
@@ -359,16 +376,14 @@ struct sample_queue {
 	struct list_head	list;
 };
 
-#define FLUSH_PERIOD	(2 * NSEC_PER_SEC)
-
 static void flush_sample_queue(struct perf_session *s,
 			       struct perf_event_ops *ops)
 {
 	struct list_head *head = &s->ordered_samples.samples_head;
-	u64 limit = s->ordered_samples.flush_limit;
+	u64 limit = s->ordered_samples.next_flush;
 	struct sample_queue *tmp, *iter;
 
-	if (!ops->ordered_samples)
+	if (!ops->ordered_samples || !limit)
 		return;
 
 	list_for_each_entry_safe(iter, tmp, head, list) {
@@ -387,6 +402,55 @@ static void flush_sample_queue(struct perf_session *s,
 	}
 }
 
+/*
+ * When perf record finishes a pass on every buffers, it records this pseudo
+ * event.
+ * We record the max timestamp t found in the pass n.
+ * Assuming these timestamps are monotonic across cpus, we know that if
+ * a buffer still has events with timestamps below t, they will be all
+ * available and then read in the pass n + 1.
+ * Hence when we start to read the pass n + 2, we can safely flush every
+ * events with timestamps below t.
+ *
+ *    ============ PASS n =================
+ *       CPU 0         |   CPU 1
+ *                     |
+ *    cnt1 timestamps  |   cnt2 timestamps
+ *          1          |         2
+ *          2          |         3
+ *          -          |         4  <--- max recorded
+ *
+ *    ============ PASS n + 1 ==============
+ *       CPU 0         |   CPU 1
+ *                     |
+ *    cnt1 timestamps  |   cnt2 timestamps
+ *          3          |         5
+ *          4          |         6
+ *          5          |         7 <---- max recorded
+ *
+ *      Flush every events below timestamp 4
+ *
+ *    ============ PASS n + 2 ==============
+ *       CPU 0         |   CPU 1
+ *                     |
+ *    cnt1 timestamps  |   cnt2 timestamps
+ *          6          |         8
+ *          7          |         9
+ *          -          |         10
+ *
+ *      Flush every events below timestamp 7
+ *      etc...
+ */
+static int process_finished_round(event_t *event __used,
+				  struct perf_session *session,
+				  struct perf_event_ops *ops)
+{
+	flush_sample_queue(session, ops);
+	session->ordered_samples.next_flush = session->ordered_samples.max_timestamp;
+
+	return 0;
+}
+
 static void __queue_sample_end(struct sample_queue *new, struct list_head *head)
 {
 	struct sample_queue *iter;
@@ -455,16 +519,11 @@ static void __queue_sample_event(struct sample_queue *new,
 }
 
 static int queue_sample_event(event_t *event, struct sample_data *data,
-			      struct perf_session *s,
-			      struct perf_event_ops *ops)
+			      struct perf_session *s)
 {
 	u64 timestamp = data->time;
 	struct sample_queue *new;
-	u64 flush_limit;
-
 
-	if (s->ordered_samples.flush_limit == ULLONG_MAX)
-		s->ordered_samples.flush_limit = timestamp + FLUSH_PERIOD;
 
 	if (timestamp < s->ordered_samples.last_flush) {
 		printf("Warning: Timestamp below last timeslice flush\n");
@@ -488,23 +547,8 @@ static int queue_sample_event(event_t *event, struct sample_data *data,
 	__queue_sample_event(new, s);
 	s->ordered_samples.last_inserted = new;
 
-	/*
-	 * We want to have a slice of events covering 2 * FLUSH_PERIOD
-	 * If FLUSH_PERIOD is big enough, it ensures every events that occured
-	 * in the first half of the timeslice have all been buffered and there
-	 * are none remaining (we need that because of the weakly ordered
-	 * event recording we have). Then once we reach the 2 * FLUSH_PERIOD
-	 * timeslice, we flush the first half to be gentle with the memory
-	 * (the second half can still get new events in the middle, so wait
-	 * another period to flush it)
-	 */
-	flush_limit = s->ordered_samples.flush_limit;
-
-	if (new->timestamp > flush_limit &&
-		new->timestamp - flush_limit > FLUSH_PERIOD) {
-		s->ordered_samples.flush_limit += FLUSH_PERIOD;
-		flush_sample_queue(s, ops);
-	}
+	if (new->timestamp > s->ordered_samples.max_timestamp)
+		s->ordered_samples.max_timestamp = new->timestamp;
 
 	return 0;
 }
@@ -520,7 +564,7 @@ static int perf_session__process_sample(event_t *event, struct perf_session *s,
 	bzero(&data, sizeof(struct sample_data));
 	event__parse_sample(event, s->sample_type, &data);
 
-	queue_sample_event(event, &data, s, ops);
+	queue_sample_event(event, &data, s);
 
 	return 0;
 }
@@ -572,6 +616,8 @@ static int perf_session__process_event(struct perf_session *self,
 		return ops->tracing_data(event, self);
 	case PERF_RECORD_HEADER_BUILD_ID:
 		return ops->build_id(event, self);
+	case PERF_RECORD_FINISHED_ROUND:
+		return ops->finished_round(event, self, ops);
 	default:
 		self->unknown_events++;
 		return -1;
@@ -786,7 +832,7 @@ more:
 done:
 	err = 0;
 	/* do the final flush for ordered samples */
-	self->ordered_samples.flush_limit = ULLONG_MAX;
+	self->ordered_samples.next_flush = ULLONG_MAX;
 	flush_sample_queue(self, ops);
 out_err:
 	ui_progress__delete(progress);
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index f2b2c6a3a49..568fd08a478 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -14,7 +14,8 @@ struct thread;
 
 struct ordered_samples {
 	u64			last_flush;
-	u64			flush_limit;
+	u64			next_flush;
+	u64			max_timestamp;
 	struct list_head	samples_head;
 	struct sample_queue	*last_inserted;
 };
@@ -41,23 +42,28 @@ struct perf_session {
 	char filename[0];
 };
 
+struct perf_event_ops;
+
 typedef int (*event_op)(event_t *self, struct perf_session *session);
+typedef int (*event_op2)(event_t *self, struct perf_session *session,
+			 struct perf_event_ops *ops);
 
 struct perf_event_ops {
-	event_op sample,
-		 mmap,
-		 comm,
-		 fork,
-		 exit,
-		 lost,
-		 read,
-		 throttle,
-		 unthrottle,
-		 attr,
-		 event_type,
-		 tracing_data,
-		 build_id;
-	bool	ordered_samples;
+	event_op	sample,
+			mmap,
+			comm,
+			fork,
+			exit,
+			lost,
+			read,
+			throttle,
+			unthrottle,
+			attr,
+			event_type,
+			tracing_data,
+			build_id;
+	event_op2	finished_round;
+	bool		ordered_samples;
 };
 
 struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe);
-- 
cgit v1.2.3


From 26242d859c9be9eea61f7f19514e9d272ae8ce26 Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Date: Mon, 3 May 2010 14:12:00 +0900
Subject: perf lock: Add "info" subcommand for dumping misc information

This adds the "info" subcommand to perf lock which can be used
to dump metadata like threads or addresses of lock instances.
"map" was removed because info should do the work for it.

This will be useful not only for debugging but also for ordinary
analyzing.

v2: adding example of usage
% sudo ./perf lock info -t
 | Thread ID: comm
 | 	 0: swapper
 |         1: init
 |        18: migration/5
 |        29: events/2
 |        32: events/5
 |        33: events/6
...

% sudo ./perf lock info -m
| Address of instance: name of class
|  0xffff8800b95adae0: &(&sighand->siglock)->rlock
|  0xffff8800bbb41ae0: &(&sighand->siglock)->rlock
|  0xffff8800bf165ae0: &(&sighand->siglock)->rlock
|  0xffff8800b9576a98: &p->cred_guard_mutex
|  0xffff8800bb890a08: &(&p->alloc_lock)->rlock
|  0xffff8800b9522a08: &(&p->alloc_lock)->rlock
|  0xffff8800bb8aaa08: &(&p->alloc_lock)->rlock
|  0xffff8800bba72a08: &(&p->alloc_lock)->rlock
|  0xffff8800bf18ea08: &(&p->alloc_lock)->rlock
|  0xffff8800b8a0d8a0: &(&ip->i_lock)->mr_lock
|  0xffff88009bf818a0: &(&ip->i_lock)->mr_lock
|  0xffff88004c66b8a0: &(&ip->i_lock)->mr_lock
|  0xffff8800bb6478a0: &(shost->host_lock)->rlock

v3: fixed some problems Frederic pointed out
 * better rbtree tracking in dump_threads()
 * removed printf() and used pr_info() and pr_debug()

Signed-off-by: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
LKML-Reference: <1272863520-16179-1-git-send-email-mitake@dcl.info.waseda.ac.jp>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 tools/perf/builtin-lock.c | 96 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 73 insertions(+), 23 deletions(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 6605000ed73..c4eb854ff7e 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -720,15 +720,15 @@ static void print_result(void)
 	char cut_name[20];
 	int bad, total;
 
-	printf("%20s ", "Name");
-	printf("%10s ", "acquired");
-	printf("%10s ", "contended");
+	pr_info("%20s ", "Name");
+	pr_info("%10s ", "acquired");
+	pr_info("%10s ", "contended");
 
-	printf("%15s ", "total wait (ns)");
-	printf("%15s ", "max wait (ns)");
-	printf("%15s ", "min wait (ns)");
+	pr_info("%15s ", "total wait (ns)");
+	pr_info("%15s ", "max wait (ns)");
+	pr_info("%15s ", "min wait (ns)");
 
-	printf("\n\n");
+	pr_info("\n\n");
 
 	bad = total = 0;
 	while ((st = pop_from_result())) {
@@ -741,7 +741,7 @@ static void print_result(void)
 
 		if (strlen(st->name) < 16) {
 			/* output raw name */
-			printf("%20s ", st->name);
+			pr_info("%20s ", st->name);
 		} else {
 			strncpy(cut_name, st->name, 16);
 			cut_name[16] = '.';
@@ -749,17 +749,17 @@ static void print_result(void)
 			cut_name[18] = '.';
 			cut_name[19] = '\0';
 			/* cut off name for saving output style */
-			printf("%20s ", cut_name);
+			pr_info("%20s ", cut_name);
 		}
 
-		printf("%10u ", st->nr_acquired);
-		printf("%10u ", st->nr_contended);
+		pr_info("%10u ", st->nr_acquired);
+		pr_info("%10u ", st->nr_contended);
 
-		printf("%15llu ", st->wait_time_total);
-		printf("%15llu ", st->wait_time_max);
-		printf("%15llu ", st->wait_time_min == ULLONG_MAX ?
+		pr_info("%15llu ", st->wait_time_total);
+		pr_info("%15llu ", st->wait_time_max);
+		pr_info("%15llu ", st->wait_time_min == ULLONG_MAX ?
 		       0 : st->wait_time_min);
-		printf("\n");
+		pr_info("\n");
 	}
 
 	{
@@ -768,28 +768,59 @@ static void print_result(void)
 		const char *name[4] =
 			{ "acquire", "acquired", "contended", "release" };
 
-		printf("\n=== output for debug===\n\n");
-		printf("bad:%d, total:%d\n", bad, total);
-		printf("bad rate:%f\n", (double)(bad / total));
+		pr_debug("\n=== output for debug===\n\n");
+		pr_debug("bad:%d, total:%d\n", bad, total);
+		pr_debug("bad rate:%f\n", (double)(bad / total));
 
-		printf("histogram of events caused bad sequence\n");
+		pr_debug("histogram of events caused bad sequence\n");
 		for (i = 0; i < 4; i++)
-			printf(" %10s: %d\n", name[i], bad_hist[i]);
+			pr_debug(" %10s: %d\n", name[i], bad_hist[i]);
 	}
 }
 
+static int			info_threads;
+static int			info_map;
+
+static void dump_threads(void)
+{
+	struct thread_stat *st;
+	struct rb_node *node;
+	struct thread *t;
+
+	pr_info("%10s: comm\n", "Thread ID");
+
+	node = rb_first(&thread_stats);
+	while (node) {
+		st = container_of(node, struct thread_stat, rb);
+		t = perf_session__findnew(session, st->tid);
+		pr_info("%10d: %s\n", st->tid, t->comm);
+		node = rb_next(node);
+	};
+}
+
 static void dump_map(void)
 {
 	unsigned int i;
 	struct lock_stat *st;
 
+	pr_info("Address of instance: name of class\n");
 	for (i = 0; i < LOCKHASH_SIZE; i++) {
 		list_for_each_entry(st, &lockhash_table[i], hash_entry) {
-			printf("%p: %s\n", st->addr, st->name);
+			pr_info(" %p: %s\n", st->addr, st->name);
 		}
 	}
 }
 
+static void dump_info(void)
+{
+	if (info_threads)
+		dump_threads();
+	else if (info_map)
+		dump_map();
+	else
+		die("Unknown type of information\n");
+}
+
 static int process_sample_event(event_t *self, struct perf_session *s)
 {
 	struct sample_data data;
@@ -858,6 +889,19 @@ static const struct option report_options[] = {
 	OPT_END()
 };
 
+static const char * const info_usage[] = {
+	"perf lock info [<options>]",
+	NULL
+};
+
+static const struct option info_options[] = {
+	OPT_BOOLEAN('t', "threads", &info_threads,
+		    "dump thread list in perf.data"),
+	OPT_BOOLEAN('m', "map", &info_map,
+		    "map of lock instances (name:address table)"),
+	OPT_END()
+};
+
 static const char * const lock_usage[] = {
 	"perf lock [<options>] {record|trace|report}",
 	NULL
@@ -929,12 +973,18 @@ int cmd_lock(int argc, const char **argv, const char *prefix __used)
 	} else if (!strcmp(argv[0], "trace")) {
 		/* Aliased to 'perf trace' */
 		return cmd_trace(argc, argv, prefix);
-	} else if (!strcmp(argv[0], "map")) {
+	} else if (!strcmp(argv[0], "info")) {
+		if (argc) {
+			argc = parse_options(argc, argv,
+					     info_options, info_usage, 0);
+			if (argc)
+				usage_with_options(info_usage, info_options);
+		}
 		/* recycling report_lock_ops */
 		trace_handler = &report_lock_ops;
 		setup_pager();
 		read_events();
-		dump_map();
+		dump_info();
 	} else {
 		usage_with_options(lock_usage, lock_options);
 	}
-- 
cgit v1.2.3


From 10350ec362b48f79f3df8447c25813790075e27c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 5 May 2010 23:47:28 +0200
Subject: perf: Cleanup perf lock broken states

Use enum to get a human view of bad_hist indexes and
put bad histogram output in its own function.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
---
 tools/perf/builtin-lock.c | 49 ++++++++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index c4eb854ff7e..1e93179c2d3 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -387,7 +387,15 @@ static struct lock_seq_stat *get_seq(struct thread_stat *ts, void *addr)
 	return seq;
 }
 
-static int bad_hist[4];
+enum broken_state {
+	BROKEN_ACQUIRE,
+	BROKEN_ACQUIRED,
+	BROKEN_CONTENDED,
+	BROKEN_RELEASE,
+	BROKEN_MAX,
+};
+
+static int bad_hist[BROKEN_MAX];
 
 static void
 report_lock_acquire_event(struct trace_acquire_event *acquire_event,
@@ -437,7 +445,7 @@ report_lock_acquire_event(struct trace_acquire_event *acquire_event,
 broken:
 		/* broken lock sequence, discard it */
 		ls->discard = 1;
-		bad_hist[0]++;
+		bad_hist[BROKEN_ACQUIRE]++;
 		list_del(&seq->list);
 		free(seq);
 		goto end;
@@ -481,7 +489,6 @@ report_lock_acquired_event(struct trace_acquired_event *acquired_event,
 	case SEQ_STATE_CONTENDED:
 		contended_term = timestamp - seq->prev_event_time;
 		ls->wait_time_total += contended_term;
-
 		if (contended_term < ls->wait_time_min)
 			ls->wait_time_min = contended_term;
 		else if (ls->wait_time_max < contended_term)
@@ -492,7 +499,7 @@ report_lock_acquired_event(struct trace_acquired_event *acquired_event,
 	case SEQ_STATE_READ_ACQUIRED:
 		/* broken lock sequence, discard it */
 		ls->discard = 1;
-		bad_hist[1]++;
+		bad_hist[BROKEN_ACQUIRED]++;
 		list_del(&seq->list);
 		free(seq);
 		goto end;
@@ -540,7 +547,7 @@ report_lock_contended_event(struct trace_contended_event *contended_event,
 	case SEQ_STATE_CONTENDED:
 		/* broken lock sequence, discard it */
 		ls->discard = 1;
-		bad_hist[2]++;
+		bad_hist[BROKEN_CONTENDED]++;
 		list_del(&seq->list);
 		free(seq);
 		goto end;
@@ -594,7 +601,7 @@ report_lock_release_event(struct trace_release_event *release_event,
 	case SEQ_STATE_RELEASED:
 		/* broken lock sequence, discard it */
 		ls->discard = 1;
-		bad_hist[3]++;
+		bad_hist[BROKEN_RELEASE]++;
 		goto free_seq;
 		break;
 	default:
@@ -713,6 +720,21 @@ process_raw_event(void *data, int cpu, u64 timestamp, struct thread *thread)
 		process_lock_release_event(data, event, cpu, timestamp, thread);
 }
 
+static void print_bad_events(int bad, int total)
+{
+	/* Output for debug, this have to be removed */
+	int i;
+	const char *name[4] =
+		{ "acquire", "acquired", "contended", "release" };
+
+	pr_info("\n=== output for debug===\n\n");
+	pr_info("bad:%d, total:%d\n", bad, total);
+	pr_info("bad rate:%f\n", (double)(bad / total));
+	pr_info("histogram of events caused bad sequence\n");
+	for (i = 0; i < BROKEN_MAX; i++)
+		pr_info(" %10s: %d\n", name[i], bad_hist[i]);
+}
+
 /* TODO: various way to print, coloring, nano or milli sec */
 static void print_result(void)
 {
@@ -762,20 +784,7 @@ static void print_result(void)
 		pr_info("\n");
 	}
 
-	{
-		/* Output for debug, this have to be removed */
-		int i;
-		const char *name[4] =
-			{ "acquire", "acquired", "contended", "release" };
-
-		pr_debug("\n=== output for debug===\n\n");
-		pr_debug("bad:%d, total:%d\n", bad, total);
-		pr_debug("bad rate:%f\n", (double)(bad / total));
-
-		pr_debug("histogram of events caused bad sequence\n");
-		for (i = 0; i < 4; i++)
-			pr_debug(" %10s: %d\n", name[i], bad_hist[i]);
-	}
+	print_bad_events(bad, total);
 }
 
 static int			info_threads;
-- 
cgit v1.2.3


From 84c7a21791eb2e962a27e19bab5b77d5d9e13a34 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 5 May 2010 23:57:25 +0200
Subject: perf: Humanize lock flags in perf lock

Use an enum instead of plain constants for lock flags.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
---
 tools/perf/builtin-lock.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 1e93179c2d3..3b304ed4d2e 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -397,6 +397,11 @@ enum broken_state {
 
 static int bad_hist[BROKEN_MAX];
 
+enum acquire_flags {
+	TRY_LOCK = 1,
+	READ_LOCK = 2,
+};
+
 static void
 report_lock_acquire_event(struct trace_acquire_event *acquire_event,
 			struct event *__event __used,
@@ -421,9 +426,9 @@ report_lock_acquire_event(struct trace_acquire_event *acquire_event,
 		if (!acquire_event->flag) {
 			seq->state = SEQ_STATE_ACQUIRING;
 		} else {
-			if (acquire_event->flag & 1)
+			if (acquire_event->flag & TRY_LOCK)
 				ls->nr_trylock++;
-			if (acquire_event->flag & 2)
+			if (acquire_event->flag & READ_LOCK)
 				ls->nr_readlock++;
 			seq->state = SEQ_STATE_READ_ACQUIRED;
 			seq->read_count = 1;
@@ -431,7 +436,7 @@ report_lock_acquire_event(struct trace_acquire_event *acquire_event,
 		}
 		break;
 	case SEQ_STATE_READ_ACQUIRED:
-		if (acquire_event->flag & 2) {
+		if (acquire_event->flag & READ_LOCK) {
 			seq->read_count++;
 			ls->nr_acquired++;
 			goto end;
-- 
cgit v1.2.3


From 5efe08cf685f33f562566dc68b6077b6f6a4f706 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 6 May 2010 04:55:22 +0200
Subject: perf: Fix perf lock bad rate

Fix the cast made to get the bad rate. It is made in the result
instead of the operands. We need the operands to be cast in double,
otherwise the result will always be zero.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
---
 tools/perf/builtin-lock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 3b304ed4d2e..d7dde9cbbdf 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -733,8 +733,8 @@ static void print_bad_events(int bad, int total)
 		{ "acquire", "acquired", "contended", "release" };
 
 	pr_info("\n=== output for debug===\n\n");
-	pr_info("bad:%d, total:%d\n", bad, total);
-	pr_info("bad rate:%f\n", (double)(bad / total));
+	pr_info("bad: %d, total: %d\n", bad, total);
+	pr_info("bad rate: %f %%\n", (double)bad / (double)total * 100);
 	pr_info("histogram of events caused bad sequence\n");
 	for (i = 0; i < BROKEN_MAX; i++)
 		pr_info(" %10s: %d\n", name[i], bad_hist[i]);
-- 
cgit v1.2.3


From 90c0e5fc7b73d2575c7367e1da70ff9521718e5e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 7 May 2010 02:33:42 +0200
Subject: perf lock: Always check min AND max wait time

When a lock is acquired after beeing contended, we update the
wait time statistics for the given lock.
But if the min wait time is updated, we don't check the max wait
time. This is wrong because the first time we update the wait time,
we want to update both min and max wait time.

Before:
	Name   acquired  contended total wait (ns)   max wait (ns)   min wait (ns)
	key          8          1           21656           0           21656

After:
	Name   acquired  contended total wait (ns)   max wait (ns)   min wait (ns)
	key          8          1           21656           21656           21656

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
---
 tools/perf/builtin-lock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index d7dde9cbbdf..e549f4547b9 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -496,7 +496,7 @@ report_lock_acquired_event(struct trace_acquired_event *acquired_event,
 		ls->wait_time_total += contended_term;
 		if (contended_term < ls->wait_time_min)
 			ls->wait_time_min = contended_term;
-		else if (ls->wait_time_max < contended_term)
+		if (ls->wait_time_max < contended_term)
 			ls->wait_time_max = contended_term;
 		break;
 	case SEQ_STATE_RELEASED:
-- 
cgit v1.2.3


From 883a2a3189dae9d2912c417e47152f51cb922a3f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 8 May 2010 06:16:11 +0200
Subject: tracing: Drop lock_acquired waittime field

Drop the waittime field from the lock_acquired event, we can
calculate it by substracting the lock_acquired event timestamp
with the matching lock_acquire one.

It is not needed and takes useless space in the traces.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/lock.h | 11 ++++-------
 kernel/lockdep.c            |  2 +-
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
index 5c1dcfc16c6..17ca287ae17 100644
--- a/include/trace/events/lock.h
+++ b/include/trace/events/lock.h
@@ -78,24 +78,21 @@ TRACE_EVENT(lock_contended,
 );
 
 TRACE_EVENT(lock_acquired,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
 
-	TP_ARGS(lock, ip, waittime),
+	TP_ARGS(lock, ip),
 
 	TP_STRUCT__entry(
 		__string(name, lock->name)
-		__field(s64, wait_nsec)
 		__field(void *, lockdep_addr)
 	),
 
 	TP_fast_assign(
 		__assign_str(name, lock->name);
-		__entry->wait_nsec = waittime;
 		__entry->lockdep_addr = lock;
 	),
-	TP_printk("%p %s (%llu ns)", __entry->lockdep_addr,
-		  __get_str(name),
-		  __entry->wait_nsec)
+	TP_printk("%p %s", __entry->lockdep_addr,
+		  __get_str(name))
 );
 
 #endif
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 2594e1ce41c..31e22e74236 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3380,7 +3380,7 @@ found_it:
 		hlock->holdtime_stamp = now;
 	}
 
-	trace_lock_acquired(lock, ip, waittime);
+	trace_lock_acquired(lock, ip);
 
 	stats = get_lock_stats(hlock_class(hlock));
 	if (waittime) {
-- 
cgit v1.2.3


From 93135439459920c4d856f4ab8f068c030085c8df Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 8 May 2010 06:24:25 +0200
Subject: tracing: Drop the nested field from lock_release event

Drop the nested field as we don't use it. Every nested state can
be computed from a state machine on post processing already.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/lock.h | 4 ++--
 kernel/lockdep.c            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
index 17ca287ae17..fde4c385339 100644
--- a/include/trace/events/lock.h
+++ b/include/trace/events/lock.h
@@ -37,9 +37,9 @@ TRACE_EVENT(lock_acquire,
 
 TRACE_EVENT(lock_release,
 
-	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
 
-	TP_ARGS(lock, nested, ip),
+	TP_ARGS(lock, ip),
 
 	TP_STRUCT__entry(
 		__string(name, lock->name)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 31e22e74236..e9c759f06c1 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3227,7 +3227,7 @@ void lock_release(struct lockdep_map *lock, int nested,
 	raw_local_irq_save(flags);
 	check_flags(flags);
 	current->lockdep_recursion = 1;
-	trace_lock_release(lock, nested, ip);
+	trace_lock_release(lock, ip);
 	__lock_release(lock, nested, ip);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
-- 
cgit v1.2.3


From 2c193c736803ceb547daec725e5c5d992d039f20 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 8 May 2010 06:36:02 +0200
Subject: tracing: Factorize lock events in a lock class

lock_acquired, lock_contended and lock_release now share the
same prototype and format. Let's factorize them into a lock
event class.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/lock.h | 48 ++++++++++++++-------------------------------
 1 file changed, 15 insertions(+), 33 deletions(-)

diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
index fde4c385339..2821b86de63 100644
--- a/include/trace/events/lock.h
+++ b/include/trace/events/lock.h
@@ -35,15 +35,15 @@ TRACE_EVENT(lock_acquire,
 		  __get_str(name))
 );
 
-TRACE_EVENT(lock_release,
+DECLARE_EVENT_CLASS(lock,
 
 	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
 
 	TP_ARGS(lock, ip),
 
 	TP_STRUCT__entry(
-		__string(name, lock->name)
-		__field(void *, lockdep_addr)
+		__string(	name, 	lock->name	)
+		__field(	void *, lockdep_addr	)
 	),
 
 	TP_fast_assign(
@@ -51,48 +51,30 @@ TRACE_EVENT(lock_release,
 		__entry->lockdep_addr = lock;
 	),
 
-	TP_printk("%p %s",
-		  __entry->lockdep_addr, __get_str(name))
+	TP_printk("%p %s",  __entry->lockdep_addr, __get_str(name))
 );
 
-#ifdef CONFIG_LOCK_STAT
-
-TRACE_EVENT(lock_contended,
+DEFINE_EVENT(lock, lock_release,
 
 	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
 
-	TP_ARGS(lock, ip),
-
-	TP_STRUCT__entry(
-		__string(name, lock->name)
-		__field(void *, lockdep_addr)
-	),
+	TP_ARGS(lock, ip)
+);
 
-	TP_fast_assign(
-		__assign_str(name, lock->name);
-		__entry->lockdep_addr = lock;
-	),
+#ifdef CONFIG_LOCK_STAT
 
-	TP_printk("%p %s",
-		  __entry->lockdep_addr, __get_str(name))
-);
+DEFINE_EVENT(lock, lock_contended,
 
-TRACE_EVENT(lock_acquired,
 	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
 
-	TP_ARGS(lock, ip),
+	TP_ARGS(lock, ip)
+);
 
-	TP_STRUCT__entry(
-		__string(name, lock->name)
-		__field(void *, lockdep_addr)
-	),
+DEFINE_EVENT(lock, lock_acquired,
 
-	TP_fast_assign(
-		__assign_str(name, lock->name);
-		__entry->lockdep_addr = lock;
-	),
-	TP_printk("%p %s", __entry->lockdep_addr,
-		  __get_str(name))
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
+	TP_ARGS(lock, ip)
 );
 
 #endif
-- 
cgit v1.2.3


From 794e43b56c18b95fc9776c914a2659e7d558a352 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Wed, 5 May 2010 00:27:40 -0500
Subject: perf/live-mode: Handle payload-less events

Some events, such as the PERF_RECORD_FINISHED_ROUND event consist of
only an event header and no data.  In this case, a 0-length payload
will be read, and the 0 return value will be wrongly interpreted as an
'unexpected end of event stream'.

This patch allows for proper handling of data-less events by skipping
0-length reads.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
LKML-Reference: <1273038527.6383.51.camel@tropicana>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 tools/perf/util/session.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 9401909fa28..00ab298bbb4 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -696,15 +696,18 @@ more:
 	p = &event;
 	p += sizeof(struct perf_event_header);
 
-	err = do_read(self->fd, p, size - sizeof(struct perf_event_header));
-	if (err <= 0) {
-		if (err == 0) {
-			pr_err("unexpected end of event stream\n");
-			goto done;
-		}
+	if (size - sizeof(struct perf_event_header)) {
+		err = do_read(self->fd, p,
+			      size - sizeof(struct perf_event_header));
+		if (err <= 0) {
+			if (err == 0) {
+				pr_err("unexpected end of event stream\n");
+				goto done;
+			}
 
-		pr_err("failed to read event data\n");
-		goto out_err;
+			pr_err("failed to read event data\n");
+			goto out_err;
+		}
 	}
 
 	if (size == 0 ||
-- 
cgit v1.2.3


From 139633c6a43781cd44798165b0472a34bf53a1e8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 9 May 2010 11:47:13 -0300
Subject: perf callchain: Move validate_callchain to callchain lib
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c | 15 +--------------
 tools/perf/util/callchain.c |  7 +++++++
 tools/perf/util/callchain.h |  3 +++
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 196473b5125..1cae8771379 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -123,19 +123,6 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 	return 0;
 }
 
-static int validate_chain(struct ip_callchain *chain, event_t *event)
-{
-	unsigned int chain_size;
-
-	chain_size = event->header.size;
-	chain_size -= (unsigned long)&event->ip.__more_data - (unsigned long)event;
-
-	if (chain->nr*sizeof(u64) > chain_size)
-		return -1;
-
-	return 0;
-}
-
 static int add_event_total(struct perf_session *session,
 			   struct sample_data *data,
 			   struct perf_event_attr *attr)
@@ -171,7 +158,7 @@ static int process_sample_event(event_t *event, struct perf_session *session)
 
 		dump_printf("... chain: nr:%Lu\n", data.callchain->nr);
 
-		if (validate_chain(data.callchain, event) < 0) {
+		if (!ip_callchain__valid(data.callchain, event)) {
 			pr_debug("call-chain problem with event, "
 				 "skipping it.\n");
 			return 0;
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index db628af6d20..ac148613afe 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -17,6 +17,13 @@
 
 #include "callchain.h"
 
+bool ip_callchain__valid(struct ip_callchain *chain, event_t *event)
+{
+	unsigned int chain_size = event->header.size;
+	chain_size -= (unsigned long)&event->ip.__more_data - (unsigned long)event;
+	return chain->nr * sizeof(u64) <= chain_size;
+}
+
 #define chain_for_each_child(child, parent)	\
 	list_for_each_entry(child, &parent->children, brothers)
 
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 8a7e8bbd0fd..0f4da093cbd 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -4,6 +4,7 @@
 #include "../perf.h"
 #include <linux/list.h>
 #include <linux/rbtree.h>
+#include "event.h"
 #include "util.h"
 #include "symbol.h"
 
@@ -58,4 +59,6 @@ static inline u64 cumul_hits(struct callchain_node *node)
 int register_callchain_param(struct callchain_param *param);
 int append_chain(struct callchain_node *root, struct ip_callchain *chain,
 		 struct map_symbol *syms);
+
+bool ip_callchain__valid(struct ip_callchain *chain, event_t *event);
 #endif	/* __PERF_CALLCHAIN_H */
-- 
cgit v1.2.3


From 39d1e1b1e26dc84d40bf2792287d0d61e44b57df Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 9 May 2010 12:01:05 -0300
Subject: perf report: Fix leak of resolved callchains array on error path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 1cae8771379..3a70c5807c0 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -83,7 +83,7 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 	struct map_symbol *syms = NULL;
 	struct symbol *parent = NULL;
 	bool hit;
-	int err;
+	int err = -ENOMEM;
 	struct hist_entry *he;
 	struct event_stat_id *stats;
 	struct perf_event_attr *attr;
@@ -101,26 +101,24 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 	else
 		stats = get_stats(self, data->id, 0, 0);
 	if (stats == NULL)
-		return -ENOMEM;
+		goto out_free_syms;
 	he = __perf_session__add_hist_entry(&stats->hists, al, parent,
 					    data->period, &hit);
 	if (he == NULL)
-		return -ENOMEM;
+		goto out_free_syms;
 
 	if (hit)
 		__perf_session__add_count(he, al,  data->period);
 
+	err = 0;
 	if (symbol_conf.use_callchain) {
 		if (!hit)
 			callchain_init(he->callchain);
 		err = append_chain(he->callchain, data->callchain, syms);
-		free(syms);
-
-		if (err)
-			return err;
 	}
-
-	return 0;
+out_free_syms:
+	free(syms);
+	return err;
 }
 
 static int add_event_total(struct perf_session *session,
-- 
cgit v1.2.3


From 28e2a106d16046ca792722795f809e3f80a5af80 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 9 May 2010 13:02:23 -0300
Subject: perf hist: Simplify the insertion of new hist_entry instances
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

And with that fix at least one bug:

The first hit for an entry, the one that calls malloc to create a new
instance in __perf_session__add_hist_entry, wasn't adding the count to
the per cpumode (PERF_RECORD_MISC_USER, etc) total variable.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-annotate.c |  9 +++------
 tools/perf/builtin-diff.c     | 14 +++-----------
 tools/perf/builtin-report.c   | 12 ++----------
 tools/perf/util/hist.c        | 36 +++++++++++++++++++++++-------------
 tools/perf/util/hist.h        |  5 +----
 5 files changed, 32 insertions(+), 44 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index ee154b58772..c7ac45a59ed 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -72,8 +72,6 @@ static int annotate__hist_hit(struct hist_entry *he, u64 ip)
 	struct sym_priv *priv;
 	struct sym_hist *h;
 
-	he->count++;
-
 	if (!sym || !he->ms.map)
 		return 0;
 
@@ -99,9 +97,8 @@ static int annotate__hist_hit(struct hist_entry *he, u64 ip)
 }
 
 static int perf_session__add_hist_entry(struct perf_session *self,
-					struct addr_location *al, u64 count)
+					struct addr_location *al)
 {
-	bool hit;
 	struct hist_entry *he;
 
 	if (sym_hist_filter != NULL &&
@@ -115,7 +112,7 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 		return 0;
 	}
 
-	he = __perf_session__add_hist_entry(&self->hists, al, NULL, count, &hit);
+	he = __perf_session__add_hist_entry(&self->hists, al, NULL, 1);
 	if (he == NULL)
 		return -ENOMEM;
 
@@ -135,7 +132,7 @@ static int process_sample_event(event_t *event, struct perf_session *session)
 		return -1;
 	}
 
-	if (!al.filtered && perf_session__add_hist_entry(session, &al, 1)) {
+	if (!al.filtered && perf_session__add_hist_entry(session, &al)) {
 		pr_warning("problem incrementing symbol count, "
 			   "skipping event\n");
 		return -1;
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 4cce68f2368..613a5c4f6d8 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -25,17 +25,9 @@ static bool show_displacement;
 static int perf_session__add_hist_entry(struct perf_session *self,
 					struct addr_location *al, u64 count)
 {
-	bool hit;
-	struct hist_entry *he = __perf_session__add_hist_entry(&self->hists,
-							       al, NULL,
-							       count, &hit);
-	if (he == NULL)
-		return -ENOMEM;
-
-	if (hit)
-		__perf_session__add_count(he, al, count);
-
-	return 0;
+	if (__perf_session__add_hist_entry(&self->hists, al, NULL, count) != NULL)
+		return 0;
+	return -ENOMEM;
 }
 
 static int diff__process_sample_event(event_t *event, struct perf_session *session)
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 3a70c5807c0..5e2f47f88ec 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -82,7 +82,6 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 {
 	struct map_symbol *syms = NULL;
 	struct symbol *parent = NULL;
-	bool hit;
 	int err = -ENOMEM;
 	struct hist_entry *he;
 	struct event_stat_id *stats;
@@ -103,19 +102,12 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 	if (stats == NULL)
 		goto out_free_syms;
 	he = __perf_session__add_hist_entry(&stats->hists, al, parent,
-					    data->period, &hit);
+					    data->period);
 	if (he == NULL)
 		goto out_free_syms;
-
-	if (hit)
-		__perf_session__add_count(he, al,  data->period);
-
 	err = 0;
-	if (symbol_conf.use_callchain) {
-		if (!hit)
-			callchain_init(he->callchain);
+	if (symbol_conf.use_callchain)
 		err = append_chain(he->callchain, data->callchain, syms);
-	}
 out_free_syms:
 	free(syms);
 	return err;
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index ad6b22dde27..e0c8a722e11 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -8,13 +8,10 @@ struct callchain_param	callchain_param = {
 	.min_percent = 0.5
 };
 
-void __perf_session__add_count(struct hist_entry *he,
-			struct addr_location *al,
-			u64 count)
+static void perf_session__add_cpumode_count(struct hist_entry *he,
+					    unsigned int cpumode, u64 count)
 {
-	he->count += count;
-
-	switch (al->cpumode) {
+	switch (cpumode) {
 	case PERF_RECORD_MISC_KERNEL:
 		he->count_sys += count;
 		break;
@@ -36,10 +33,24 @@ void __perf_session__add_count(struct hist_entry *he,
  * histogram, sorted on item, collects counts
  */
 
+static struct hist_entry *hist_entry__new(struct hist_entry *template)
+{
+	size_t callchain_size = symbol_conf.use_callchain ? sizeof(struct callchain_node) : 0;
+	struct hist_entry *self = malloc(sizeof(*self) + callchain_size);
+
+	if (self != NULL) {
+		*self = *template;
+		if (symbol_conf.use_callchain)
+			callchain_init(self->callchain);
+	}
+
+	return self;
+}
+
 struct hist_entry *__perf_session__add_hist_entry(struct rb_root *hists,
 						  struct addr_location *al,
 						  struct symbol *sym_parent,
-						  u64 count, bool *hit)
+						  u64 count)
 {
 	struct rb_node **p = &hists->rb_node;
 	struct rb_node *parent = NULL;
@@ -64,8 +75,8 @@ struct hist_entry *__perf_session__add_hist_entry(struct rb_root *hists,
 		cmp = hist_entry__cmp(&entry, he);
 
 		if (!cmp) {
-			*hit = true;
-			return he;
+			he->count += count;
+			goto out;
 		}
 
 		if (cmp < 0)
@@ -74,14 +85,13 @@ struct hist_entry *__perf_session__add_hist_entry(struct rb_root *hists,
 			p = &(*p)->rb_right;
 	}
 
-	he = malloc(sizeof(*he) + (symbol_conf.use_callchain ?
-				    sizeof(struct callchain_node) : 0));
+	he = hist_entry__new(&entry);
 	if (!he)
 		return NULL;
-	*he = entry;
 	rb_link_node(&he->rb_node, parent, p);
 	rb_insert_color(&he->rb_node, hists);
-	*hit = false;
+out:
+	perf_session__add_cpumode_count(he, al->cpumode, count);
 	return he;
 }
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 9df1c340ec9..b49013adb34 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -12,13 +12,10 @@ struct addr_location;
 struct symbol;
 struct rb_root;
 
-void __perf_session__add_count(struct hist_entry *he,
-			struct addr_location *al,
-			u64 count);
 struct hist_entry *__perf_session__add_hist_entry(struct rb_root *hists,
 						  struct addr_location *al,
 						  struct symbol *parent,
-						  u64 count, bool *hit);
+						  u64 count);
 extern int64_t hist_entry__cmp(struct hist_entry *, struct hist_entry *);
 extern int64_t hist_entry__collapse(struct hist_entry *, struct hist_entry *);
 int hist_entry__fprintf(struct hist_entry *self,
-- 
cgit v1.2.3


From b1f724c3055fa75a31d272222213647547a2d3d4 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 9 May 2010 08:22:08 -0700
Subject: sched: Add a comment to get_cpu_idle_time_us()

The exported function get_cpu_idle_time_us() has no comment
describing it; add a kerneldoc comment

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082208.7cb721f0@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f25735a767a..358822ee32d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -179,6 +179,20 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 	return now;
 }
 
+/**
+ * get_cpu_idle_time_us - get the total idle time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative idle time (since boot) for a given
+ * CPU, in microseconds. The idle time returned includes
+ * the iowait time (unlike what "top" and co report).
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-- 
cgit v1.2.3


From 595aac488b546c7185be7e29c8ae165a588b2a9f Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 9 May 2010 08:22:45 -0700
Subject: sched: Introduce a function to update the idle statistics

Currently, two places update the idle statistics (and more to
come later in this series).

This patch creates a helper function for updating these
statistics.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082245.163e67ed@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 358822ee32d..59d8762c7e1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,14 +150,25 @@ static void tick_nohz_update_jiffies(ktime_t now)
 	touch_softlockup_watchdog();
 }
 
-static void tick_nohz_stop_idle(int cpu, ktime_t now)
+/*
+ * Updates the per cpu time idle statistics counters
+ */
+static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
 {
-	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 	ktime_t delta;
 
-	delta = ktime_sub(now, ts->idle_entrytime);
 	ts->idle_lastupdate = now;
-	ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+	if (ts->idle_active) {
+		delta = ktime_sub(now, ts->idle_entrytime);
+		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+	}
+}
+
+static void tick_nohz_stop_idle(int cpu, ktime_t now)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+	update_ts_time_stats(ts, now);
 	ts->idle_active = 0;
 
 	sched_clock_idle_wakeup_event(0);
@@ -165,14 +176,12 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
 
 static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 {
-	ktime_t now, delta;
+	ktime_t now;
 
 	now = ktime_get();
-	if (ts->idle_active) {
-		delta = ktime_sub(now, ts->idle_entrytime);
-		ts->idle_lastupdate = now;
-		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-	}
+
+	update_ts_time_stats(ts, now);
+
 	ts->idle_entrytime = now;
 	ts->idle_active = 1;
 	sched_clock_idle_sleep_event();
-- 
cgit v1.2.3


From 8c7b09f43f4bf570654bcc458ce96819a932303c Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 9 May 2010 08:23:23 -0700
Subject: sched: Update the idle statistics in get_cpu_idle_time_us()

Right now, get_cpu_idle_time_us() only reports the idle
statistics upto the point the CPU entered last idle; not what is
valid right now.

This patch adds an update of the idle statistics to
get_cpu_idle_time_us(), so that calling this function always
returns statistics that are accurate at the point of the call.

This includes resetting the start of the idle time for
accounting purposes to avoid double accounting.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082323.2d2f1945@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 59d8762c7e1..f15d18d82c1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -161,6 +161,7 @@ static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
 	if (ts->idle_active) {
 		delta = ktime_sub(now, ts->idle_entrytime);
 		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+		ts->idle_entrytime = now;
 	}
 }
 
@@ -205,14 +206,18 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	ktime_t now;
 
 	if (!tick_nohz_enabled)
 		return -1;
 
+	now = ktime_get();
+	update_ts_time_stats(ts, now);
+
 	if (ts->idle_active)
 		*last_update_time = ktime_to_us(ts->idle_lastupdate);
 	else
-		*last_update_time = ktime_to_us(ktime_get());
+		*last_update_time = ktime_to_us(now);
 
 	return ktime_to_us(ts->idle_sleeptime);
 }
-- 
cgit v1.2.3


From 8d63bf949e330588b80d30ca8f0a27a45297a9e9 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 9 May 2010 08:24:03 -0700
Subject: sched: Fold updating of the last_update_time_info into
 update_ts_time_stats()

This patch folds the updating of the last_update_time into the
update_ts_time_stats() function, and updates the callers.

This allows for further cleanups that are done in the next
patch.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082403.60072967@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f15d18d82c1..e86e1c6674d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -153,7 +153,8 @@ static void tick_nohz_update_jiffies(ktime_t now)
 /*
  * Updates the per cpu time idle statistics counters
  */
-static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
+static void
+update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
 {
 	ktime_t delta;
 
@@ -163,13 +164,19 @@ static void update_ts_time_stats(struct tick_sched *ts, ktime_t now)
 		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
 		ts->idle_entrytime = now;
 	}
+
+	if (ts->idle_active && last_update_time)
+		*last_update_time = ktime_to_us(ts->idle_lastupdate);
+	else
+		*last_update_time = ktime_to_us(now);
+
 }
 
 static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 
-	update_ts_time_stats(ts, now);
+	update_ts_time_stats(ts, now, NULL);
 	ts->idle_active = 0;
 
 	sched_clock_idle_wakeup_event(0);
@@ -181,7 +188,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 
 	now = ktime_get();
 
-	update_ts_time_stats(ts, now);
+	update_ts_time_stats(ts, now, NULL);
 
 	ts->idle_entrytime = now;
 	ts->idle_active = 1;
@@ -206,18 +213,11 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-	ktime_t now;
 
 	if (!tick_nohz_enabled)
 		return -1;
 
-	now = ktime_get();
-	update_ts_time_stats(ts, now);
-
-	if (ts->idle_active)
-		*last_update_time = ktime_to_us(ts->idle_lastupdate);
-	else
-		*last_update_time = ktime_to_us(now);
+	update_ts_time_stats(ts, ktime_get(), last_update_time);
 
 	return ktime_to_us(ts->idle_sleeptime);
 }
-- 
cgit v1.2.3


From e0e37c200f1357db0dd986edb359c41c57d24f6e Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 9 May 2010 08:24:39 -0700
Subject: sched: Eliminate the ts->idle_lastupdate field

Now that the only user of ts->idle_lastupdate is
update_ts_time_stats(), the entire field can be eliminated.

In update_ts_time_stats(), idle_lastupdate is first set to
"now", and a few lines later, the only user is an if() statement
that assigns a variable either to "now" or to
ts->idle_lastupdate, which has the value of "now" at that point.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082439.2fab0b4f@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tick.h     | 1 -
 kernel/time/tick-sched.c | 5 +----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index d2ae79e21be..0343eed4061 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -60,7 +60,6 @@ struct tick_sched {
 	ktime_t				idle_waketime;
 	ktime_t				idle_exittime;
 	ktime_t				idle_sleeptime;
-	ktime_t				idle_lastupdate;
 	ktime_t				sleep_length;
 	unsigned long			last_jiffies;
 	unsigned long			next_jiffies;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e86e1c6674d..50953f4c42b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -158,16 +158,13 @@ update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
 {
 	ktime_t delta;
 
-	ts->idle_lastupdate = now;
 	if (ts->idle_active) {
 		delta = ktime_sub(now, ts->idle_entrytime);
 		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
 		ts->idle_entrytime = now;
 	}
 
-	if (ts->idle_active && last_update_time)
-		*last_update_time = ktime_to_us(ts->idle_lastupdate);
-	else
+	if (last_update_time)
 		*last_update_time = ktime_to_us(now);
 
 }
-- 
cgit v1.2.3


From 0224cf4c5ee0d7faec83956b8e21f7d89e3df3bd Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 9 May 2010 08:25:23 -0700
Subject: sched: Intoduce get_cpu_iowait_time_us()

For the ondemand cpufreq governor, it is desired that the iowait
time is microaccounted in a similar way as idle time is.

This patch introduces the infrastructure to account and expose
this information via the get_cpu_iowait_time_us() function.

[akpm@linux-foundation.org: fix CONFIG_NO_HZ=n build]
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082523.284feab6@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/tick.h     |  4 ++++
 kernel/time/tick-sched.c | 28 ++++++++++++++++++++++++++++
 kernel/time/timer_list.c |  1 +
 3 files changed, 33 insertions(+)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 0343eed4061..b232ccc0ee2 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -42,6 +42,7 @@ enum tick_nohz_mode {
  * @idle_waketime:	Time when the idle was interrupted
  * @idle_exittime:	Time when the idle state was left
  * @idle_sleeptime:	Sum of the time slept in idle with sched tick stopped
+ * @iowait_sleeptime:	Sum of the time slept in idle with sched tick stopped, with IO outstanding
  * @sleep_length:	Duration of the current idle sleep
  * @do_timer_lst:	CPU was the last one doing do_timer before going idle
  */
@@ -60,6 +61,7 @@ struct tick_sched {
 	ktime_t				idle_waketime;
 	ktime_t				idle_exittime;
 	ktime_t				idle_sleeptime;
+	ktime_t				iowait_sleeptime;
 	ktime_t				sleep_length;
 	unsigned long			last_jiffies;
 	unsigned long			next_jiffies;
@@ -123,6 +125,7 @@ extern void tick_nohz_stop_sched_tick(int inidle);
 extern void tick_nohz_restart_sched_tick(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
+extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 # else
 static inline void tick_nohz_stop_sched_tick(int inidle) { }
 static inline void tick_nohz_restart_sched_tick(void) { }
@@ -133,6 +136,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
 	return len;
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
+static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 # endif /* !NO_HZ */
 
 #endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 50953f4c42b..1d7b9bc1c03 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -161,6 +161,8 @@ update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
 	if (ts->idle_active) {
 		delta = ktime_sub(now, ts->idle_entrytime);
 		ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+		if (nr_iowait_cpu() > 0)
+			ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
 		ts->idle_entrytime = now;
 	}
 
@@ -220,6 +222,32 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 
+/*
+ * get_cpu_iowait_time_us - get the total iowait time of a cpu
+ * @cpu: CPU number to query
+ * @last_update_time: variable to store update time in
+ *
+ * Return the cummulative iowait time (since boot) for a given
+ * CPU, in microseconds.
+ *
+ * This time is measured via accounting rather than sampling,
+ * and is as accurate as ktime_get() is.
+ *
+ * This function returns -1 if NOHZ is not enabled.
+ */
+u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
+{
+	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+
+	if (!tick_nohz_enabled)
+		return -1;
+
+	update_ts_time_stats(ts, ktime_get(), last_update_time);
+
+	return ktime_to_us(ts->iowait_sleeptime);
+}
+EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+
 /**
  * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
  *
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1a4a7dd7877..ab8f5e33fa9 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 		P_ns(idle_waketime);
 		P_ns(idle_exittime);
 		P_ns(idle_sleeptime);
+		P_ns(iowait_sleeptime);
 		P(last_jiffies);
 		P(next_jiffies);
 		P_ns(idle_expires);
-- 
cgit v1.2.3


From 6b8fcd9029f217a9ecce822db645e19111c11080 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 9 May 2010 08:26:06 -0700
Subject: ondemand: Solve a big performance issue by counting IOWAIT time as
 busy

The ondemand cpufreq governor uses CPU busy time (e.g. not-idle
time) as a measure for scaling the CPU frequency up or down.
If the CPU is busy, the CPU frequency scales up, if it's idle,
the CPU frequency scales down. Effectively, it uses the CPU busy
time as proxy variable for the more nebulous "how critical is
performance right now" question.

This algorithm falls flat on its face in the light of workloads
where you're alternatingly disk and CPU bound, such as the ever
popular "git grep", but also things like startup of programs and
maildir using email clients... much to the chagarin of Andrew
Morton.

This patch changes the ondemand algorithm to count iowait time
as busy, not idle, time. As shown in the breakdown cases above,
iowait is performance critical often, and by counting iowait,
the proxy variable becomes a more accurate representation of the
"how critical is performance" question.

The problem and fix are both verified with the "perf timechar"
tool.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dave Jones <davej@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100509082606.3d9f00d0@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/cpufreq/cpufreq_ondemand.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index bd444dc93cf..ed472f8dfb7 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -73,6 +73,7 @@ enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE};
 
 struct cpu_dbs_info_s {
 	cputime64_t prev_cpu_idle;
+	cputime64_t prev_cpu_iowait;
 	cputime64_t prev_cpu_wall;
 	cputime64_t prev_cpu_nice;
 	struct cpufreq_policy *cur_policy;
@@ -148,6 +149,16 @@ static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
 	return idle_time;
 }
 
+static inline cputime64_t get_cpu_iowait_time(unsigned int cpu, cputime64_t *wall)
+{
+	u64 iowait_time = get_cpu_iowait_time_us(cpu, wall);
+
+	if (iowait_time == -1ULL)
+		return 0;
+
+	return iowait_time;
+}
+
 /*
  * Find right freq to be set now with powersave_bias on.
  * Returns the freq_hi to be used right now and will set freq_hi_jiffies,
@@ -470,14 +481,15 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 
 	for_each_cpu(j, policy->cpus) {
 		struct cpu_dbs_info_s *j_dbs_info;
-		cputime64_t cur_wall_time, cur_idle_time;
-		unsigned int idle_time, wall_time;
+		cputime64_t cur_wall_time, cur_idle_time, cur_iowait_time;
+		unsigned int idle_time, wall_time, iowait_time;
 		unsigned int load, load_freq;
 		int freq_avg;
 
 		j_dbs_info = &per_cpu(od_cpu_dbs_info, j);
 
 		cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
+		cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time);
 
 		wall_time = (unsigned int) cputime64_sub(cur_wall_time,
 				j_dbs_info->prev_cpu_wall);
@@ -487,6 +499,10 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 				j_dbs_info->prev_cpu_idle);
 		j_dbs_info->prev_cpu_idle = cur_idle_time;
 
+		iowait_time = (unsigned int) cputime64_sub(cur_iowait_time,
+				j_dbs_info->prev_cpu_iowait);
+		j_dbs_info->prev_cpu_iowait = cur_iowait_time;
+
 		if (dbs_tuners_ins.ignore_nice) {
 			cputime64_t cur_nice;
 			unsigned long cur_nice_jiffies;
@@ -504,6 +520,16 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 			idle_time += jiffies_to_usecs(cur_nice_jiffies);
 		}
 
+		/*
+		 * For the purpose of ondemand, waiting for disk IO is an
+		 * indication that you're performance critical, and not that
+		 * the system is actually idle. So subtract the iowait time
+		 * from the cpu idle time.
+		 */
+
+		if (idle_time >= iowait_time)
+			idle_time -= iowait_time;
+
 		if (unlikely(!wall_time || wall_time < idle_time))
 			continue;
 
-- 
cgit v1.2.3


From 19379b11819efc1fc3b602e64f7e7531050aaddb Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 9 May 2010 08:26:51 -0700
Subject: ondemand: Make the iowait-is-busy time a sysfs tunable

Pavel Machek pointed out that not all CPUs have an efficient
idle at high frequency. Specifically, older Intel and various
AMD cpus would get a higher powerusage when copying files from
USB.

Mike Chan pointed out that the same is true for various ARM
chips as well.

Thomas Renninger suggested to make this a sysfs tunable with a
reasonable default.

This patch adds a sysfs tunable for the new behavior, and uses
a very simple function to determine a reasonable default,
depending on the CPU vendor/type.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: davej@redhat.com
LKML-Reference: <20100509082651.46914d04@infradead.org>
[ minor tidyup ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/cpufreq/cpufreq_ondemand.c | 47 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index ed472f8dfb7..8e9dbdc6c70 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -109,6 +109,7 @@ static struct dbs_tuners {
 	unsigned int down_differential;
 	unsigned int ignore_nice;
 	unsigned int powersave_bias;
+	unsigned int io_is_busy;
 } dbs_tuners_ins = {
 	.up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
 	.down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL,
@@ -260,6 +261,7 @@ static ssize_t show_##file_name						\
 	return sprintf(buf, "%u\n", dbs_tuners_ins.object);		\
 }
 show_one(sampling_rate, sampling_rate);
+show_one(io_is_busy, io_is_busy);
 show_one(up_threshold, up_threshold);
 show_one(ignore_nice_load, ignore_nice);
 show_one(powersave_bias, powersave_bias);
@@ -310,6 +312,23 @@ static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b,
 	return count;
 }
 
+static ssize_t store_io_is_busy(struct kobject *a, struct attribute *b,
+				   const char *buf, size_t count)
+{
+	unsigned int input;
+	int ret;
+
+	ret = sscanf(buf, "%u", &input);
+	if (ret != 1)
+		return -EINVAL;
+
+	mutex_lock(&dbs_mutex);
+	dbs_tuners_ins.io_is_busy = !!input;
+	mutex_unlock(&dbs_mutex);
+
+	return count;
+}
+
 static ssize_t store_up_threshold(struct kobject *a, struct attribute *b,
 				  const char *buf, size_t count)
 {
@@ -392,6 +411,7 @@ static struct global_attr _name = \
 __ATTR(_name, 0644, show_##_name, store_##_name)
 
 define_one_rw(sampling_rate);
+define_one_rw(io_is_busy);
 define_one_rw(up_threshold);
 define_one_rw(ignore_nice_load);
 define_one_rw(powersave_bias);
@@ -403,6 +423,7 @@ static struct attribute *dbs_attributes[] = {
 	&up_threshold.attr,
 	&ignore_nice_load.attr,
 	&powersave_bias.attr,
+	&io_is_busy.attr,
 	NULL
 };
 
@@ -527,7 +548,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
 		 * from the cpu idle time.
 		 */
 
-		if (idle_time >= iowait_time)
+		if (dbs_tuners_ins.io_is_busy && idle_time >= iowait_time)
 			idle_time -= iowait_time;
 
 		if (unlikely(!wall_time || wall_time < idle_time))
@@ -643,6 +664,29 @@ static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
 	cancel_delayed_work_sync(&dbs_info->work);
 }
 
+/*
+ * Not all CPUs want IO time to be accounted as busy; this dependson how
+ * efficient idling at a higher frequency/voltage is.
+ * Pavel Machek says this is not so for various generations of AMD and old
+ * Intel systems.
+ * Mike Chan (androidlcom) calis this is also not true for ARM.
+ * Because of this, whitelist specific known (series) of CPUs by default, and
+ * leave all others up to the user.
+ */
+static int should_io_be_busy(void)
+{
+#if defined(CONFIG_X86)
+	/*
+	 * For Intel, Core 2 (model 15) andl later have an efficient idle.
+	 */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+	    boot_cpu_data.x86 == 6 &&
+	    boot_cpu_data.x86_model >= 15)
+		return 1;
+#endif
+	return 0;
+}
+
 static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 				   unsigned int event)
 {
@@ -705,6 +749,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 			dbs_tuners_ins.sampling_rate =
 				max(min_sampling_rate,
 				    latency * LATENCY_MULTIPLIER);
+			dbs_tuners_ins.io_is_busy = should_io_be_busy();
 		}
 		mutex_unlock(&dbs_mutex);
 
-- 
cgit v1.2.3


From 76ba7e846fcc89d9d4b25b89e303c9058de96d60 Mon Sep 17 00:00:00 2001
From: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Date: Sat, 8 May 2010 17:10:29 +0900
Subject: perf lock: Drop "-a" option from cmd_record() default arguments set

This patch drops "-a" from the default arguments passed to
perf record by perf lock.

If a user wants to do a system wide record of lock events,
        perf lock record -a <program> <argument> ...
is enough for this purpose.

This can reduce the size of the perf.data file.

% sudo ./perf lock record whoami
root
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.439 MB perf.data (~19170 samples) ]
% sudo ./perf lock record -a whoami   # with -a option
root
[ perf record: Woken up 0 times to write data ]
[ perf record: Captured and wrote 48.962 MB perf.data (~2139197 samples) ]

Signed-off-by: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
LKML-Reference: Message-Id: <1273306229-5216-1-git-send-email-mitake@dcl.info.waseda.ac.jp>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 tools/perf/builtin-lock.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index e549f4547b9..e18dfdc2948 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -930,7 +930,6 @@ static const struct option lock_options[] = {
 
 static const char *record_args[] = {
 	"record",
-	"-a",
 	"-R",
 	"-f",
 	"-m", "1024",
-- 
cgit v1.2.3


From 3ceb0d4438876a65606c258e5d69e03e57460dd6 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 9 May 2010 16:07:01 -0300
Subject: perf symbols: Consider unresolved DSOs in the dso__col_widt
 calculation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

By using BITS_PER_LONG / 4, that is the number of chars that will be
used in such cases as the DSO "name".

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 23d5dfd4ed7..46563e16c3f 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -676,6 +676,13 @@ int event__preprocess_sample(const event_t *self, struct perf_session *session,
 			dso__calc_col_width(al->map->dso);
 
 		al->sym = map__find_symbol(al->map, al->addr, filter);
+	} else {
+		const unsigned int unresolved_col_width = BITS_PER_LONG / 4;
+
+		if (dsos__col_width < unresolved_col_width &&
+		    !symbol_conf.col_width_list_str && !symbol_conf.field_sep &&
+		    !symbol_conf.dso_list)
+			dsos__col_width = unresolved_col_width;
 	}
 
 	if (symbol_conf.sym_list && al->sym &&
-- 
cgit v1.2.3


From 4cc4945844fe2cf493f1783b6ce938ba1617d5c2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 9 May 2010 21:14:07 -0300
Subject: perf symbols: Check if a struct machine instance was found
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Which can happen when processing old files that had no fake kernel MMAP,
events.

That shouldn't result in perf_session__create_kernel_maps not being
called, this will be fixed in a followup patch, for now do these checks
to avoid segfaulting.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 46563e16c3f..dfc8bf64d70 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -493,8 +493,10 @@ int event__process_mmap(event_t *self, struct perf_session *session)
 		return 0;
 	}
 
-	thread = perf_session__findnew(session, self->mmap.pid);
 	machine = perf_session__find_host_machine(session);
+	if (machine == NULL)
+		goto out_problem;
+	thread = perf_session__findnew(session, self->mmap.pid);
 	map = map__new(&machine->user_dsos, self->mmap.start,
 			self->mmap.len, self->mmap.pgoff,
 			self->mmap.pid, self->mmap.filename,
@@ -552,6 +554,10 @@ void thread__find_addr_map(struct thread *self,
 	if (cpumode == PERF_RECORD_MISC_KERNEL && perf_host) {
 		al->level = 'k';
 		machine = perf_session__find_host_machine(session);
+		if (machine == NULL) {
+			al->map = NULL;
+			return;
+		}
 		mg = &machine->kmaps;
 	} else if (cpumode == PERF_RECORD_MISC_USER && perf_host) {
 		al->level = '.';
@@ -559,7 +565,7 @@ void thread__find_addr_map(struct thread *self,
 	} else if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL && perf_guest) {
 		al->level = 'g';
 		machine = perf_session__find_machine(session, pid);
-		if (!machine) {
+		if (machine == NULL) {
 			al->map = NULL;
 			return;
 		}
-- 
cgit v1.2.3


From 1f626bc36847ac8dd192f055aed0f9678a781313 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 9 May 2010 19:57:08 -0300
Subject: perf session: Embed the host machine data on perf_session
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have just one host on a given session, and that is the most common
setup right now, so embed a ->host_machine struct machine instance
directly in the perf_session class, check if we're looking for it before
going to the rb_tree.

This also fixes a problem found when we try to process old perf.data
files where we didn't have MMAP events for the kernel and modules and
thus don't create the kernel maps, do it in event__preprocess_sample if
it wasn't already.

Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Zhang, Yanmin <yanmin_zhang@linux.intel.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/event.c   | 10 ++++++++++
 tools/perf/util/map.c     | 24 ------------------------
 tools/perf/util/session.c |  8 ++++++++
 tools/perf/util/session.h | 14 ++++++++------
 tools/perf/util/symbol.c  |  2 +-
 tools/perf/util/symbol.h  |  2 ++
 6 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index dfc8bf64d70..d2ea9dd9fdf 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -656,6 +656,16 @@ int event__preprocess_sample(const event_t *self, struct perf_session *session,
 		goto out_filtered;
 
 	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+	/*
+	 * Have we already created the kernel maps for the host machine?
+	 *
+	 * This should have happened earlier, when we processed the kernel MMAP
+	 * events, but for older perf.data files there was no such thing, so do
+	 * it now.
+	 */
+	if (cpumode == PERF_RECORD_MISC_KERNEL &&
+	    session->host_machine.vmlinux_maps[MAP__FUNCTION] == NULL)
+		machine__create_kernel_maps(&session->host_machine);
 
 	thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION,
 			      self->ip.pid, self->ip.ip, al);
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 44a4df68b3c..e672f2fef65 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -579,30 +579,6 @@ struct machine *machines__find(struct rb_root *self, pid_t pid)
 	return default_machine;
 }
 
-/*
- * FIXME: Why repeatedly search for this?
- */
-struct machine *machines__find_host(struct rb_root *self)
-{
-	struct rb_node **p = &self->rb_node;
-	struct rb_node *parent = NULL;
-	struct machine *machine;
-	pid_t pid = HOST_KERNEL_ID;
-
-	while (*p != NULL) {
-		parent = *p;
-		machine = rb_entry(parent, struct machine, rb_node);
-		if (pid < machine->pid)
-			p = &(*p)->rb_left;
-		else if (pid > machine->pid)
-			p = &(*p)->rb_right;
-		else
-			return machine;
-	}
-
-	return NULL;
-}
-
 struct machine *machines__findnew(struct rb_root *self, pid_t pid)
 {
 	char path[PATH_MAX];
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 5d353e70fe2..71bc608e0ec 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -100,6 +100,7 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
 	self->repipe = repipe;
 	self->ordered_samples.flush_limit = ULLONG_MAX;
 	INIT_LIST_HEAD(&self->ordered_samples.samples_head);
+	machine__init(&self->host_machine, "", HOST_KERNEL_ID);
 
 	if (mode == O_RDONLY) {
 		if (perf_session__open(self, force) < 0)
@@ -870,3 +871,10 @@ int perf_session__set_kallsyms_ref_reloc_sym(struct map **maps,
 
 	return 0;
 }
+
+size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp)
+{
+	return __dsos__fprintf(&self->host_machine.kernel_dsos, fp) +
+	       __dsos__fprintf(&self->host_machine.user_dsos, fp) +
+	       machines__fprintf_dsos(&self->machines, fp);
+}
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index f2b2c6a3a49..eb9f179376a 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -25,6 +25,7 @@ struct perf_session {
 	unsigned long		mmap_window;
 	struct rb_root		threads;
 	struct thread		*last_match;
+	struct machine		host_machine;
 	struct rb_root		machines;
 	struct events_stats	events_stats;
 	struct rb_root		stats_by_id;
@@ -107,18 +108,22 @@ int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 static inline
 struct machine *perf_session__find_host_machine(struct perf_session *self)
 {
-	return machines__find_host(&self->machines);
+	return &self->host_machine;
 }
 
 static inline
 struct machine *perf_session__find_machine(struct perf_session *self, pid_t pid)
 {
+	if (pid == HOST_KERNEL_ID)
+		return &self->host_machine;
 	return machines__find(&self->machines, pid);
 }
 
 static inline
 struct machine *perf_session__findnew_machine(struct perf_session *self, pid_t pid)
 {
+	if (pid == HOST_KERNEL_ID)
+		return &self->host_machine;
 	return machines__findnew(&self->machines, pid);
 }
 
@@ -126,14 +131,11 @@ static inline
 void perf_session__process_machines(struct perf_session *self,
 				    machine__process_t process)
 {
+	process(&self->host_machine, self);
 	return machines__process(&self->machines, process, self);
 }
 
-static inline
-size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp)
-{
-	return machines__fprintf_dsos(&self->machines, fp);
-}
+size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp);
 
 static inline
 size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, FILE *fp,
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 4c0146a4906..994efdb531e 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1889,7 +1889,7 @@ struct dso *__dsos__findnew(struct list_head *head, const char *name)
 	return dso;
 }
 
-static size_t __dsos__fprintf(struct list_head *head, FILE *fp)
+size_t __dsos__fprintf(struct list_head *head, FILE *fp)
 {
 	struct dso *pos;
 	size_t ret = 0;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index a517c17407b..edff866d76b 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -167,6 +167,8 @@ int machine__load_kallsyms(struct machine *self, const char *filename,
 int machine__load_vmlinux_path(struct machine *self, enum map_type type,
 			       symbol_filter_t filter);
 
+size_t __dsos__fprintf(struct list_head *head, FILE *fp);
+
 size_t machines__fprintf_dsos(struct rb_root *self, FILE *fp);
 size_t machines__fprintf_dsos_buildid(struct rb_root *self, FILE *fp, bool with_hits);
 
-- 
cgit v1.2.3


From 232a5c948da5e23dff27e48180abf4a4238f7602 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 9 May 2010 20:28:10 -0300
Subject: perf report: Allow limiting the number of entries to print in
 callchains
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Works by adding a third parameter to the '-g' argument, after the graph
type and minimum percentage, for example:

[root@doppio linux-2.6-tip]# perf report -g fractal,0.5,2

Will show only the first two symbols where at least 0.5% of the samples
took place.

All the other symbols that don't fall outside these constraints will be
put together in the last entry, prefixed with "[...]" and the total
percentage for them.

Suggested-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c |  5 ++++-
 tools/perf/util/callchain.h |  1 +
 tools/perf/util/hist.c      | 10 ++++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 5e2f47f88ec..642a6d8eb5d 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -343,7 +343,7 @@ static int
 parse_callchain_opt(const struct option *opt __used, const char *arg,
 		    int unset)
 {
-	char *tok;
+	char *tok, *tok2;
 	char *endptr;
 
 	/*
@@ -388,10 +388,13 @@ parse_callchain_opt(const struct option *opt __used, const char *arg,
 	if (!tok)
 		goto setup;
 
+	tok2 = strtok(NULL, ",");
 	callchain_param.min_percent = strtod(tok, &endptr);
 	if (tok == endptr)
 		return -1;
 
+	if (tok2)
+		callchain_param.print_limit = strtod(tok2, &endptr);
 setup:
 	if (register_callchain_param(&callchain_param) < 0) {
 		fprintf(stderr, "Can't register callchain params\n");
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 0f4da093cbd..1cba1f5504e 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -34,6 +34,7 @@ typedef void (*sort_chain_func_t)(struct rb_root *, struct callchain_node *,
 
 struct callchain_param {
 	enum chain_mode 	mode;
+	u32			print_limit;
 	double			min_percent;
 	sort_chain_func_t	sort;
 };
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index e0c8a722e11..0f154a530df 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -333,6 +333,7 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 	u64 remaining;
 	size_t ret = 0;
 	int i;
+	uint entries_printed = 0;
 
 	if (callchain_param.mode == CHAIN_GRAPH_REL)
 		new_total = self->children_hit;
@@ -379,6 +380,8 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 						  new_depth_mask | (1 << depth),
 						  left_margin);
 		node = next;
+		if (++entries_printed == callchain_param.print_limit)
+			break;
 	}
 
 	if (callchain_param.mode == CHAIN_GRAPH_REL &&
@@ -404,6 +407,7 @@ static size_t callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 	bool printed = false;
 	int i = 0;
 	int ret = 0;
+	u32 entries_printed = 0;
 
 	list_for_each_entry(chain, &self->val, list) {
 		if (!i++ && sort__first_dimension == SORT_SYM)
@@ -424,6 +428,9 @@ static size_t callchain__fprintf_graph(FILE *fp, struct callchain_node *self,
 			ret += fprintf(fp, " %s\n", chain->ms.sym->name);
 		else
 			ret += fprintf(fp, " %p\n", (void *)(long)chain->ip);
+
+		if (++entries_printed == callchain_param.print_limit)
+			break;
 	}
 
 	ret += __callchain__fprintf_graph(fp, self, total_samples, 1, 1, left_margin);
@@ -462,6 +469,7 @@ static size_t hist_entry_callchain__fprintf(FILE *fp, struct hist_entry *self,
 	struct rb_node *rb_node;
 	struct callchain_node *chain;
 	size_t ret = 0;
+	u32 entries_printed = 0;
 
 	rb_node = rb_first(&self->sorted_chain);
 	while (rb_node) {
@@ -484,6 +492,8 @@ static size_t hist_entry_callchain__fprintf(FILE *fp, struct hist_entry *self,
 			break;
 		}
 		ret += fprintf(fp, "\n");
+		if (++entries_printed == callchain_param.print_limit)
+			break;
 		rb_node = rb_next(rb_node);
 	}
 
-- 
cgit v1.2.3


From 3998d095354d2a3062bdaa821ef07a1e1c82873c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sun, 9 May 2010 22:46:54 -0700
Subject: x86, hypervisor: add missing <linux/module.h>

EXPORT_SYMBOL() needs <linux/module.h> to be included; fixes modular
builds of the VMware balloon driver, and any future modular drivers
which depends on the hypervisor.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Greg KH <greg@kroah.com>
Cc: Hank Janssen <hjanssen@microsoft.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Ky Srinivasan <ksrinivasan@novell.com>
Cc: Dmitry Torokhov <dtor@vmware.com>
LKML-Reference: <4BE49778.6060800@zytor.com>
---
 arch/x86/kernel/cpu/hypervisor.c | 1 +
 arch/x86/kernel/cpu/mshyperv.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 4afb5a2130e..dd531cc56a8 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,6 +21,7 @@
  *
  */
 
+#include <linux/module.h>
 #include <asm/processor.h>
 #include <asm/hypervisor.h>
 
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 0f1371724c8..16f41bbe46b 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/types.h>
+#include <linux/module.h>
 #include <asm/processor.h>
 #include <asm/hypervisor.h>
 #include <asm/hyperv.h>
-- 
cgit v1.2.3


From af507ae8a0512a83728b17d8f8c5fa1561669f50 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 10 May 2010 11:24:27 +0800
Subject: sched: Remove a stale comment

This comment should have been removed together with uids_mutex
when removing user sched.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dhaval Giani <dhaval.giani@gmail.com>
LKML-Reference: <4BE77C6B.5010402@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/user.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/user.c b/kernel/user.c
index 8e1c8c0a496..7e72614b736 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -136,7 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up, *new;
 
-	/* Make uid_hash_find() + uid_hash_insert() atomic. */
 	spin_lock_irq(&uidhash_lock);
 	up = uid_hash_find(uid, hashent);
 	spin_unlock_irq(&uidhash_lock);
-- 
cgit v1.2.3


From 7f8264539c62378cccbdf9b598927b034bef4a92 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 10 May 2010 10:51:25 -0300
Subject: perf newt: Use newtAddComponent()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of newtAddComponents(just-one-entry, NULL), that is not needed
if, like in this browser, we're adding just one component at a time.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 7a123a94e3f..e283a6e6b6e 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -31,7 +31,7 @@ struct ui_progress *ui_progress__new(const char *title, u64 total)
 		self->scale = newtScale(0, 0, cols, total);
 		if (self->scale == NULL)
 			goto out_free_form;
-		newtFormAddComponents(self->form, self->scale, NULL);
+		newtFormAddComponent(self->form, self->scale);
 		newtRefresh();
 	}
 
@@ -107,7 +107,7 @@ static int popup_menu(int argc, char * const argv[])
 	if (listbox == NULL)
 		goto out_destroy_form;
 
-	newtFormAddComponents(form, listbox, NULL);
+	newtFormAddComponent(form, listbox);
 
 	for (i = 0; i < argc; ++i) {
 		int len = strlen(argv[i]);
@@ -365,7 +365,7 @@ static void map_symbol__annotate_browser(const struct map_symbol *self,
 
 	newtCenteredWindow(max_line_len + 2, rows - 5, self->sym->name);
 	form = newt_form__new();
-	newtFormAddComponents(form, tree, NULL);
+	newtFormAddComponent(form, tree);
 
 	newtFormRun(form, &es);
 	newtFormDestroy(form);
-- 
cgit v1.2.3


From 51c8176472de1551a301b676e36a61884e0e8494 Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Mon, 10 May 2010 15:15:24 +0530
Subject: cifs: remove unused parameter from cifs_posix_open_inode_helper()

..a left over from the commit 3321b791b2e8897323f8c044a0c77ff25781381c.

Cc: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d53d6308bf3..a83541ec971 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -108,8 +108,7 @@ static inline int cifs_get_disposition(unsigned int flags)
 /* all arguments to this function must be checked for validity in caller */
 static inline int
 cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
-			     struct cifsInodeInfo *pCifsInode,
-			     struct cifsFileInfo *pCifsFile, __u32 oplock,
+			     struct cifsInodeInfo *pCifsInode, __u32 oplock,
 			     u16 netfid)
 {
 
@@ -311,7 +310,7 @@ int cifs_open(struct inode *inode, struct file *file)
 
 			pCifsFile = cifs_fill_filedata(file);
 			cifs_posix_open_inode_helper(inode, file, pCifsInode,
-						     pCifsFile, oplock, netfid);
+						     oplock, netfid);
 			goto out;
 		} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
 			if (tcon->ses->serverNOS)
-- 
cgit v1.2.3


From cdd5b75b0cd24c4d6a98b12a219217b1ccfe2586 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 10 May 2010 10:56:50 -0300
Subject: perf callchains: Use zalloc to allocate objects
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/callchain.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index ac148613afe..21a52e0a443 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -167,7 +167,7 @@ create_child(struct callchain_node *parent, bool inherit_children)
 {
 	struct callchain_node *new;
 
-	new = malloc(sizeof(*new));
+	new = zalloc(sizeof(*new));
 	if (!new) {
 		perror("not enough memory to create child for code path tree");
 		return NULL;
@@ -213,7 +213,7 @@ fill_node(struct callchain_node *node, struct resolved_chain *chain, int start)
 	for (i = start; i < chain->nr; i++) {
 		struct callchain_list *call;
 
-		call = malloc(sizeof(*call));
+		call = zalloc(sizeof(*call));
 		if (!call) {
 			perror("not enough memory for the code path tree");
 			return;
@@ -386,7 +386,7 @@ int append_chain(struct callchain_node *root, struct ip_callchain *chain,
 	if (!chain->nr)
 		return 0;
 
-	filtered = malloc(sizeof(*filtered) +
+	filtered = zalloc(sizeof(*filtered) +
 			  chain->nr * sizeof(struct resolved_ip));
 	if (!filtered)
 		return -ENOMEM;
-- 
cgit v1.2.3


From d118f8ba6ac2af2bf11d40cba657c813f0f39ca2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 10 May 2010 12:51:05 -0300
Subject: perf session: create_kernel_maps should use ->host_machine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using machines__create_kernel_maps(..., HOST_KERNEL_ID) it would create
another machine instance for the host machine, and since 1f626bc we have
it out of the machines rb_tree.

Fix it by using machine__create_kernel_maps(&self->host_machine)
directly.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/session.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index c088d8f9b51..4130036a010 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -69,11 +69,10 @@ void perf_session__update_sample_type(struct perf_session *self)
 
 int perf_session__create_kernel_maps(struct perf_session *self)
 {
-	struct rb_root *machines = &self->machines;
-	int ret = machines__create_kernel_maps(machines, HOST_KERNEL_ID);
+	int ret = machine__create_kernel_maps(&self->host_machine);
 
 	if (ret >= 0)
-		ret = machines__create_guest_kernel_maps(machines);
+		ret = machines__create_guest_kernel_maps(&self->machines);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 1c02c4d2e92f2097f1bba63ec71560b0e05a7f36 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 10 May 2010 13:04:11 -0300
Subject: perf hist: Introduce hists class and move lots of methods to it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In cbbc79a we introduced support for multiple events by introducing a
new "event_stat_id" struct and then made several perf_session methods
receive a point to it instead of a pointer to perf_session, and kept the
event_stats and hists rb_tree in perf_session.

While working on the new newt based browser, I realised that it would be
better to introduce a new class, "hists" (short for "histograms"),
renaming the "event_stat_id" struct and the perf_session methods that
were really "hists" methods, as they manipulate only struct hists
members, not touching anything in the other perf_session members.

Other optimizations, such as calculating the maximum lenght of a symbol
name present in an hists instance will be possible as we add them,
avoiding a re-traversal just for finding that information.

The rationale for the name "hists" to replace "event_stat_id" is that we
may have multiple sets of hists for the same event_stat id, as, for
instance, the 'perf diff' tool has, so event stat id is not what
characterizes what this struct and the functions that manipulate it do.

Cc: Eric B Munson <ebmunson@us.ibm.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-annotate.c | 17 ++++----
 tools/perf/builtin-diff.c     | 50 +++++++++++------------
 tools/perf/builtin-report.c   | 71 ++++++++++++++++-----------------
 tools/perf/builtin-trace.c    |  2 +-
 tools/perf/util/event.c       |  2 +-
 tools/perf/util/event.h       | 14 -------
 tools/perf/util/hist.c        | 92 +++++++++++++++++++------------------------
 tools/perf/util/hist.h        | 48 ++++++++++++----------
 tools/perf/util/session.c     |  2 +-
 tools/perf/util/session.h     | 12 ++++--
 10 files changed, 146 insertions(+), 164 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index c7ac45a59ed..3940964161b 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -96,8 +96,7 @@ static int annotate__hist_hit(struct hist_entry *he, u64 ip)
 	return 0;
 }
 
-static int perf_session__add_hist_entry(struct perf_session *self,
-					struct addr_location *al)
+static int hists__add_entry(struct hists *self, struct addr_location *al)
 {
 	struct hist_entry *he;
 
@@ -112,7 +111,7 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 		return 0;
 	}
 
-	he = __perf_session__add_hist_entry(&self->hists, al, NULL, 1);
+	he = __hists__add_entry(self, al, NULL, 1);
 	if (he == NULL)
 		return -ENOMEM;
 
@@ -132,7 +131,7 @@ static int process_sample_event(event_t *event, struct perf_session *session)
 		return -1;
 	}
 
-	if (!al.filtered && perf_session__add_hist_entry(session, &al)) {
+	if (!al.filtered && hists__add_entry(&session->hists, &al)) {
 		pr_warning("problem incrementing symbol count, "
 			   "skipping event\n");
 		return -1;
@@ -514,11 +513,11 @@ static void annotate_sym(struct hist_entry *he)
 		free_source_line(he, len);
 }
 
-static void perf_session__find_annotations(struct perf_session *self)
+static void hists__find_annotations(struct hists *self)
 {
 	struct rb_node *nd;
 
-	for (nd = rb_first(&self->hists); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
 		struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
 		struct sym_priv *priv;
 
@@ -570,9 +569,9 @@ static int __cmd_annotate(void)
 	if (verbose > 2)
 		perf_session__fprintf_dsos(session, stdout);
 
-	perf_session__collapse_resort(&session->hists);
-	perf_session__output_resort(&session->hists, session->event_total[0]);
-	perf_session__find_annotations(session);
+	hists__collapse_resort(&session->hists);
+	hists__output_resort(&session->hists);
+	hists__find_annotations(&session->hists);
 out_delete:
 	perf_session__delete(session);
 
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 613a5c4f6d8..3a95a0260a5 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -22,10 +22,10 @@ static char	  diff__default_sort_order[] = "dso,symbol";
 static bool  force;
 static bool show_displacement;
 
-static int perf_session__add_hist_entry(struct perf_session *self,
-					struct addr_location *al, u64 count)
+static int hists__add_entry(struct hists *self,
+			    struct addr_location *al, u64 count)
 {
-	if (__perf_session__add_hist_entry(&self->hists, al, NULL, count) != NULL)
+	if (__hists__add_entry(self, al, NULL, count) != NULL)
 		return 0;
 	return -ENOMEM;
 }
@@ -49,12 +49,12 @@ static int diff__process_sample_event(event_t *event, struct perf_session *sessi
 
 	event__parse_sample(event, session->sample_type, &data);
 
-	if (perf_session__add_hist_entry(session, &al, data.period)) {
+	if (hists__add_entry(&session->hists, &al, data.period)) {
 		pr_warning("problem incrementing symbol count, skipping event\n");
 		return -1;
 	}
 
-	session->events_stats.total += data.period;
+	session->hists.stats.total += data.period;
 	return 0;
 }
 
@@ -87,35 +87,34 @@ static void perf_session__insert_hist_entry_by_name(struct rb_root *root,
 	rb_insert_color(&he->rb_node, root);
 }
 
-static void perf_session__resort_hist_entries(struct perf_session *self)
+static void hists__resort_entries(struct hists *self)
 {
 	unsigned long position = 1;
 	struct rb_root tmp = RB_ROOT;
-	struct rb_node *next = rb_first(&self->hists);
+	struct rb_node *next = rb_first(&self->entries);
 
 	while (next != NULL) {
 		struct hist_entry *n = rb_entry(next, struct hist_entry, rb_node);
 
 		next = rb_next(&n->rb_node);
-		rb_erase(&n->rb_node, &self->hists);
+		rb_erase(&n->rb_node, &self->entries);
 		n->position = position++;
 		perf_session__insert_hist_entry_by_name(&tmp, n);
 	}
 
-	self->hists = tmp;
+	self->entries = tmp;
 }
 
-static void perf_session__set_hist_entries_positions(struct perf_session *self)
+static void hists__set_positions(struct hists *self)
 {
-	perf_session__output_resort(&self->hists, self->events_stats.total);
-	perf_session__resort_hist_entries(self);
+	hists__output_resort(self);
+	hists__resort_entries(self);
 }
 
-static struct hist_entry *
-perf_session__find_hist_entry(struct perf_session *self,
-			      struct hist_entry *he)
+static struct hist_entry *hists__find_entry(struct hists *self,
+					    struct hist_entry *he)
 {
-	struct rb_node *n = self->hists.rb_node;
+	struct rb_node *n = self->entries.rb_node;
 
 	while (n) {
 		struct hist_entry *iter = rb_entry(n, struct hist_entry, rb_node);
@@ -132,14 +131,13 @@ perf_session__find_hist_entry(struct perf_session *self,
 	return NULL;
 }
 
-static void perf_session__match_hists(struct perf_session *old_session,
-				      struct perf_session *new_session)
+static void hists__match(struct hists *older, struct hists *newer)
 {
 	struct rb_node *nd;
 
-	for (nd = rb_first(&new_session->hists); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&newer->entries); nd; nd = rb_next(nd)) {
 		struct hist_entry *pos = rb_entry(nd, struct hist_entry, rb_node);
-		pos->pair = perf_session__find_hist_entry(old_session, pos);
+		pos->pair = hists__find_entry(older, pos);
 	}
 }
 
@@ -159,15 +157,13 @@ static int __cmd_diff(void)
 			goto out_delete;
 	}
 
-	perf_session__output_resort(&session[1]->hists,
-				    session[1]->events_stats.total);
+	hists__output_resort(&session[1]->hists);
 	if (show_displacement)
-		perf_session__set_hist_entries_positions(session[0]);
+		hists__set_positions(&session[0]->hists);
 
-	perf_session__match_hists(session[0], session[1]);
-	perf_session__fprintf_hists(&session[1]->hists, session[0],
-				    show_displacement, stdout,
-				    session[1]->events_stats.total);
+	hists__match(&session[0]->hists, &session[1]->hists);
+	hists__fprintf(&session[1]->hists, &session[0]->hists,
+		       show_displacement, stdout);
 out_delete:
 	for (i = 0; i < 2; ++i)
 		perf_session__delete(session[i]);
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 642a6d8eb5d..53077fd973f 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -44,16 +44,17 @@ static char		*pretty_printing_style = default_pretty_printing_style;
 
 static char		callchain_default_opt[] = "fractal,0.5";
 
-static struct event_stat_id *get_stats(struct perf_session *self,
-				       u64 event_stream, u32 type, u64 config)
+static struct hists *perf_session__hists_findnew(struct perf_session *self,
+						 u64 event_stream, u32 type,
+						 u64 config)
 {
-	struct rb_node **p = &self->stats_by_id.rb_node;
+	struct rb_node **p = &self->hists_tree.rb_node;
 	struct rb_node *parent = NULL;
-	struct event_stat_id *iter, *new;
+	struct hists *iter, *new;
 
 	while (*p != NULL) {
 		parent = *p;
-		iter = rb_entry(parent, struct event_stat_id, rb_node);
+		iter = rb_entry(parent, struct hists, rb_node);
 		if (iter->config == config)
 			return iter;
 
@@ -64,15 +65,15 @@ static struct event_stat_id *get_stats(struct perf_session *self,
 			p = &(*p)->rb_left;
 	}
 
-	new = malloc(sizeof(struct event_stat_id));
+	new = malloc(sizeof(struct hists));
 	if (new == NULL)
 		return NULL;
-	memset(new, 0, sizeof(struct event_stat_id));
+	memset(new, 0, sizeof(struct hists));
 	new->event_stream = event_stream;
 	new->config = config;
 	new->type = type;
 	rb_link_node(&new->rb_node, parent, p);
-	rb_insert_color(&new->rb_node, &self->stats_by_id);
+	rb_insert_color(&new->rb_node, &self->hists_tree);
 	return new;
 }
 
@@ -84,7 +85,7 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 	struct symbol *parent = NULL;
 	int err = -ENOMEM;
 	struct hist_entry *he;
-	struct event_stat_id *stats;
+	struct hists *hists;
 	struct perf_event_attr *attr;
 
 	if ((sort__has_parent || symbol_conf.use_callchain) && data->callchain) {
@@ -96,13 +97,12 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 
 	attr = perf_header__find_attr(data->id, &self->header);
 	if (attr)
-		stats = get_stats(self, data->id, attr->type, attr->config);
+		hists = perf_session__hists_findnew(self, data->id, attr->type, attr->config);
 	else
-		stats = get_stats(self, data->id, 0, 0);
-	if (stats == NULL)
+		hists = perf_session__hists_findnew(self, data->id, 0, 0);
+	if (hists == NULL)
 		goto out_free_syms;
-	he = __perf_session__add_hist_entry(&stats->hists, al, parent,
-					    data->period);
+	he = __hists__add_entry(hists, al, parent, data->period);
 	if (he == NULL)
 		goto out_free_syms;
 	err = 0;
@@ -117,18 +117,19 @@ static int add_event_total(struct perf_session *session,
 			   struct sample_data *data,
 			   struct perf_event_attr *attr)
 {
-	struct event_stat_id *stats;
+	struct hists *hists;
 
 	if (attr)
-		stats = get_stats(session, data->id, attr->type, attr->config);
+		hists = perf_session__hists_findnew(session, data->id,
+						    attr->type, attr->config);
 	else
-		stats = get_stats(session, data->id, 0, 0);
+		hists = perf_session__hists_findnew(session, data->id, 0, 0);
 
-	if (!stats)
+	if (!hists)
 		return -ENOMEM;
 
-	stats->stats.total += data->period;
-	session->events_stats.total += data->period;
+	hists->stats.total += data->period;
+	session->hists.stats.total += data->period;
 	return 0;
 }
 
@@ -292,35 +293,33 @@ static int __cmd_report(void)
 	if (verbose > 2)
 		perf_session__fprintf_dsos(session, stdout);
 
-	next = rb_first(&session->stats_by_id);
+	next = rb_first(&session->hists_tree);
 	while (next) {
-		struct event_stat_id *stats;
+		struct hists *hists;
 		u64 nr_hists;
 
-		stats = rb_entry(next, struct event_stat_id, rb_node);
-		perf_session__collapse_resort(&stats->hists);
-		nr_hists = perf_session__output_resort(&stats->hists,
-						       stats->stats.total);
+		hists = rb_entry(next, struct hists, rb_node);
+		hists__collapse_resort(hists);
+		nr_hists = hists__output_resort(hists);
 		if (use_browser)
-			perf_session__browse_hists(&stats->hists, nr_hists,
-						   stats->stats.total, help,
+			perf_session__browse_hists(&hists->entries, nr_hists,
+						   hists->stats.total, help,
 						   input_name);
 		else {
-			if (rb_first(&session->stats_by_id) ==
-			    rb_last(&session->stats_by_id))
+			if (rb_first(&session->hists.entries) ==
+			    rb_last(&session->hists.entries))
 				fprintf(stdout, "# Samples: %Ld\n#\n",
-					stats->stats.total);
+					hists->stats.total);
 			else
 				fprintf(stdout, "# Samples: %Ld %s\n#\n",
-					stats->stats.total,
-					__event_name(stats->type, stats->config));
+					hists->stats.total,
+					__event_name(hists->type, hists->config));
 
-			perf_session__fprintf_hists(&stats->hists, NULL, false, stdout,
-					    stats->stats.total);
+			hists__fprintf(hists, NULL, false, stdout);
 			fprintf(stdout, "\n\n");
 		}
 
-		next = rb_next(&stats->rb_node);
+		next = rb_next(&hists->rb_node);
 	}
 
 	if (!use_browser && sort_order == default_sort_order &&
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 9c483e92e8d..6e268ca761e 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -107,7 +107,7 @@ static int process_sample_event(event_t *event, struct perf_session *session)
 					     data.time, thread->comm);
 	}
 
-	session->events_stats.total += data.period;
+	session->hists.stats.total += data.period;
 	return 0;
 }
 
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index d2ea9dd9fdf..cce006ec8f0 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -368,7 +368,7 @@ int event__process_comm(event_t *self, struct perf_session *session)
 int event__process_lost(event_t *self, struct perf_session *session)
 {
 	dump_printf(": id:%Ld: lost:%Ld\n", self->lost.id, self->lost.lost);
-	session->events_stats.lost += self->lost.lost;
+	session->hists.stats.lost += self->lost.lost;
 	return 0;
 }
 
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 6cc1b1dced5..48c2cc9dae4 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -131,20 +131,6 @@ typedef union event_union {
 	struct build_id_event		build_id;
 } event_t;
 
-struct events_stats {
-	u64 total;
-	u64 lost;
-};
-
-struct event_stat_id {
-	struct rb_node		rb_node;
-	struct rb_root		hists;
-	struct events_stats	stats;
-	u64			config;
-	u64			event_stream;
-	u32			type;
-};
-
 void event__print_totals(void);
 
 struct perf_session;
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 0f154a530df..410cf56c966 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -8,21 +8,21 @@ struct callchain_param	callchain_param = {
 	.min_percent = 0.5
 };
 
-static void perf_session__add_cpumode_count(struct hist_entry *he,
-					    unsigned int cpumode, u64 count)
+static void hist_entry__add_cpumode_count(struct hist_entry *self,
+					  unsigned int cpumode, u64 count)
 {
 	switch (cpumode) {
 	case PERF_RECORD_MISC_KERNEL:
-		he->count_sys += count;
+		self->count_sys += count;
 		break;
 	case PERF_RECORD_MISC_USER:
-		he->count_us += count;
+		self->count_us += count;
 		break;
 	case PERF_RECORD_MISC_GUEST_KERNEL:
-		he->count_guest_sys += count;
+		self->count_guest_sys += count;
 		break;
 	case PERF_RECORD_MISC_GUEST_USER:
-		he->count_guest_us += count;
+		self->count_guest_us += count;
 		break;
 	default:
 		break;
@@ -47,12 +47,11 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template)
 	return self;
 }
 
-struct hist_entry *__perf_session__add_hist_entry(struct rb_root *hists,
-						  struct addr_location *al,
-						  struct symbol *sym_parent,
-						  u64 count)
+struct hist_entry *__hists__add_entry(struct hists *self,
+				      struct addr_location *al,
+				      struct symbol *sym_parent, u64 count)
 {
-	struct rb_node **p = &hists->rb_node;
+	struct rb_node **p = &self->entries.rb_node;
 	struct rb_node *parent = NULL;
 	struct hist_entry *he;
 	struct hist_entry entry = {
@@ -89,9 +88,9 @@ struct hist_entry *__perf_session__add_hist_entry(struct rb_root *hists,
 	if (!he)
 		return NULL;
 	rb_link_node(&he->rb_node, parent, p);
-	rb_insert_color(&he->rb_node, hists);
+	rb_insert_color(&he->rb_node, &self->entries);
 out:
-	perf_session__add_cpumode_count(he, al->cpumode, count);
+	hist_entry__add_cpumode_count(he, al->cpumode, count);
 	return he;
 }
 
@@ -167,7 +166,7 @@ static void collapse__insert_entry(struct rb_root *root, struct hist_entry *he)
 	rb_insert_color(&he->rb_node, root);
 }
 
-void perf_session__collapse_resort(struct rb_root *hists)
+void hists__collapse_resort(struct hists *self)
 {
 	struct rb_root tmp;
 	struct rb_node *next;
@@ -177,28 +176,28 @@ void perf_session__collapse_resort(struct rb_root *hists)
 		return;
 
 	tmp = RB_ROOT;
-	next = rb_first(hists);
+	next = rb_first(&self->entries);
 
 	while (next) {
 		n = rb_entry(next, struct hist_entry, rb_node);
 		next = rb_next(&n->rb_node);
 
-		rb_erase(&n->rb_node, hists);
+		rb_erase(&n->rb_node, &self->entries);
 		collapse__insert_entry(&tmp, n);
 	}
 
-	*hists = tmp;
+	self->entries = tmp;
 }
 
 /*
  * reverse the map, sort on count.
  */
 
-static void perf_session__insert_output_hist_entry(struct rb_root *root,
-						   struct hist_entry *he,
-						   u64 min_callchain_hits)
+static void __hists__insert_output_entry(struct rb_root *entries,
+					 struct hist_entry *he,
+					 u64 min_callchain_hits)
 {
-	struct rb_node **p = &root->rb_node;
+	struct rb_node **p = &entries->rb_node;
 	struct rb_node *parent = NULL;
 	struct hist_entry *iter;
 
@@ -217,10 +216,10 @@ static void perf_session__insert_output_hist_entry(struct rb_root *root,
 	}
 
 	rb_link_node(&he->rb_node, parent, p);
-	rb_insert_color(&he->rb_node, root);
+	rb_insert_color(&he->rb_node, entries);
 }
 
-u64 perf_session__output_resort(struct rb_root *hists, u64 total_samples)
+u64 hists__output_resort(struct hists *self)
 {
 	struct rb_root tmp;
 	struct rb_node *next;
@@ -228,23 +227,21 @@ u64 perf_session__output_resort(struct rb_root *hists, u64 total_samples)
 	u64 min_callchain_hits;
 	u64 nr_hists = 0;
 
-	min_callchain_hits =
-		total_samples * (callchain_param.min_percent / 100);
+	min_callchain_hits = self->stats.total * (callchain_param.min_percent / 100);
 
 	tmp = RB_ROOT;
-	next = rb_first(hists);
+	next = rb_first(&self->entries);
 
 	while (next) {
 		n = rb_entry(next, struct hist_entry, rb_node);
 		next = rb_next(&n->rb_node);
 
-		rb_erase(&n->rb_node, hists);
-		perf_session__insert_output_hist_entry(&tmp, n,
-						       min_callchain_hits);
+		rb_erase(&n->rb_node, &self->entries);
+		__hists__insert_output_entry(&tmp, n, min_callchain_hits);
 		++nr_hists;
 	}
 
-	*hists = tmp;
+	self->entries = tmp;
 	return nr_hists;
 }
 
@@ -500,12 +497,9 @@ static size_t hist_entry_callchain__fprintf(FILE *fp, struct hist_entry *self,
 	return ret;
 }
 
-int hist_entry__snprintf(struct hist_entry *self,
-			   char *s, size_t size,
-			   struct perf_session *pair_session,
-			   bool show_displacement,
-			   long displacement, bool color,
-			   u64 session_total)
+int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size,
+			 struct hists *pair_hists, bool show_displacement,
+			 long displacement, bool color, u64 session_total)
 {
 	struct sort_entry *se;
 	u64 count, total, count_sys, count_us, count_guest_sys, count_guest_us;
@@ -515,9 +509,9 @@ int hist_entry__snprintf(struct hist_entry *self,
 	if (symbol_conf.exclude_other && !self->parent)
 		return 0;
 
-	if (pair_session) {
+	if (pair_hists) {
 		count = self->pair ? self->pair->count : 0;
-		total = pair_session->events_stats.total;
+		total = pair_hists->stats.total;
 		count_sys = self->pair ? self->pair->count_sys : 0;
 		count_us = self->pair ? self->pair->count_us : 0;
 		count_guest_sys = self->pair ? self->pair->count_guest_sys : 0;
@@ -569,7 +563,7 @@ int hist_entry__snprintf(struct hist_entry *self,
 			ret += snprintf(s + ret, size - ret, "%11lld", count);
 	}
 
-	if (pair_session) {
+	if (pair_hists) {
 		char bf[32];
 		double old_percent = 0, new_percent = 0, diff;
 
@@ -615,14 +609,12 @@ int hist_entry__snprintf(struct hist_entry *self,
 	return ret;
 }
 
-int hist_entry__fprintf(struct hist_entry *self,
-			struct perf_session *pair_session,
-			bool show_displacement,
-			long displacement, FILE *fp,
+int hist_entry__fprintf(struct hist_entry *self, struct hists *pair_hists,
+			bool show_displacement, long displacement, FILE *fp,
 			u64 session_total)
 {
 	char bf[512];
-	hist_entry__snprintf(self, bf, sizeof(bf), pair_session,
+	hist_entry__snprintf(self, bf, sizeof(bf), pair_hists,
 			     show_displacement, displacement,
 			     true, session_total);
 	return fprintf(fp, "%s\n", bf);
@@ -644,10 +636,8 @@ static size_t hist_entry__fprintf_callchain(struct hist_entry *self, FILE *fp,
 					     left_margin);
 }
 
-size_t perf_session__fprintf_hists(struct rb_root *hists,
-				   struct perf_session *pair,
-				   bool show_displacement, FILE *fp,
-				   u64 session_total)
+size_t hists__fprintf(struct hists *self, struct hists *pair,
+		      bool show_displacement, FILE *fp)
 {
 	struct sort_entry *se;
 	struct rb_node *nd;
@@ -753,7 +743,7 @@ size_t perf_session__fprintf_hists(struct rb_root *hists,
 	fprintf(fp, "\n#\n");
 
 print_entries:
-	for (nd = rb_first(hists); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
 
 		if (show_displacement) {
@@ -765,10 +755,10 @@ print_entries:
 			++position;
 		}
 		ret += hist_entry__fprintf(h, pair, show_displacement,
-					   displacement, fp, session_total);
+					   displacement, fp, self->stats.total);
 
 		if (symbol_conf.use_callchain)
-			ret += hist_entry__fprintf_callchain(h, fp, session_total);
+			ret += hist_entry__fprintf_callchain(h, fp, self->stats.total);
 
 		if (h->ms.map == NULL && verbose > 1) {
 			__map_groups__fprintf_maps(&h->thread->mg,
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index b49013adb34..bdde81eca69 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -6,34 +6,40 @@
 
 extern struct callchain_param callchain_param;
 
-struct perf_session;
 struct hist_entry;
 struct addr_location;
 struct symbol;
 struct rb_root;
 
-struct hist_entry *__perf_session__add_hist_entry(struct rb_root *hists,
-						  struct addr_location *al,
-						  struct symbol *parent,
-						  u64 count);
+struct events_stats {
+	u64 total;
+	u64 lost;
+};
+
+struct hists {
+	struct rb_node		rb_node;
+	struct rb_root		entries;
+	struct events_stats	stats;
+	u64			config;
+	u64			event_stream;
+	u32			type;
+};
+
+struct hist_entry *__hists__add_entry(struct hists *self,
+				      struct addr_location *al,
+				      struct symbol *parent, u64 count);
 extern int64_t hist_entry__cmp(struct hist_entry *, struct hist_entry *);
 extern int64_t hist_entry__collapse(struct hist_entry *, struct hist_entry *);
-int hist_entry__fprintf(struct hist_entry *self,
-			   struct perf_session *pair_session,
-			   bool show_displacement,
-			   long displacement, FILE *fp,
-			   u64 session_total);
-int hist_entry__snprintf(struct hist_entry *self,
-			 char *bf, size_t size,
-			 struct perf_session *pair_session,
-			 bool show_displacement, long displacement,
-			 bool color, u64 session_total);
+int hist_entry__fprintf(struct hist_entry *self, struct hists *pair_hists,
+			bool show_displacement, long displacement, FILE *fp,
+			u64 total);
+int hist_entry__snprintf(struct hist_entry *self, char *bf, size_t size,
+			 struct hists *pair_hists, bool show_displacement,
+			 long displacement, bool color, u64 total);
 void hist_entry__free(struct hist_entry *);
 
-u64 perf_session__output_resort(struct rb_root *hists, u64 total_samples);
-void perf_session__collapse_resort(struct rb_root *hists);
-size_t perf_session__fprintf_hists(struct rb_root *hists,
-				   struct perf_session *pair,
-				   bool show_displacement, FILE *fp,
-				   u64 session_total);
+u64 hists__output_resort(struct hists *self);
+void hists__collapse_resort(struct hists *self);
+size_t hists__fprintf(struct hists *self, struct hists *pair,
+		      bool show_displacement, FILE *fp);
 #endif	/* __PERF_HIST_H */
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 4130036a010..72a7f6ae029 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -89,7 +89,7 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
 
 	memcpy(self->filename, filename, len);
 	self->threads = RB_ROOT;
-	self->stats_by_id = RB_ROOT;
+	self->hists_tree = RB_ROOT;
 	self->last_match = NULL;
 	self->mmap_window = 32;
 	self->cwd = NULL;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 242d528bfae..46190f94b54 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -1,6 +1,7 @@
 #ifndef __PERF_SESSION_H
 #define __PERF_SESSION_H
 
+#include "hist.h"
 #include "event.h"
 #include "header.h"
 #include "symbol.h"
@@ -28,11 +29,16 @@ struct perf_session {
 	struct thread		*last_match;
 	struct machine		host_machine;
 	struct rb_root		machines;
-	struct events_stats	events_stats;
-	struct rb_root		stats_by_id;
+	struct rb_root		hists_tree;
 	unsigned long		event_total[PERF_RECORD_MAX];
 	unsigned long		unknown_events;
-	struct rb_root		hists;
+	/*
+	 * FIXME: should point to the first entry in hists_tree and
+	 *        be a hists instance. Right now its only 'report'
+	 *        that is using ->hists_tree while all the rest use
+	 *        ->hists.
+	 */
+	struct hists		hists;
 	u64			sample_type;
 	int			fd;
 	bool			fd_pipe;
-- 
cgit v1.2.3


From c9ad488289144ae5ef53b012e15895ef1f5e4bb6 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 6 May 2010 11:45:45 +0300
Subject: x86: Eliminate TS_XSAVE

The fpu code currently uses current->thread_info->status & TS_XSAVE as
a way to distinguish between XSAVE capable processors and older processors.
The decision is not really task specific; instead we use the task status to
avoid a global memory reference - the value should be the same across all
threads.

Eliminate this tie-in into the task structure by using an alternative
instruction keyed off the XSAVE cpu feature; this results in shorter and
faster code, without introducing a global memory reference.

[ hpa: in the future, this probably should use an asm jmp ]

Signed-off-by: Avi Kivity <avi@redhat.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-2-git-send-email-avi@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/i387.h        | 20 ++++++++++++++++----
 arch/x86/include/asm/thread_info.h |  1 -
 arch/x86/kernel/cpu/common.c       |  5 +----
 arch/x86/kernel/i387.c             |  5 +----
 arch/x86/kernel/xsave.c            |  6 +++---
 5 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index da293092450..a301a6825c3 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -56,6 +56,18 @@ extern int restore_i387_xstate_ia32(void __user *buf);
 
 #define X87_FSW_ES (1 << 7)	/* Exception Summary */
 
+static inline bool use_xsave(void)
+{
+	u8 has_xsave;
+
+	alternative_io("mov $0, %0",
+		       "mov $1, %0",
+		       X86_FEATURE_XSAVE,
+		       "=g"(has_xsave));
+
+	return has_xsave;
+}
+
 #ifdef CONFIG_X86_64
 
 /* Ignore delayed exceptions from user space */
@@ -99,7 +111,7 @@ static inline void clear_fpu_state(struct task_struct *tsk)
 	/*
 	 * xsave header may indicate the init state of the FP.
 	 */
-	if ((task_thread_info(tsk)->status & TS_XSAVE) &&
+	if (use_xsave() &&
 	    !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
 		return;
 
@@ -164,7 +176,7 @@ static inline void fxsave(struct task_struct *tsk)
 
 static inline void __save_init_fpu(struct task_struct *tsk)
 {
-	if (task_thread_info(tsk)->status & TS_XSAVE)
+	if (use_xsave())
 		xsave(tsk);
 	else
 		fxsave(tsk);
@@ -218,7 +230,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
  */
 static inline void __save_init_fpu(struct task_struct *tsk)
 {
-	if (task_thread_info(tsk)->status & TS_XSAVE) {
+	if (use_xsave()) {
 		struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
 		struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
 
@@ -266,7 +278,7 @@ end:
 
 static inline int restore_fpu_checking(struct task_struct *tsk)
 {
-	if (task_thread_info(tsk)->status & TS_XSAVE)
+	if (use_xsave())
 		return xrstor_checking(&tsk->thread.xstate->xsave);
 	else
 		return fxrstor_checking(&tsk->thread.xstate->fxsave);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e0d28901e96..e9e341505ab 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -244,7 +244,6 @@ static inline struct thread_info *current_thread_info(void)
 #define TS_POLLING		0x0004	/* true if in idle loop
 					   and not sleeping */
 #define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
-#define TS_XSAVE		0x0010	/* Use xsave/xrstor */
 
 #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4868e4a951e..c1c00d0b169 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1243,10 +1243,7 @@ void __cpuinit cpu_init(void)
 	/*
 	 * Force FPU initialization:
 	 */
-	if (cpu_has_xsave)
-		current_thread_info()->status = TS_XSAVE;
-	else
-		current_thread_info()->status = 0;
+	current_thread_info()->status = 0;
 	clear_used_math();
 	mxcsr_feature_mask_init();
 
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 54c31c28548..14ca1dc7a70 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -102,10 +102,7 @@ void __cpuinit fpu_init(void)
 
 	mxcsr_feature_mask_init();
 	/* clean state in init */
-	if (cpu_has_xsave)
-		current_thread_info()->status = TS_XSAVE;
-	else
-		current_thread_info()->status = 0;
+	current_thread_info()->status = 0;
 	clear_used_math();
 }
 #endif	/* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 782c3a362ec..c1b0a11033a 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -99,7 +99,7 @@ int save_i387_xstate(void __user *buf)
 		if (err)
 			return err;
 
-		if (task_thread_info(tsk)->status & TS_XSAVE)
+		if (use_xsave())
 			err = xsave_user(buf);
 		else
 			err = fxsave_user(buf);
@@ -116,7 +116,7 @@ int save_i387_xstate(void __user *buf)
 
 	clear_used_math(); /* trigger finit */
 
-	if (task_thread_info(tsk)->status & TS_XSAVE) {
+	if (use_xsave()) {
 		struct _fpstate __user *fx = buf;
 		struct _xstate __user *x = buf;
 		u64 xstate_bv;
@@ -225,7 +225,7 @@ int restore_i387_xstate(void __user *buf)
 		clts();
 		task_thread_info(current)->status |= TS_USEDFPU;
 	}
-	if (task_thread_info(tsk)->status & TS_XSAVE)
+	if (use_xsave())
 		err = restore_user_xstate(buf);
 	else
 		err = fxrstor_checking((__force struct i387_fxsave_struct *)
-- 
cgit v1.2.3


From 86603283326c9e95e5ad4e9fdddeec93cac5d9ad Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 6 May 2010 11:45:46 +0300
Subject: x86: Introduce 'struct fpu' and related API

Currently all fpu state access is through tsk->thread.xstate.  Since we wish
to generalize fpu access to non-task contexts, wrap the state in a new
'struct fpu' and convert existing access to use an fpu API.

Signal frame handlers are not converted to the API since they will remain
task context only things.

Signed-off-by: Avi Kivity <avi@redhat.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-3-git-send-email-avi@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/i387.h      | 115 +++++++++++++++++++++++++++++----------
 arch/x86/include/asm/processor.h |   6 +-
 arch/x86/include/asm/xsave.h     |   7 ++-
 arch/x86/kernel/i387.c           | 102 +++++++++++++++++-----------------
 arch/x86/kernel/process.c        |  21 +++----
 arch/x86/kernel/process_32.c     |   2 +-
 arch/x86/kernel/process_64.c     |   2 +-
 arch/x86/kernel/xsave.c          |   2 +-
 arch/x86/math-emu/fpu_aux.c      |   6 +-
 9 files changed, 160 insertions(+), 103 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a301a6825c3..1a8cca33b73 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -16,6 +16,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/regset.h>
 #include <linux/hardirq.h>
+#include <linux/slab.h>
 #include <asm/asm.h>
 #include <asm/processor.h>
 #include <asm/sigcontext.h>
@@ -103,10 +104,10 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
    values. The kernel data segment can be sometimes 0 and sometimes
    new user value. Both should be ok.
    Use the PDA as safe address because it should be already in L1. */
-static inline void clear_fpu_state(struct task_struct *tsk)
+static inline void fpu_clear(struct fpu *fpu)
 {
-	struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
-	struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+	struct xsave_struct *xstate = &fpu->state->xsave;
+	struct i387_fxsave_struct *fx = &fpu->state->fxsave;
 
 	/*
 	 * xsave header may indicate the init state of the FP.
@@ -123,6 +124,11 @@ static inline void clear_fpu_state(struct task_struct *tsk)
 			  X86_FEATURE_FXSAVE_LEAK);
 }
 
+static inline void clear_fpu_state(struct task_struct *tsk)
+{
+	fpu_clear(&tsk->thread.fpu);
+}
+
 static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 {
 	int err;
@@ -147,7 +153,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 	return err;
 }
 
-static inline void fxsave(struct task_struct *tsk)
+static inline void fpu_fxsave(struct fpu *fpu)
 {
 	/* Using "rex64; fxsave %0" is broken because, if the memory operand
 	   uses any extended registers for addressing, a second REX prefix
@@ -157,42 +163,45 @@ static inline void fxsave(struct task_struct *tsk)
 	/* Using "fxsaveq %0" would be the ideal choice, but is only supported
 	   starting with gas 2.16. */
 	__asm__ __volatile__("fxsaveq %0"
-			     : "=m" (tsk->thread.xstate->fxsave));
+			     : "=m" (fpu->state->fxsave));
 #elif 0
 	/* Using, as a workaround, the properly prefixed form below isn't
 	   accepted by any binutils version so far released, complaining that
 	   the same type of prefix is used twice if an extended register is
 	   needed for addressing (fix submitted to mainline 2005-11-21). */
 	__asm__ __volatile__("rex64/fxsave %0"
-			     : "=m" (tsk->thread.xstate->fxsave));
+			     : "=m" (fpu->state->fxsave));
 #else
 	/* This, however, we can work around by forcing the compiler to select
 	   an addressing mode that doesn't require extended registers. */
 	__asm__ __volatile__("rex64/fxsave (%1)"
-			     : "=m" (tsk->thread.xstate->fxsave)
-			     : "cdaSDb" (&tsk->thread.xstate->fxsave));
+			     : "=m" (fpu->state->fxsave)
+			     : "cdaSDb" (&fpu->state->fxsave));
 #endif
 }
 
-static inline void __save_init_fpu(struct task_struct *tsk)
+static inline void fpu_save_init(struct fpu *fpu)
 {
 	if (use_xsave())
-		xsave(tsk);
+		fpu_xsave(fpu);
 	else
-		fxsave(tsk);
+		fpu_fxsave(fpu);
 
-	clear_fpu_state(tsk);
+	fpu_clear(fpu);
+}
+
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+	fpu_save_init(&tsk->thread.fpu);
 	task_thread_info(tsk)->status &= ~TS_USEDFPU;
 }
 
 #else  /* CONFIG_X86_32 */
 
 #ifdef CONFIG_MATH_EMULATION
-extern void finit_task(struct task_struct *tsk);
+extern void finit_soft_fpu(struct i387_soft_struct *soft);
 #else
-static inline void finit_task(struct task_struct *tsk)
-{
-}
+static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
 #endif
 
 static inline void tolerant_fwait(void)
@@ -228,13 +237,13 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 /*
  * These must be called with preempt disabled
  */
-static inline void __save_init_fpu(struct task_struct *tsk)
+static inline void fpu_save_init(struct fpu *fpu)
 {
 	if (use_xsave()) {
-		struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
-		struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+		struct xsave_struct *xstate = &fpu->state->xsave;
+		struct i387_fxsave_struct *fx = &fpu->state->fxsave;
 
-		xsave(tsk);
+		fpu_xsave(fpu);
 
 		/*
 		 * xsave header may indicate the init state of the FP.
@@ -258,8 +267,8 @@ static inline void __save_init_fpu(struct task_struct *tsk)
 		"fxsave %[fx]\n"
 		"bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
 		X86_FEATURE_FXSR,
-		[fx] "m" (tsk->thread.xstate->fxsave),
-		[fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory");
+		[fx] "m" (fpu->state->fxsave),
+		[fsw] "m" (fpu->state->fxsave.swd) : "memory");
 clear_state:
 	/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
 	   is pending.  Clear the x87 state here by setting it to fixed
@@ -271,17 +280,34 @@ clear_state:
 		X86_FEATURE_FXSAVE_LEAK,
 		[addr] "m" (safe_address));
 end:
+	;
+}
+
+static inline void __save_init_fpu(struct task_struct *tsk)
+{
+	fpu_save_init(&tsk->thread.fpu);
 	task_thread_info(tsk)->status &= ~TS_USEDFPU;
 }
 
+
 #endif	/* CONFIG_X86_64 */
 
-static inline int restore_fpu_checking(struct task_struct *tsk)
+static inline int fpu_fxrstor_checking(struct fpu *fpu)
+{
+	return fxrstor_checking(&fpu->state->fxsave);
+}
+
+static inline int fpu_restore_checking(struct fpu *fpu)
 {
 	if (use_xsave())
-		return xrstor_checking(&tsk->thread.xstate->xsave);
+		return fpu_xrstor_checking(fpu);
 	else
-		return fxrstor_checking(&tsk->thread.xstate->fxsave);
+		return fpu_fxrstor_checking(fpu);
+}
+
+static inline int restore_fpu_checking(struct task_struct *tsk)
+{
+	return fpu_restore_checking(&tsk->thread.fpu);
 }
 
 /*
@@ -409,30 +435,59 @@ static inline void clear_fpu(struct task_struct *tsk)
 static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
 {
 	if (cpu_has_fxsr) {
-		return tsk->thread.xstate->fxsave.cwd;
+		return tsk->thread.fpu.state->fxsave.cwd;
 	} else {
-		return (unsigned short)tsk->thread.xstate->fsave.cwd;
+		return (unsigned short)tsk->thread.fpu.state->fsave.cwd;
 	}
 }
 
 static inline unsigned short get_fpu_swd(struct task_struct *tsk)
 {
 	if (cpu_has_fxsr) {
-		return tsk->thread.xstate->fxsave.swd;
+		return tsk->thread.fpu.state->fxsave.swd;
 	} else {
-		return (unsigned short)tsk->thread.xstate->fsave.swd;
+		return (unsigned short)tsk->thread.fpu.state->fsave.swd;
 	}
 }
 
 static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
 {
 	if (cpu_has_xmm) {
-		return tsk->thread.xstate->fxsave.mxcsr;
+		return tsk->thread.fpu.state->fxsave.mxcsr;
 	} else {
 		return MXCSR_DEFAULT;
 	}
 }
 
+static bool fpu_allocated(struct fpu *fpu)
+{
+	return fpu->state != NULL;
+}
+
+static inline int fpu_alloc(struct fpu *fpu)
+{
+	if (fpu_allocated(fpu))
+		return 0;
+	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
+	if (!fpu->state)
+		return -ENOMEM;
+	WARN_ON((unsigned long)fpu->state & 15);
+	return 0;
+}
+
+static inline void fpu_free(struct fpu *fpu)
+{
+	if (fpu->state) {
+		kmem_cache_free(task_xstate_cachep, fpu->state);
+		fpu->state = NULL;
+	}
+}
+
+static inline void fpu_copy(struct fpu *dst, struct fpu *src)
+{
+	memcpy(dst->state, src->state, xstate_size);
+}
+
 #endif /* __ASSEMBLY__ */
 
 #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b753ea59703..b684f587647 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -380,6 +380,10 @@ union thread_xstate {
 	struct xsave_struct		xsave;
 };
 
+struct fpu {
+	union thread_xstate *state;
+};
+
 #ifdef CONFIG_X86_64
 DECLARE_PER_CPU(struct orig_ist, orig_ist);
 
@@ -457,7 +461,7 @@ struct thread_struct {
 	unsigned long		trap_no;
 	unsigned long		error_code;
 	/* floating point and extended processor state */
-	union thread_xstate	*xstate;
+	struct fpu		fpu;
 #ifdef CONFIG_X86_32
 	/* Virtual 86 mode info */
 	struct vm86_struct __user *vm86_info;
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index ddc04ccad03..2c4390cae22 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -37,8 +37,9 @@ extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
 			    void __user *fpstate,
 			    struct _fpx_sw_bytes *sw);
 
-static inline int xrstor_checking(struct xsave_struct *fx)
+static inline int fpu_xrstor_checking(struct fpu *fpu)
 {
+	struct xsave_struct *fx = &fpu->state->xsave;
 	int err;
 
 	asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
@@ -110,12 +111,12 @@ static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
 		     :   "memory");
 }
 
-static inline void xsave(struct task_struct *tsk)
+static inline void fpu_xsave(struct fpu *fpu)
 {
 	/* This, however, we can work around by forcing the compiler to select
 	   an addressing mode that doesn't require extended registers. */
 	__asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
-			     : : "D" (&(tsk->thread.xstate->xsave)),
+			     : : "D" (&(fpu->state->xsave)),
 				 "a" (-1), "d"(-1) : "memory");
 }
 #endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 14ca1dc7a70..86cef6b3225 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -107,57 +107,57 @@ void __cpuinit fpu_init(void)
 }
 #endif	/* CONFIG_X86_64 */
 
-/*
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remeber the current task has used the FPU.
- */
-int init_fpu(struct task_struct *tsk)
+static void fpu_finit(struct fpu *fpu)
 {
-	if (tsk_used_math(tsk)) {
-		if (HAVE_HWFP && tsk == current)
-			unlazy_fpu(tsk);
-		return 0;
-	}
-
-	/*
-	 * Memory allocation at the first usage of the FPU and other state.
-	 */
-	if (!tsk->thread.xstate) {
-		tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
-						      GFP_KERNEL);
-		if (!tsk->thread.xstate)
-			return -ENOMEM;
-	}
-
 #ifdef CONFIG_X86_32
 	if (!HAVE_HWFP) {
-		memset(tsk->thread.xstate, 0, xstate_size);
-		finit_task(tsk);
-		set_stopped_child_used_math(tsk);
-		return 0;
+		finit_soft_fpu(&fpu->state->soft);
+		return;
 	}
 #endif
 
 	if (cpu_has_fxsr) {
-		struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+		struct i387_fxsave_struct *fx = &fpu->state->fxsave;
 
 		memset(fx, 0, xstate_size);
 		fx->cwd = 0x37f;
 		if (cpu_has_xmm)
 			fx->mxcsr = MXCSR_DEFAULT;
 	} else {
-		struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+		struct i387_fsave_struct *fp = &fpu->state->fsave;
 		memset(fp, 0, xstate_size);
 		fp->cwd = 0xffff037fu;
 		fp->swd = 0xffff0000u;
 		fp->twd = 0xffffffffu;
 		fp->fos = 0xffff0000u;
 	}
+}
+
+/*
+ * The _current_ task is using the FPU for the first time
+ * so initialize it and set the mxcsr to its default
+ * value at reset if we support XMM instructions and then
+ * remeber the current task has used the FPU.
+ */
+int init_fpu(struct task_struct *tsk)
+{
+	int ret;
+
+	if (tsk_used_math(tsk)) {
+		if (HAVE_HWFP && tsk == current)
+			unlazy_fpu(tsk);
+		return 0;
+	}
+
 	/*
-	 * Only the device not available exception or ptrace can call init_fpu.
+	 * Memory allocation at the first usage of the FPU and other state.
 	 */
+	ret = fpu_alloc(&tsk->thread.fpu);
+	if (ret)
+		return ret;
+
+	fpu_finit(&tsk->thread.fpu);
+
 	set_stopped_child_used_math(tsk);
 	return 0;
 }
@@ -191,7 +191,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 		return ret;
 
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   &target->thread.xstate->fxsave, 0, -1);
+				   &target->thread.fpu.state->fxsave, 0, -1);
 }
 
 int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -208,19 +208,19 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 		return ret;
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &target->thread.xstate->fxsave, 0, -1);
+				 &target->thread.fpu.state->fxsave, 0, -1);
 
 	/*
 	 * mxcsr reserved bits must be masked to zero for security reasons.
 	 */
-	target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+	target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
 
 	/*
 	 * update the header bits in the xsave header, indicating the
 	 * presence of FP and SSE state.
 	 */
 	if (cpu_has_xsave)
-		target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+		target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
 
 	return ret;
 }
@@ -243,14 +243,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 	 * memory layout in the thread struct, so that we can copy the entire
 	 * xstateregs to the user using one user_regset_copyout().
 	 */
-	memcpy(&target->thread.xstate->fxsave.sw_reserved,
+	memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
 	       xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
 
 	/*
 	 * Copy the xstate memory layout.
 	 */
 	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				  &target->thread.xstate->xsave, 0, -1);
+				  &target->thread.fpu.state->xsave, 0, -1);
 	return ret;
 }
 
@@ -269,14 +269,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 		return ret;
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &target->thread.xstate->xsave, 0, -1);
+				 &target->thread.fpu.state->xsave, 0, -1);
 
 	/*
 	 * mxcsr reserved bits must be masked to zero for security reasons.
 	 */
-	target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+	target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
 
-	xsave_hdr = &target->thread.xstate->xsave.xsave_hdr;
+	xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
 
 	xsave_hdr->xstate_bv &= pcntxt_mask;
 	/*
@@ -362,7 +362,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
 static void
 convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 {
-	struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
+	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
 	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -402,7 +402,7 @@ static void convert_to_fxsr(struct task_struct *tsk,
 			    const struct user_i387_ia32_struct *env)
 
 {
-	struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
+	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
 	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -442,7 +442,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 
 	if (!cpu_has_fxsr) {
 		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					   &target->thread.xstate->fsave, 0,
+					   &target->thread.fpu.state->fsave, 0,
 					   -1);
 	}
 
@@ -472,7 +472,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 
 	if (!cpu_has_fxsr) {
 		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					  &target->thread.xstate->fsave, 0, -1);
+					  &target->thread.fpu.state->fsave, 0, -1);
 	}
 
 	if (pos > 0 || count < sizeof(env))
@@ -487,7 +487,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	 * presence of FP.
 	 */
 	if (cpu_has_xsave)
-		target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
+		target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
 	return ret;
 }
 
@@ -498,7 +498,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
-	struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+	struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
 
 	fp->status = fp->swd;
 	if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
@@ -509,7 +509,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
-	struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+	struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
 	struct user_i387_ia32_struct env;
 	int err = 0;
 
@@ -544,7 +544,7 @@ static int save_i387_xsave(void __user *buf)
 	 * header as well as change any contents in the memory layout.
 	 * xrestore as part of sigreturn will capture all the changes.
 	 */
-	tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+	tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
 
 	if (save_i387_fxsave(fx) < 0)
 		return -1;
@@ -596,7 +596,7 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
 
-	return __copy_from_user(&tsk->thread.xstate->fsave, buf,
+	return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
 				sizeof(struct i387_fsave_struct));
 }
 
@@ -607,10 +607,10 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
 	struct user_i387_ia32_struct env;
 	int err;
 
-	err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
+	err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
 			       size);
 	/* mxcsr reserved bits must be masked to zero for security reasons */
-	tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+	tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
 	if (err || __copy_from_user(&env, buf, sizeof(env)))
 		return 1;
 	convert_to_fxsr(tsk, &env);
@@ -626,7 +626,7 @@ static int restore_i387_xsave(void __user *buf)
 	struct i387_fxsave_struct __user *fx =
 		(struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
 	struct xsave_hdr_struct *xsave_hdr =
-				&current->thread.xstate->xsave.xsave_hdr;
+				&current->thread.fpu.state->xsave.xsave_hdr;
 	u64 mask;
 	int err;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 28ad9f4d8b9..f18fd9c1524 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -32,25 +32,22 @@ struct kmem_cache *task_xstate_cachep;
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
+	int ret;
+
 	*dst = *src;
-	if (src->thread.xstate) {
-		dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
-						      GFP_KERNEL);
-		if (!dst->thread.xstate)
-			return -ENOMEM;
-		WARN_ON((unsigned long)dst->thread.xstate & 15);
-		memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
+	if (fpu_allocated(&src->thread.fpu)) {
+		memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
+		ret = fpu_alloc(&dst->thread.fpu);
+		if (ret)
+			return ret;
+		fpu_copy(&dst->thread.fpu, &src->thread.fpu);
 	}
 	return 0;
 }
 
 void free_thread_xstate(struct task_struct *tsk)
 {
-	if (tsk->thread.xstate) {
-		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
-		tsk->thread.xstate = NULL;
-	}
-
+	fpu_free(&tsk->thread.fpu);
 	WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c62667e30..0a7a4f5bbaa 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -317,7 +317,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* we're going to use this soon, after a few expensive things */
 	if (preload_fpu)
-		prefetch(next->xstate);
+		prefetch(next->fpu.state);
 
 	/*
 	 * Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 17cb3295cbf..979215f5198 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -396,7 +396,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* we're going to use this soon, after a few expensive things */
 	if (preload_fpu)
-		prefetch(next->xstate);
+		prefetch(next->fpu.state);
 
 	/*
 	 * Reload esp0, LDT and the page table pointer:
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index c1b0a11033a..37e68fc5e24 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -109,7 +109,7 @@ int save_i387_xstate(void __user *buf)
 		task_thread_info(tsk)->status &= ~TS_USEDFPU;
 		stts();
 	} else {
-		if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
+		if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
 				   xstate_size))
 			return -1;
 	}
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index aa098708877..62797f93051 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,10 +30,10 @@ static void fclex(void)
 }
 
 /* Needs to be externally visible */
-void finit_task(struct task_struct *tsk)
+void finit_soft_fpu(struct i387_soft_struct *soft)
 {
-	struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
 	struct address *oaddr, *iaddr;
+	memset(soft, 0, sizeof(*soft));
 	soft->cwd = 0x037f;
 	soft->swd = 0;
 	soft->ftop = 0;	/* We don't keep top in the status word internally. */
@@ -52,7 +52,7 @@ void finit_task(struct task_struct *tsk)
 
 void finit(void)
 {
-	finit_task(current);
+	finit_task(&current->thread.fpu);
 }
 
 /*
-- 
cgit v1.2.3


From fae683f764f91f31ab45512e70cc8cc81d4d157b Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Mon, 10 May 2010 20:00:05 +0530
Subject: cifs: add comments explaining cifs_new_fileinfo behavior

The comments make it clear the otherwise subtle behavior of cifs_new_fileinfo().

Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Reviewed-by: Shirish Pargaonkar <shirishp@us.ibm.com>
--
 fs/cifs/dir.c |   18 ++++++++++++++++--
 1 files changed, 16 insertions(+), 2 deletions(-)
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/dir.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index d791d0763a9..bd363df19b3 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -129,6 +129,12 @@ cifs_bp_rename_retry:
 	return full_path;
 }
 
+/*
+ * When called with struct file pointer set to NULL, there is no way we could
+ * update file->private_data, but getting it stuck on openFileList provides a
+ * way to access it from cifs_fill_filedata and thereby set file->private_data
+ * from cifs_open.
+ */
 struct cifsFileInfo *
 cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
 		  struct file *file, struct vfsmount *mnt, unsigned int oflags)
@@ -251,6 +257,10 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 		cifs_fattr_to_inode(*pinode, &fattr);
 	}
 
+	/*
+	 * cifs_fill_filedata() takes care of setting cifsFileInfo pointer to
+	 * file->private_data.
+	 */
 	if (mnt)
 		cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
 
@@ -466,8 +476,12 @@ cifs_create_set_dentry:
 		/* mknod case - do not leave file open */
 		CIFSSMBClose(xid, tcon, fileHandle);
 	} else if (!(posix_create) && (newinode)) {
-			cifs_new_fileinfo(newinode, fileHandle, NULL,
-						nd->path.mnt, oflags);
+		/*
+		 * cifs_fill_filedata() takes care of setting cifsFileInfo
+		 * pointer to file->private_data.
+		 */
+		cifs_new_fileinfo(newinode, fileHandle, NULL, nd->path.mnt,
+				  oflags);
 	}
 cifs_create_out:
 	kfree(buf);
-- 
cgit v1.2.3


From 2b3fc35f6919344e3cf722dde8308f47235c0b70 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 20 Apr 2010 16:23:07 +0800
Subject: rcu: optionally leave lockdep enabled after RCU lockdep splat

There is no need to disable lockdep after an RCU lockdep splat,
so remove the debug_lockdeps_off() from lockdep_rcu_dereference().
To avoid repeated lockdep splats, use a static variable in the inlined
rcu_dereference_check() and rcu_dereference_protected() macros so that
a given instance splats only once, but so that multiple instances can
be detected per boot.

This is controlled by a new config variable CONFIG_PROVE_RCU_REPEATEDLY,
which is disabled by default.  This provides the normal lockdep behavior
by default, but permits people who want to find multiple RCU-lockdep
splats per boot to easily do so.

Requested-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Tested-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 15 +++++++++++----
 kernel/lockdep.c         |  3 +++
 lib/Kconfig.debug        | 12 ++++++++++++
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index db266bbed23..4dca2752cfd 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -192,6 +192,15 @@ static inline int rcu_read_lock_sched_held(void)
 
 extern int rcu_my_thread_group_empty(void);
 
+#define __do_rcu_dereference_check(c)					\
+	do {								\
+		static bool __warned;					\
+		if (debug_lockdep_rcu_enabled() && !__warned && !(c)) {	\
+			__warned = true;				\
+			lockdep_rcu_dereference(__FILE__, __LINE__);	\
+		}							\
+	} while (0)
+
 /**
  * rcu_dereference_check - rcu_dereference with debug checking
  * @p: The pointer to read, prior to dereferencing
@@ -221,8 +230,7 @@ extern int rcu_my_thread_group_empty(void);
  */
 #define rcu_dereference_check(p, c) \
 	({ \
-		if (debug_lockdep_rcu_enabled() && !(c)) \
-			lockdep_rcu_dereference(__FILE__, __LINE__); \
+		__do_rcu_dereference_check(c); \
 		rcu_dereference_raw(p); \
 	})
 
@@ -239,8 +247,7 @@ extern int rcu_my_thread_group_empty(void);
  */
 #define rcu_dereference_protected(p, c) \
 	({ \
-		if (debug_lockdep_rcu_enabled() && !(c)) \
-			lockdep_rcu_dereference(__FILE__, __LINE__); \
+		__do_rcu_dereference_check(c); \
 		(p); \
 	})
 
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 2594e1ce41c..3a756ba8d5d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3801,8 +3801,11 @@ void lockdep_rcu_dereference(const char *file, const int line)
 {
 	struct task_struct *curr = current;
 
+#ifndef CONFIG_PROVE_RCU_REPEATEDLY
 	if (!debug_locks_off())
 		return;
+#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
+	/* Note: the following can be executed concurrently, so be careful. */
 	printk("\n===================================================\n");
 	printk(  "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
 	printk(  "---------------------------------------------------\n");
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 935248bdbc4..94090b4bb7d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -512,6 +512,18 @@ config PROVE_RCU
 
 	 Say N if you are unsure.
 
+config PROVE_RCU_REPEATEDLY
+	bool "RCU debugging: don't disable PROVE_RCU on first splat"
+	depends on PROVE_RCU
+	default n
+	help
+	 By itself, PROVE_RCU will disable checking upon issuing the
+	 first warning (or "splat").  This feature prevents such
+	 disabling, allowing multiple RCU-lockdep warnings to be printed
+	 on a single reboot.
+
+	 Say N if you are unsure.
+
 config LOCKDEP
 	bool
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
-- 
cgit v1.2.3


From d25eb9442bb2c38c1e742f0fa764d7132d72593f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 18 Mar 2010 21:36:51 -0700
Subject: rcu: substitute set_need_resched for sending resched IPIs

This patch adds a check to __rcu_pending() that does a local
set_need_resched() if the current CPU is holding up the current grace
period and if force_quiescent_state() will be called soon.  The goal is
to reduce the probability that force_quiescent_state() will need to do
smp_send_reschedule(), which sends an IPI and is therefore more expensive
on most architectures.

Signed-off-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3ec8160fc75..e54c1235122 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1499,6 +1499,16 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
 	if (rdp->qs_pending) {
+
+		/*
+		 * If force_quiescent_state() coming soon and this CPU
+		 * needs a quiescent state, and this is either RCU-sched
+		 * or RCU-bh, force a local reschedule.
+		 */
+		if (!rdp->preemptable &&
+		    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
+				 jiffies))
+			set_need_resched();
 		rdp->n_rp_qs_pending++;
 		return 1;
 	}
-- 
cgit v1.2.3


From f261414f0d56dd1a0e34888e27d1d4902ad052f3 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sun, 28 Mar 2010 11:15:20 +0800
Subject: rcu: make dead code really dead

cleanup: make dead code really dead

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e54c1235122..6042fb85953 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1236,11 +1236,11 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 		break; /* grace period idle or initializing, ignore. */
 
 	case RCU_SAVE_DYNTICK:
-
-		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
 		if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
 			break; /* So gcc recognizes the dead code. */
 
+		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
+
 		/* Record dyntick-idle state. */
 		force_qs_rnp(rsp, dyntick_save_progress_counter);
 		raw_spin_lock(&rnp->lock);  /* irqs already disabled */
-- 
cgit v1.2.3


From 0c34029abdfdea64420cb4264c4e91a776b22157 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sun, 28 Mar 2010 11:12:30 +0800
Subject: rcu: move some code from macro to function

Shrink the RCU_INIT_FLAVOR() macro by moving all but the initialization
of the ->rda[] array to rcu_init_one().  The call to rcu_init_one()
can then be moved to the end of the RCU_INIT_FLAVOR() macro, which is
required because rcu_boot_init_percpu_data(), which is now called from
rcu_init_one(), depends on the initialization of the ->rda[] array.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6042fb85953..86bb9499aae 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1859,6 +1859,14 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 			INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
 		}
 	}
+
+	rnp = rsp->level[NUM_RCU_LVLS - 1];
+	for_each_possible_cpu(i) {
+		if (i > rnp->grphi)
+			rnp++;
+		rsp->rda[i]->mynode = rnp;
+		rcu_boot_init_percpu_data(i, rsp);
+	}
 }
 
 /*
@@ -1869,19 +1877,11 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 #define RCU_INIT_FLAVOR(rsp, rcu_data) \
 do { \
 	int i; \
-	int j; \
-	struct rcu_node *rnp; \
 	\
-	rcu_init_one(rsp); \
-	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
-	j = 0; \
 	for_each_possible_cpu(i) { \
-		if (i > rnp[j].grphi) \
-			j++; \
-		per_cpu(rcu_data, i).mynode = &rnp[j]; \
 		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
-		rcu_boot_init_percpu_data(i, rsp); \
 	} \
+	rcu_init_one(rsp); \
 } while (0)
 
 void __init rcu_init(void)
-- 
cgit v1.2.3


From 5db356736acb9ba717df1aa9444e4e44cbb30a71 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 30 Mar 2010 18:40:36 +0800
Subject: rcu: ignore offline CPUs in last non-dyntick-idle CPU check

Offline CPUs are not in nohz_cpu_mask, but can be ignored when checking
for the last non-dyntick-idle CPU.  This patch therefore only checks
online CPUs for not being dyntick idle, allowing fast entry into
full-system dyntick-idle state even when there are some offline CPUs.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 79b53bda894..687c4e90722 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1016,7 +1016,7 @@ int rcu_needs_cpu(int cpu)
 
 	/* Don't bother unless we are the last non-dyntick-idle CPU. */
 	for_each_cpu_not(thatcpu, nohz_cpu_mask)
-		if (thatcpu != cpu) {
+		if (cpu_online(thatcpu) && thatcpu != cpu) {
 			per_cpu(rcu_dyntick_drain, cpu) = 0;
 			per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
 			return rcu_needs_cpu_quick_check(cpu);
-- 
cgit v1.2.3


From d20200b591f59847ab6a5c23507084a7d29e23c5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 30 Mar 2010 10:52:21 -0700
Subject: rcu: Fix bogus CONFIG_PROVE_LOCKING in comments to reflect reality.

It is CONFIG_DEBUG_LOCK_ALLOC rather than CONFIG_PROVE_LOCKING, so fix it.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 15 ++++++++-------
 include/linux/srcu.h     |  4 ++--
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 4dca2752cfd..a150af0e5cd 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -106,8 +106,8 @@ extern int debug_lockdep_rcu_enabled(void);
 /**
  * rcu_read_lock_held - might we be in RCU read-side critical section?
  *
- * If CONFIG_PROVE_LOCKING is selected and enabled, returns nonzero iff in
- * an RCU read-side critical section.  In absence of CONFIG_PROVE_LOCKING,
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU
+ * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
  * this assumes we are in an RCU read-side critical section unless it can
  * prove otherwise.
  *
@@ -129,11 +129,12 @@ extern int rcu_read_lock_bh_held(void);
 /**
  * rcu_read_lock_sched_held - might we be in RCU-sched read-side critical section?
  *
- * If CONFIG_PROVE_LOCKING is selected and enabled, returns nonzero iff in an
- * RCU-sched read-side critical section.  In absence of CONFIG_PROVE_LOCKING,
- * this assumes we are in an RCU-sched read-side critical section unless it
- * can prove otherwise.  Note that disabling of preemption (including
- * disabling irqs) counts as an RCU-sched read-side critical section.
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
+ * RCU-sched read-side critical section.  In absence of
+ * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
+ * critical section unless it can prove otherwise.  Note that disabling
+ * of preemption (including disabling irqs) counts as an RCU-sched
+ * read-side critical section.
  *
  * Check rcu_scheduler_active to prevent false positives during boot.
  */
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 4d5ecb222af..9c01f102242 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -84,8 +84,8 @@ long srcu_batches_completed(struct srcu_struct *sp);
 /**
  * srcu_read_lock_held - might we be in SRCU read-side critical section?
  *
- * If CONFIG_PROVE_LOCKING is selected and enabled, returns nonzero iff in
- * an SRCU read-side critical section.  In absence of CONFIG_PROVE_LOCKING,
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU
+ * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
  * this assumes we are in an SRCU read-side critical section unless it can
  * prove otherwise.
  */
-- 
cgit v1.2.3


From 32c141a0a1dfa29e0a07d78bec0c0919fc4b9f88 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 30 Mar 2010 10:59:28 -0700
Subject: rcu: fix now-bogus rcu_scheduler_active comments.

The rcu_scheduler_active check has been wrapped into the new
debug_lockdep_rcu_enabled() function, so update the comments to
reflect this new reality.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index a150af0e5cd..02537a72aaa 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -111,7 +111,8 @@ extern int debug_lockdep_rcu_enabled(void);
  * this assumes we are in an RCU read-side critical section unless it can
  * prove otherwise.
  *
- * Check rcu_scheduler_active to prevent false positives during boot.
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
  */
 static inline int rcu_read_lock_held(void)
 {
@@ -136,7 +137,8 @@ extern int rcu_read_lock_bh_held(void);
  * of preemption (including disabling irqs) counts as an RCU-sched
  * read-side critical section.
  *
- * Check rcu_scheduler_active to prevent false positives during boot.
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
  */
 #ifdef CONFIG_PREEMPT
 static inline int rcu_read_lock_sched_held(void)
-- 
cgit v1.2.3


From da848c47bc6e873a54a445ea1960423a495b6b32 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 30 Mar 2010 15:46:01 -0700
Subject: rcu: shrink rcutiny by making synchronize_rcu_bh() be inline

Because synchronize_rcu_bh() is identical to synchronize_sched(),
make the former a static inline invoking the latter, saving the
overhead of an EXPORT_SYMBOL_GPL() and the duplicate code.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |  2 --
 include/linux/rcutiny.h  | 12 +++++++++++-
 include/linux/rcutree.h  |  2 ++
 kernel/rcutiny.c         |  9 ++-------
 4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 02537a72aaa..d8fb2abcf30 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -56,8 +56,6 @@ struct rcu_head {
 };
 
 /* Exported common interfaces */
-extern void synchronize_rcu_bh(void);
-extern void synchronize_sched(void);
 extern void rcu_barrier(void);
 extern void rcu_barrier_bh(void);
 extern void rcu_barrier_sched(void);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index a5195875480..bbeb55b7709 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -74,7 +74,17 @@ static inline void rcu_sched_force_quiescent_state(void)
 {
 }
 
-#define synchronize_rcu synchronize_sched
+extern void synchronize_sched(void);
+
+static inline void synchronize_rcu(void)
+{
+	synchronize_sched();
+}
+
+static inline void synchronize_rcu_bh(void)
+{
+	synchronize_sched();
+}
 
 static inline void synchronize_rcu_expedited(void)
 {
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 42cc3a04779..7484fe66a3a 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -86,6 +86,8 @@ static inline void __rcu_read_unlock_bh(void)
 
 extern void call_rcu_sched(struct rcu_head *head,
 			   void (*func)(struct rcu_head *rcu));
+extern void synchronize_rcu_bh(void);
+extern void synchronize_sched(void);
 extern void synchronize_rcu_expedited(void);
 
 static inline void synchronize_rcu_bh_expedited(void)
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 9f6d9ff2572..272c6d21a75 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -187,7 +187,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
  *
  * Cool, huh?  (Due to Josh Triplett.)
  *
- * But we want to make this a static inline later.
+ * But we want to make this a static inline later.  The cond_resched()
+ * currently makes this problematic.
  */
 void synchronize_sched(void)
 {
@@ -195,12 +196,6 @@ void synchronize_sched(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
 
-void synchronize_rcu_bh(void)
-{
-	synchronize_sched();
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
-
 /*
  * Helper function for call_rcu() and call_rcu_bh().
  */
-- 
cgit v1.2.3


From 99652b54de1ee094236f7171485214071af4ef31 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 30 Mar 2010 15:50:01 -0700
Subject: rcu: rename rcutiny rcu_ctrlblk to rcu_sched_ctrlblk

Make naming line up in preparation for CONFIG_TINY_PREEMPT_RCU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutiny.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 272c6d21a75..d9f8a623c9f 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,9 +44,9 @@ struct rcu_ctrlblk {
 };
 
 /* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
-	.donetail	= &rcu_ctrlblk.rcucblist,
-	.curtail	= &rcu_ctrlblk.rcucblist,
+static struct rcu_ctrlblk rcu_sched_ctrlblk = {
+	.donetail	= &rcu_sched_ctrlblk.rcucblist,
+	.curtail	= &rcu_sched_ctrlblk.rcucblist,
 };
 
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
@@ -108,7 +108,8 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
  */
 void rcu_sched_qs(int cpu)
 {
-	if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk))
+	if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
+	    rcu_qsctr_help(&rcu_bh_ctrlblk))
 		raise_softirq(RCU_SOFTIRQ);
 }
 
@@ -173,7 +174,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
  */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
-	__rcu_process_callbacks(&rcu_ctrlblk);
+	__rcu_process_callbacks(&rcu_sched_ctrlblk);
 	__rcu_process_callbacks(&rcu_bh_ctrlblk);
 }
 
@@ -221,7 +222,7 @@ static void __call_rcu(struct rcu_head *head,
  */
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-	__call_rcu(head, func, &rcu_ctrlblk);
+	__call_rcu(head, func, &rcu_sched_ctrlblk);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
-- 
cgit v1.2.3


From 25502a6c13745f4650cc59322bd198194f55e796 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 1 Apr 2010 17:37:01 -0700
Subject: rcu: refactor RCU's context-switch handling

The addition of preemptible RCU to treercu resulted in a bit of
confusion and inefficiency surrounding the handling of context switches
for RCU-sched and for RCU-preempt.  For RCU-sched, a context switch
is a quiescent state, pure and simple, just like it always has been.
For RCU-preempt, a context switch is in no way a quiescent state, but
special handling is required when a task blocks in an RCU read-side
critical section.

However, the callout from the scheduler and the outer loop in ksoftirqd
still calls something named rcu_sched_qs(), whose name is no longer
accurate.  Furthermore, when rcu_check_callbacks() notes an RCU-sched
quiescent state, it ends up unnecessarily (though harmlessly, aside
from the performance hit) enqueuing the current task if it happens to
be running in an RCU-preempt read-side critical section.  This not only
increases the maximum latency of scheduler_tick(), it also needlessly
increases the overhead of the next outermost rcu_read_unlock() invocation.

This patch addresses this situation by separating the notion of RCU's
context-switch handling from that of RCU-sched's quiescent states.
The context-switch handling is covered by rcu_note_context_switch() in
general and by rcu_preempt_note_context_switch() for preemptible RCU.
This permits rcu_sched_qs() to handle quiescent states and only quiescent
states.  It also reduces the maximum latency of scheduler_tick(), though
probably by much less than a microsecond.  Finally, it means that tasks
within preemptible-RCU read-side critical sections avoid incurring the
overhead of queuing unless there really is a context switch.

Suggested-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/rcutiny.h |  4 ++++
 include/linux/rcutree.h |  1 +
 kernel/rcutree.c        | 17 ++++++++++++-----
 kernel/rcutree_plugin.h | 11 +++++++----
 kernel/sched.c          |  2 +-
 kernel/softirq.c        |  2 +-
 6 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index bbeb55b7709..ff22b97fb97 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -29,6 +29,10 @@
 
 void rcu_sched_qs(int cpu);
 void rcu_bh_qs(int cpu);
+static inline void rcu_note_context_switch(int cpu)
+{
+	rcu_sched_qs(cpu);
+}
 
 #define __rcu_read_lock()	preempt_disable()
 #define __rcu_read_unlock()	preempt_enable()
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 7484fe66a3a..b9f74606f32 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -34,6 +34,7 @@ struct notifier_block;
 
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
+extern void rcu_note_context_switch(int cpu);
 extern int rcu_needs_cpu(int cpu);
 extern int rcu_expedited_torture_stats(char *page);
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 86bb9499aae..e33631354b6 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -97,25 +97,32 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
  */
 void rcu_sched_qs(int cpu)
 {
-	struct rcu_data *rdp;
+	struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
 
-	rdp = &per_cpu(rcu_sched_data, cpu);
 	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
 	rdp->passed_quiesc = 1;
-	rcu_preempt_note_context_switch(cpu);
 }
 
 void rcu_bh_qs(int cpu)
 {
-	struct rcu_data *rdp;
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
 
-	rdp = &per_cpu(rcu_bh_data, cpu);
 	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
 	rdp->passed_quiesc = 1;
 }
 
+/*
+ * Note a context switch.  This is a quiescent state for RCU-sched,
+ * and requires special handling for preemptible RCU.
+ */
+void rcu_note_context_switch(int cpu)
+{
+	rcu_sched_qs(cpu);
+	rcu_preempt_note_context_switch(cpu);
+}
+
 #ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 	.dynticks_nesting = 1,
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 687c4e90722..f9bc83a047d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -75,13 +75,19 @@ EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
  * that this just means that the task currently running on the CPU is
  * not in a quiescent state.  There might be any number of tasks blocked
  * while in an RCU read-side critical section.
+ *
+ * Unlike the other rcu_*_qs() functions, callers to this function
+ * must disable irqs in order to protect the assignment to
+ * ->rcu_read_unlock_special.
  */
 static void rcu_preempt_qs(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
+
 	rdp->passed_quiesc_completed = rdp->gpnum - 1;
 	barrier();
 	rdp->passed_quiesc = 1;
+	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 }
 
 /*
@@ -144,9 +150,8 @@ static void rcu_preempt_note_context_switch(int cpu)
 	 * grace period, then the fact that the task has been enqueued
 	 * means that we continue to block the current grace period.
 	 */
-	rcu_preempt_qs(cpu);
 	local_irq_save(flags);
-	t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
+	rcu_preempt_qs(cpu);
 	local_irq_restore(flags);
 }
 
@@ -236,7 +241,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
 	 */
 	special = t->rcu_read_unlock_special;
 	if (special & RCU_READ_UNLOCK_NEED_QS) {
-		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 		rcu_preempt_qs(smp_processor_id());
 	}
 
@@ -473,7 +477,6 @@ static void rcu_preempt_check_callbacks(int cpu)
 	struct task_struct *t = current;
 
 	if (t->rcu_read_lock_nesting == 0) {
-		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 		rcu_preempt_qs(cpu);
 		return;
 	}
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c2a54f70ff..d8a213ccdc3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3706,7 +3706,7 @@ need_resched:
 	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
-	rcu_sched_qs(cpu);
+	rcu_note_context_switch(cpu);
 	prev = rq->curr;
 	switch_count = &prev->nivcsw;
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7c1a67ef027..0db913a5c60 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -716,7 +716,7 @@ static int run_ksoftirqd(void * __bind_cpu)
 			preempt_enable_no_resched();
 			cond_resched();
 			preempt_disable();
-			rcu_sched_qs((long)__bind_cpu);
+			rcu_note_context_switch((long)__bind_cpu);
 		}
 		preempt_enable();
 		set_current_state(TASK_INTERRUPTIBLE);
-- 
cgit v1.2.3


From bbad937983147c017c25406860287cb94da9af7c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 2 Apr 2010 16:17:17 -0700
Subject: rcu: slim down rcutiny by removing rcu_scheduler_active and friends

TINY_RCU does not need rcu_scheduler_active unless CONFIG_DEBUG_LOCK_ALLOC.
So conditionally compile rcu_scheduler_active in order to slim down
rcutiny a bit more.  Also gets rid of an EXPORT_SYMBOL_GPL, which is
responsible for most of the slimming.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |  4 +---
 include/linux/rcutiny.h  | 13 +++++++++++++
 include/linux/rcutree.h  |  3 +++
 kernel/rcupdate.c        | 19 -------------------
 kernel/rcutiny.c         |  7 +++++++
 kernel/rcutiny_plugin.h  | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/rcutree.c         | 19 +++++++++++++++++++
 7 files changed, 82 insertions(+), 22 deletions(-)
 create mode 100644 kernel/rcutiny_plugin.h

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index d8fb2abcf30..23be3a70251 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -64,8 +64,6 @@ extern int sched_expedited_torture_stats(char *page);
 
 /* Internal to kernel */
 extern void rcu_init(void);
-extern int rcu_scheduler_active;
-extern void rcu_scheduler_starting(void);
 
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 #include <linux/rcutree.h>
@@ -178,7 +176,7 @@ static inline int rcu_read_lock_bh_held(void)
 #ifdef CONFIG_PREEMPT
 static inline int rcu_read_lock_sched_held(void)
 {
-	return !rcu_scheduler_active || preempt_count() != 0 || irqs_disabled();
+	return preempt_count() != 0 || irqs_disabled();
 }
 #else /* #ifdef CONFIG_PREEMPT */
 static inline int rcu_read_lock_sched_held(void)
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index ff22b97fb97..14e5a76b2c0 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -128,4 +128,17 @@ static inline int rcu_preempt_depth(void)
 	return 0;
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+extern int rcu_scheduler_active __read_mostly;
+extern void rcu_scheduler_starting(void);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+static inline void rcu_scheduler_starting(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index b9f74606f32..48282055e83 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -123,4 +123,7 @@ static inline int rcu_blocking_is_gp(void)
 	return num_online_cpus() == 1;
 }
 
+extern void rcu_scheduler_starting(void);
+extern int rcu_scheduler_active __read_mostly;
+
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 49d808e833b..72a8dc9567f 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,6 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
-#include <linux/kernel_stat.h>
 #include <linux/hardirq.h>
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -64,9 +63,6 @@ struct lockdep_map rcu_sched_lock_map =
 EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
 #endif
 
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 int debug_lockdep_rcu_enabled(void)
@@ -96,21 +92,6 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-/*
- * This function is invoked towards the end of the scheduler's initialization
- * process.  Before this is called, the idle task might contain
- * RCU read-side critical sections (during which time, this idle
- * task is booting the system).  After this function is called, the
- * idle tasks are prohibited from containing RCU read-side critical
- * sections.
- */
-void rcu_scheduler_starting(void)
-{
-	WARN_ON(num_online_cpus() != 1);
-	WARN_ON(nr_context_switches() > 0);
-	rcu_scheduler_active = 1;
-}
-
 /*
  * Awaken the corresponding synchronize_rcu() instance now that a
  * grace period has elapsed.
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index d9f8a623c9f..b1804ff83d5 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -54,6 +54,11 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.curtail	= &rcu_bh_ctrlblk.rcucblist,
 };
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
 #ifdef CONFIG_NO_HZ
 
 static long rcu_dynticks_nesting = 1;
@@ -276,3 +281,5 @@ void __init rcu_init(void)
 {
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
+
+#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
new file mode 100644
index 00000000000..d223a92bc74
--- /dev/null
+++ b/kernel/rcutiny_plugin.h
@@ -0,0 +1,39 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptable semantics.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2009
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+#include <linux/kernel_stat.h>
+
+/*
+ * During boot, we forgive RCU lockdep issues.  After this function is
+ * invoked, we start taking RCU lockdep issues seriously.
+ */
+void rcu_scheduler_starting(void)
+{
+	WARN_ON(nr_context_switches() > 0);
+	rcu_scheduler_active = 1;
+}
+
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e33631354b6..3623f8e1022 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,6 +46,7 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/time.h>
+#include <linux/kernel_stat.h>
 
 #include "rcutree.h"
 
@@ -80,6 +81,9 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
+int rcu_scheduler_active __read_mostly;
+EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+
 /*
  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
  * permit this function to be invoked without holding the root rcu_node
@@ -1783,6 +1787,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
+/*
+ * This function is invoked towards the end of the scheduler's initialization
+ * process.  Before this is called, the idle task might contain
+ * RCU read-side critical sections (during which time, this idle
+ * task is booting the system).  After this function is called, the
+ * idle tasks are prohibited from containing RCU read-side critical
+ * sections.  This function also enables RCU lockdep checking.
+ */
+void rcu_scheduler_starting(void)
+{
+	WARN_ON(num_online_cpus() != 1);
+	WARN_ON(nr_context_switches() > 0);
+	rcu_scheduler_active = 1;
+}
+
 /*
  * Compute the per-level fanout, either using the exact fanout specified
  * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
-- 
cgit v1.2.3


From 55ec936ff4e57cc626db336a7bf33b267390e9b4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 13 Apr 2010 12:22:33 -0700
Subject: rcu: enable CPU_STALL_VERBOSE by default

The CPU_STALL_VERBOSE kernel configuration parameter was added to
2.6.34 to identify any preempted/blocked tasks that were preventing
the current grace period from completing when running preemptible
RCU.  As is conventional for new configurations parameters, this
defaulted disabled.  It is now time to enable it by default.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 94090b4bb7d..930a9e5eae0 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -805,7 +805,7 @@ config RCU_CPU_STALL_DETECTOR
 config RCU_CPU_STALL_VERBOSE
 	bool "Print additional per-task information for RCU_CPU_STALL_DETECTOR"
 	depends on RCU_CPU_STALL_DETECTOR && TREE_PREEMPT_RCU
-	default n
+	default y
 	help
 	  This option causes RCU to printk detailed per-task information
 	  for any tasks that are stalling the current RCU grace period.
-- 
cgit v1.2.3


From c68de2097a8799549a3c3bf27cbfeea24a604284 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 15 Apr 2010 10:12:40 -0700
Subject: rcu: disable CPU stall warnings upon panic

The current RCU CPU stall warnings remain enabled even after a panic
occurs, which some people have found to be a bit counterproductive.
This patch therefore uses a notifier to disable stall warnings once a
panic occurs.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3623f8e1022..595fb83e9b7 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -449,6 +449,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 
+int rcu_cpu_stall_panicking __read_mostly;
+
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
 	rsp->gp_start = jiffies;
@@ -526,6 +528,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 	long delta;
 	struct rcu_node *rnp;
 
+	if (rcu_cpu_stall_panicking)
+		return;
 	delta = jiffies - rsp->jiffies_stall;
 	rnp = rdp->mynode;
 	if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
@@ -540,6 +544,21 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 }
 
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+	rcu_cpu_stall_panicking = 1;
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block rcu_panic_block = {
+	.notifier_call = rcu_panic,
+};
+
+static void __init check_cpu_stall_init(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+}
+
 #else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
 static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -550,6 +569,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 }
 
+static void __init check_cpu_stall_init(void)
+{
+}
+
 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
 /*
@@ -1934,6 +1957,7 @@ void __init rcu_init(void)
 	cpu_notifier(rcu_cpu_notify, 0);
 	for_each_online_cpu(cpu)
 		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+	check_cpu_stall_init();
 }
 
 #include "rcutree_plugin.h"
-- 
cgit v1.2.3


From 26845c2860cebebe6ce2d9d01ae3cb3db84b7e29 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 13 Apr 2010 14:19:23 -0700
Subject: rcu: print boot-time console messages if RCU configs out of ordinary

Print boot-time messages if tracing is enabled, if fanout is set
to non-default values, if exact fanout is specified, if accelerated
dyntick-idle grace periods have been enabled, if RCU-lockdep is enabled,
if rcutorture has been boot-time enabled, if the CPU stall detector has
been disabled, or if four-level hierarchy has been enabled.

This is all for TREE_RCU and TREE_PREEMPT_RCU.  TINY_RCU will be handled
separately, if at all.

Suggested-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c        |  6 ------
 kernel/rcutree_plugin.h | 44 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 595fb83e9b7..ec6196fcd1f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1938,12 +1938,6 @@ void __init rcu_init(void)
 	int cpu;
 
 	rcu_bootup_announce();
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-#if NUM_RCU_LVL_4 != 0
-	printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
-#endif /* #if NUM_RCU_LVL_4 != 0 */
 	RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
 	RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
 	__rcu_init_preempt();
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index f9bc83a047d..0ae2339ab04 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -26,6 +26,45 @@
 
 #include <linux/delay.h>
 
+/*
+ * Check the RCU kernel configuration parameters and print informative
+ * messages about anything out of the ordinary.  If you like #ifdef, you
+ * will love this function.
+ */
+static void __init rcu_bootup_announce_oddness(void)
+{
+#ifdef CONFIG_RCU_TRACE
+	printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
+#endif
+#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
+	printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
+	       CONFIG_RCU_FANOUT);
+#endif
+#ifdef CONFIG_RCU_FANOUT_EXACT
+	printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
+#endif
+#ifdef CONFIG_RCU_FAST_NO_HZ
+	printk(KERN_INFO
+	       "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
+#endif
+#ifdef CONFIG_PROVE_RCU
+	printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
+#endif
+#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
+	printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
+#endif
+#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
+	printk(KERN_INFO
+	       "\tRCU-based detection of stalled CPUs is disabled.\n");
+#endif
+#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
+	printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
+#endif
+#if NUM_RCU_LVL_4 != 0
+	printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
+#endif
+}
+
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
 struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
@@ -38,8 +77,8 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
  */
 static void __init rcu_bootup_announce(void)
 {
-	printk(KERN_INFO
-	       "Experimental preemptable hierarchical RCU implementation.\n");
+	printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
+	rcu_bootup_announce_oddness();
 }
 
 /*
@@ -757,6 +796,7 @@ void exit_rcu(void)
 static void __init rcu_bootup_announce(void)
 {
 	printk(KERN_INFO "Hierarchical RCU implementation.\n");
+	rcu_bootup_announce_oddness();
 }
 
 /*
-- 
cgit v1.2.3


From 4300aa642cc9ecb35f2e0683dd294fb790ef028c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 13 Apr 2010 16:18:22 -0700
Subject: rcu: improve RCU CPU stall-warning messages

The existing RCU CPU stall-warning messages can be confusing, especially
in the case where one CPU detects a single other stalled CPU.  In addition,
the console messages did not say which flavor of RCU detected the stall,
which can make it difficult to work out exactly what is causing the stall.
This commit improves these messages.

Requested-by: Dhaval Giani <dhaval.giani@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 20 +++++++++++---------
 kernel/rcutree.h |  1 +
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ec6196fcd1f..f391886be8f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -54,8 +54,8 @@
 
 static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 
-#define RCU_STATE_INITIALIZER(name) { \
-	.level = { &name.node[0] }, \
+#define RCU_STATE_INITIALIZER(structname) { \
+	.level = { &structname.node[0] }, \
 	.levelcnt = { \
 		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
 		NUM_RCU_LVL_1, \
@@ -66,13 +66,14 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 	.signaled = RCU_GP_IDLE, \
 	.gpnum = -300, \
 	.completed = -300, \
-	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \
+	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
 	.orphan_cbs_list = NULL, \
-	.orphan_cbs_tail = &name.orphan_cbs_list, \
+	.orphan_cbs_tail = &structname.orphan_cbs_list, \
 	.orphan_qlen = 0, \
-	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \
+	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
 	.n_force_qs = 0, \
 	.n_force_qs_ngp = 0, \
+	.name = #structname, \
 }
 
 struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
@@ -483,7 +484,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 
 	/* OK, time to rat on our buddy... */
 
-	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
+	printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
+	       rsp->name);
 	rcu_for_each_leaf_node(rsp, rnp) {
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		rcu_print_task_stall(rnp);
@@ -494,7 +496,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 			if (rnp->qsmask & (1UL << cpu))
 				printk(" %d", rnp->grplo + cpu);
 	}
-	printk(" (detected by %d, t=%ld jiffies)\n",
+	printk("} (detected by %d, t=%ld jiffies)\n",
 	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
 	trigger_all_cpu_backtrace();
 
@@ -510,8 +512,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	unsigned long flags;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
-			smp_processor_id(), jiffies - rsp->gp_start);
+	printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
+	       rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
 	trigger_all_cpu_backtrace();
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4a525a30e08..11f171121ad 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -326,6 +326,7 @@ struct rcu_state {
 	unsigned long jiffies_stall;		/* Time at which to check */
 						/*  for CPU stalls. */
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+	char *name;				/* Name of structure. */
 };
 
 /* Return values for rcu_preempt_offline_tasks(). */
-- 
cgit v1.2.3


From 4a90a0681cf6cd21cd444184302aa045156486b3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 14 Apr 2010 16:48:11 -0700
Subject: rcu: permit discontiguous cpu_possible_mask CPU numbering

TREE_RCU assumes that CPU numbering is contiguous, but some users need
large holes in the numbering to better map to hardware layout.  This patch
makes TREE_RCU (and TREE_PREEMPT_RCU) tolerate large holes in the CPU
numbering.  However, NR_CPUS must still be greater than the largest
CPU number.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f391886be8f..c60fd74e7ec 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1913,7 +1913,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 
 	rnp = rsp->level[NUM_RCU_LVLS - 1];
 	for_each_possible_cpu(i) {
-		if (i > rnp->grphi)
+		while (i > rnp->grphi)
 			rnp++;
 		rsp->rda[i]->mynode = rnp;
 		rcu_boot_init_percpu_data(i, rsp);
-- 
cgit v1.2.3


From d21670acab9fcb4bc74a40b68a6941059234c55c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 14 Apr 2010 17:39:26 -0700
Subject: rcu: reduce the number of spurious RCU_SOFTIRQ invocations

Lai Jiangshan noted that up to 10% of the RCU_SOFTIRQ are spurious, and
traced this down to the fact that the current grace-period machinery
will uselessly raise RCU_SOFTIRQ when a given CPU needs to go through
a quiescent state, but has not yet done so.  In this situation, there
might well be nothing that RCU_SOFTIRQ can do, and the overhead can be
worth worrying about in the ksoftirqd case.  This patch therefore avoids
raising RCU_SOFTIRQ in this situation.

Changes since v1 (http://lkml.org/lkml/2010/3/30/122 from Lai Jiangshan):

o	Omit the rcu_qs_pending() prechecks, as they aren't that
	much less expensive than the quiescent-state checks.

o	Merge with the set_need_resched() patch that reduces IPIs.

o	Add the new n_rp_report_qs field to the rcu_pending tracing output.

o	Update the tracing documentation accordingly.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/RCU/trace.txt | 35 +++++++++++++++++++----------------
 kernel/rcutree.c            | 11 ++++++-----
 kernel/rcutree.h            |  1 +
 kernel/rcutree_trace.c      |  4 +++-
 4 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 8608fd85e92..efd8cc95c06 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -256,23 +256,23 @@ o	Each element of the form "1/1 0:127 ^0" represents one struct
 The output of "cat rcu/rcu_pending" looks as follows:
 
 rcu_sched:
-  0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
-  1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
-  2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
-  3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
-  4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
-  5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
-  6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
-  7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
+  0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
+  1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
+  2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
+  3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
+  4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
+  5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
+  6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
+  7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
 rcu_bh:
-  0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
-  1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
-  2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
-  3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
-  4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
-  5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
-  6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
-  7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
+  0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
+  1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
+  2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
+  3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
+  4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
+  5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
+  6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
+  7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
 
 As always, this is once again split into "rcu_sched" and "rcu_bh"
 portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional
@@ -284,6 +284,9 @@ o	"np" is the number of times that __rcu_pending() has been invoked
 o	"qsp" is the number of times that the RCU was waiting for a
 	quiescent state from this CPU.
 
+o	"rpq" is the number of times that the CPU had passed through
+	a quiescent state, but not yet reported it to RCU.
+
 o	"cbr" is the number of times that this CPU had RCU callbacks
 	that had passed through a grace period, and were thus ready
 	to be invoked.
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index c60fd74e7ec..ba6996943e2 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1161,8 +1161,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
  */
 void rcu_check_callbacks(int cpu, int user)
 {
-	if (!rcu_pending(cpu))
-		return; /* if nothing for RCU to do. */
 	if (user ||
 	    (idle_cpu(cpu) && rcu_scheduler_active &&
 	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1194,7 +1192,8 @@ void rcu_check_callbacks(int cpu, int user)
 		rcu_bh_qs(cpu);
 	}
 	rcu_preempt_check_callbacks(cpu);
-	raise_softirq(RCU_SOFTIRQ);
+	if (rcu_pending(cpu))
+		raise_softirq(RCU_SOFTIRQ);
 }
 
 #ifdef CONFIG_SMP
@@ -1534,18 +1533,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	check_cpu_stall(rsp, rdp);
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
-	if (rdp->qs_pending) {
+	if (rdp->qs_pending && !rdp->passed_quiesc) {
 
 		/*
 		 * If force_quiescent_state() coming soon and this CPU
 		 * needs a quiescent state, and this is either RCU-sched
 		 * or RCU-bh, force a local reschedule.
 		 */
+		rdp->n_rp_qs_pending++;
 		if (!rdp->preemptable &&
 		    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
 				 jiffies))
 			set_need_resched();
-		rdp->n_rp_qs_pending++;
+	} else if (rdp->qs_pending && rdp->passed_quiesc) {
+		rdp->n_rp_report_qs++;
 		return 1;
 	}
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 11f171121ad..14c040b18ed 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -223,6 +223,7 @@ struct rcu_data {
 	/* 5) __rcu_pending() statistics. */
 	unsigned long n_rcu_pending;	/* rcu_pending() calls since boot. */
 	unsigned long n_rp_qs_pending;
+	unsigned long n_rp_report_qs;
 	unsigned long n_rp_cb_ready;
 	unsigned long n_rp_cpu_needs_gp;
 	unsigned long n_rp_gp_completed;
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d45db2e35d2..36c95b45738 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = {
 static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
 {
 	seq_printf(m, "%3d%cnp=%ld "
-		   "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+		   "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
+		   "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
 		   rdp->n_rcu_pending,
 		   rdp->n_rp_qs_pending,
+		   rdp->n_rp_report_qs,
 		   rdp->n_rp_cb_ready,
 		   rdp->n_rp_cpu_needs_gp,
 		   rdp->n_rp_gp_completed,
-- 
cgit v1.2.3


From f1d507beeab1d1d60a1c58eac7dc81522c6f4629 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 15 Apr 2010 15:49:46 -0700
Subject: rcu: improve the RCU CPU-stall warning documentation

The existing Documentation/RCU/stallwarn.txt has proven unhelpful, so
rework it a bit.  In particular, show how to interpret the stall-warning
messages.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/RCU/stallwarn.txt | 94 +++++++++++++++++++++++++++++++----------
 1 file changed, 71 insertions(+), 23 deletions(-)

diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 1423d2570d7..44c6dcc93d6 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -3,35 +3,79 @@ Using RCU's CPU Stall Detector
 The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables
 RCU's CPU stall detector, which detects conditions that unduly delay
 RCU grace periods.  The stall detector's idea of what constitutes
-"unduly delayed" is controlled by a pair of C preprocessor macros:
+"unduly delayed" is controlled by a set of C preprocessor macros:
 
 RCU_SECONDS_TILL_STALL_CHECK
 
 	This macro defines the period of time that RCU will wait from
 	the beginning of a grace period until it issues an RCU CPU
-	stall warning.	It is normally ten seconds.
+	stall warning.	This time period is normally ten seconds.
 
 RCU_SECONDS_TILL_STALL_RECHECK
 
 	This macro defines the period of time that RCU will wait after
-	issuing a stall warning until it issues another stall warning.
-	It is normally set to thirty seconds.
+	issuing a stall warning until it issues another stall warning
+	for the same stall.  This time period is normally set to thirty
+	seconds.
 
 RCU_STALL_RAT_DELAY
 
-	The CPU stall detector tries to make the offending CPU rat on itself,
-	as this often gives better-quality stack traces.  However, if
-	the offending CPU does not detect its own stall in the number
-	of jiffies specified by RCU_STALL_RAT_DELAY, then other CPUs will
-	complain.  This is normally set to two jiffies.
+	The CPU stall detector tries to make the offending CPU print its
+	own warnings, as this often gives better-quality stack traces.
+	However, if the offending CPU does not detect its own stall in
+	the number of jiffies specified by RCU_STALL_RAT_DELAY, then
+	some other CPU will complain.  This delay is normally set to
+	two jiffies.
 
-The following problems can result in an RCU CPU stall warning:
+When a CPU detects that it is stalling, it will print a message similar
+to the following:
+
+INFO: rcu_sched_state detected stall on CPU 5 (t=2500 jiffies)
+
+This message indicates that CPU 5 detected that it was causing a stall,
+and that the stall was affecting RCU-sched.  This message will normally be
+followed by a stack dump of the offending CPU.  On TREE_RCU kernel builds,
+RCU and RCU-sched are implemented by the same underlying mechanism,
+while on TREE_PREEMPT_RCU kernel builds, RCU is instead implemented
+by rcu_preempt_state.
+
+On the other hand, if the offending CPU fails to print out a stall-warning
+message quickly enough, some other CPU will print a message similar to
+the following:
+
+INFO: rcu_bh_state detected stalls on CPUs/tasks: { 3 5 } (detected by 2, 2502 jiffies)
+
+This message indicates that CPU 2 detected that CPUs 3 and 5 were both
+causing stalls, and that the stall was affecting RCU-bh.  This message
+will normally be followed by stack dumps for each CPU.  Please note that
+TREE_PREEMPT_RCU builds can be stalled by tasks as well as by CPUs,
+and that the tasks will be indicated by PID, for example, "P3421".
+It is even possible for a rcu_preempt_state stall to be caused by both
+CPUs -and- tasks, in which case the offending CPUs and tasks will all
+be called out in the list.
+
+Finally, if the grace period ends just as the stall warning starts
+printing, there will be a spurious stall-warning message:
+
+INFO: rcu_bh_state detected stalls on CPUs/tasks: { } (detected by 4, 2502 jiffies)
+
+This is rare, but does happen from time to time in real life.
+
+So your kernel printed an RCU CPU stall warning.  The next question is
+"What caused it?"  The following problems can result in RCU CPU stall
+warnings:
 
 o	A CPU looping in an RCU read-side critical section.
 	
-o	A CPU looping with interrupts disabled.
+o	A CPU looping with interrupts disabled.  This condition can
+	result in RCU-sched and RCU-bh stalls.
 
-o	A CPU looping with preemption disabled.
+o	A CPU looping with preemption disabled.  This condition can
+	result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
+	stalls.
+
+o	A CPU looping with bottom halves disabled.  This condition can
+	result in RCU-sched and RCU-bh stalls.
 
 o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
 	without invoking schedule().
@@ -39,20 +83,24 @@ o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
 o	A bug in the RCU implementation.
 
 o	A hardware failure.  This is quite unlikely, but has occurred
-	at least once in a former life.  A CPU failed in a running system,
+	at least once in real life.  A CPU failed in a running system,
 	becoming unresponsive, but not causing an immediate crash.
 	This resulted in a series of RCU CPU stall warnings, eventually
 	leading the realization that the CPU had failed.
 
-The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning.
-SRCU does not do so directly, but its calls to synchronize_sched() will
-result in RCU-sched detecting any CPU stalls that might be occurring.
-
-To diagnose the cause of the stall, inspect the stack traces.  The offending
-function will usually be near the top of the stack.  If you have a series
-of stall warnings from a single extended stall, comparing the stack traces
-can often help determine where the stall is occurring, which will usually
-be in the function nearest the top of the stack that stays the same from
-trace to trace.
+The RCU, RCU-sched, and RCU-bh implementations have CPU stall
+warning.  SRCU does not have its own CPU stall warnings, but its
+calls to synchronize_sched() will result in RCU-sched detecting
+RCU-sched-related CPU stalls.  Please note that RCU only detects
+CPU stalls when there is a grace period in progress.  No grace period,
+no CPU stall warnings.
+
+To diagnose the cause of the stall, inspect the stack traces.
+The offending function will usually be near the top of the stack.
+If you have a series of stall warnings from a single extended stall,
+comparing the stack traces can often help determine where the stall
+is occurring, which will usually be in the function nearest the top of
+that portion of the stack which remains the same from trace to trace.
+If you can reliably trigger the stall, ftrace can be quite helpful.
 
 RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE.
-- 
cgit v1.2.3


From d14aada8e20bdf81ffd43f433b123972cf575b32 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Apr 2010 22:24:22 -0700
Subject: rcu: make SRCU usable in modules

Add a #include for mutex.h to allow SRCU to be more easily used in
kernel modules.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 9c01f102242..4d5d2f546db 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -27,6 +27,8 @@
 #ifndef _LINUX_SRCU_H
 #define _LINUX_SRCU_H
 
+#include <linux/mutex.h>
+
 struct srcu_struct_array {
 	int c[2];
 };
-- 
cgit v1.2.3


From 77e38ed347162423c6b72e06c865a121081c2bb6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 25 Apr 2010 21:04:29 -0700
Subject: rcu: RCU_FAST_NO_HZ must check RCU dyntick state

The current version of RCU_FAST_NO_HZ reproduces the old CLASSIC_RCU
dyntick-idle bug, as it fails to detect CPUs that have interrupted
or NMIed out of dyntick-idle mode.  Fix this by making rcu_needs_cpu()
check the state in the per-CPU rcu_dynticks variables, thus correctly
detecting the dyntick-idle state from an RCU perspective.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0ae2339ab04..9b18227e86e 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1051,6 +1051,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
 int rcu_needs_cpu(int cpu)
 {
 	int c = 0;
+	int snap;
+	int snap_nmi;
 	int thatcpu;
 
 	/* Check for being in the holdoff period. */
@@ -1058,12 +1060,18 @@ int rcu_needs_cpu(int cpu)
 		return rcu_needs_cpu_quick_check(cpu);
 
 	/* Don't bother unless we are the last non-dyntick-idle CPU. */
-	for_each_cpu_not(thatcpu, nohz_cpu_mask)
-		if (cpu_online(thatcpu) && thatcpu != cpu) {
+	for_each_online_cpu(thatcpu) {
+		if (thatcpu == cpu)
+			continue;
+		snap = per_cpu(rcu_dynticks, thatcpu)->dynticks;
+		snap_nmi = per_cpu(rcu_dynticks, thatcpu)->dynticks_nmi;
+		smp_mb(); /* Order sampling of snap with end of grace period. */
+		if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
 			per_cpu(rcu_dyntick_drain, cpu) = 0;
 			per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
 			return rcu_needs_cpu_quick_check(cpu);
 		}
+	}
 
 	/* Check and update the rcu_dyntick_drain sequencing. */
 	if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
-- 
cgit v1.2.3


From d822ed1094032ab524344a9a474c93128d9c2159 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 8 May 2010 19:58:22 -0700
Subject: rcu: fix build bug in RCU_FAST_NO_HZ builds

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree_plugin.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 9b18227e86e..ac7d80fa895 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1063,8 +1063,8 @@ int rcu_needs_cpu(int cpu)
 	for_each_online_cpu(thatcpu) {
 		if (thatcpu == cpu)
 			continue;
-		snap = per_cpu(rcu_dynticks, thatcpu)->dynticks;
-		snap_nmi = per_cpu(rcu_dynticks, thatcpu)->dynticks_nmi;
+		snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
+		snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
 		smp_mb(); /* Order sampling of snap with end of grace period. */
 		if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
 			per_cpu(rcu_dyntick_drain, cpu) = 0;
-- 
cgit v1.2.3


From c3f8978ea332cd4be88e12574452a025892ac9af Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 10 May 2010 13:37:16 -0700
Subject: x86, fpu: Unbreak FPU emulation

Unbreak FPU emulation, broken by checkin
86603283326c9e95e5ad4e9fdddeec93cac5d9ad:
x86: Introduce 'struct fpu' and related API

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-3-git-send-email-avi@redhat.com>
---
 arch/x86/math-emu/fpu_aux.c    | 2 +-
 arch/x86/math-emu/fpu_entry.c  | 4 ++--
 arch/x86/math-emu/fpu_system.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 62797f93051..dc8adad10a2 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -52,7 +52,7 @@ void finit_soft_fpu(struct i387_soft_struct *soft)
 
 void finit(void)
 {
-	finit_task(&current->thread.fpu);
+	finit_soft_fpu(&current->thread.fpu.state->soft);
 }
 
 /*
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 5d87f586f8d..7718541541d 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -681,7 +681,7 @@ int fpregs_soft_set(struct task_struct *target,
 		    unsigned int pos, unsigned int count,
 		    const void *kbuf, const void __user *ubuf)
 {
-	struct i387_soft_struct *s387 = &target->thread.xstate->soft;
+	struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
 	void *space = s387->st_space;
 	int ret;
 	int offset, other, i, tags, regnr, tag, newtop;
@@ -733,7 +733,7 @@ int fpregs_soft_get(struct task_struct *target,
 		    unsigned int pos, unsigned int count,
 		    void *kbuf, void __user *ubuf)
 {
-	struct i387_soft_struct *s387 = &target->thread.xstate->soft;
+	struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
 	const void *space = s387->st_space;
 	int ret;
 	int offset = (S387->ftop & 7) * 10, other = 80 - offset;
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 50fa0ec2c8a..2c614410a5f 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -31,7 +31,7 @@
 #define SEG_EXPAND_DOWN(s)	(((s).b & ((1 << 11) | (1 << 10))) \
 				 == (1 << 10))
 
-#define I387			(current->thread.xstate)
+#define I387			(current->thread.fpu.state)
 #define FPU_info		(I387->soft.info)
 
 #define FPU_CS			(*(unsigned short *) &(FPU_info->regs->cs))
-- 
cgit v1.2.3


From dce8bf4e115aa44d590802ce3554e926840c9042 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 10 May 2010 13:41:41 -0700
Subject: x86, fpu: Use the proper asm constraint in use_xsave()

The proper constraint for a receiving 8-bit variable is "=qm", not
"=g" which equals "=rim"; even though the "i" will never match, bugs
can and do happen due to the difference between "q" and "r".

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-2-git-send-email-avi@redhat.com>
---
 arch/x86/include/asm/i387.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 1a8cca33b73..8002e9ce25f 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -64,7 +64,7 @@ static inline bool use_xsave(void)
 	alternative_io("mov $0, %0",
 		       "mov $1, %0",
 		       X86_FEATURE_XSAVE,
-		       "=g"(has_xsave));
+		       "=qm" (has_xsave));
 
 	return has_xsave;
 }
-- 
cgit v1.2.3


From fefb0b94bbab858be0909a7eb5ef357e0f996a79 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 10 May 2010 13:57:51 -0300
Subject: perf hist: Calculate max_sym name len and nr_entries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Better done when we are adding entries, be it initially of when we're
re-sorting the histograms.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c |  6 +++---
 tools/perf/util/hist.c      | 27 ++++++++++++++++++++-------
 tools/perf/util/hist.h      |  4 +++-
 tools/perf/util/symbol.c    |  5 +++--
 tools/perf/util/symbol.h    |  1 +
 5 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 53077fd973f..d7c75291e78 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -296,13 +296,13 @@ static int __cmd_report(void)
 	next = rb_first(&session->hists_tree);
 	while (next) {
 		struct hists *hists;
-		u64 nr_hists;
 
 		hists = rb_entry(next, struct hists, rb_node);
 		hists__collapse_resort(hists);
-		nr_hists = hists__output_resort(hists);
+		hists__output_resort(hists);
 		if (use_browser)
-			perf_session__browse_hists(&hists->entries, nr_hists,
+			perf_session__browse_hists(&hists->entries,
+						   hists->nr_entries,
 						   hists->stats.total, help,
 						   input_name);
 		else {
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 410cf56c966..e34fd248067 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -47,6 +47,13 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template)
 	return self;
 }
 
+static void hists__inc_nr_entries(struct hists *self, struct hist_entry *entry)
+{
+	if (entry->ms.sym && self->max_sym_namelen < entry->ms.sym->namelen)
+		self->max_sym_namelen = entry->ms.sym->namelen;
+	++self->nr_entries;
+}
+
 struct hist_entry *__hists__add_entry(struct hists *self,
 				      struct addr_location *al,
 				      struct symbol *sym_parent, u64 count)
@@ -89,6 +96,7 @@ struct hist_entry *__hists__add_entry(struct hists *self,
 		return NULL;
 	rb_link_node(&he->rb_node, parent, p);
 	rb_insert_color(&he->rb_node, &self->entries);
+	hists__inc_nr_entries(self, he);
 out:
 	hist_entry__add_cpumode_count(he, al->cpumode, count);
 	return he;
@@ -137,7 +145,7 @@ void hist_entry__free(struct hist_entry *he)
  * collapse the histogram
  */
 
-static void collapse__insert_entry(struct rb_root *root, struct hist_entry *he)
+static bool collapse__insert_entry(struct rb_root *root, struct hist_entry *he)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
@@ -153,7 +161,7 @@ static void collapse__insert_entry(struct rb_root *root, struct hist_entry *he)
 		if (!cmp) {
 			iter->count += he->count;
 			hist_entry__free(he);
-			return;
+			return false;
 		}
 
 		if (cmp < 0)
@@ -164,6 +172,7 @@ static void collapse__insert_entry(struct rb_root *root, struct hist_entry *he)
 
 	rb_link_node(&he->rb_node, parent, p);
 	rb_insert_color(&he->rb_node, root);
+	return true;
 }
 
 void hists__collapse_resort(struct hists *self)
@@ -177,13 +186,16 @@ void hists__collapse_resort(struct hists *self)
 
 	tmp = RB_ROOT;
 	next = rb_first(&self->entries);
+	self->nr_entries = 0;
+	self->max_sym_namelen = 0;
 
 	while (next) {
 		n = rb_entry(next, struct hist_entry, rb_node);
 		next = rb_next(&n->rb_node);
 
 		rb_erase(&n->rb_node, &self->entries);
-		collapse__insert_entry(&tmp, n);
+		if (collapse__insert_entry(&tmp, n))
+			hists__inc_nr_entries(self, n);
 	}
 
 	self->entries = tmp;
@@ -219,30 +231,31 @@ static void __hists__insert_output_entry(struct rb_root *entries,
 	rb_insert_color(&he->rb_node, entries);
 }
 
-u64 hists__output_resort(struct hists *self)
+void hists__output_resort(struct hists *self)
 {
 	struct rb_root tmp;
 	struct rb_node *next;
 	struct hist_entry *n;
 	u64 min_callchain_hits;
-	u64 nr_hists = 0;
 
 	min_callchain_hits = self->stats.total * (callchain_param.min_percent / 100);
 
 	tmp = RB_ROOT;
 	next = rb_first(&self->entries);
 
+	self->nr_entries = 0;
+	self->max_sym_namelen = 0;
+
 	while (next) {
 		n = rb_entry(next, struct hist_entry, rb_node);
 		next = rb_next(&n->rb_node);
 
 		rb_erase(&n->rb_node, &self->entries);
 		__hists__insert_output_entry(&tmp, n, min_callchain_hits);
-		++nr_hists;
+		hists__inc_nr_entries(self, n);
 	}
 
 	self->entries = tmp;
-	return nr_hists;
 }
 
 static size_t callchain__fprintf_left_margin(FILE *fp, int left_margin)
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index bdde81eca69..1b18d04195d 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -19,10 +19,12 @@ struct events_stats {
 struct hists {
 	struct rb_node		rb_node;
 	struct rb_root		entries;
+	u64			nr_entries;
 	struct events_stats	stats;
 	u64			config;
 	u64			event_stream;
 	u32			type;
+	u32			max_sym_namelen;
 };
 
 struct hist_entry *__hists__add_entry(struct hists *self,
@@ -38,7 +40,7 @@ int hist_entry__snprintf(struct hist_entry *self, char *bf, size_t size,
 			 long displacement, bool color, u64 total);
 void hist_entry__free(struct hist_entry *);
 
-u64 hists__output_resort(struct hists *self);
+void hists__output_resort(struct hists *self);
 void hists__collapse_resort(struct hists *self);
 size_t hists__fprintf(struct hists *self, struct hists *pair,
 		      bool show_displacement, FILE *fp);
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 994efdb531e..ecccc8df128 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -130,8 +130,9 @@ static struct symbol *symbol__new(u64 start, u64 len, const char *name)
 	if (symbol_conf.priv_size)
 		self = ((void *)self) + symbol_conf.priv_size;
 
-	self->start = start;
-	self->end   = len ? start + len - 1 : start;
+	self->start   = start;
+	self->end     = len ? start + len - 1 : start;
+	self->namelen = namelen - 1;
 
 	pr_debug4("%s: %s %#Lx-%#Lx\n", __func__, name, start, self->end);
 
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index edff866d76b..6389d1acaf8 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -54,6 +54,7 @@ struct symbol {
 	struct rb_node	rb_node;
 	u64		start;
 	u64		end;
+	u16		namelen;
 	char		name[0];
 };
 
-- 
cgit v1.2.3


From c3f5fd287aa897f710f3305367a1d256c9cf3e83 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 9 May 2010 23:46:52 -0500
Subject: perf/trace/scripting: failed-syscalls script cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A couple small fixes for the failed syscalls script:

- The script description says it can be restricted to a specific comm,
  make it so.

- silence the match output in the shell script

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1273466820-9330-2-git-send-email-tzanussi@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/scripts/perl/bin/failed-syscalls-report | 2 +-
 tools/perf/scripts/perl/failed-syscalls.pl         | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/perf/scripts/perl/bin/failed-syscalls-report b/tools/perf/scripts/perl/bin/failed-syscalls-report
index f6346082a8f..e3a5e55d54f 100644
--- a/tools/perf/scripts/perl/bin/failed-syscalls-report
+++ b/tools/perf/scripts/perl/bin/failed-syscalls-report
@@ -2,7 +2,7 @@
 # description: system-wide failed syscalls
 # args: [comm]
 if [ $# -gt 0 ] ; then
-    if ! expr match "$1" "-" ; then
+    if ! expr match "$1" "-" > /dev/null ; then
 	comm=$1
 	shift
     fi
diff --git a/tools/perf/scripts/perl/failed-syscalls.pl b/tools/perf/scripts/perl/failed-syscalls.pl
index c18e7e27a84..94bc25a347e 100644
--- a/tools/perf/scripts/perl/failed-syscalls.pl
+++ b/tools/perf/scripts/perl/failed-syscalls.pl
@@ -11,6 +11,8 @@ use Perf::Trace::Core;
 use Perf::Trace::Context;
 use Perf::Trace::Util;
 
+my $for_comm = shift;
+
 my %failed_syscalls;
 
 sub raw_syscalls::sys_exit
@@ -33,6 +35,8 @@ sub trace_end
 
     foreach my $comm (sort {$failed_syscalls{$b} <=> $failed_syscalls{$a}}
 		      keys %failed_syscalls) {
-	    printf("%-20s  %10s\n", $comm, $failed_syscalls{$comm});
+	next if ($for_comm && $comm ne $for_comm);
+
+	printf("%-20s  %10s\n", $comm, $failed_syscalls{$comm});
     }
 }
-- 
cgit v1.2.3


From 6922c3d772711239e75ddaea760e6b0535e7e7a6 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 9 May 2010 23:46:53 -0500
Subject: perf/trace/scripting: rw-by-pid script cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some minor fixes for the rw-by-pid script:

- Fix nuisance 'use of uninitialized value' warnings

- Change the failed read/write sections to sort by error counts

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1273466820-9330-3-git-send-email-tzanussi@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/scripts/perl/rw-by-pid.pl | 60 ++++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 23 deletions(-)

diff --git a/tools/perf/scripts/perl/rw-by-pid.pl b/tools/perf/scripts/perl/rw-by-pid.pl
index da601fae1a0..9db23c9daf5 100644
--- a/tools/perf/scripts/perl/rw-by-pid.pl
+++ b/tools/perf/scripts/perl/rw-by-pid.pl
@@ -79,12 +79,12 @@ sub trace_end
     printf("%6s  %-20s  %10s  %10s  %10s\n", "------", "--------------------",
 	   "-----------", "----------", "----------");
 
-    foreach my $pid (sort {$reads{$b}{bytes_read} <=>
-			       $reads{$a}{bytes_read}} keys %reads) {
-	my $comm = $reads{$pid}{comm};
-	my $total_reads = $reads{$pid}{total_reads};
-	my $bytes_requested = $reads{$pid}{bytes_requested};
-	my $bytes_read = $reads{$pid}{bytes_read};
+    foreach my $pid (sort { ($reads{$b}{bytes_read} || 0) <=>
+				($reads{$a}{bytes_read} || 0) } keys %reads) {
+	my $comm = $reads{$pid}{comm} || "";
+	my $total_reads = $reads{$pid}{total_reads} || 0;
+	my $bytes_requested = $reads{$pid}{bytes_requested} || 0;
+	my $bytes_read = $reads{$pid}{bytes_read} || 0;
 
 	printf("%6s  %-20s  %10s  %10s  %10s\n", $pid, $comm,
 	       $total_reads, $bytes_requested, $bytes_read);
@@ -96,16 +96,23 @@ sub trace_end
     printf("%6s  %20s  %6s  %10s\n", "------", "--------------------",
 	   "------", "----------");
 
-    foreach my $pid (keys %reads) {
-	my $comm = $reads{$pid}{comm};
-	foreach my $err (sort {$reads{$b}{comm} cmp $reads{$a}{comm}}
-			 keys %{$reads{$pid}{errors}}) {
-	    my $errors = $reads{$pid}{errors}{$err};
+    my @errcounts = ();
 
-	    printf("%6d  %-20s  %6d  %10s\n", $pid, $comm, $err, $errors);
+    foreach my $pid (keys %reads) {
+	foreach my $error (keys %{$reads{$pid}{errors}}) {
+	    my $comm = $reads{$pid}{comm} || "";
+	    my $errcount = $reads{$pid}{errors}{$error} || 0;
+	    push @errcounts, [$pid, $comm, $error, $errcount];
 	}
     }
 
+    @errcounts = sort { $b->[3] <=> $a->[3] } @errcounts;
+
+    for my $i (0 .. $#errcounts) {
+	printf("%6d  %-20s  %6d  %10s\n", $errcounts[$i][0],
+	       $errcounts[$i][1], $errcounts[$i][2], $errcounts[$i][3]);
+    }
+
     printf("\nwrite counts by pid:\n\n");
 
     printf("%6s  %20s  %10s  %10s\n", "pid", "comm",
@@ -113,11 +120,11 @@ sub trace_end
     printf("%6s  %-20s  %10s  %10s\n", "------", "--------------------",
 	   "-----------", "----------");
 
-    foreach my $pid (sort {$writes{$b}{bytes_written} <=>
-			       $writes{$a}{bytes_written}} keys %writes) {
-	my $comm = $writes{$pid}{comm};
-	my $total_writes = $writes{$pid}{total_writes};
-	my $bytes_written = $writes{$pid}{bytes_written};
+    foreach my $pid (sort { ($writes{$b}{bytes_written} || 0) <=>
+			($writes{$a}{bytes_written} || 0)} keys %writes) {
+	my $comm = $writes{$pid}{comm} || "";
+	my $total_writes = $writes{$pid}{total_writes} || 0;
+	my $bytes_written = $writes{$pid}{bytes_written} || 0;
 
 	printf("%6s  %-20s  %10s  %10s\n", $pid, $comm,
 	       $total_writes, $bytes_written);
@@ -129,16 +136,23 @@ sub trace_end
     printf("%6s  %20s  %6s  %10s\n", "------", "--------------------",
 	   "------", "----------");
 
-    foreach my $pid (keys %writes) {
-	my $comm = $writes{$pid}{comm};
-	foreach my $err (sort {$writes{$b}{comm} cmp $writes{$a}{comm}}
-			 keys %{$writes{$pid}{errors}}) {
-	    my $errors = $writes{$pid}{errors}{$err};
+    @errcounts = ();
 
-	    printf("%6d  %-20s  %6d  %10s\n", $pid, $comm, $err, $errors);
+    foreach my $pid (keys %writes) {
+	foreach my $error (keys %{$writes{$pid}{errors}}) {
+	    my $comm = $writes{$pid}{comm} || "";
+	    my $errcount = $writes{$pid}{errors}{$error} || 0;
+	    push @errcounts, [$pid, $comm, $error, $errcount];
 	}
     }
 
+    @errcounts = sort { $b->[3] <=> $a->[3] } @errcounts;
+
+    for my $i (0 .. $#errcounts) {
+	printf("%6d  %-20s  %6d  %10s\n", $errcounts[$i][0],
+	       $errcounts[$i][1], $errcounts[$i][2], $errcounts[$i][3]);
+    }
+
     print_unhandled();
 }
 
-- 
cgit v1.2.3


From e88a4bfbcda440b1c6b9d5a31a554a6ad9686182 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 9 May 2010 23:46:54 -0500
Subject: perf/trace/scripting: rwtop script cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A couple of fixes for the rwtop script:

- printing the totals and clearing the hashes in the signal handler
  eventually leads to various random and serious problems when running
  the rwtop script continuously.  Moving the print_totals() calls to
  the event handlers solves that problem, and the event handlers are
  invoked frequently enough that it doesn't affect the timeliness of
  the output.

- Fix nuisance 'use of uninitialized value' warnings

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Message-Id: <1273466820-9330-4-git-send-email-tzanussi@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/scripts/perl/rwtop.pl | 48 +++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/tools/perf/scripts/perl/rwtop.pl b/tools/perf/scripts/perl/rwtop.pl
index ec2ab49a6f2..4bb3ecd3347 100644
--- a/tools/perf/scripts/perl/rwtop.pl
+++ b/tools/perf/scripts/perl/rwtop.pl
@@ -21,6 +21,7 @@ use Perf::Trace::Util;
 my $default_interval = 3;
 my $nlines = 20;
 my $print_thread;
+my $print_pending = 0;
 
 my %reads;
 my %writes;
@@ -36,6 +37,8 @@ sub syscalls::sys_exit_read
 	$common_pid, $common_comm,
 	$nr, $ret) = @_;
 
+    print_check();
+
     if ($ret > 0) {
 	$reads{$common_pid}{bytes_read} += $ret;
     } else {
@@ -52,6 +55,8 @@ sub syscalls::sys_enter_read
 	$common_pid, $common_comm,
 	$nr, $fd, $buf, $count) = @_;
 
+    print_check();
+
     $reads{$common_pid}{bytes_requested} += $count;
     $reads{$common_pid}{total_reads}++;
     $reads{$common_pid}{comm} = $common_comm;
@@ -63,6 +68,8 @@ sub syscalls::sys_exit_write
 	$common_pid, $common_comm,
 	$nr, $ret) = @_;
 
+    print_check();
+
     if ($ret <= 0) {
 	$writes{$common_pid}{errors}{$ret}++;
     }
@@ -74,6 +81,8 @@ sub syscalls::sys_enter_write
 	$common_pid, $common_comm,
 	$nr, $fd, $buf, $count) = @_;
 
+    print_check();
+
     $writes{$common_pid}{bytes_written} += $count;
     $writes{$common_pid}{total_writes}++;
     $writes{$common_pid}{comm} = $common_comm;
@@ -81,7 +90,7 @@ sub syscalls::sys_enter_write
 
 sub trace_begin
 {
-    $SIG{ALRM} = \&print_totals;
+    $SIG{ALRM} = \&set_print_pending;
     alarm 1;
 }
 
@@ -91,6 +100,20 @@ sub trace_end
     print_totals();
 }
 
+sub print_check()
+{
+    if ($print_pending == 1) {
+	$print_pending = 0;
+	print_totals();
+    }
+}
+
+sub set_print_pending()
+{
+    $print_pending = 1;
+    alarm $interval;
+}
+
 sub print_totals
 {
     my $count;
@@ -106,12 +129,12 @@ sub print_totals
     printf("%6s  %-20s  %10s  %10s  %10s\n", "------", "--------------------",
 	   "----------", "----------", "----------");
 
-    foreach my $pid (sort {$reads{$b}{bytes_read} <=>
-			       $reads{$a}{bytes_read}} keys %reads) {
-	my $comm = $reads{$pid}{comm};
-	my $total_reads = $reads{$pid}{total_reads};
-	my $bytes_requested = $reads{$pid}{bytes_requested};
-	my $bytes_read = $reads{$pid}{bytes_read};
+    foreach my $pid (sort { ($reads{$b}{bytes_read} || 0) <=>
+			       ($reads{$a}{bytes_read} || 0) } keys %reads) {
+	my $comm = $reads{$pid}{comm} || "";
+	my $total_reads = $reads{$pid}{total_reads} || 0;
+	my $bytes_requested = $reads{$pid}{bytes_requested} || 0;
+	my $bytes_read = $reads{$pid}{bytes_read} || 0;
 
 	printf("%6s  %-20s  %10s  %10s  %10s\n", $pid, $comm,
 	       $total_reads, $bytes_requested, $bytes_read);
@@ -130,11 +153,11 @@ sub print_totals
     printf("%6s  %-20s  %10s  %13s\n", "------", "--------------------",
 	   "----------", "-------------");
 
-    foreach my $pid (sort {$writes{$b}{bytes_written} <=>
-			       $writes{$a}{bytes_written}} keys %writes) {
-	my $comm = $writes{$pid}{comm};
-	my $total_writes = $writes{$pid}{total_writes};
-	my $bytes_written = $writes{$pid}{bytes_written};
+    foreach my $pid (sort { ($writes{$b}{bytes_written} || 0) <=>
+			($writes{$a}{bytes_written} || 0)} keys %writes) {
+	my $comm = $writes{$pid}{comm} || "";
+	my $total_writes = $writes{$pid}{total_writes} || 0;
+	my $bytes_written = $writes{$pid}{bytes_written} || 0;
 
 	printf("%6s  %-20s  %10s  %13s\n", $pid, $comm,
 	       $total_writes, $bytes_written);
@@ -146,7 +169,6 @@ sub print_totals
 
     %reads = ();
     %writes = ();
-    alarm $interval;
 }
 
 my %unhandled;
-- 
cgit v1.2.3


From e366728d57cb8c708f76b282ae194c6044355b5f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 9 May 2010 23:46:55 -0500
Subject: perf/trace/scripting: wakeup-latency script cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some minor fixes for the wakeup-latency script:

 - Fix nuisance 'use of uninitialized value' warnings

 - Avoid divide-by-zero error

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1273466820-9330-5-git-send-email-tzanussi@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/scripts/perl/wakeup-latency.pl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tools/perf/scripts/perl/wakeup-latency.pl b/tools/perf/scripts/perl/wakeup-latency.pl
index ed58ef284e2..d9143dcec6c 100644
--- a/tools/perf/scripts/perl/wakeup-latency.pl
+++ b/tools/perf/scripts/perl/wakeup-latency.pl
@@ -22,8 +22,8 @@ my %last_wakeup;
 
 my $max_wakeup_latency;
 my $min_wakeup_latency;
-my $total_wakeup_latency;
-my $total_wakeups;
+my $total_wakeup_latency = 0;
+my $total_wakeups = 0;
 
 sub sched::sched_switch
 {
@@ -67,8 +67,12 @@ sub trace_end
 {
     printf("wakeup_latency stats:\n\n");
     print "total_wakeups: $total_wakeups\n";
-    printf("avg_wakeup_latency (ns): %u\n",
-	   avg($total_wakeup_latency, $total_wakeups));
+    if ($total_wakeups) {
+	printf("avg_wakeup_latency (ns): %u\n",
+	       avg($total_wakeup_latency, $total_wakeups));
+    } else {
+	printf("avg_wakeup_latency (ns): N/A\n");
+    }
     printf("min_wakeup_latency (ns): %u\n", $min_wakeup_latency);
     printf("max_wakeup_latency (ns): %u\n", $max_wakeup_latency);
 
-- 
cgit v1.2.3


From a3412d9b358d37fce4527fd67ea601635f2b9496 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 9 May 2010 23:46:56 -0500
Subject: perf/trace/scripting: workqueue-stats script cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some minor fixes for the workqueue-stats script:

 - Fix nuisance 'use of uninitialized value' warnings

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1273466820-9330-6-git-send-email-tzanussi@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/scripts/perl/workqueue-stats.pl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/perf/scripts/perl/workqueue-stats.pl b/tools/perf/scripts/perl/workqueue-stats.pl
index 511302c8a49..b84b12699b7 100644
--- a/tools/perf/scripts/perl/workqueue-stats.pl
+++ b/tools/perf/scripts/perl/workqueue-stats.pl
@@ -71,9 +71,9 @@ sub trace_end
     printf("%3s %6s %6s\t%-20s\n", "---", "---", "----", "----");
     foreach my $pidhash (@cpus) {
 	while ((my $pid, my $wqhash) = each %$pidhash) {
-	    my $ins = $$wqhash{'inserted'};
-	    my $exe = $$wqhash{'executed'};
-	    my $comm = $$wqhash{'comm'};
+	    my $ins = $$wqhash{'inserted'} || 0;
+	    my $exe = $$wqhash{'executed'} || 0;
+	    my $comm = $$wqhash{'comm'} || "";
 	    if ($ins || $exe) {
 		printf("%3u %6u %6u\t%-20s\n", $cpu, $ins, $exe, $comm);
 	    }
@@ -87,9 +87,9 @@ sub trace_end
     printf("%3s %6s %6s\t%-20s\n", "---", "-------", "---------", "----");
     foreach my $pidhash (@cpus) {
 	while ((my $pid, my $wqhash) = each %$pidhash) {
-	    my $created = $$wqhash{'created'};
-	    my $destroyed = $$wqhash{'destroyed'};
-	    my $comm = $$wqhash{'comm'};
+	    my $created = $$wqhash{'created'} || 0;
+	    my $destroyed = $$wqhash{'destroyed'} || 0;
+	    my $comm = $$wqhash{'comm'} || "";
 	    if ($created || $destroyed) {
 		printf("%3u %6u %6u\t%-20s\n", $cpu, $created, $destroyed,
 		       $comm);
-- 
cgit v1.2.3


From 3824a4e8da9791f4eed99d69bfcdb3b42f440426 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 9 May 2010 23:46:57 -0500
Subject: perf/trace/scripting: don't show script start/stop messages by
 default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Only print the script start/stop messages in verbose mode - users
normally don't care and it just clutters up the output.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1273466820-9330-7-git-send-email-tzanussi@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-trace.c                             | 3 +++
 tools/perf/util/scripting-engines/trace-event-perl.c   | 3 ---
 tools/perf/util/scripting-engines/trace-event-python.c | 4 ----
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 6e268ca761e..95fcb0517a9 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -53,6 +53,8 @@ static void setup_scripting(void)
 
 static int cleanup_scripting(void)
 {
+	pr_debug("\nperf trace script stopped\n");
+
 	return scripting_ops->stop_script();
 }
 
@@ -703,6 +705,7 @@ int cmd_trace(int argc, const char **argv, const char *prefix __used)
 		err = scripting_ops->start_script(script_name, argc, argv);
 		if (err)
 			goto out;
+		pr_debug("perf trace started with script %s\n\n", script_name);
 	}
 
 	err = __cmd_trace(session);
diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index 5376378e0cf..b059dc50cc2 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -371,7 +371,6 @@ static int perl_start_script(const char *script, int argc, const char **argv)
 	run_start_sub();
 
 	free(command_line);
-	fprintf(stderr, "perf trace started with Perl script %s\n\n", script);
 	return 0;
 error:
 	perl_free(my_perl);
@@ -394,8 +393,6 @@ static int perl_stop_script(void)
 	perl_destruct(my_perl);
 	perl_free(my_perl);
 
-	fprintf(stderr, "\nperf trace Perl script stopped\n");
-
 	return 0;
 }
 
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 6a72f14c598..81f39cab3aa 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -374,8 +374,6 @@ static int python_start_script(const char *script, int argc, const char **argv)
 	}
 
 	free(command_line);
-	fprintf(stderr, "perf trace started with Python script %s\n\n",
-		script);
 
 	return err;
 error:
@@ -407,8 +405,6 @@ out:
 	Py_XDECREF(main_module);
 	Py_Finalize();
 
-	fprintf(stderr, "\nperf trace Python script stopped\n");
-
 	return err;
 }
 
-- 
cgit v1.2.3


From a4ab0c12975d1286b2696370f5e0576450609bf0 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 9 May 2010 23:46:58 -0500
Subject: perf/trace/scripting: failed-syscalls-by-pid script cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A small fixe for the failed syscalls by pid script:

 - silence the match output in the shell script

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1273466820-9330-8-git-send-email-tzanussi@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/scripts/python/bin/failed-syscalls-by-pid-report | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report
index 8c128eff9c0..30293545fcc 100644
--- a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report
+++ b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report
@@ -2,7 +2,7 @@
 # description: system-wide failed syscalls, by pid
 # args: [comm]
 if [ $# -gt 0 ] ; then
-    if ! expr match "$1" "-" ; then
+    if ! expr match "$1" "-" > /dev/null ; then
 	comm=$1
 	shift
     fi
-- 
cgit v1.2.3


From 79e653f1bf2e52d12a952366e782dadf590b9d1d Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 9 May 2010 23:46:59 -0500
Subject: perf/trace/scripting: syscall-counts-by-pid script cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A small fix for the syscall counts by pid script:

- silence the match output in the shell script

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1273466820-9330-9-git-send-email-tzanussi@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/scripts/python/bin/syscall-counts-by-pid-report | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/scripts/python/bin/syscall-counts-by-pid-report b/tools/perf/scripts/python/bin/syscall-counts-by-pid-report
index c53362e4860..9e9d8ddd72c 100644
--- a/tools/perf/scripts/python/bin/syscall-counts-by-pid-report
+++ b/tools/perf/scripts/python/bin/syscall-counts-by-pid-report
@@ -2,7 +2,7 @@
 # description: system-wide syscall counts, by pid
 # args: [comm]
 if [ $# -gt 0 ] ; then
-    if ! expr match "$1" "-" ; then
+    if ! expr match "$1" "-" > /dev/null ; then
 	comm=$1
 	shift
     fi
-- 
cgit v1.2.3


From e61a639a794063d78fd248a37ce2c21d5c81fc19 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 9 May 2010 23:47:00 -0500
Subject: perf/trace/scripting: syscall-counts script cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A small fix for the syscall counts script:

 - silence the match output in the shell script

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1273466820-9330-10-git-send-email-tzanussi@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/scripts/python/bin/syscall-counts-report | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/scripts/python/bin/syscall-counts-report b/tools/perf/scripts/python/bin/syscall-counts-report
index 8c21552b3cd..dc076b61879 100644
--- a/tools/perf/scripts/python/bin/syscall-counts-report
+++ b/tools/perf/scripts/python/bin/syscall-counts-report
@@ -2,7 +2,7 @@
 # description: system-wide syscall counts
 # args: [comm]
 if [ $# -gt 0 ] ; then
-    if ! expr match "$1" "-" ; then
+    if ! expr match "$1" "-" > /dev/null ; then
 	comm=$1
 	shift
     fi
-- 
cgit v1.2.3


From a5d8e467f83f6672104f276223a88e3b50cbd375 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Sat, 17 Apr 2010 08:48:38 -0400
Subject: Debugobjects transition check

Implement a basic state machine checker in the debugobjects.

This state machine checker detects races and inconsistencies within the "active"
life of a debugobject. The checker only keeps track of the current state; all
the state machine logic is kept at the object instance level.

The checker works by adding a supplementary "unsigned int astate" field to the
debug_obj structure. It keeps track of the current "active state" of the object.

The only constraints that are imposed on the states by the debugobjects system
is that:

- activation of an object sets the current active state to 0,
- deactivation of an object expects the current active state to be 0.

For the rest of the states, the state mapping is determined by the specific
object instance. Therefore, the logic keeping track of the state machine is
within the specialized instance, without any need to know about it at the
debugobject level.

The current object active state is changed by calling:

debug_object_active_state(addr, descr, expect, next)

where "expect" is the expected state and "next" is the next state to move to if
the expected state is found. A warning is generated if the expected is not
found.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David S. Miller <davem@davemloft.net>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: akpm@linux-foundation.org
CC: mingo@elte.hu
CC: laijs@cn.fujitsu.com
CC: dipankar@in.ibm.com
CC: josh@joshtriplett.org
CC: dvhltc@us.ibm.com
CC: niv@us.ibm.com
CC: peterz@infradead.org
CC: rostedt@goodmis.org
CC: Valdis.Kletnieks@vt.edu
CC: dhowells@redhat.com
CC: eric.dumazet@gmail.com
CC: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/debugobjects.h | 11 +++++++++
 lib/debugobjects.c           | 59 +++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index 8c243aaa86a..597692f1fc8 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -20,12 +20,14 @@ struct debug_obj_descr;
  * struct debug_obj - representaion of an tracked object
  * @node:	hlist node to link the object into the tracker list
  * @state:	tracked object state
+ * @astate:	current active state
  * @object:	pointer to the real object
  * @descr:	pointer to an object type specific debug description structure
  */
 struct debug_obj {
 	struct hlist_node	node;
 	enum debug_obj_state	state;
+	unsigned int		astate;
 	void			*object;
 	struct debug_obj_descr	*descr;
 };
@@ -60,6 +62,15 @@ extern void debug_object_deactivate(void *addr, struct debug_obj_descr *descr);
 extern void debug_object_destroy   (void *addr, struct debug_obj_descr *descr);
 extern void debug_object_free      (void *addr, struct debug_obj_descr *descr);
 
+/*
+ * Active state:
+ * - Set at 0 upon initialization.
+ * - Must return to 0 before deactivation.
+ */
+extern void
+debug_object_active_state(void *addr, struct debug_obj_descr *descr,
+			  unsigned int expect, unsigned int next);
+
 extern void debug_objects_early_init(void);
 extern void debug_objects_mem_init(void);
 #else
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index b862b30369f..076464fd207 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -141,6 +141,7 @@ alloc_object(void *addr, struct debug_bucket *b, struct debug_obj_descr *descr)
 		obj->object = addr;
 		obj->descr  = descr;
 		obj->state  = ODEBUG_STATE_NONE;
+		obj->astate = 0;
 		hlist_del(&obj->node);
 
 		hlist_add_head(&obj->node, &b->list);
@@ -252,8 +253,10 @@ static void debug_print_object(struct debug_obj *obj, char *msg)
 
 	if (limit < 5 && obj->descr != descr_test) {
 		limit++;
-		WARN(1, KERN_ERR "ODEBUG: %s %s object type: %s\n", msg,
-		       obj_states[obj->state], obj->descr->name);
+		WARN(1, KERN_ERR "ODEBUG: %s %s (active state %u) "
+				 "object type: %s\n",
+			msg, obj_states[obj->state], obj->astate,
+			obj->descr->name);
 	}
 	debug_objects_warnings++;
 }
@@ -447,7 +450,10 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr)
 		case ODEBUG_STATE_INIT:
 		case ODEBUG_STATE_INACTIVE:
 		case ODEBUG_STATE_ACTIVE:
-			obj->state = ODEBUG_STATE_INACTIVE;
+			if (!obj->astate)
+				obj->state = ODEBUG_STATE_INACTIVE;
+			else
+				debug_print_object(obj, "deactivate");
 			break;
 
 		case ODEBUG_STATE_DESTROYED:
@@ -553,6 +559,53 @@ out_unlock:
 	raw_spin_unlock_irqrestore(&db->lock, flags);
 }
 
+/**
+ * debug_object_active_state - debug checks object usage state machine
+ * @addr:	address of the object
+ * @descr:	pointer to an object specific debug description structure
+ * @expect:	expected state
+ * @next:	state to move to if expected state is found
+ */
+void
+debug_object_active_state(void *addr, struct debug_obj_descr *descr,
+			  unsigned int expect, unsigned int next)
+{
+	struct debug_bucket *db;
+	struct debug_obj *obj;
+	unsigned long flags;
+
+	if (!debug_objects_enabled)
+		return;
+
+	db = get_bucket((unsigned long) addr);
+
+	raw_spin_lock_irqsave(&db->lock, flags);
+
+	obj = lookup_object(addr, db);
+	if (obj) {
+		switch (obj->state) {
+		case ODEBUG_STATE_ACTIVE:
+			if (obj->astate == expect)
+				obj->astate = next;
+			else
+				debug_print_object(obj, "active_state");
+			break;
+
+		default:
+			debug_print_object(obj, "active_state");
+			break;
+		}
+	} else {
+		struct debug_obj o = { .object = addr,
+				       .state = ODEBUG_STATE_NOTAVAILABLE,
+				       .descr = descr };
+
+		debug_print_object(&o, "active_state");
+	}
+
+	raw_spin_unlock_irqrestore(&db->lock, flags);
+}
+
 #ifdef CONFIG_DEBUG_OBJECTS_FREE
 static void __debug_check_no_obj_freed(const void *address, unsigned long size)
 {
-- 
cgit v1.2.3


From 4376030a54860dedab9d848dfa7cc700a6025c0b Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Sat, 17 Apr 2010 08:48:39 -0400
Subject: rcu head introduce rcu head init on stack

PEM:
o     Would it be possible to make this bisectable as follows?

      a.      Insert a new patch after current patch 4/6 that
              defines destroy_rcu_head_on_stack(),
              init_rcu_head_on_stack(), and init_rcu_head() with
              their !CONFIG_DEBUG_OBJECTS_RCU_HEAD definitions.

This patch performs this transition.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: David S. Miller <davem@davemloft.net>
CC: akpm@linux-foundation.org
CC: mingo@elte.hu
CC: laijs@cn.fujitsu.com
CC: dipankar@in.ibm.com
CC: josh@joshtriplett.org
CC: dvhltc@us.ibm.com
CC: niv@us.ibm.com
CC: tglx@linutronix.de
CC: peterz@infradead.org
CC: rostedt@goodmis.org
CC: Valdis.Kletnieks@vt.edu
CC: dhowells@redhat.com
CC: eric.dumazet@gmail.com
CC: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 23be3a70251..b653b4aaa8a 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -79,6 +79,14 @@ extern void rcu_init(void);
        (ptr)->next = NULL; (ptr)->func = NULL; \
 } while (0)
 
+static inline void init_rcu_head_on_stack(struct rcu_head *head)
+{
+}
+
+static inline void destroy_rcu_head_on_stack(struct rcu_head *head)
+{
+}
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 extern struct lockdep_map rcu_lock_map;
-- 
cgit v1.2.3


From d861f7bf14808b9f457cb32c34585e97df60f140 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marek.vasut@gmail.com>
Date: Mon, 10 May 2010 15:35:11 -0700
Subject: Input: iforce - add Guillemot Jet Leader Force Feedback

This device features a RUDDER on the knob.

Signed-off-by: Marek Vasut <marek.vasut@gmail.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/joystick/iforce/iforce-main.c | 4 ++++
 drivers/input/joystick/iforce/iforce-usb.c  | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/input/joystick/iforce/iforce-main.c b/drivers/input/joystick/iforce/iforce-main.c
index b1edd778639..5956d89b1d3 100644
--- a/drivers/input/joystick/iforce/iforce-main.c
+++ b/drivers/input/joystick/iforce/iforce-main.c
@@ -54,6 +54,9 @@ static signed short btn_avb_wheel[] =
 static signed short abs_joystick[] =
 { ABS_X, ABS_Y, ABS_THROTTLE, ABS_HAT0X, ABS_HAT0Y, -1 };
 
+static signed short abs_joystick_rudder[] =
+{ ABS_X, ABS_Y, ABS_THROTTLE, ABS_RUDDER, ABS_HAT0X, ABS_HAT0Y, -1 };
+
 static signed short abs_avb_pegasus[] =
 { ABS_X, ABS_Y, ABS_THROTTLE, ABS_RUDDER, ABS_HAT0X, ABS_HAT0Y,
   ABS_HAT1X, ABS_HAT1Y, -1 };
@@ -76,6 +79,7 @@ static struct iforce_device iforce_device[] = {
 	{ 0x061c, 0xc0a4, "ACT LABS Force RS",                          btn_wheel, abs_wheel, ff_iforce }, //?
 	{ 0x061c, 0xc084, "ACT LABS Force RS",				btn_wheel, abs_wheel, ff_iforce },
 	{ 0x06f8, 0x0001, "Guillemot Race Leader Force Feedback",	btn_wheel, abs_wheel, ff_iforce }, //?
+	{ 0x06f8, 0x0001, "Guillemot Jet Leader Force Feedback",	btn_joystick, abs_joystick_rudder, ff_iforce },
 	{ 0x06f8, 0x0004, "Guillemot Force Feedback Racing Wheel",	btn_wheel, abs_wheel, ff_iforce }, //?
 	{ 0x06f8, 0x0004, "Gullemot Jet Leader 3D",			btn_joystick, abs_joystick, ff_iforce }, //?
 	{ 0x06d6, 0x29bc, "Trust Force Feedback Race Master",		btn_wheel, abs_wheel, ff_iforce },
diff --git a/drivers/input/joystick/iforce/iforce-usb.c b/drivers/input/joystick/iforce/iforce-usb.c
index b41303d3ec5..6c96631ae5d 100644
--- a/drivers/input/joystick/iforce/iforce-usb.c
+++ b/drivers/input/joystick/iforce/iforce-usb.c
@@ -212,6 +212,7 @@ static struct usb_device_id iforce_usb_ids [] = {
 	{ USB_DEVICE(0x061c, 0xc0a4) },         /* ACT LABS Force RS */
 	{ USB_DEVICE(0x061c, 0xc084) },         /* ACT LABS Force RS */
 	{ USB_DEVICE(0x06f8, 0x0001) },		/* Guillemot Race Leader Force Feedback */
+	{ USB_DEVICE(0x06f8, 0x0003) },		/* Guillemot Jet Leader Force Feedback */
 	{ USB_DEVICE(0x06f8, 0x0004) },		/* Guillemot Force Feedback Racing Wheel */
 	{ USB_DEVICE(0x06f8, 0xa302) },		/* Guillemot Jet Leader 3D */
 	{ }					/* Terminating entry */
-- 
cgit v1.2.3


From 513d8be9883fe0a7a73d216c7cecd20e7c9effda Mon Sep 17 00:00:00 2001
From: Marek Vasut <marek.vasut@gmail.com>
Date: Mon, 10 May 2010 15:35:11 -0700
Subject: Input: iforce - fix Guillemot Jet Leader 3D entry

USB ID entry for "Guillemot Jet Leader 3D" in iforce-main.c did not match
one used in iforce-usb.c

Signed-off-by: Marek Vasut <marek.vasut@gmail.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/joystick/iforce/iforce-main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/input/joystick/iforce/iforce-main.c b/drivers/input/joystick/iforce/iforce-main.c
index 5956d89b1d3..405febd94f2 100644
--- a/drivers/input/joystick/iforce/iforce-main.c
+++ b/drivers/input/joystick/iforce/iforce-main.c
@@ -81,7 +81,7 @@ static struct iforce_device iforce_device[] = {
 	{ 0x06f8, 0x0001, "Guillemot Race Leader Force Feedback",	btn_wheel, abs_wheel, ff_iforce }, //?
 	{ 0x06f8, 0x0001, "Guillemot Jet Leader Force Feedback",	btn_joystick, abs_joystick_rudder, ff_iforce },
 	{ 0x06f8, 0x0004, "Guillemot Force Feedback Racing Wheel",	btn_wheel, abs_wheel, ff_iforce }, //?
-	{ 0x06f8, 0x0004, "Gullemot Jet Leader 3D",			btn_joystick, abs_joystick, ff_iforce }, //?
+	{ 0x06f8, 0xa302, "Guillemot Jet Leader 3D",			btn_joystick, abs_joystick, ff_iforce }, //?
 	{ 0x06d6, 0x29bc, "Trust Force Feedback Race Master",		btn_wheel, abs_wheel, ff_iforce },
 	{ 0x0000, 0x0000, "Unknown I-Force Device [%04x:%04x]",		btn_joystick, abs_joystick, ff_iforce }
 };
-- 
cgit v1.2.3


From 0ebf9e3692d640917fb792a7494d05e1f5b1058f Mon Sep 17 00:00:00 2001
From: Daniel T Chen <crimsun@ubuntu.com>
Date: Mon, 10 May 2010 21:50:04 +0200
Subject: ALSA: hda: Fix 0 dB for Lenovo models using Conexant CX20549 (Venice)

Reference: http://mailman.alsa-project.org/pipermail/alsa-devel/2010-May/027525.html

As reported on the mailing list, we also need to cap to the 0 dB offset
for Lenovo models, else the sound will be distorted.

Reported-and-Tested-by: Tim Starling <tstarling@wikimedia.org>
Cc: <stable@kernel.org>
Signed-off-by: Daniel T Chen <crimsun@ubuntu.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_conexant.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
index d8213e2231a..feabb44c7ca 100644
--- a/sound/pci/hda/patch_conexant.c
+++ b/sound/pci/hda/patch_conexant.c
@@ -1197,9 +1197,10 @@ static int patch_cxt5045(struct hda_codec *codec)
 	case 0x103c:
 	case 0x1631:
 	case 0x1734:
-		/* HP, Packard Bell, & Fujitsu-Siemens laptops have really bad
-		 * sound over 0dB on NID 0x17. Fix max PCM level to 0 dB
-		 * (originally it has 0x2b steps with 0dB offset 0x14)
+	case 0x17aa:
+		/* HP, Packard Bell, Fujitsu-Siemens & Lenovo laptops have
+		 * really bad sound over 0dB on NID 0x17. Fix max PCM level to
+		 * 0 dB (originally it has 0x2b steps with 0dB offset 0x14)
 		 */
 		snd_hda_override_amp_caps(codec, 0x17, HDA_INPUT,
 					  (0x14 << AC_AMPCAP_OFFSET_SHIFT) |
-- 
cgit v1.2.3


From e3174cfd2a1e28fff774681f00a0eef3d31da970 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 11 May 2010 08:31:49 +0200
Subject: Revert "perf: Fix exit() vs PERF_FORMAT_GROUP"

This reverts commit 4fd38e4595e2f6c9d27732c042a0e16b2753049c.

It causes various crashes and hangs when events are activated.

The cause is not fully understood yet but we need to revert it
because the effects are severe.

Reported-by: Stephane Eranian <eranian@google.com>
Reported-by: Lin Ming <ming.m.lin@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h | 1 -
 kernel/perf_event.c        | 5 -----
 2 files changed, 6 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4924c96d7e2..3fd5c82e0e1 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -575,7 +575,6 @@ struct pmu {
  * enum perf_event_active_state - the states of a event
  */
 enum perf_event_active_state {
-	PERF_EVENT_STATE_FREE		= -3,
 	PERF_EVENT_STATE_ERROR		= -2,
 	PERF_EVENT_STATE_OFF		= -1,
 	PERF_EVENT_STATE_INACTIVE	=  0,
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 180151ff837..a9047463fd8 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -334,9 +334,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (event->state > PERF_EVENT_STATE_OFF)
 		event->state = PERF_EVENT_STATE_OFF;
 
-	if (event->state > PERF_EVENT_STATE_FREE)
-		return;
-
 	/*
 	 * If this was a group event with sibling events then
 	 * upgrade the siblings to singleton events by adding them
@@ -1871,8 +1868,6 @@ int perf_event_release_kernel(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 
-	event->state = PERF_EVENT_STATE_FREE;
-
 	WARN_ON_ONCE(ctx->parent_ctx);
 	/*
 	 * There are two ways this annotation is useful:
-- 
cgit v1.2.3


From 26ebe0a28986f4845b2c5bea43ac5cc0b9f27f0a Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 11 May 2010 08:36:29 +0200
Subject: ALSA: hda - Fix mute-LED GPIO pin for HP dv series

Old HP dv series seem to use the GPIO pin 0 for controlling the mute LED
although the pin is a large package, where the newer models use GPIO 3
in such a case.  For fixing the regression from the previous kernels,
set spec->gpio_led statically for these model quirks.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_sigmatel.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c
index 12825aa0310..eb4ea3df5d8 100644
--- a/sound/pci/hda/patch_sigmatel.c
+++ b/sound/pci/hda/patch_sigmatel.c
@@ -4766,6 +4766,9 @@ static void set_hp_led_gpio(struct hda_codec *codec)
 	struct sigmatel_spec *spec = codec->spec;
 	unsigned int gpio;
 
+	if (spec->gpio_led)
+		return;
+
 	gpio = snd_hda_param_read(codec, codec->afg, AC_PAR_GPIO_CAP);
 	gpio &= AC_GPIO_IO_COUNT;
 	if (gpio > 3)
@@ -5683,11 +5686,13 @@ again:
 		 * detection.
 		 */
 		spec->hp_detect = 1;
+		spec->gpio_led = 0x01;
 		break;
 	case STAC_HP_HDX:
 		spec->num_dmics = 1;
 		spec->num_dmuxes = 1;
 		spec->num_smuxes = 1;
+		spec->gpio_led = 0x08;
 		break;
 	}
 
-- 
cgit v1.2.3


From 050735b08ca8a016bbace4445fa025b88fee770b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 11 May 2010 11:51:53 +0200
Subject: perf: Fix exit() vs PERF_FORMAT_GROUP

Both Stephane and Corey reported that PERF_FORMAT_GROUP didn't
work as expected if the task the counters were attached to quit
before the read() call.

The cause is that we unconditionally destroy the grouping when
we remove counters from their context. Fix this by splitting off
the group destroy from the list removal such that
perf_event_remove_from_context() does not do this and change
perf_event_release() to do so.

Reported-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Reported-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: <stable@kernel.org> # .34.x
LKML-Reference: <1273571513.5605.3527.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a9047463fd8..c97e8251840 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -308,8 +308,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 static void
 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-	struct perf_event *sibling, *tmp;
-
 	if (list_empty(&event->group_entry))
 		return;
 	ctx->nr_events--;
@@ -333,6 +331,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	 */
 	if (event->state > PERF_EVENT_STATE_OFF)
 		event->state = PERF_EVENT_STATE_OFF;
+}
+
+static void
+perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
+{
+	struct perf_event *sibling, *tmp;
 
 	/*
 	 * If this was a group event with sibling events then
@@ -1868,6 +1872,12 @@ int perf_event_release_kernel(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
 
+	/*
+	 * Remove from the PMU, can't get re-enabled since we got
+	 * here because the last ref went.
+	 */
+	perf_event_disable(event);
+
 	WARN_ON_ONCE(ctx->parent_ctx);
 	/*
 	 * There are two ways this annotation is useful:
@@ -1882,7 +1892,10 @@ int perf_event_release_kernel(struct perf_event *event)
 	 *     to trigger the AB-BA case.
 	 */
 	mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
-	perf_event_remove_from_context(event);
+	raw_spin_lock_irq(&ctx->lock);
+	list_del_event(event, ctx);
+	perf_destroy_group(event, ctx);
+	raw_spin_unlock_irq(&ctx->lock);
 	mutex_unlock(&ctx->mutex);
 
 	mutex_lock(&event->owner->perf_event_mutex);
-- 
cgit v1.2.3


From 96c21a460a37880abfbc8445d5b098dbab958a29 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 11 May 2010 16:19:10 +0200
Subject: perf: Fix exit() vs event-groups

Corey reported that the value scale times of group siblings are not
updated when the monitored task dies.

The problem appears to be that we only update the group leader's
time values, fix it by updating the whole group.

Reported-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: <stable@kernel.org> # .34.x
LKML-Reference: <1273588935.1810.6.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index c97e8251840..a4fa381db3c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -255,6 +255,18 @@ static void update_event_times(struct perf_event *event)
 	event->total_time_running = run_end - event->tstamp_running;
 }
 
+/*
+ * Update total_time_enabled and total_time_running for all events in a group.
+ */
+static void update_group_times(struct perf_event *leader)
+{
+	struct perf_event *event;
+
+	update_event_times(leader);
+	list_for_each_entry(event, &leader->sibling_list, group_entry)
+		update_event_times(event);
+}
+
 static struct list_head *
 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 {
@@ -320,7 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (event->group_leader != event)
 		event->group_leader->nr_siblings--;
 
-	update_event_times(event);
+	update_group_times(event);
 
 	/*
 	 * If event was in error state, then keep it
@@ -501,18 +513,6 @@ retry:
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
-/*
- * Update total_time_enabled and total_time_running for all events in a group.
- */
-static void update_group_times(struct perf_event *leader)
-{
-	struct perf_event *event;
-
-	update_event_times(leader);
-	list_for_each_entry(event, &leader->sibling_list, group_entry)
-		update_event_times(event);
-}
-
 /*
  * Cross CPU call to disable a performance event
  */
-- 
cgit v1.2.3


From 8e6d5573af55435160d329f6ae3fe16a0abbdaec Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Sat, 8 May 2010 20:28:41 +1000
Subject: perf, powerpc: Implement group scheduling transactional APIs

[paulus@samba.org: Set cpuhw->event[i]->hw.config in
power_pmu_commit_txn.]

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20100508102841.GA10650@brick.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_event.c | 129 +++++++++++++++++++++------------------
 1 file changed, 68 insertions(+), 61 deletions(-)

diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c
index 08460a2e9f4..43b83c35cf5 100644
--- a/arch/powerpc/kernel/perf_event.c
+++ b/arch/powerpc/kernel/perf_event.c
@@ -35,6 +35,9 @@ struct cpu_hw_events {
 	u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
 	unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
 	unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
+
+	unsigned int group_flag;
+	int n_txn_start;
 };
 DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
 
@@ -718,66 +721,6 @@ static int collect_events(struct perf_event *group, int max_count,
 	return n;
 }
 
-static void event_sched_in(struct perf_event *event)
-{
-	event->state = PERF_EVENT_STATE_ACTIVE;
-	event->oncpu = smp_processor_id();
-	event->tstamp_running += event->ctx->time - event->tstamp_stopped;
-	if (is_software_event(event))
-		event->pmu->enable(event);
-}
-
-/*
- * Called to enable a whole group of events.
- * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
- * Assumes the caller has disabled interrupts and has
- * frozen the PMU with hw_perf_save_disable.
- */
-int hw_perf_group_sched_in(struct perf_event *group_leader,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_event_context *ctx)
-{
-	struct cpu_hw_events *cpuhw;
-	long i, n, n0;
-	struct perf_event *sub;
-
-	if (!ppmu)
-		return 0;
-	cpuhw = &__get_cpu_var(cpu_hw_events);
-	n0 = cpuhw->n_events;
-	n = collect_events(group_leader, ppmu->n_counter - n0,
-			   &cpuhw->event[n0], &cpuhw->events[n0],
-			   &cpuhw->flags[n0]);
-	if (n < 0)
-		return -EAGAIN;
-	if (check_excludes(cpuhw->event, cpuhw->flags, n0, n))
-		return -EAGAIN;
-	i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0);
-	if (i < 0)
-		return -EAGAIN;
-	cpuhw->n_events = n0 + n;
-	cpuhw->n_added += n;
-
-	/*
-	 * OK, this group can go on; update event states etc.,
-	 * and enable any software events
-	 */
-	for (i = n0; i < n0 + n; ++i)
-		cpuhw->event[i]->hw.config = cpuhw->events[i];
-	cpuctx->active_oncpu += n;
-	n = 1;
-	event_sched_in(group_leader);
-	list_for_each_entry(sub, &group_leader->sibling_list, group_entry) {
-		if (sub->state != PERF_EVENT_STATE_OFF) {
-			event_sched_in(sub);
-			++n;
-		}
-	}
-	ctx->nr_active += n;
-
-	return 1;
-}
-
 /*
  * Add a event to the PMU.
  * If all events are not already frozen, then we disable and
@@ -805,12 +748,22 @@ static int power_pmu_enable(struct perf_event *event)
 	cpuhw->event[n0] = event;
 	cpuhw->events[n0] = event->hw.config;
 	cpuhw->flags[n0] = event->hw.event_base;
+
+	/*
+	 * If group events scheduling transaction was started,
+	 * skip the schedulability test here, it will be peformed
+	 * at commit time(->commit_txn) as a whole
+	 */
+	if (cpuhw->group_flag & PERF_EVENT_TXN_STARTED)
+		goto nocheck;
+
 	if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1))
 		goto out;
 	if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1))
 		goto out;
-
 	event->hw.config = cpuhw->events[n0];
+
+nocheck:
 	++cpuhw->n_events;
 	++cpuhw->n_added;
 
@@ -896,11 +849,65 @@ static void power_pmu_unthrottle(struct perf_event *event)
 	local_irq_restore(flags);
 }
 
+/*
+ * Start group events scheduling transaction
+ * Set the flag to make pmu::enable() not perform the
+ * schedulability test, it will be performed at commit time
+ */
+void power_pmu_start_txn(const struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
+
+	cpuhw->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuhw->n_txn_start = cpuhw->n_events;
+}
+
+/*
+ * Stop group events scheduling transaction
+ * Clear the flag and pmu::enable() will perform the
+ * schedulability test.
+ */
+void power_pmu_cancel_txn(const struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events);
+
+	cpuhw->group_flag &= ~PERF_EVENT_TXN_STARTED;
+}
+
+/*
+ * Commit group events scheduling transaction
+ * Perform the group schedulability test as a whole
+ * Return 0 if success
+ */
+int power_pmu_commit_txn(const struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuhw;
+	long i, n;
+
+	if (!ppmu)
+		return -EAGAIN;
+	cpuhw = &__get_cpu_var(cpu_hw_events);
+	n = cpuhw->n_events;
+	if (check_excludes(cpuhw->event, cpuhw->flags, 0, n))
+		return -EAGAIN;
+	i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n);
+	if (i < 0)
+		return -EAGAIN;
+
+	for (i = cpuhw->n_txn_start; i < n; ++i)
+		cpuhw->event[i]->hw.config = cpuhw->events[i];
+
+	return 0;
+}
+
 struct pmu power_pmu = {
 	.enable		= power_pmu_enable,
 	.disable	= power_pmu_disable,
 	.read		= power_pmu_read,
 	.unthrottle	= power_pmu_unthrottle,
+	.start_txn	= power_pmu_start_txn,
+	.cancel_txn	= power_pmu_cancel_txn,
+	.commit_txn	= power_pmu_commit_txn,
 };
 
 /*
-- 
cgit v1.2.3


From a52357259680fe5368c2fabf5949209e231f2aa2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 11 May 2010 17:12:33 +0200
Subject: x86/amd-iommu: Add amd_iommu=off command line option

This patch adds a command line option to tell the AMD IOMMU
driver to not initialize any IOMMU it finds.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 Documentation/kernel-parameters.txt | 2 ++
 arch/x86/kernel/amd_iommu_init.c    | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 839b21b0699..0c6c56076d1 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -324,6 +324,8 @@ and is between 256 and 4096 characters. It is defined in the file
 				    they are unmapped. Otherwise they are
 				    flushed before they will be reused, which
 				    is a lot of faster
+			off	  - do not initialize any AMD IOMMU found in
+				    the system
 
 	amijoy.map=	[HW,JOY] Amiga joystick support
 			Map of devices attached to JOY0DAT and JOY1DAT
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 6360abf993d..3bacb4d0844 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -120,6 +120,7 @@ struct ivmd_header {
 bool amd_iommu_dump;
 
 static int __initdata amd_iommu_detected;
+static bool __initdata amd_iommu_disabled;
 
 u16 amd_iommu_last_bdf;			/* largest PCI device id we have
 					   to handle */
@@ -1372,6 +1373,9 @@ void __init amd_iommu_detect(void)
 	if (no_iommu || (iommu_detected && !gart_iommu_aperture))
 		return;
 
+	if (amd_iommu_disabled)
+		return;
+
 	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
 		iommu_detected = 1;
 		amd_iommu_detected = 1;
@@ -1401,6 +1405,8 @@ static int __init parse_amd_iommu_options(char *str)
 	for (; *str; ++str) {
 		if (strncmp(str, "fullflush", 9) == 0)
 			amd_iommu_unmap_flush = true;
+		if (strncmp(str, "off", 3) == 0)
+			amd_iommu_disabled = true;
 	}
 
 	return 1;
-- 
cgit v1.2.3


From fdb3603800e7a65bc3cafdfd5a1797d08f09e582 Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Tue, 11 May 2010 09:46:46 +0530
Subject: cifs: propagate cifs_new_fileinfo() error back to the caller

..otherwise memory allocation errors go undetected.

Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/dir.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index bd363df19b3..86d3c0c82f2 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -261,8 +261,14 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 	 * cifs_fill_filedata() takes care of setting cifsFileInfo pointer to
 	 * file->private_data.
 	 */
-	if (mnt)
-		cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
+	if (mnt) {
+		struct cifsFileInfo *pfile_info;
+
+		pfile_info = cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt,
+					       oflags);
+		if (pfile_info == NULL)
+			rc = -ENOMEM;
+	}
 
 posix_open_ret:
 	kfree(presp_data);
@@ -476,12 +482,15 @@ cifs_create_set_dentry:
 		/* mknod case - do not leave file open */
 		CIFSSMBClose(xid, tcon, fileHandle);
 	} else if (!(posix_create) && (newinode)) {
+		struct cifsFileInfo *pfile_info;
 		/*
 		 * cifs_fill_filedata() takes care of setting cifsFileInfo
 		 * pointer to file->private_data.
 		 */
-		cifs_new_fileinfo(newinode, fileHandle, NULL, nd->path.mnt,
-				  oflags);
+		pfile_info = cifs_new_fileinfo(newinode, fileHandle, NULL,
+					       nd->path.mnt, oflags);
+		if (pfile_info == NULL)
+			rc = -ENOMEM;
 	}
 cifs_create_out:
 	kfree(buf);
-- 
cgit v1.2.3


From b09e0190acf88c7fe3b05e3c331e1b2ef5310896 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 May 2010 11:10:15 -0300
Subject: perf hist: Adopt filter by dso and by thread methods from the newt
 browser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Those are really not specific to the newt code, can be used by other UI
frontends.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-report.c |  5 +--
 tools/perf/util/hist.c      | 59 +++++++++++++++++++++++++++++++++
 tools/perf/util/hist.h      | 15 +++++++++
 tools/perf/util/newt.c      | 80 ++++++++-------------------------------------
 tools/perf/util/session.h   | 15 ---------
 5 files changed, 89 insertions(+), 85 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index d7c75291e78..3d67d6bf22c 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -301,10 +301,7 @@ static int __cmd_report(void)
 		hists__collapse_resort(hists);
 		hists__output_resort(hists);
 		if (use_browser)
-			perf_session__browse_hists(&hists->entries,
-						   hists->nr_entries,
-						   hists->stats.total, help,
-						   input_name);
+			hists__browse(hists, help, input_name);
 		else {
 			if (rb_first(&session->hists.entries) ==
 			    rb_last(&session->hists.entries))
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index e34fd248067..baa55be64d9 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -784,3 +784,62 @@ print_entries:
 
 	return ret;
 }
+
+enum hist_filter {
+	HIST_FILTER__DSO,
+	HIST_FILTER__THREAD,
+};
+
+void hists__filter_by_dso(struct hists *self, const struct dso *dso)
+{
+	struct rb_node *nd;
+
+	self->nr_entries = self->stats.total = 0;
+	self->max_sym_namelen = 0;
+
+	for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
+		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+
+		if (symbol_conf.exclude_other && !h->parent)
+			continue;
+
+		if (dso != NULL && (h->ms.map == NULL || h->ms.map->dso != dso)) {
+			h->filtered |= (1 << HIST_FILTER__DSO);
+			continue;
+		}
+
+		h->filtered &= ~(1 << HIST_FILTER__DSO);
+		if (!h->filtered) {
+			++self->nr_entries;
+			self->stats.total += h->count;
+			if (h->ms.sym &&
+			    self->max_sym_namelen < h->ms.sym->namelen)
+				self->max_sym_namelen = h->ms.sym->namelen;
+		}
+	}
+}
+
+void hists__filter_by_thread(struct hists *self, const struct thread *thread)
+{
+	struct rb_node *nd;
+
+	self->nr_entries = self->stats.total = 0;
+	self->max_sym_namelen = 0;
+
+	for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
+		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+
+		if (thread != NULL && h->thread != thread) {
+			h->filtered |= (1 << HIST_FILTER__THREAD);
+			continue;
+		}
+		h->filtered &= ~(1 << HIST_FILTER__THREAD);
+		if (!h->filtered) {
+			++self->nr_entries;
+			self->stats.total += h->count;
+			if (h->ms.sym &&
+			    self->max_sym_namelen < h->ms.sym->namelen)
+				self->max_sym_namelen = h->ms.sym->namelen;
+		}
+	}
+}
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 1b18d04195d..1c5f93ac5ab 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -44,4 +44,19 @@ void hists__output_resort(struct hists *self);
 void hists__collapse_resort(struct hists *self);
 size_t hists__fprintf(struct hists *self, struct hists *pair,
 		      bool show_displacement, FILE *fp);
+
+void hists__filter_by_dso(struct hists *self, const struct dso *dso);
+void hists__filter_by_thread(struct hists *self, const struct thread *thread);
+
+#ifdef NO_NEWT_SUPPORT
+static inline int hists__browse(struct hists self __used,
+				const char *helpline __used,
+				const char *input_name __used)
+{
+	return 0;
+}
+#else
+int hists__browse(struct hists *self, const char *helpline,
+		  const char *input_name);
+#endif
 #endif	/* __PERF_HIST_H */
diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index e283a6e6b6e..638b519e72b 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -410,8 +410,8 @@ static void hist_browser__delete(struct hist_browser *self)
 	free(self);
 }
 
-static int hist_browser__populate(struct hist_browser *self, struct rb_root *hists,
-				  u64 nr_hists, u64 session_total, const char *title)
+static int hist_browser__populate(struct hist_browser *self, struct hists *hists,
+				  const char *title)
 {
 	int max_len = 0, idx, cols, rows;
 	struct ui_progress *progress;
@@ -426,7 +426,7 @@ static int hist_browser__populate(struct hist_browser *self, struct rb_root *his
 	}
 
 	snprintf(str, sizeof(str), "Samples: %Ld                            ",
-		 session_total);
+		 hists->stats.total);
 	newtDrawRootText(0, 0, str);
 
 	newtGetScreenSize(NULL, &rows);
@@ -442,24 +442,25 @@ static int hist_browser__populate(struct hist_browser *self, struct rb_root *his
 	newtComponentAddCallback(self->tree, hist_browser__selection,
 				 &self->selection);
 
-	progress = ui_progress__new("Adding entries to the browser...", nr_hists);
+	progress = ui_progress__new("Adding entries to the browser...",
+				    hists->nr_entries);
 	if (progress == NULL)
 		return -1;
 
 	idx = 0;
-	for (nd = rb_first(hists); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
 		int len;
 
 		if (h->filtered)
 			continue;
 
-		len = hist_entry__append_browser(h, self->tree, session_total);
+		len = hist_entry__append_browser(h, self->tree, hists->stats.total);
 		if (len > max_len)
 			max_len = len;
 		if (symbol_conf.use_callchain)
 			hist_entry__append_callchain_browser(h, self->tree,
-							     session_total, idx++);
+							     hists->stats.total, idx++);
 		++curr_hist;
 		if (curr_hist % 5)
 			ui_progress__update(progress, curr_hist);
@@ -490,57 +491,6 @@ static int hist_browser__populate(struct hist_browser *self, struct rb_root *his
 	return 0;
 }
 
-enum hist_filter {
-	HIST_FILTER__DSO,
-	HIST_FILTER__THREAD,
-};
-
-static u64 hists__filter_by_dso(struct rb_root *hists, const struct dso *dso,
-				u64 *session_total)
-{
-	struct rb_node *nd;
-	u64 nr_hists = 0;
-
-	*session_total = 0;
-
-	for (nd = rb_first(hists); nd; nd = rb_next(nd)) {
-		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-
-		if (dso != NULL && (h->ms.map == NULL || h->ms.map->dso != dso)) {
-			h->filtered |= (1 << HIST_FILTER__DSO);
-			continue;
-		}
-		h->filtered &= ~(1 << HIST_FILTER__DSO);
-		++nr_hists;
-		*session_total += h->count;
-	}
-
-	return nr_hists;
-}
-
-static u64 hists__filter_by_thread(struct rb_root *hists, const struct thread *thread,
-				   u64 *session_total)
-{
-	struct rb_node *nd;
-	u64 nr_hists = 0;
-
-	*session_total = 0;
-
-	for (nd = rb_first(hists); nd; nd = rb_next(nd)) {
-		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-
-		if (thread != NULL && h->thread != thread) {
-			h->filtered |= (1 << HIST_FILTER__THREAD);
-			continue;
-		}
-		h->filtered &= ~(1 << HIST_FILTER__THREAD);
-		++nr_hists;
-		*session_total += h->count;
-	}
-
-	return nr_hists;
-}
-
 static struct thread *hist_browser__selected_thread(struct hist_browser *self)
 {
 	int *indexes;
@@ -577,9 +527,7 @@ static int hist_browser__title(char *bf, size_t size, const char *input_name,
 	return printed ?: snprintf(bf, size, "Report: %s", input_name);
 }
 
-int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
-			       u64 session_total, const char *helpline,
-			       const char *input_name)
+int hists__browse(struct hists *self, const char *helpline, const char *input_name)
 {
 	struct hist_browser *browser = hist_browser__new();
 	const struct thread *thread_filter = NULL;
@@ -595,7 +543,7 @@ int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
 
 	hist_browser__title(msg, sizeof(msg), input_name,
 			    dso_filter, thread_filter);
-	if (hist_browser__populate(browser, hists, nr_hists, session_total, msg) < 0)
+	if (hist_browser__populate(browser, self, msg) < 0)
 		goto out;
 
 	while (1) {
@@ -672,10 +620,10 @@ do_annotate:
 				newtPushHelpLine(msg);
 				dso_filter = dso;
 			}
-			nr_hists = hists__filter_by_dso(hists, dso_filter, &session_total);
+			hists__filter_by_dso(self, dso_filter);
 			hist_browser__title(msg, sizeof(msg), input_name,
 					    dso_filter, thread_filter);
-			if (hist_browser__populate(browser, hists, nr_hists, session_total, msg) < 0)
+			if (hist_browser__populate(browser, self, msg) < 0)
 				goto out;
 		} else if (choice == zoom_thread) {
 			if (thread_filter) {
@@ -689,10 +637,10 @@ do_annotate:
 				newtPushHelpLine(msg);
 				thread_filter = thread;
 			}
-			nr_hists = hists__filter_by_thread(hists, thread_filter, &session_total);
+			hists__filter_by_thread(self, thread_filter);
 			hist_browser__title(msg, sizeof(msg), input_name,
 					    dso_filter, thread_filter);
-			if (hist_browser__populate(browser, hists, nr_hists, session_total, msg) < 0)
+			if (hist_browser__populate(browser, self, msg) < 0)
 				goto out;
 		}
 	}
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 46190f94b54..ce00fa6cded 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -102,21 +102,6 @@ int perf_session__create_kernel_maps(struct perf_session *self);
 int do_read(int fd, void *buf, size_t size);
 void perf_session__update_sample_type(struct perf_session *self);
 
-#ifdef NO_NEWT_SUPPORT
-static inline int perf_session__browse_hists(struct rb_root *hists __used,
-					      u64 nr_hists __used,
-					      u64 session_total __used,
-					     const char *helpline __used,
-					     const char *input_name __used)
-{
-	return 0;
-}
-#else
-int perf_session__browse_hists(struct rb_root *hists, u64 nr_hists,
-			       u64 session_total, const char *helpline,
-			       const char *input_name);
-#endif
-
 static inline
 struct machine *perf_session__find_host_machine(struct perf_session *self)
 {
-- 
cgit v1.2.3


From 6b3c4ef50441e85dc9b2c9b67e95e8ad1185c15e Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 11 May 2010 00:59:53 -0400
Subject: perf probe: Check older elfutils and set NO_DWARF

Check whether elfutils is older than 0.138 (from which version checking
routine has been introduced). And if so, set NO_DWARF because it is hard
to check the API dependency without version checking.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Reported-by: Robert Richter <robert.richter@amd.com>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20100511045953.9913.19485.stgit@localhost6.localdomain6>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 0ef5cfe52f2..b28bb7359b1 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -506,8 +506,8 @@ PERFLIBS = $(LIB_FILE)
 -include config.mak
 
 ifndef NO_DWARF
-ifneq ($(shell sh -c "(echo '\#include <dwarf.h>'; echo '\#include <libdw.h>'; echo 'int main(void) { Dwarf *dbg; dbg = dwarf_begin(0, DWARF_C_READ); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -I/usr/include/elfutils -ldw -lelf -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
-	msg := $(warning No libdw.h found or old libdw.h found, disables dwarf support. Please install elfutils-devel/libdw-dev);
+ifneq ($(shell sh -c "(echo '\#include <dwarf.h>'; echo '\#include <libdw.h>'; echo '\#include <version.h>'; echo '\#ifndef _ELFUTILS_PREREQ'; echo '\#error'; echo '\#endif'; echo 'int main(void) { Dwarf *dbg; dbg = dwarf_begin(0, DWARF_C_READ); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -I/usr/include/elfutils -ldw -lelf -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
+	msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
 	NO_DWARF := 1
 endif # Dwarf support
 endif # NO_DWARF
-- 
cgit v1.2.3


From d11c7addfe0fa501cb54c824c0fac3481d527433 Mon Sep 17 00:00:00 2001
From: Kyle McMartin <kyle@mcmartin.ca>
Date: Mon, 10 May 2010 16:43:35 -0400
Subject: perf symbols: allow forcing use of cplus_demangle

For Fedora, I want to force perf to link against libiberty.a for
cplus_demangle, rather than libbfd.a for bfd_demangle due to licensing insanity
on binutils. (libiberty is LGPL2, libbfd is GPL3.)

If we just rely on autodetection, we'll end up with libbfd linked against us,
since they're both in binutils-static in the buildroot.

Cc: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20100510204335.GA7565@bombadil.infradead.org>
Signed-off-by: Kyle McMartin <kyle@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index b28bb7359b1..0797786aa72 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -592,6 +592,9 @@ endif
 
 ifdef NO_DEMANGLE
 	BASIC_CFLAGS += -DNO_DEMANGLE
+else ifdef HAVE_CPLUS_DEMANGLE
+	EXTLIBS += -liberty
+	BASIC_CFLAGS += -DHAVE_CPLUS_DEMANGLE
 else
 	has_bfd := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) -lbfd "$(QUIET_STDERR)" && echo y")
 
-- 
cgit v1.2.3


From a93d2f1744206827ccf416e2cdc5018aa503314e Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Fri, 7 May 2010 14:33:26 +0800
Subject: sched, wait: Use wrapper functions

epoll should not touch flags in wait_queue_t. This patch introduces a new
function __add_wait_queue_exclusive(), for the users, who use wait queue as a
LIFO queue.

__add_wait_queue_tail_exclusive() is introduced too instead of
add_wait_queue_exclusive_locked(). remove_wait_queue_locked() is removed, as
it is a duplicate of __remove_wait_queue(), disliked by users, and with less
users.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: <containers@lists.linux-foundation.org>
LKML-Reference: <1273214006-2979-1-git-send-email-xiaosuo@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/eventpoll.c       |  3 +--
 include/linux/wait.h | 35 +++++++++++++++--------------------
 kernel/cgroup.c      |  2 +-
 kernel/sched.c       |  3 +--
 4 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5b4ef..3817149919c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1140,8 +1140,7 @@ retry:
 		 * ep_poll_callback() when events will become available.
 		 */
 		init_waitqueue_entry(&wait, current);
-		wait.flags |= WQ_FLAG_EXCLUSIVE;
-		__add_wait_queue(&ep->wq, &wait);
+		__add_wait_queue_exclusive(&ep->wq, &wait);
 
 		for (;;) {
 			/*
diff --git a/include/linux/wait.h b/include/linux/wait.h
index a48e16b77d5..76d96d035ea 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -127,12 +127,26 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
 /*
  * Used for wake-one threads:
  */
+static inline void __add_wait_queue_exclusive(wait_queue_head_t *q,
+					      wait_queue_t *wait)
+{
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue(q, wait);
+}
+
 static inline void __add_wait_queue_tail(wait_queue_head_t *head,
-						wait_queue_t *new)
+					 wait_queue_t *new)
 {
 	list_add_tail(&new->task_list, &head->task_list);
 }
 
+static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q,
+					      wait_queue_t *wait)
+{
+	wait->flags |= WQ_FLAG_EXCLUSIVE;
+	__add_wait_queue_tail(q, wait);
+}
+
 static inline void __remove_wait_queue(wait_queue_head_t *head,
 							wait_queue_t *old)
 {
@@ -403,25 +417,6 @@ do {									\
 	__ret;								\
 })
 
-/*
- * Must be called with the spinlock in the wait_queue_head_t held.
- */
-static inline void add_wait_queue_exclusive_locked(wait_queue_head_t *q,
-						   wait_queue_t * wait)
-{
-	wait->flags |= WQ_FLAG_EXCLUSIVE;
-	__add_wait_queue_tail(q,  wait);
-}
-
-/*
- * Must be called with the spinlock in the wait_queue_head_t held.
- */
-static inline void remove_wait_queue_locked(wait_queue_head_t *q,
-					    wait_queue_t * wait)
-{
-	__remove_wait_queue(q,  wait);
-}
-
 /*
  * These are the old interfaces to sleep waiting for an event.
  * They are racy.  DO NOT use them, use the wait_event* interfaces above.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2769e13980..4a07d057a26 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3010,7 +3010,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
 	unsigned long flags = (unsigned long)key;
 
 	if (flags & POLLHUP) {
-		remove_wait_queue_locked(event->wqh, &event->wait);
+		__remove_wait_queue(event->wqh, &event->wait);
 		spin_lock(&cgrp->event_list_lock);
 		list_del(&event->list);
 		spin_unlock(&cgrp->event_list_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index 39aa9c7e22c..b531d793408 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3983,8 +3983,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);
 
-		wait.flags |= WQ_FLAG_EXCLUSIVE;
-		__add_wait_queue_tail(&x->wait, &wait);
+		__add_wait_queue_tail_exclusive(&x->wait, &wait);
 		do {
 			if (signal_pending_state(state, current)) {
 				timeout = -ERESTARTSYS;
-- 
cgit v1.2.3


From 0ceed5db321ac0f9782e77dda476ebe28a8e2199 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 11 May 2010 09:53:18 -0700
Subject: ceph: unregister osd request on failure

The osd request wasn't being unregistered when the osd returned a failure
code, even though the result was returned to the caller.  This would cause
it to eventually time out, and then crash the kernel when it tried to
resend the request using a stale page vector.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index c7b4dedaace..8128082a028 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -779,16 +779,18 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	struct ceph_osd_request *req;
 	u64 tid;
 	int numops, object_len, flags;
+	s32 result;
 
 	tid = le64_to_cpu(msg->hdr.tid);
 	if (msg->front.iov_len < sizeof(*rhead))
 		goto bad;
 	numops = le32_to_cpu(rhead->num_ops);
 	object_len = le32_to_cpu(rhead->object_len);
+	result = le32_to_cpu(rhead->result);
 	if (msg->front.iov_len != sizeof(*rhead) + object_len +
 	    numops * sizeof(struct ceph_osd_op))
 		goto bad;
-	dout("handle_reply %p tid %llu\n", msg, tid);
+	dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
 
 	/* lookup */
 	mutex_lock(&osdc->request_mutex);
@@ -834,7 +836,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	dout("handle_reply tid %llu flags %d\n", tid, flags);
 
 	/* either this is a read, or we got the safe response */
-	if ((flags & CEPH_OSD_FLAG_ONDISK) ||
+	if (result < 0 ||
+	    (flags & CEPH_OSD_FLAG_ONDISK) ||
 	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
 		__unregister_request(osdc, req);
 
-- 
cgit v1.2.3


From 04d000eb358919043da538f197d63f2a5924a525 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 7 May 2010 11:26:34 -0700
Subject: ceph: fix open file counting on snapped inodes when mds returns no
 caps

It's possible the MDS will not issue caps on a snapped inode, in which case
an open request may not __ceph_get_fmode(), botching the open file
counting.  (This is actually a server bug, but the client shouldn't BUG out
in this case.)

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 261f3e6c0bc..85b4d2ffdeb 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -733,6 +733,10 @@ no_change:
 				__ceph_get_fmode(ci, cap_fmode);
 			spin_unlock(&inode->i_lock);
 		}
+	} else if (cap_fmode >= 0) {
+		pr_warning("mds issued no caps on %llx.%llx\n",
+			   ceph_vinop(inode));
+		__ceph_get_fmode(ci, cap_fmode);
 	}
 
 	/* update delegation info? */
-- 
cgit v1.2.3


From d85b705663905b3dae30007f824355bdcfcf3f00 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 10 May 2010 10:24:48 -0700
Subject: ceph: resubmit requests on pg mapping change (not just primary
 change)

OSD requests need to be resubmitted on any pg mapping change, not just when
the pg primary changes.  Resending only when the primary changes results in
occasional 'hung' requests during osd cluster recovery or rebalancing.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 19 +++++++++++++++----
 fs/ceph/osd_client.h |  2 ++
 fs/ceph/osdmap.c     | 29 ++++++++++++++++++++++++-----
 fs/ceph/osdmap.h     |  2 ++
 fs/ceph/rados.h      |  1 +
 5 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 8128082a028..3514f71ff85 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -565,7 +565,8 @@ static int __map_osds(struct ceph_osd_client *osdc,
 {
 	struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
 	struct ceph_pg pgid;
-	int o = -1;
+	int acting[CEPH_PG_MAX_SIZE];
+	int o = -1, num = 0;
 	int err;
 
 	dout("map_osds %p tid %lld\n", req, req->r_tid);
@@ -576,10 +577,16 @@ static int __map_osds(struct ceph_osd_client *osdc,
 	pgid = reqhead->layout.ol_pgid;
 	req->r_pgid = pgid;
 
-	o = ceph_calc_pg_primary(osdc->osdmap, pgid);
+	err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
+	if (err > 0) {
+		o = acting[0];
+		num = err;
+	}
 
 	if ((req->r_osd && req->r_osd->o_osd == o &&
-	     req->r_sent >= req->r_osd->o_incarnation) ||
+	     req->r_sent >= req->r_osd->o_incarnation &&
+	     req->r_num_pg_osds == num &&
+	     memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
 	    (req->r_osd == NULL && o == -1))
 		return 0;  /* no change */
 
@@ -587,6 +594,10 @@ static int __map_osds(struct ceph_osd_client *osdc,
 	     req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
 	     req->r_osd ? req->r_osd->o_osd : -1);
 
+	/* record full pg acting set */
+	memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
+	req->r_num_pg_osds = num;
+
 	if (req->r_osd) {
 		__cancel_request(req);
 		list_del_init(&req->r_osd_item);
@@ -612,7 +623,7 @@ static int __map_osds(struct ceph_osd_client *osdc,
 		__remove_osd_from_lru(req->r_osd);
 		list_add(&req->r_osd_item, &req->r_osd->o_requests);
 	}
-	err = 1;   /* osd changed */
+	err = 1;   /* osd or pg changed */
 
 out:
 	return err;
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index c5191d62f24..ce776989ef6 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -48,6 +48,8 @@ struct ceph_osd_request {
 	struct list_head r_osd_item;
 	struct ceph_osd *r_osd;
 	struct ceph_pg   r_pgid;
+	int              r_pg_osds[CEPH_PG_MAX_SIZE];
+	int              r_num_pg_osds;
 
 	struct ceph_connection *r_con_filling_msg;
 
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 2e2c15eed82..cfdd8f4388b 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -1040,13 +1040,34 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 	return osds;
 }
 
+/*
+ * Return acting set for given pgid.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+			int *acting)
+{
+	int rawosds[CEPH_PG_MAX_SIZE], *osds;
+	int i, o, num = CEPH_PG_MAX_SIZE;
+
+	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+	if (!osds)
+		return -1;
+
+	/* primary is first up osd */
+	o = 0;
+	for (i = 0; i < num; i++)
+		if (ceph_osd_is_up(osdmap, osds[i]))
+			acting[o++] = osds[i];
+	return o;
+}
+
 /*
  * Return primary osd for given pgid, or -1 if none.
  */
 int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 {
-	int rawosds[10], *osds;
-	int i, num = ARRAY_SIZE(rawosds);
+	int rawosds[CEPH_PG_MAX_SIZE], *osds;
+	int i, num = CEPH_PG_MAX_SIZE;
 
 	osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
 	if (!osds)
@@ -1054,9 +1075,7 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 
 	/* primary is first up osd */
 	for (i = 0; i < num; i++)
-		if (ceph_osd_is_up(osdmap, osds[i])) {
+		if (ceph_osd_is_up(osdmap, osds[i]))
 			return osds[i];
-			break;
-		}
 	return -1;
 }
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
index 8bc9f1e4f56..970b547e510 100644
--- a/fs/ceph/osdmap.h
+++ b/fs/ceph/osdmap.h
@@ -120,6 +120,8 @@ extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
 				   const char *oid,
 				   struct ceph_file_layout *fl,
 				   struct ceph_osdmap *osdmap);
+extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+			       int *acting);
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
 				struct ceph_pg pgid);
 
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index a1fc1d017b5..fd56451a871 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -58,6 +58,7 @@ struct ceph_timespec {
 #define CEPH_PG_LAYOUT_LINEAR 2
 #define CEPH_PG_LAYOUT_HYBRID 3
 
+#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
 
 /*
  * placement group.
-- 
cgit v1.2.3


From 9abf82b8bc93dd904738a71ca69aa5df356d4d24 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 10 May 2010 21:58:38 -0700
Subject: ceph: fix locking for waking session requests after reconnect

The session->s_waiting list is protected by mdsc->mutex, not s_mutex.  This
was causing (rare) s_waiting list corruption.

Fix errors paths too, while we're here.  A more thorough cleanup of this
function is coming soon.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 60a9a4ae47b..eccc0ecad1a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2136,7 +2136,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 	struct ceph_mds_session *session = NULL;
 	struct ceph_msg *reply;
 	struct rb_node *p;
-	int err;
+	int err = -ENOMEM;
 	struct ceph_pagelist *pagelist;
 
 	pr_info("reconnect to recovering mds%d\n", mds);
@@ -2185,7 +2185,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
 		goto fail;
 	err = iterate_session_caps(session, encode_caps_cb, pagelist);
 	if (err < 0)
-		goto out;
+		goto fail;
 
 	/*
 	 * snaprealms.  we provide mds with the ino, seq (version), and
@@ -2213,28 +2213,31 @@ send:
 	reply->nr_pages = calc_pages_for(0, pagelist->length);
 	ceph_con_send(&session->s_con, reply);
 
-	if (session) {
-		session->s_state = CEPH_MDS_SESSION_OPEN;
-		__wake_requests(mdsc, &session->s_waiting);
-	}
+	session->s_state = CEPH_MDS_SESSION_OPEN;
+	mutex_unlock(&session->s_mutex);
+
+	mutex_lock(&mdsc->mutex);
+	__wake_requests(mdsc, &session->s_waiting);
+	mutex_unlock(&mdsc->mutex);
+
+	ceph_put_mds_session(session);
 
-out:
 	up_read(&mdsc->snap_rwsem);
-	if (session) {
-		mutex_unlock(&session->s_mutex);
-		ceph_put_mds_session(session);
-	}
 	mutex_lock(&mdsc->mutex);
 	return;
 
 fail:
 	ceph_msg_put(reply);
+	up_read(&mdsc->snap_rwsem);
+	mutex_unlock(&session->s_mutex);
+	ceph_put_mds_session(session);
 fail_nomsg:
 	ceph_pagelist_release(pagelist);
 	kfree(pagelist);
 fail_nopagelist:
-	pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
-	goto out;
+	pr_err("error %d preparing reconnect for mds%d\n", err, mds);
+	mutex_lock(&mdsc->mutex);
+	return;
 }
 
 
-- 
cgit v1.2.3


From 3d69438031b00c601c991ab447cafb7d5c3c59a6 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 11 May 2010 14:59:55 -0400
Subject: cifs: guard against hardlinking directories

When we made serverino the default, we trusted that the field sent by the
server in the "uniqueid" field was actually unique. It turns out that it
isn't reliably so.

Samba, in particular, will just put the st_ino in the uniqueid field when
unix extensions are enabled. When a share spans multiple filesystems, it's
quite possible that there will be collisions. This is a server bug, but
when the inodes in question are a directory (as is often the case) and
there is a collision with the root inode of the mount, the result is a
kernel panic on umount.

Fix this by checking explicitly for directory inodes with the same
uniqueid. If that is the case, then we can assume that using server inode
numbers will be a problem and that they should be disabled.

Fixes Samba bugzilla 7407

Signed-off-by: Jeff Layton <jlayton@redhat.com>
CC: Stable <stable@kernel.org>
Reviewed-and-Tested-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsglob.h |  1 +
 fs/cifs/inode.c    | 21 +++++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ecf0ffbe2b6..0c2fd17439c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -502,6 +502,7 @@ struct dfs_info3_param {
 #define CIFS_FATTR_DFS_REFERRAL		0x1
 #define CIFS_FATTR_DELETE_PENDING	0x2
 #define CIFS_FATTR_NEED_REVAL		0x4
+#define CIFS_FATTR_INO_COLLISION	0x8
 
 struct cifs_fattr {
 	u32		cf_flags;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 35ec1171621..29b9ea244c8 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -715,6 +715,16 @@ cifs_find_inode(struct inode *inode, void *opaque)
 	if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
 		return 0;
 
+	/*
+	 * uh oh -- it's a directory. We can't use it since hardlinked dirs are
+	 * verboten. Disable serverino and return it as if it were found, the
+	 * caller can discard it, generate a uniqueid and retry the find
+	 */
+	if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
+		fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
+		cifs_autodisable_serverino(CIFS_SB(inode->i_sb));
+	}
+
 	return 1;
 }
 
@@ -734,15 +744,22 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
 	unsigned long hash;
 	struct inode *inode;
 
+retry_iget5_locked:
 	cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid));
 
 	/* hash down to 32-bits on 32-bit arch */
 	hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
 
 	inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
-
-	/* we have fattrs in hand, update the inode */
 	if (inode) {
+		/* was there a problematic inode number collision? */
+		if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
+			iput(inode);
+			fattr->cf_uniqueid = iunique(sb, ROOT_I);
+			fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
+			goto retry_iget5_locked;
+		}
+
 		cifs_fattr_to_inode(inode, fattr);
 		if (sb->s_flags & MS_NOATIME)
 			inode->i_flags |= S_NOATIME | S_NOCMTIME;
-- 
cgit v1.2.3


From 3798ed7bc7ade26d3f59506cd06288615dfc7585 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 May 2010 18:01:23 -0300
Subject: perf ui: Add ui_helpline methods
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Initially this was just to be able to have a printf like method to
prepare the formatted string and then pass to newtPushHelpLine, but as
we already have for ui_progress, etc, its a step in identifying a
restricted, highlevel set of widgets we can then have implementations
for multiple widget sets (GTK, etc).

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 69 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 50 insertions(+), 19 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 638b519e72b..daa86efffce 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -57,6 +57,43 @@ void ui_progress__delete(struct ui_progress *self)
 	free(self);
 }
 
+static void ui_helpline__pop(void)
+{
+	newtPopHelpLine();
+}
+
+static void ui_helpline__push(const char *msg)
+{
+	newtPushHelpLine(msg);
+}
+
+static void ui_helpline__vpush(const char *fmt, va_list ap)
+{
+	char *s;
+
+	if (vasprintf(&s, fmt, ap) < 0)
+		vfprintf(stderr, fmt, ap);
+	else {
+		ui_helpline__push(s);
+		free(s);
+	}
+}
+
+static void ui_helpline__fpush(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	ui_helpline__vpush(fmt, ap);
+	va_end(ap);
+}
+
+static void ui_helpline__puts(const char *msg)
+{
+	ui_helpline__pop();
+	ui_helpline__push(msg);
+}
+
 static char browser__last_msg[1024];
 
 int browser__show_help(const char *format, va_list ap)
@@ -69,8 +106,7 @@ int browser__show_help(const char *format, va_list ap)
 	backlog += ret;
 
 	if (browser__last_msg[backlog - 1] == '\n') {
-		newtPopHelpLine();
-		newtPushHelpLine(browser__last_msg);
+		ui_helpline__puts(browser__last_msg);
 		newtRefresh();
 		backlog = 0;
 	}
@@ -340,7 +376,7 @@ static void map_symbol__annotate_browser(const struct map_symbol *self,
 	if (fp == NULL)
 		goto out_free_str;
 
-	newtPushHelpLine("Press ESC to exit");
+	ui_helpline__push("Press ESC to exit");
 	newtGetScreenSize(&cols, &rows);
 	tree = newtListbox(0, 0, rows - 5, NEWT_FLAG_SCROLL);
 
@@ -370,7 +406,7 @@ static void map_symbol__annotate_browser(const struct map_symbol *self,
 	newtFormRun(form, &es);
 	newtFormDestroy(form);
 	newtPopWindow();
-	newtPopHelpLine();
+	ui_helpline__pop();
 out_free_str:
 	free(str);
 }
@@ -539,7 +575,7 @@ int hists__browse(struct hists *self, const char *helpline, const char *input_na
 	if (browser == NULL)
 		return -1;
 
-	newtPushHelpLine(helpline);
+	ui_helpline__push(helpline);
 
 	hist_browser__title(msg, sizeof(msg), input_name,
 			    dso_filter, thread_filter);
@@ -602,8 +638,7 @@ int hists__browse(struct hists *self, const char *helpline, const char *input_na
 do_annotate:
 		if (choice == annotate) {
 			if (browser->selection->map->dso->origin == DSO__ORIG_KERNEL) {
-				newtPopHelpLine();
-				newtPushHelpLine("No vmlinux file found, can't "
+				ui_helpline__puts("No vmlinux file found, can't "
 						 "annotate with just a "
 						 "kallsyms file");
 				continue;
@@ -611,13 +646,11 @@ do_annotate:
 			map_symbol__annotate_browser(browser->selection, input_name);
 		} else if (choice == zoom_dso) {
 			if (dso_filter) {
-				newtPopHelpLine();
+				ui_helpline__pop();
 				dso_filter = NULL;
 			} else {
-				snprintf(msg, sizeof(msg),
-					 "To zoom out press -> + \"Zoom out of %s DSO\"",
-					 dso->kernel ? "the Kernel" : dso->short_name);
-				newtPushHelpLine(msg);
+				ui_helpline__fpush("To zoom out press -> + \"Zoom out of %s DSO\"",
+						   dso->kernel ? "the Kernel" : dso->short_name);
 				dso_filter = dso;
 			}
 			hists__filter_by_dso(self, dso_filter);
@@ -627,14 +660,12 @@ do_annotate:
 				goto out;
 		} else if (choice == zoom_thread) {
 			if (thread_filter) {
-				newtPopHelpLine();
+				ui_helpline__pop();
 				thread_filter = NULL;
 			} else {
-				snprintf(msg, sizeof(msg),
-					 "To zoom out press -> + \"Zoom out of %s(%d) thread\"",
-					 (thread->comm_set ? thread->comm : ""),
-					 thread->pid);
-				newtPushHelpLine(msg);
+				ui_helpline__fpush("To zoom out press -> + \"Zoom out of %s(%d) thread\"",
+						   thread->comm_set ? thread->comm : "",
+						   thread->pid);
 				thread_filter = thread;
 			}
 			hists__filter_by_thread(self, thread_filter);
@@ -658,7 +689,7 @@ void setup_browser(void)
 	use_browser = true;
 	newtInit();
 	newtCls();
-	newtPushHelpLine(" ");
+	ui_helpline__puts(" ");
 }
 
 void exit_browser(bool wait_for_ok)
-- 
cgit v1.2.3


From 45c6ceb547ad2d98215351974a4686bf8cb13e14 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 11 May 2010 15:01:51 -0700
Subject: ceph: zero unused message header, footer fields

We shouldn't leak any prior memory contents to other parties.  And random
data, particularly in the 'version' field, can cause problems down the
line.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 509f57d9ccb..a3a8f368845 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -2085,15 +2085,19 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
 	kref_init(&m->kref);
 	INIT_LIST_HEAD(&m->list_head);
 
+	m->hdr.tid = 0;
 	m->hdr.type = cpu_to_le16(type);
+	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+	m->hdr.version = 0;
 	m->hdr.front_len = cpu_to_le32(front_len);
 	m->hdr.middle_len = 0;
 	m->hdr.data_len = cpu_to_le32(page_len);
 	m->hdr.data_off = cpu_to_le16(page_off);
-	m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+	m->hdr.reserved = 0;
 	m->footer.front_crc = 0;
 	m->footer.middle_crc = 0;
 	m->footer.data_crc = 0;
+	m->footer.flags = 0;
 	m->front_max = front_len;
 	m->front_is_vmalloc = false;
 	m->more_to_follow = false;
-- 
cgit v1.2.3


From 72d5a9f7a9542f88397558c65bcfc3b115a65e34 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 10 May 2010 17:12:17 -0700
Subject: rcu: remove all rcu head initializations, except on_stack
 initializations

Remove all rcu head inits. We don't care about the RCU head state before passing
it to call_rcu() anyway. Only leave the "on_stack" variants so debugobjects can
keep track of objects on stack.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/init_task.h | 1 -
 kernel/rcutiny.c          | 6 ++++++
 kernel/rcutorture.c       | 2 ++
 kernel/rcutree.c          | 4 ++++
 kernel/rcutree_plugin.h   | 2 ++
 5 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index b1ed1cd8e2a..7996fc2c9ba 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -49,7 +49,6 @@ extern struct group_info init_groups;
 		{ .first = &init_task.pids[PIDTYPE_PGID].node },	\
 		{ .first = &init_task.pids[PIDTYPE_SID].node },		\
 	},								\
-	.rcu		= RCU_HEAD_INIT,				\
 	.level		= 0,						\
 	.numbers	= { {						\
 		.nr		= 0,					\
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index b1804ff83d5..38729d3cd23 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -245,11 +245,13 @@ void rcu_barrier(void)
 {
 	struct rcu_synchronize rcu;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
@@ -257,11 +259,13 @@ void rcu_barrier_bh(void)
 {
 	struct rcu_synchronize rcu;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_bh(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
 
@@ -269,11 +273,13 @@ void rcu_barrier_sched(void)
 {
 	struct rcu_synchronize rcu;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_sched(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 58df55bf83e..077defb3457 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -464,9 +464,11 @@ static void rcu_bh_torture_synchronize(void)
 {
 	struct rcu_bh_torture_synchronize rcu;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 
 static struct rcu_torture_ops rcu_bh_ops = {
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ba6996943e2..d4437345706 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1484,11 +1484,13 @@ void synchronize_sched(void)
 	if (rcu_blocking_is_gp())
 		return;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_sched(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
 
@@ -1508,11 +1510,13 @@ void synchronize_rcu_bh(void)
 	if (rcu_blocking_is_gp())
 		return;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_bh(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ac7d80fa895..0e4f420245d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -557,11 +557,13 @@ void synchronize_rcu(void)
 	if (!rcu_scheduler_active)
 		return;
 
+	init_rcu_head_on_stack(&rcu.head);
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu(&rcu.head, wakeme_after_rcu);
 	/* Wait for it. */
 	wait_for_completion(&rcu.completion);
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
-- 
cgit v1.2.3


From a3c8acd04376d604370dcb6cd2143c9c14078a50 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 11 May 2010 17:47:07 -0700
Subject: x86: Add new static_cpu_has() function using alternatives

For CPU-feature-specific code that touches performance-critical paths,
introduce a static patching version of [boot_]cpu_has().  This is run
at alternatives time and is therefore not appropriate for most
initialization code, but on the other hand initialization code is
generally not performance critical.

On gcc 4.5+ this uses the new "asm goto" feature.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-2-git-send-email-avi@redhat.com>
---
 arch/x86/include/asm/cpufeature.h | 57 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 0cd82d06861..9b11a5cc666 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -175,6 +175,7 @@
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
+#include <asm/asm.h>
 #include <linux/bitops.h>
 
 extern const char * const x86_cap_flags[NCAPINTS*32];
@@ -283,6 +284,62 @@ extern const char * const x86_power_flags[32];
 
 #endif /* CONFIG_X86_64 */
 
+/*
+ * Static testing of CPU features.  Used the same as boot_cpu_has().
+ * These are only valid after alternatives have run, but will statically
+ * patch the target code for additional performance.
+ *
+ */
+static __always_inline __pure bool __static_cpu_has(u8 bit)
+{
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
+		asm goto("1: jmp %l[t_no]\n"
+			 "2:\n"
+			 ".section .altinstructions,\"a\"\n"
+			 _ASM_ALIGN "\n"
+			 _ASM_PTR "1b\n"
+			 _ASM_PTR "0\n" 	/* no replacement */
+			 " .byte %P0\n"		/* feature bit */
+			 " .byte 2b - 1b\n"	/* source len */
+			 " .byte 0\n"		/* replacement len */
+			 " .byte 0xff + 0 - (2b-1b)\n"	/* padding */
+			 ".previous\n"
+			 : : "i" (bit) : : t_no);
+		return true;
+	t_no:
+		return false;
+#else
+		u8 flag;
+		/* Open-coded due to __stringify() in ALTERNATIVE() */
+		asm volatile("1: movb $0,%0\n"
+			     "2:\n"
+			     ".section .altinstructions,\"a\"\n"
+			     _ASM_ALIGN "\n"
+			     _ASM_PTR "1b\n"
+			     _ASM_PTR "3f\n"
+			     " .byte %P1\n"		/* feature bit */
+			     " .byte 2b - 1b\n"		/* source len */
+			     " .byte 4f - 3f\n"		/* replacement len */
+			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* padding */
+			     ".previous\n"
+			     ".section .altinstr_replacement,\"ax\"\n"
+			     "3: movb $1,%0\n"
+			     "4:\n"
+			     ".previous\n"
+			     : "=qm" (flag) : "i" (bit));
+		return flag;
+#endif
+}
+
+#define static_cpu_has(bit)					\
+(								\
+	__builtin_constant_p(boot_cpu_has(bit)) ?		\
+		boot_cpu_has(bit) :				\
+	(__builtin_constant_p(bit) && !((bit) & ~0xff)) ?	\
+		__static_cpu_has(bit) :				\
+		boot_cpu_has(bit)				\
+)
+
 #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
 
 #endif /* _ASM_X86_CPUFEATURE_H */
-- 
cgit v1.2.3


From c9775b4cc522e5f1b40b1366a993f0f05f600f39 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 11 May 2010 17:49:54 -0700
Subject: x86, fpu: Use static_cpu_has() to implement use_xsave()

use_xsave() is now just a special case of static_cpu_has(), so use
static_cpu_has().

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1273135546-29690-2-git-send-email-avi@redhat.com>
---
 arch/x86/include/asm/i387.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 8002e9ce25f..c991b3a7b90 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -18,6 +18,7 @@
 #include <linux/hardirq.h>
 #include <linux/slab.h>
 #include <asm/asm.h>
+#include <asm/cpufeature.h>
 #include <asm/processor.h>
 #include <asm/sigcontext.h>
 #include <asm/user.h>
@@ -57,16 +58,9 @@ extern int restore_i387_xstate_ia32(void __user *buf);
 
 #define X87_FSW_ES (1 << 7)	/* Exception Summary */
 
-static inline bool use_xsave(void)
+static __always_inline __pure bool use_xsave(void)
 {
-	u8 has_xsave;
-
-	alternative_io("mov $0, %0",
-		       "mov $1, %0",
-		       X86_FEATURE_XSAVE,
-		       "=qm" (has_xsave));
-
-	return has_xsave;
+	return static_cpu_has(X86_FEATURE_XSAVE);
 }
 
 #ifdef CONFIG_X86_64
-- 
cgit v1.2.3


From ef7b93a11904c6ba10604233d318d9e8ec88cddc Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 11 May 2010 23:18:06 -0300
Subject: perf report: Librarize the annotation code and use it in the newt
 browser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now we don't anymore use popen to run 'perf annotate' for the selected
symbol, instead we collect per address samplings when processing samples
in 'perf report' if we're using the newt browser, then we use this data
directly to do annotation.

Done this way we can actually traverse the objdump_line objects
directly, matching the addresses to the collected samples and colouring
them appropriately using lower level slang routines.

The new ui_browser class will be reused for the main, callchain aware,
histogram browser, when it will be made generic and don't assume that
the objects are always instances of the objdump_line class maintained
using list_heads.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile           |   5 +
 tools/perf/builtin-annotate.c | 198 +-----------------------
 tools/perf/builtin-report.c   |  19 ++-
 tools/perf/util/hist.c        | 184 ++++++++++++++++++++++
 tools/perf/util/hist.h        |  29 ++++
 tools/perf/util/newt.c        | 352 ++++++++++++++++++++++++++++++++++++------
 tools/perf/util/sort.h        |   6 -
 7 files changed, 544 insertions(+), 249 deletions(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 0797786aa72..9c4dc30cdc1 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -560,6 +560,8 @@ ifneq ($(shell sh -c "(echo '\#include <newt.h>'; echo 'int main(void) { newtIni
 	msg := $(warning newt not found, disables TUI support. Please install newt-devel or libnewt-dev);
 	BASIC_CFLAGS += -DNO_NEWT_SUPPORT
 else
+	# Fedora has /usr/include/slang/slang.h, but ubuntu /usr/include/slang.h
+	BASIC_CFLAGS += -I/usr/include/slang
 	EXTLIBS += -lnewt
 	LIB_OBJS += $(OUTPUT)util/newt.o
 endif
@@ -948,6 +950,9 @@ $(OUTPUT)builtin-init-db.o: builtin-init-db.c $(OUTPUT)PERF-CFLAGS
 $(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
 
+$(OUTPUT)util/newt.o: util/newt.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DENABLE_SLFUTURE_CONST $<
+
 $(OUTPUT)util/rbtree.o: ../../lib/rbtree.c $(OUTPUT)PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
 
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 3940964161b..fd1b786c8f3 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -34,68 +34,8 @@ static bool		full_paths;
 
 static bool		print_line;
 
-struct sym_hist {
-	u64		sum;
-	u64		ip[0];
-};
-
-struct sym_ext {
-	struct rb_node	node;
-	double		percent;
-	char		*path;
-};
-
-struct sym_priv {
-	struct sym_hist	*hist;
-	struct sym_ext	*ext;
-};
-
 static const char *sym_hist_filter;
 
-static int sym__alloc_hist(struct symbol *self)
-{
-	struct sym_priv *priv = symbol__priv(self);
-	const int size = (sizeof(*priv->hist) +
-			  (self->end - self->start) * sizeof(u64));
-
-	priv->hist = zalloc(size);
-	return priv->hist == NULL ? -1 : 0;
-}
-
-/*
- * collect histogram counts
- */
-static int annotate__hist_hit(struct hist_entry *he, u64 ip)
-{
-	unsigned int sym_size, offset;
-	struct symbol *sym = he->ms.sym;
-	struct sym_priv *priv;
-	struct sym_hist *h;
-
-	if (!sym || !he->ms.map)
-		return 0;
-
-	priv = symbol__priv(sym);
-	if (priv->hist == NULL && sym__alloc_hist(sym) < 0)
-		return -ENOMEM;
-
-	sym_size = sym->end - sym->start;
-	offset = ip - sym->start;
-
-	pr_debug3("%s: ip=%#Lx\n", __func__, he->ms.map->unmap_ip(he->ms.map, ip));
-
-	if (offset >= sym_size)
-		return 0;
-
-	h = priv->hist;
-	h->sum++;
-	h->ip[offset]++;
-
-	pr_debug3("%#Lx %s: count++ [ip: %#Lx, %#Lx] => %Ld\n", he->ms.sym->start,
-		  he->ms.sym->name, ip, ip - he->ms.sym->start, h->ip[offset]);
-	return 0;
-}
-
 static int hists__add_entry(struct hists *self, struct addr_location *al)
 {
 	struct hist_entry *he;
@@ -115,7 +55,7 @@ static int hists__add_entry(struct hists *self, struct addr_location *al)
 	if (he == NULL)
 		return -ENOMEM;
 
-	return annotate__hist_hit(he, al->addr);
+	return hist_entry__inc_addr_samples(he, al->addr);
 }
 
 static int process_sample_event(event_t *event, struct perf_session *session)
@@ -140,101 +80,6 @@ static int process_sample_event(event_t *event, struct perf_session *session)
 	return 0;
 }
 
-struct objdump_line {
-	struct list_head node;
-	s64		 offset;
-	char		 *line;
-};
-
-static struct objdump_line *objdump_line__new(s64 offset, char *line)
-{
-	struct objdump_line *self = malloc(sizeof(*self));
-
-	if (self != NULL) {
-		self->offset = offset;
-		self->line = line;
-	}
-
-	return self;
-}
-
-static void objdump_line__free(struct objdump_line *self)
-{
-	free(self->line);
-	free(self);
-}
-
-static void objdump__add_line(struct list_head *head, struct objdump_line *line)
-{
-	list_add_tail(&line->node, head);
-}
-
-static struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
-						      struct objdump_line *pos)
-{
-	list_for_each_entry_continue(pos, head, node)
-		if (pos->offset >= 0)
-			return pos;
-
-	return NULL;
-}
-
-static int parse_line(FILE *file, struct hist_entry *he,
-		      struct list_head *head)
-{
-	struct symbol *sym = he->ms.sym;
-	struct objdump_line *objdump_line;
-	char *line = NULL, *tmp, *tmp2;
-	size_t line_len;
-	s64 line_ip, offset = -1;
-	char *c;
-
-	if (getline(&line, &line_len, file) < 0)
-		return -1;
-
-	if (!line)
-		return -1;
-
-	c = strchr(line, '\n');
-	if (c)
-		*c = 0;
-
-	line_ip = -1;
-
-	/*
-	 * Strip leading spaces:
-	 */
-	tmp = line;
-	while (*tmp) {
-		if (*tmp != ' ')
-			break;
-		tmp++;
-	}
-
-	if (*tmp) {
-		/*
-		 * Parse hexa addresses followed by ':'
-		 */
-		line_ip = strtoull(tmp, &tmp2, 16);
-		if (*tmp2 != ':')
-			line_ip = -1;
-	}
-
-	if (line_ip != -1) {
-		u64 start = map__rip_2objdump(he->ms.map, sym->start);
-		offset = line_ip - start;
-	}
-
-	objdump_line = objdump_line__new(offset, line);
-	if (objdump_line == NULL) {
-		free(line);
-		return -1;
-	}
-	objdump__add_line(head, objdump_line);
-
-	return 0;
-}
-
 static int objdump_line__print(struct objdump_line *self,
 			       struct list_head *head,
 			       struct hist_entry *he, u64 len)
@@ -439,27 +284,11 @@ static void annotate_sym(struct hist_entry *he)
 	struct symbol *sym = he->ms.sym;
 	const char *filename = dso->long_name, *d_filename;
 	u64 len;
-	char command[PATH_MAX*2];
-	FILE *file;
 	LIST_HEAD(head);
 	struct objdump_line *pos, *n;
 
-	if (!filename)
-		return;
-
-	if (dso->origin == DSO__ORIG_KERNEL) {
-		if (dso->annotate_warned)
-			return;
-		dso->annotate_warned = 1;
-		pr_err("Can't annotate %s: No vmlinux file was found in the "
-		       "path:\n", sym->name);
-		vmlinux_path__fprintf(stderr);
+	if (hist_entry__annotate(he, &head) < 0)
 		return;
-	}
-
-	pr_debug("%s: filename=%s, sym=%s, start=%#Lx, end=%#Lx\n", __func__,
-		 filename, sym->name, map->unmap_ip(map, sym->start),
-		 map->unmap_ip(map, sym->end));
 
 	if (full_paths)
 		d_filename = filename;
@@ -477,29 +306,6 @@ static void annotate_sym(struct hist_entry *he)
 	printf(" Percent |	Source code & Disassembly of %s\n", d_filename);
 	printf("------------------------------------------------\n");
 
-	if (verbose >= 2)
-		printf("annotating [%p] %30s : [%p] %30s\n",
-		       dso, dso->long_name, sym, sym->name);
-
-	sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s|grep -v %s",
-		map__rip_2objdump(map, sym->start),
-		map__rip_2objdump(map, sym->end),
-		filename, filename);
-
-	if (verbose >= 3)
-		printf("doing: %s\n", command);
-
-	file = popen(command, "r");
-	if (!file)
-		return;
-
-	while (!feof(file)) {
-		if (parse_line(file, he, &head) < 0)
-			break;
-	}
-
-	pclose(file);
-
 	if (verbose)
 		hist_entry__print_hits(he);
 
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 3d67d6bf22c..04de3387de3 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -106,8 +106,18 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 	if (he == NULL)
 		goto out_free_syms;
 	err = 0;
-	if (symbol_conf.use_callchain)
+	if (symbol_conf.use_callchain) {
 		err = append_chain(he->callchain, data->callchain, syms);
+		if (err)
+			goto out_free_syms;
+	}
+	/*
+	 * Only in the newt browser we are doing integrated annotation,
+	 * so we don't allocated the extra space needed because the stdio
+	 * code will not use it.
+	 */
+	if (use_browser)
+		err = hist_entry__inc_addr_samples(he, al->addr);
 out_free_syms:
 	free(syms);
 	return err;
@@ -458,6 +468,13 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 
 	if (strcmp(input_name, "-") != 0)
 		setup_browser();
+	/*
+	 * Only in the newt browser we are doing integrated annotation,
+	 * so don't allocate extra space that won't be used in the stdio
+	 * implementation.
+	 */
+	if (use_browser)
+		symbol_conf.priv_size = sizeof(struct sym_priv);
 
 	if (symbol__init() < 0)
 		return -1;
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index baa55be64d9..451d2e45d84 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -843,3 +843,187 @@ void hists__filter_by_thread(struct hists *self, const struct thread *thread)
 		}
 	}
 }
+
+static int symbol__alloc_hist(struct symbol *self)
+{
+	struct sym_priv *priv = symbol__priv(self);
+	const int size = (sizeof(*priv->hist) +
+			  (self->end - self->start) * sizeof(u64));
+
+	priv->hist = zalloc(size);
+	return priv->hist == NULL ? -1 : 0;
+}
+
+int hist_entry__inc_addr_samples(struct hist_entry *self, u64 ip)
+{
+	unsigned int sym_size, offset;
+	struct symbol *sym = self->ms.sym;
+	struct sym_priv *priv;
+	struct sym_hist *h;
+
+	if (!sym || !self->ms.map)
+		return 0;
+
+	priv = symbol__priv(sym);
+	if (priv->hist == NULL && symbol__alloc_hist(sym) < 0)
+		return -ENOMEM;
+
+	sym_size = sym->end - sym->start;
+	offset = ip - sym->start;
+
+	pr_debug3("%s: ip=%#Lx\n", __func__, self->ms.map->unmap_ip(self->ms.map, ip));
+
+	if (offset >= sym_size)
+		return 0;
+
+	h = priv->hist;
+	h->sum++;
+	h->ip[offset]++;
+
+	pr_debug3("%#Lx %s: count++ [ip: %#Lx, %#Lx] => %Ld\n", self->ms.sym->start,
+		  self->ms.sym->name, ip, ip - self->ms.sym->start, h->ip[offset]);
+	return 0;
+}
+
+static struct objdump_line *objdump_line__new(s64 offset, char *line)
+{
+	struct objdump_line *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		self->offset = offset;
+		self->line = line;
+	}
+
+	return self;
+}
+
+void objdump_line__free(struct objdump_line *self)
+{
+	free(self->line);
+	free(self);
+}
+
+static void objdump__add_line(struct list_head *head, struct objdump_line *line)
+{
+	list_add_tail(&line->node, head);
+}
+
+struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
+					       struct objdump_line *pos)
+{
+	list_for_each_entry_continue(pos, head, node)
+		if (pos->offset >= 0)
+			return pos;
+
+	return NULL;
+}
+
+static int hist_entry__parse_objdump_line(struct hist_entry *self, FILE *file,
+					  struct list_head *head)
+{
+	struct symbol *sym = self->ms.sym;
+	struct objdump_line *objdump_line;
+	char *line = NULL, *tmp, *tmp2, *c;
+	size_t line_len;
+	s64 line_ip, offset = -1;
+
+	if (getline(&line, &line_len, file) < 0)
+		return -1;
+
+	if (!line)
+		return -1;
+
+	while (line_len != 0 && isspace(line[line_len - 1]))
+		line[--line_len] = '\0';
+
+	c = strchr(line, '\n');
+	if (c)
+		*c = 0;
+
+	line_ip = -1;
+
+	/*
+	 * Strip leading spaces:
+	 */
+	tmp = line;
+	while (*tmp) {
+		if (*tmp != ' ')
+			break;
+		tmp++;
+	}
+
+	if (*tmp) {
+		/*
+		 * Parse hexa addresses followed by ':'
+		 */
+		line_ip = strtoull(tmp, &tmp2, 16);
+		if (*tmp2 != ':')
+			line_ip = -1;
+	}
+
+	if (line_ip != -1) {
+		u64 start = map__rip_2objdump(self->ms.map, sym->start);
+		offset = line_ip - start;
+	}
+
+	objdump_line = objdump_line__new(offset, line);
+	if (objdump_line == NULL) {
+		free(line);
+		return -1;
+	}
+	objdump__add_line(head, objdump_line);
+
+	return 0;
+}
+
+int hist_entry__annotate(struct hist_entry *self, struct list_head *head)
+{
+	struct symbol *sym = self->ms.sym;
+	struct map *map = self->ms.map;
+	struct dso *dso = map->dso;
+	const char *filename = dso->long_name;
+	char command[PATH_MAX * 2];
+	FILE *file;
+	u64 len;
+
+	if (!filename)
+		return -1;
+
+	if (dso->origin == DSO__ORIG_KERNEL) {
+		if (dso->annotate_warned)
+			return 0;
+		dso->annotate_warned = 1;
+		pr_err("Can't annotate %s: No vmlinux file was found in the "
+		       "path:\n", sym->name);
+		vmlinux_path__fprintf(stderr);
+		return -1;
+	}
+
+	pr_debug("%s: filename=%s, sym=%s, start=%#Lx, end=%#Lx\n", __func__,
+		 filename, sym->name, map->unmap_ip(map, sym->start),
+		 map->unmap_ip(map, sym->end));
+
+	len = sym->end - sym->start;
+
+	pr_debug("annotating [%p] %30s : [%p] %30s\n",
+		 dso, dso->long_name, sym, sym->name);
+
+	snprintf(command, sizeof(command),
+		 "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s|grep -v %s|expand",
+		 map__rip_2objdump(map, sym->start),
+		 map__rip_2objdump(map, sym->end),
+		 filename, filename);
+
+	pr_debug("Executing: %s\n", command);
+
+	file = popen(command, "r");
+	if (!file)
+		return -1;
+
+	while (!feof(file))
+		if (hist_entry__parse_objdump_line(self, file, head) < 0)
+			break;
+
+	pclose(file);
+	return 0;
+}
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 1c5f93ac5ab..ed9c0673496 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -11,6 +11,32 @@ struct addr_location;
 struct symbol;
 struct rb_root;
 
+struct objdump_line {
+	struct list_head node;
+	s64		 offset;
+	char		 *line;
+};
+
+void objdump_line__free(struct objdump_line *self);
+struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
+					       struct objdump_line *pos);
+
+struct sym_hist {
+	u64		sum;
+	u64		ip[0];
+};
+
+struct sym_ext {
+	struct rb_node	node;
+	double		percent;
+	char		*path;
+};
+
+struct sym_priv {
+	struct sym_hist	*hist;
+	struct sym_ext	*ext;
+};
+
 struct events_stats {
 	u64 total;
 	u64 lost;
@@ -45,6 +71,9 @@ void hists__collapse_resort(struct hists *self);
 size_t hists__fprintf(struct hists *self, struct hists *pair,
 		      bool show_displacement, FILE *fp);
 
+int hist_entry__inc_addr_samples(struct hist_entry *self, u64 ip);
+int hist_entry__annotate(struct hist_entry *self, struct list_head *head);
+
 void hists__filter_by_dso(struct hists *self, const struct dso *dso);
 void hists__filter_by_thread(struct hists *self, const struct thread *thread);
 
diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index daa86efffce..ba6acd04c08 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -2,6 +2,7 @@
 #include <stdio.h>
 #undef _GNU_SOURCE
 
+#include <slang.h>
 #include <stdlib.h>
 #include <newt.h>
 #include <sys/ttydefaults.h>
@@ -171,6 +172,254 @@ static bool dialog_yesno(const char *msg)
 	return newtWinChoice(NULL, yes, no, (char *)msg) == 1;
 }
 
+#define HE_COLORSET_TOP		50
+#define HE_COLORSET_MEDIUM	51
+#define HE_COLORSET_NORMAL	52
+#define HE_COLORSET_SELECTED	53
+#define HE_COLORSET_CODE	54
+
+static int ui_browser__percent_color(double percent, bool current)
+{
+	if (current)
+		return HE_COLORSET_SELECTED;
+	if (percent >= MIN_RED)
+		return HE_COLORSET_TOP;
+	if (percent >= MIN_GREEN)
+		return HE_COLORSET_MEDIUM;
+	return HE_COLORSET_NORMAL;
+}
+
+struct ui_browser {
+	newtComponent	form, sb;
+	u64		index, first_visible_entry_idx;
+	void		*first_visible_entry, *entries;
+	u16		top, left, width, height;
+	void		*priv;
+	u32		nr_entries;
+};
+
+static void ui_browser__refresh_dimensions(struct ui_browser *self)
+{
+	int cols, rows;
+	newtGetScreenSize(&cols, &rows);
+
+	if (self->width > cols - 4)
+		self->width = cols - 4;
+	self->height = rows - 5;
+	if (self->height > self->nr_entries)
+		self->height = self->nr_entries;
+	self->top  = (rows - self->height) / 2;
+	self->left = (cols - self->width) / 2;
+}
+
+static void ui_browser__reset_index(struct ui_browser *self)
+{
+        self->index = self->first_visible_entry_idx = 0;
+        self->first_visible_entry = NULL;
+}
+
+static int objdump_line__show(struct objdump_line *self, struct list_head *head,
+			      int width, struct hist_entry *he, int len,
+			      bool current_entry)
+{
+	if (self->offset != -1) {
+		struct symbol *sym = he->ms.sym;
+		unsigned int hits = 0;
+		double percent = 0.0;
+		int color;
+		struct sym_priv *priv = symbol__priv(sym);
+		struct sym_ext *sym_ext = priv->ext;
+		struct sym_hist *h = priv->hist;
+		s64 offset = self->offset;
+		struct objdump_line *next = objdump__get_next_ip_line(head, self);
+
+		while (offset < (s64)len &&
+		       (next == NULL || offset < next->offset)) {
+			if (sym_ext) {
+				percent += sym_ext[offset].percent;
+			} else
+				hits += h->ip[offset];
+
+			++offset;
+		}
+
+		if (sym_ext == NULL && h->sum)
+			percent = 100.0 * hits / h->sum;
+
+		color = ui_browser__percent_color(percent, current_entry);
+		SLsmg_set_color(color);
+		SLsmg_printf(" %7.2f ", percent);
+		if (!current_entry)
+			SLsmg_set_color(HE_COLORSET_CODE);
+	} else {
+		int color = ui_browser__percent_color(0, current_entry);
+		SLsmg_set_color(color);
+		SLsmg_write_nstring(" ", 9);
+	}
+
+	SLsmg_write_char(':');
+	SLsmg_write_nstring(" ", 8);
+	if (!*self->line)
+		SLsmg_write_nstring(" ", width - 18);
+	else
+		SLsmg_write_nstring(self->line, width - 18);
+
+	return 0;
+}
+
+static int ui_browser__refresh_entries(struct ui_browser *self)
+{
+	struct objdump_line *pos;
+	struct list_head *head = self->entries;
+	struct hist_entry *he = self->priv;
+	int row = 0;
+	int len = he->ms.sym->end - he->ms.sym->start;
+
+	if (self->first_visible_entry == NULL || self->first_visible_entry == self->entries)
+                self->first_visible_entry = head->next;
+
+	pos = list_entry(self->first_visible_entry, struct objdump_line, node);
+
+	list_for_each_entry_from(pos, head, node) {
+		bool current_entry = (self->first_visible_entry_idx + row) == self->index;
+		SLsmg_gotorc(self->top + row, self->left);
+		objdump_line__show(pos, head, self->width,
+				   he, len, current_entry);
+		if (++row == self->height)
+			break;
+	}
+
+	SLsmg_set_color(HE_COLORSET_NORMAL);
+	SLsmg_fill_region(self->top + row, self->left,
+			  self->height - row, self->width, ' ');
+
+	return 0;
+}
+
+static int ui_browser__run(struct ui_browser *self, const char *title,
+			   struct newtExitStruct *es)
+{
+	if (self->form) {
+		newtFormDestroy(self->form);
+		newtPopWindow();
+	}
+
+	ui_browser__refresh_dimensions(self);
+	newtCenteredWindow(self->width + 2, self->height, title);
+	self->form = newt_form__new();
+	if (self->form == NULL)
+		return -1;
+
+	self->sb = newtVerticalScrollbar(self->width + 1, 0, self->height,
+					 HE_COLORSET_NORMAL,
+					 HE_COLORSET_SELECTED);
+	if (self->sb == NULL)
+		return -1;
+
+	newtFormAddHotKey(self->form, NEWT_KEY_UP);
+	newtFormAddHotKey(self->form, NEWT_KEY_DOWN);
+	newtFormAddHotKey(self->form, NEWT_KEY_PGUP);
+	newtFormAddHotKey(self->form, NEWT_KEY_PGDN);
+	newtFormAddHotKey(self->form, NEWT_KEY_HOME);
+	newtFormAddHotKey(self->form, NEWT_KEY_END);
+
+	if (ui_browser__refresh_entries(self) < 0)
+		return -1;
+	newtFormAddComponent(self->form, self->sb);
+
+	while (1) {
+		unsigned int offset;
+
+		newtFormRun(self->form, es);
+
+		if (es->reason != NEWT_EXIT_HOTKEY)
+			break;
+		switch (es->u.key) {
+		case NEWT_KEY_DOWN:
+			if (self->index == self->nr_entries - 1)
+				break;
+			++self->index;
+			if (self->index == self->first_visible_entry_idx + self->height) {
+				struct list_head *pos = self->first_visible_entry;
+				++self->first_visible_entry_idx;
+				self->first_visible_entry = pos->next;
+			}
+			break;
+		case NEWT_KEY_UP:
+			if (self->index == 0)
+				break;
+			--self->index;
+			if (self->index < self->first_visible_entry_idx) {
+				struct list_head *pos = self->first_visible_entry;
+				--self->first_visible_entry_idx;
+				self->first_visible_entry = pos->prev;
+			}
+			break;
+		case NEWT_KEY_PGDN:
+			if (self->first_visible_entry_idx + self->height > self->nr_entries - 1)
+				break;
+
+			offset = self->height;
+			if (self->index + offset > self->nr_entries - 1)
+				offset = self->nr_entries - 1 - self->index;
+			self->index += offset;
+			self->first_visible_entry_idx += offset;
+
+			while (offset--) {
+				struct list_head *pos = self->first_visible_entry;
+				self->first_visible_entry = pos->next;
+			}
+
+			break;
+		case NEWT_KEY_PGUP:
+			if (self->first_visible_entry_idx == 0)
+				break;
+
+			if (self->first_visible_entry_idx < self->height)
+				offset = self->first_visible_entry_idx;
+			else
+				offset = self->height;
+
+			self->index -= offset;
+			self->first_visible_entry_idx -= offset;
+
+			while (offset--) {
+				struct list_head *pos = self->first_visible_entry;
+				self->first_visible_entry = pos->prev;
+			}
+			break;
+		case NEWT_KEY_HOME:
+			ui_browser__reset_index(self);
+			break;
+		case NEWT_KEY_END: {
+			struct list_head *head = self->entries;
+			offset = self->height - 1;
+
+			if (offset > self->nr_entries)
+				offset = self->nr_entries;
+
+			self->index = self->first_visible_entry_idx = self->nr_entries - 1 - offset;
+			self->first_visible_entry = head->prev;
+			while (offset-- != 0) {
+				struct list_head *pos = self->first_visible_entry;
+				self->first_visible_entry = pos->prev;
+			}
+		}
+			break;
+		case NEWT_KEY_ESCAPE:
+		case CTRL('c'):
+		case 'Q':
+		case 'q':
+			return 0;
+		default:
+			continue;
+		}
+		if (ui_browser__refresh_entries(self) < 0)
+			return -1;
+	}
+	return 0;
+}
+
 /*
  * When debugging newt problems it was useful to be able to "unroll"
  * the calls to newtCheckBoxTreeAdd{Array,Item}, so that we can generate
@@ -353,62 +602,40 @@ static size_t hist_entry__append_browser(struct hist_entry *self,
 	return ret;
 }
 
-static void map_symbol__annotate_browser(const struct map_symbol *self,
-					 const char *input_name)
+static void hist_entry__annotate_browser(struct hist_entry *self)
 {
-	FILE *fp;
-	int cols, rows;
-	newtComponent form, tree;
+	struct ui_browser browser;
 	struct newtExitStruct es;
-	char *str;
-	size_t line_len, max_line_len = 0;
-	size_t max_usable_width;
-	char *line = NULL;
+	struct objdump_line *pos, *n;
+	LIST_HEAD(head);
 
-	if (self->sym == NULL)
+	if (self->ms.sym == NULL)
 		return;
 
-	if (asprintf(&str, "perf annotate -i \"%s\" -d \"%s\" %s 2>&1 | expand",
-		     input_name, self->map->dso->name, self->sym->name) < 0)
+	if (hist_entry__annotate(self, &head) < 0)
 		return;
 
-	fp = popen(str, "r");
-	if (fp == NULL)
-		goto out_free_str;
-
 	ui_helpline__push("Press ESC to exit");
-	newtGetScreenSize(&cols, &rows);
-	tree = newtListbox(0, 0, rows - 5, NEWT_FLAG_SCROLL);
-
-	while (!feof(fp)) {
-		if (getline(&line, &line_len, fp) < 0 || !line_len)
-			break;
-		while (line_len != 0 && isspace(line[line_len - 1]))
-			line[--line_len] = '\0';
 
-		if (line_len > max_line_len)
-			max_line_len = line_len;
-		newtListboxAppendEntry(tree, line, NULL);
+	memset(&browser, 0, sizeof(browser));
+	browser.entries = &head;
+	browser.priv = self;
+	list_for_each_entry(pos, &head, node) {
+		size_t line_len = strlen(pos->line);
+		if (browser.width < line_len)
+			browser.width = line_len;
+		++browser.nr_entries;
 	}
-	fclose(fp);
-	free(line);
-
-	max_usable_width = cols - 22;
-	if (max_line_len > max_usable_width)
-		max_line_len = max_usable_width;
-
-	newtListboxSetWidth(tree, max_line_len);
 
-	newtCenteredWindow(max_line_len + 2, rows - 5, self->sym->name);
-	form = newt_form__new();
-	newtFormAddComponent(form, tree);
-
-	newtFormRun(form, &es);
-	newtFormDestroy(form);
+	browser.width += 18; /* Percentage */
+	ui_browser__run(&browser, self->ms.sym->name, &es);
+	newtFormDestroy(browser.form);
 	newtPopWindow();
+	list_for_each_entry_safe(pos, n, &head, node) {
+		list_del(&pos->node);
+		objdump_line__free(pos);
+	}
 	ui_helpline__pop();
-out_free_str:
-	free(str);
 }
 
 static const void *newt__symbol_tree_get_current(newtComponent self)
@@ -527,7 +754,7 @@ static int hist_browser__populate(struct hist_browser *self, struct hists *hists
 	return 0;
 }
 
-static struct thread *hist_browser__selected_thread(struct hist_browser *self)
+static struct hist_entry *hist_browser__selected_entry(struct hist_browser *self)
 {
 	int *indexes;
 
@@ -543,7 +770,13 @@ static struct thread *hist_browser__selected_thread(struct hist_browser *self)
 	}
 	return NULL;
 out:
-	return *(struct thread **)(self->selection + 1);
+	return container_of(self->selection, struct hist_entry, ms);
+}
+
+static struct thread *hist_browser__selected_thread(struct hist_browser *self)
+{
+	struct hist_entry *he = hist_browser__selected_entry(self);
+	return he ? he->thread : NULL;
 }
 
 static int hist_browser__title(char *bf, size_t size, const char *input_name,
@@ -637,13 +870,20 @@ int hists__browse(struct hists *self, const char *helpline, const char *input_na
 			continue;
 do_annotate:
 		if (choice == annotate) {
+			struct hist_entry *he;
+
 			if (browser->selection->map->dso->origin == DSO__ORIG_KERNEL) {
 				ui_helpline__puts("No vmlinux file found, can't "
 						 "annotate with just a "
 						 "kallsyms file");
 				continue;
 			}
-			map_symbol__annotate_browser(browser->selection, input_name);
+
+			he = hist_browser__selected_entry(browser);
+			if (he == NULL)
+				continue;
+
+			hist_entry__annotate_browser(he);
 		} else if (choice == zoom_dso) {
 			if (dso_filter) {
 				ui_helpline__pop();
@@ -681,8 +921,23 @@ out:
 	return err;
 }
 
+static struct newtPercentTreeColors {
+	const char *topColorFg, *topColorBg;
+	const char *mediumColorFg, *mediumColorBg;
+	const char *normalColorFg, *normalColorBg;
+	const char *selColorFg, *selColorBg;
+	const char *codeColorFg, *codeColorBg;
+} defaultPercentTreeColors = {
+	"red",       "lightgray",
+	"green",     "lightgray",
+	"black",     "lightgray",
+	"lightgray", "magenta",
+	"blue",	     "lightgray",
+};
+
 void setup_browser(void)
 {
+	struct newtPercentTreeColors *c = &defaultPercentTreeColors;
 	if (!isatty(1))
 		return;
 
@@ -690,6 +945,11 @@ void setup_browser(void)
 	newtInit();
 	newtCls();
 	ui_helpline__puts(" ");
+	SLtt_set_color(HE_COLORSET_TOP, NULL, c->topColorFg, c->topColorBg);
+	SLtt_set_color(HE_COLORSET_MEDIUM, NULL, c->mediumColorFg, c->mediumColorBg);
+	SLtt_set_color(HE_COLORSET_NORMAL, NULL, c->normalColorFg, c->normalColorBg);
+	SLtt_set_color(HE_COLORSET_SELECTED, NULL, c->selColorFg, c->selColorBg);
+	SLtt_set_color(HE_COLORSET_CODE, NULL, c->codeColorFg, c->codeColorBg);
 }
 
 void exit_browser(bool wait_for_ok)
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index b7c54eeed9c..af301acc461 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -48,12 +48,6 @@ struct hist_entry {
 	u64			count_us;
 	u64			count_guest_sys;
 	u64			count_guest_us;
-
-	/*
-	 * XXX WARNING!
-	 * thread _has_ to come after ms, see
-	 * hist_browser__selected_thread in util/newt.c
-	 */
 	struct map_symbol	ms;
 	struct thread		*thread;
 	u64			ip;
-- 
cgit v1.2.3


From aa3e5572c538d753dce11bf93532a75f95d22b40 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Wed, 12 May 2010 02:24:12 +0000
Subject: [CIFS] drop quota operation stubs

CIFS has stubs for XFS-style quotas without an actual implementation backing
them, hidden behind a config option not visible in Kconfig.  Remove these
stubs for now as the quota operations will see some major changes and this
code simply gets in the way.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 107 -------------------------------------------------------
 1 file changed, 107 deletions(-)

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 09842d3f7e1..833166372a0 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -49,10 +49,6 @@
 #include "cifs_spnego.h"
 #define CIFS_MAGIC_NUMBER 0xFF534D42	/* the first four bytes of SMB PDUs */
 
-#ifdef CONFIG_CIFS_QUOTA
-static const struct quotactl_ops cifs_quotactl_ops;
-#endif /* QUOTA */
-
 int cifsFYI = 0;
 int cifsERROR = 1;
 int traceSMB = 0;
@@ -135,9 +131,6 @@ cifs_read_super(struct super_block *sb, void *data,
 /*	if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
 	    sb->s_blocksize =
 		cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
-#ifdef CONFIG_CIFS_QUOTA
-	sb->s_qcop = &cifs_quotactl_ops;
-#endif
 	sb->s_blocksize = CIFS_MAX_MSGSIZE;
 	sb->s_blocksize_bits = 14;	/* default 2**14 = CIFS_MAX_MSGSIZE */
 	inode = cifs_root_iget(sb, ROOT_I);
@@ -418,106 +411,6 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 	return 0;
 }
 
-#ifdef CONFIG_CIFS_QUOTA
-int cifs_xquota_set(struct super_block *sb, int quota_type, qid_t qid,
-		struct fs_disk_quota *pdquota)
-{
-	int xid;
-	int rc = 0;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *pTcon;
-
-	if (cifs_sb)
-		pTcon = cifs_sb->tcon;
-	else
-		return -EIO;
-
-
-	xid = GetXid();
-	if (pTcon) {
-		cFYI(1, "set type: 0x%x id: %d", quota_type, qid);
-	} else
-		rc = -EIO;
-
-	FreeXid(xid);
-	return rc;
-}
-
-int cifs_xquota_get(struct super_block *sb, int quota_type, qid_t qid,
-		    struct fs_disk_quota *pdquota)
-{
-	int xid;
-	int rc = 0;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *pTcon;
-
-	if (cifs_sb)
-		pTcon = cifs_sb->tcon;
-	else
-		return -EIO;
-
-	xid = GetXid();
-	if (pTcon) {
-		cFYI(1, "set type: 0x%x id: %d", quota_type, qid);
-	} else
-		rc = -EIO;
-
-	FreeXid(xid);
-	return rc;
-}
-
-int cifs_xstate_set(struct super_block *sb, unsigned int flags, int operation)
-{
-	int xid;
-	int rc = 0;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *pTcon;
-
-	if (cifs_sb)
-		pTcon = cifs_sb->tcon;
-	else
-		return -EIO;
-
-	xid = GetXid();
-	if (pTcon) {
-		cFYI(1, "flags: 0x%x operation: 0x%x", flags, operation);
-	} else
-		rc = -EIO;
-
-	FreeXid(xid);
-	return rc;
-}
-
-int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
-{
-	int xid;
-	int rc = 0;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	struct cifsTconInfo *pTcon;
-
-	if (cifs_sb)
-		pTcon = cifs_sb->tcon;
-	else
-		return -EIO;
-
-	xid = GetXid();
-	if (pTcon) {
-		cFYI(1, "pqstats %p", qstats);
-	} else
-		rc = -EIO;
-
-	FreeXid(xid);
-	return rc;
-}
-
-static const struct quotactl_ops cifs_quotactl_ops = {
-	.set_xquota	= cifs_xquota_set,
-	.get_xquota	= cifs_xquota_get,
-	.set_xstate	= cifs_xstate_set,
-	.get_xstate	= cifs_xstate_get,
-};
-#endif
-
 static void cifs_umount_begin(struct super_block *sb)
 {
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-- 
cgit v1.2.3


From f818a73674c5d197f66b636a46d7d578d7258129 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 11 May 2010 20:56:31 -0700
Subject: ceph: fix cap removal races

The iterate_session_caps helper traverses the session caps list and tries
to grab an inode reference.  However, the __ceph_remove_cap was clearing
the inode backpointer _before_ removing itself from the session list,
causing a null pointer dereference.

Clear cap->ci under protection of s_cap_lock to avoid the race, and to
tightly couple the list and backpointer state.  Use a local flag to
indicate whether we are releasing the cap, as cap->session may be modified
by a racing thread in iterate_session_caps.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       | 19 ++++++++++++-------
 fs/ceph/mds_client.c |  5 +++--
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0c168180686..d9400534b27 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -858,6 +858,8 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
 }
 
 /*
+ * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
+ *
  * caller should hold i_lock.
  * caller will not hold session s_mutex if called from destroy_inode.
  */
@@ -866,15 +868,10 @@ void __ceph_remove_cap(struct ceph_cap *cap)
 	struct ceph_mds_session *session = cap->session;
 	struct ceph_inode_info *ci = cap->ci;
 	struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+	int removed = 0;
 
 	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
 
-	/* remove from inode list */
-	rb_erase(&cap->ci_node, &ci->i_caps);
-	cap->ci = NULL;
-	if (ci->i_auth_cap == cap)
-		ci->i_auth_cap = NULL;
-
 	/* remove from session list */
 	spin_lock(&session->s_cap_lock);
 	if (session->s_cap_iterator == cap) {
@@ -885,10 +882,18 @@ void __ceph_remove_cap(struct ceph_cap *cap)
 		list_del_init(&cap->session_caps);
 		session->s_nr_caps--;
 		cap->session = NULL;
+		removed = 1;
 	}
+	/* protect backpointer with s_cap_lock: see iterate_session_caps */
+	cap->ci = NULL;
 	spin_unlock(&session->s_cap_lock);
 
-	if (cap->session == NULL)
+	/* remove from inode list */
+	rb_erase(&cap->ci_node, &ci->i_caps);
+	if (ci->i_auth_cap == cap)
+		ci->i_auth_cap = NULL;
+
+	if (removed)
 		ceph_put_cap(cap);
 
 	if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index eccc0ecad1a..24561a557e0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -736,9 +736,10 @@ static void cleanup_cap_releases(struct ceph_mds_session *session)
 }
 
 /*
- * Helper to safely iterate over all caps associated with a session.
+ * Helper to safely iterate over all caps associated with a session, with
+ * special care taken to handle a racing __ceph_remove_cap().
  *
- * caller must hold session s_mutex
+ * Caller must hold session s_mutex.
  */
 static int iterate_session_caps(struct ceph_mds_session *session,
 				 int (*cb)(struct inode *, struct ceph_cap *,
-- 
cgit v1.2.3


From e84346b726ea90a8ed470bc81c4136a7b8710ea5 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 11 May 2010 21:20:38 -0700
Subject: ceph: preserve seq # on requeued messages after transient transport
 errors

If the tcp connection drops and we reconnect to reestablish a stateful
session (with the mds), we need to resend previously sent (and possibly
received) messages with the _same_ seq # so that they can be dropped on
the other end if needed.  Only assign a new seq once after the message is
queued.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 11 ++++++++++-
 fs/ceph/messenger.h |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index a3a8f368845..cd4fadb6491 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -492,7 +492,14 @@ static void prepare_write_message(struct ceph_connection *con)
 		list_move_tail(&m->list_head, &con->out_sent);
 	}
 
-	m->hdr.seq = cpu_to_le64(++con->out_seq);
+	/*
+	 * only assign outgoing seq # if we haven't sent this message
+	 * yet.  if it is requeued, resend with it's original seq.
+	 */
+	if (m->needs_out_seq) {
+		m->hdr.seq = cpu_to_le64(++con->out_seq);
+		m->needs_out_seq = false;
+	}
 
 	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
 	     m, con->out_seq, le16_to_cpu(m->hdr.type),
@@ -1986,6 +1993,8 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
 
+	msg->needs_out_seq = true;
+
 	/* queue */
 	mutex_lock(&con->mutex);
 	BUG_ON(!list_empty(&msg->list_head));
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a343dae73cd..a5caf91cc97 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -86,6 +86,7 @@ struct ceph_msg {
 	struct kref kref;
 	bool front_is_vmalloc;
 	bool more_to_follow;
+	bool needs_out_seq;
 	int front_max;
 
 	struct ceph_msgpool *pool;
-- 
cgit v1.2.3


From 0fe1ac48bef018bed896307cd12f6ca9b5e704ab Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 13 Apr 2010 20:46:04 +0000
Subject: powerpc/perf_event: Fix oops due to perf_event_do_pending call

Anton Blanchard found that large POWER systems would occasionally
crash in the exception exit path when profiling with perf_events.
The symptom was that an interrupt would occur late in the exit path
when the MSR[RI] (recoverable interrupt) bit was clear.  Interrupts
should be hard-disabled at this point but they were enabled.  Because
the interrupt was not recoverable the system panicked.

The reason is that the exception exit path was calling
perf_event_do_pending after hard-disabling interrupts, and
perf_event_do_pending will re-enable interrupts.

The simplest and cleanest fix for this is to use the same mechanism
that 32-bit powerpc does, namely to cause a self-IPI by setting the
decrementer to 1.  This means we can remove the tests in the exception
exit path and raw_local_irq_restore.

This also makes sure that the call to perf_event_do_pending from
timer_interrupt() happens within irq_enter/irq_exit.  (Note that
calling perf_event_do_pending from timer_interrupt does not mean that
there is a possible 1/HZ latency; setting the decrementer to 1 ensures
that the timer interrupt will happen immediately, i.e. within one
timebase tick, which is a few nanoseconds or 10s of nanoseconds.)

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: stable@kernel.org
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/hw_irq.h | 38 -------------------------
 arch/powerpc/kernel/asm-offsets.c |  1 -
 arch/powerpc/kernel/entry_64.S    |  9 ------
 arch/powerpc/kernel/irq.c         |  6 ----
 arch/powerpc/kernel/time.c        | 60 +++++++++++++++++++++++++++++++--------
 5 files changed, 48 insertions(+), 66 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index 9f4c9d4f580..bd100fcf40d 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -130,43 +130,5 @@ static inline int irqs_disabled_flags(unsigned long flags)
  */
 struct irq_chip;
 
-#ifdef CONFIG_PERF_EVENTS
-
-#ifdef CONFIG_PPC64
-static inline unsigned long test_perf_event_pending(void)
-{
-	unsigned long x;
-
-	asm volatile("lbz %0,%1(13)"
-		: "=r" (x)
-		: "i" (offsetof(struct paca_struct, perf_event_pending)));
-	return x;
-}
-
-static inline void set_perf_event_pending(void)
-{
-	asm volatile("stb %0,%1(13)" : :
-		"r" (1),
-		"i" (offsetof(struct paca_struct, perf_event_pending)));
-}
-
-static inline void clear_perf_event_pending(void)
-{
-	asm volatile("stb %0,%1(13)" : :
-		"r" (0),
-		"i" (offsetof(struct paca_struct, perf_event_pending)));
-}
-#endif /* CONFIG_PPC64 */
-
-#else  /* CONFIG_PERF_EVENTS */
-
-static inline unsigned long test_perf_event_pending(void)
-{
-	return 0;
-}
-
-static inline void clear_perf_event_pending(void) {}
-#endif /* CONFIG_PERF_EVENTS */
-
 #endif	/* __KERNEL__ */
 #endif	/* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 957ceb7059c..c09138d150d 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -133,7 +133,6 @@ int main(void)
 	DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
 	DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
 	DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
-	DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_event_pending));
 	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
 #ifdef CONFIG_PPC_MM_SLICES
 	DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct,
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 07109d84378..42e9d908914 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -556,15 +556,6 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
 2:
 	TRACE_AND_RESTORE_IRQ(r5);
 
-#ifdef CONFIG_PERF_EVENTS
-	/* check paca->perf_event_pending if we're enabling ints */
-	lbz	r3,PACAPERFPEND(r13)
-	and.	r3,r3,r5
-	beq	27f
-	bl	.perf_event_do_pending
-27:
-#endif /* CONFIG_PERF_EVENTS */
-
 	/* extract EE bit and use it to restore paca->hard_enabled */
 	ld	r3,_MSR(r1)
 	rldicl	r4,r3,49,63		/* r0 = (r3 >> 15) & 1 */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 64f6f2031c2..066bd31551d 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -53,7 +53,6 @@
 #include <linux/bootmem.h>
 #include <linux/pci.h>
 #include <linux/debugfs.h>
-#include <linux/perf_event.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -145,11 +144,6 @@ notrace void raw_local_irq_restore(unsigned long en)
 	}
 #endif /* CONFIG_PPC_STD_MMU_64 */
 
-	if (test_perf_event_pending()) {
-		clear_perf_event_pending();
-		perf_event_do_pending();
-	}
-
 	/*
 	 * if (get_paca()->hard_enabled) return;
 	 * But again we need to take care that gcc gets hard_enabled directly
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 1b16b9a3e49..0441bbdadbd 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -532,25 +532,60 @@ void __init iSeries_time_init_early(void)
 }
 #endif /* CONFIG_PPC_ISERIES */
 
-#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_PPC32)
-DEFINE_PER_CPU(u8, perf_event_pending);
+#ifdef CONFIG_PERF_EVENTS
 
-void set_perf_event_pending(void)
+/*
+ * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
+ */
+#ifdef CONFIG_PPC64
+static inline unsigned long test_perf_event_pending(void)
 {
-	get_cpu_var(perf_event_pending) = 1;
-	set_dec(1);
-	put_cpu_var(perf_event_pending);
+	unsigned long x;
+
+	asm volatile("lbz %0,%1(13)"
+		: "=r" (x)
+		: "i" (offsetof(struct paca_struct, perf_event_pending)));
+	return x;
 }
 
+static inline void set_perf_event_pending_flag(void)
+{
+	asm volatile("stb %0,%1(13)" : :
+		"r" (1),
+		"i" (offsetof(struct paca_struct, perf_event_pending)));
+}
+
+static inline void clear_perf_event_pending(void)
+{
+	asm volatile("stb %0,%1(13)" : :
+		"r" (0),
+		"i" (offsetof(struct paca_struct, perf_event_pending)));
+}
+
+#else /* 32-bit */
+
+DEFINE_PER_CPU(u8, perf_event_pending);
+
+#define set_perf_event_pending_flag()	__get_cpu_var(perf_event_pending) = 1
 #define test_perf_event_pending()	__get_cpu_var(perf_event_pending)
 #define clear_perf_event_pending()	__get_cpu_var(perf_event_pending) = 0
 
-#else  /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */
+#endif /* 32 vs 64 bit */
+
+void set_perf_event_pending(void)
+{
+	preempt_disable();
+	set_perf_event_pending_flag();
+	set_dec(1);
+	preempt_enable();
+}
+
+#else  /* CONFIG_PERF_EVENTS */
 
 #define test_perf_event_pending()	0
 #define clear_perf_event_pending()
 
-#endif /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */
+#endif /* CONFIG_PERF_EVENTS */
 
 /*
  * For iSeries shared processors, we have to let the hypervisor
@@ -582,10 +617,6 @@ void timer_interrupt(struct pt_regs * regs)
 	set_dec(DECREMENTER_MAX);
 
 #ifdef CONFIG_PPC32
-	if (test_perf_event_pending()) {
-		clear_perf_event_pending();
-		perf_event_do_pending();
-	}
 	if (atomic_read(&ppc_n_lost_interrupts) != 0)
 		do_IRQ(regs);
 #endif
@@ -604,6 +635,11 @@ void timer_interrupt(struct pt_regs * regs)
 
 	calculate_steal_time();
 
+	if (test_perf_event_pending()) {
+		clear_perf_event_pending();
+		perf_event_do_pending();
+	}
+
 #ifdef CONFIG_PPC_ISERIES
 	if (firmware_has_feature(FW_FEATURE_ISERIES))
 		get_lppaca()->int_dword.fields.decr_int = 0;
-- 
cgit v1.2.3


From 1c1e093cbf6d3a7576ba0bd10363362a1c5c74ee Mon Sep 17 00:00:00 2001
From: Stefan Weinhuber <wein@de.ibm.com>
Date: Wed, 12 May 2010 09:32:11 +0200
Subject: [S390] dasd: fix race between tasklet and dasd_sleep_on

The various dasd_sleep_on functions use a global wait queue when
waiting for a cqr. The wait condition checks the status and devlist
fields of the cqr to determine if it is safe to continue. This
evaluation may return true, although the tasklet has not finished
processing of the cqr and the callback function has not been called
yet. When the callback is finally called, the data in the cqr may
already be invalid. The sleep_on wait condition needs a safe way to
determine if the tasklet has finished processing. Use the
callback_data field of the cqr to store a token, which is set by
the callback function itself.

Cc: <stable@kernel.org>
Signed-off-by: Stefan Weinhuber <wein@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/block/dasd.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index acf222f91f5..fa2339cb168 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -37,6 +37,9 @@
  */
 #define DASD_CHANQ_MAX_SIZE 4
 
+#define DASD_SLEEPON_START_TAG	(void *) 1
+#define DASD_SLEEPON_END_TAG	(void *) 2
+
 /*
  * SECTION: exported variables of dasd.c
  */
@@ -1472,7 +1475,10 @@ void dasd_add_request_tail(struct dasd_ccw_req *cqr)
  */
 static void dasd_wakeup_cb(struct dasd_ccw_req *cqr, void *data)
 {
-	wake_up((wait_queue_head_t *) data);
+	spin_lock_irq(get_ccwdev_lock(cqr->startdev->cdev));
+	cqr->callback_data = DASD_SLEEPON_END_TAG;
+	spin_unlock_irq(get_ccwdev_lock(cqr->startdev->cdev));
+	wake_up(&generic_waitq);
 }
 
 static inline int _wait_for_wakeup(struct dasd_ccw_req *cqr)
@@ -1482,10 +1488,7 @@ static inline int _wait_for_wakeup(struct dasd_ccw_req *cqr)
 
 	device = cqr->startdev;
 	spin_lock_irq(get_ccwdev_lock(device->cdev));
-	rc = ((cqr->status == DASD_CQR_DONE ||
-	       cqr->status == DASD_CQR_NEED_ERP ||
-	       cqr->status == DASD_CQR_TERMINATED) &&
-	      list_empty(&cqr->devlist));
+	rc = (cqr->callback_data == DASD_SLEEPON_END_TAG);
 	spin_unlock_irq(get_ccwdev_lock(device->cdev));
 	return rc;
 }
@@ -1573,7 +1576,7 @@ static int _dasd_sleep_on(struct dasd_ccw_req *maincqr, int interruptible)
 			wait_event(generic_waitq, !(device->stopped));
 
 		cqr->callback = dasd_wakeup_cb;
-		cqr->callback_data = (void *) &generic_waitq;
+		cqr->callback_data = DASD_SLEEPON_START_TAG;
 		dasd_add_request_tail(cqr);
 		if (interruptible) {
 			rc = wait_event_interruptible(
@@ -1652,7 +1655,7 @@ int dasd_sleep_on_immediatly(struct dasd_ccw_req *cqr)
 	}
 
 	cqr->callback = dasd_wakeup_cb;
-	cqr->callback_data = (void *) &generic_waitq;
+	cqr->callback_data = DASD_SLEEPON_START_TAG;
 	cqr->status = DASD_CQR_QUEUED;
 	list_add(&cqr->devlist, &device->ccw_queue);
 
-- 
cgit v1.2.3


From 545c174d1f093a462b4bb9131b23d5ea72a600e1 Mon Sep 17 00:00:00 2001
From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Date: Wed, 12 May 2010 09:32:12 +0200
Subject: [S390] ptrace: fix return value of do_syscall_trace_enter()

strace may change the system call number, so regs->gprs[2] must not
be read before tracehook_report_syscall_entry(). This fixes a bug
where "strace -f" will hang after a vfork().

Cc: <stable@kernel.org>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/ptrace.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 33fdc5a7976..9f654da4cec 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -640,7 +640,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 {
-	long ret;
+	long ret = 0;
 
 	/* Do the secure computing check first. */
 	secure_computing(regs->gprs[2]);
@@ -649,7 +649,6 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	 * The sysc_tracesys code in entry.S stored the system
 	 * call number to gprs[2].
 	 */
-	ret = regs->gprs[2];
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
 	    (tracehook_report_syscall_entry(regs) ||
 	     regs->gprs[2] >= NR_syscalls)) {
@@ -671,7 +670,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 				    regs->gprs[2], regs->orig_gpr2,
 				    regs->gprs[3], regs->gprs[4],
 				    regs->gprs[5]);
-	return ret;
+	return ret ?: regs->gprs[2];
 }
 
 asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
-- 
cgit v1.2.3


From 57d84906f0f3005d4d22e13a3f5102a16a7fc4a2 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 12 May 2010 09:32:13 +0200
Subject: [S390] correct address of _stext with CONFIG_SHARED_KERNEL=y

As of git commit 1844c9bc0b2fed3023551c1affe033ab38e90b9a head64.S/head31.S
are not included in head.S anymore but build as an extra object. This breaks
shared kernel support because the .org statement in head64.S/head31.S for
CONFIG_SHARED_KERNEL=y will have a different effect. The end address of the
head.text section in head.o will be added to the .org value, to compensate
for this subtract 0x11000 to get the required value of 0x100000 again.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/head31.S | 2 +-
 arch/s390/kernel/head64.S | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kernel/head31.S b/arch/s390/kernel/head31.S
index 1bbcc499d45..b8f8dc12610 100644
--- a/arch/s390/kernel/head31.S
+++ b/arch/s390/kernel/head31.S
@@ -82,7 +82,7 @@ startup_continue:
 _ehead:
 
 #ifdef CONFIG_SHARED_KERNEL
-	.org	0x100000
+	.org	0x100000 - 0x11000	# head.o ends at 0x11000
 #endif
 
 #
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index 1f70970de0a..cdef6871741 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -80,7 +80,7 @@ startup_continue:
 _ehead:
 
 #ifdef CONFIG_SHARED_KERNEL
-	.org	0x100000
+	.org	0x100000 - 0x11000	# head.o ends at 0x11000
 #endif
 
 #
-- 
cgit v1.2.3


From 2a6ce6e5fda4721b35f309acedf4cac61ecbfb04 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 12 May 2010 10:16:20 +0200
Subject: ALSA: hda - Add hp-dv4 model for IDT 92HD71bx

It turned out that HP dv series have inconsistent the mute-LED GPIO
mapping among various models.  dv4/7 seem to use GPIO 0 while dv 5/6
seem to use GPIO 3.  The previous commit
  26ebe0a28986f4845b2c5bea43ac5cc0b9f27f0a
  ALSA: hda - Fix mute-LED GPIO pin for HP dv series
breaks dv5/6.

This patch adds the new quirk model, hp-dv4, to handle HP dv4/7
separately from HP dv5/6.

Tested-by: Kunal Gangakhedkar <kunal.gangakhedkar@gmail.com> (for dv6-1110ax)
Acked-by: Kunal Gangakhedkar <kunal.gangakhedkar@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/hda/patch_sigmatel.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c
index eb4ea3df5d8..a0e06d82da1 100644
--- a/sound/pci/hda/patch_sigmatel.c
+++ b/sound/pci/hda/patch_sigmatel.c
@@ -104,6 +104,7 @@ enum {
 	STAC_DELL_M4_2,
 	STAC_DELL_M4_3,
 	STAC_HP_M4,
+	STAC_HP_DV4,
 	STAC_HP_DV5,
 	STAC_HP_HDX,
 	STAC_HP_DV4_1222NR,
@@ -1691,6 +1692,7 @@ static unsigned int *stac92hd71bxx_brd_tbl[STAC_92HD71BXX_MODELS] = {
 	[STAC_DELL_M4_2]	= dell_m4_2_pin_configs,
 	[STAC_DELL_M4_3]	= dell_m4_3_pin_configs,
 	[STAC_HP_M4]		= NULL,
+	[STAC_HP_DV4]		= NULL,
 	[STAC_HP_DV5]		= NULL,
 	[STAC_HP_HDX]           = NULL,
 	[STAC_HP_DV4_1222NR]	= NULL,
@@ -1703,6 +1705,7 @@ static const char *stac92hd71bxx_models[STAC_92HD71BXX_MODELS] = {
 	[STAC_DELL_M4_2] = "dell-m4-2",
 	[STAC_DELL_M4_3] = "dell-m4-3",
 	[STAC_HP_M4] = "hp-m4",
+	[STAC_HP_DV4] = "hp-dv4",
 	[STAC_HP_DV5] = "hp-dv5",
 	[STAC_HP_HDX] = "hp-hdx",
 	[STAC_HP_DV4_1222NR] = "hp-dv4-1222nr",
@@ -1721,7 +1724,7 @@ static struct snd_pci_quirk stac92hd71bxx_cfg_tbl[] = {
 	SND_PCI_QUIRK_MASK(PCI_VENDOR_ID_HP, 0xfff0, 0x3080,
 		      "HP", STAC_HP_DV5),
 	SND_PCI_QUIRK_MASK(PCI_VENDOR_ID_HP, 0xfff0, 0x30f0,
-		      "HP dv4-7", STAC_HP_DV5),
+		      "HP dv4-7", STAC_HP_DV4),
 	SND_PCI_QUIRK_MASK(PCI_VENDOR_ID_HP, 0xfff0, 0x3600,
 		      "HP dv4-7", STAC_HP_DV5),
 	SND_PCI_QUIRK(PCI_VENDOR_ID_HP, 0x3610,
@@ -5678,6 +5681,9 @@ again:
 		spec->num_smuxes = 1;
 		spec->num_dmuxes = 1;
 		/* fallthrough */
+	case STAC_HP_DV4:
+		spec->gpio_led = 0x01;
+		/* fallthrough */
 	case STAC_HP_DV5:
 		snd_hda_codec_set_pincfg(codec, 0x0d, 0x90170010);
 		stac92xx_auto_set_pinctl(codec, 0x0d, AC_PINCTL_OUT_EN);
@@ -5686,7 +5692,6 @@ again:
 		 * detection.
 		 */
 		spec->hp_detect = 1;
-		spec->gpio_led = 0x01;
 		break;
 	case STAC_HP_HDX:
 		spec->num_dmics = 1;
@@ -5749,7 +5754,8 @@ again:
 	}
 
 	/* enable bass on HP dv7 */
-	if (spec->board_config == STAC_HP_DV5) {
+	if (spec->board_config == STAC_HP_DV4 ||
+	    spec->board_config == STAC_HP_DV5) {
 		unsigned int cap;
 		cap = snd_hda_param_read(codec, 0x1, AC_PAR_GPIO_CAP);
 		cap &= AC_GPIO_IO_COUNT;
-- 
cgit v1.2.3


From 6a45f7822544c54a2cf070d84f4e85f2fb32ec02 Mon Sep 17 00:00:00 2001
From: Clemens Ladisch <clemens@ladisch.de>
Date: Tue, 11 May 2010 16:34:39 +0200
Subject: ALSA: virtuoso: fix Xonar D1/DX front panel microphone

Commit 65c3ac885ce9852852b895a4a62212f62cb5f2e9 in 2.6.33 accidentally
left out the initialization of the AC97 codec FMIC2MIC bit, which broke
recording from the front panel microphone.

Signed-off-by: Clemens Ladisch <clemens@ladisch.de>
Cc: <stable@kernel.org>
Signed-off-by: Jaroslav Kysela <perex@perex.cz>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/oxygen/xonar_cs43xx.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sound/pci/oxygen/xonar_cs43xx.c b/sound/pci/oxygen/xonar_cs43xx.c
index 16c226bfcd2..7c4986b27f2 100644
--- a/sound/pci/oxygen/xonar_cs43xx.c
+++ b/sound/pci/oxygen/xonar_cs43xx.c
@@ -56,6 +56,7 @@
 #include <sound/pcm_params.h>
 #include <sound/tlv.h>
 #include "xonar.h"
+#include "cm9780.h"
 #include "cs4398.h"
 #include "cs4362a.h"
 
@@ -172,6 +173,8 @@ static void xonar_d1_init(struct oxygen *chip)
 	oxygen_clear_bits16(chip, OXYGEN_GPIO_DATA,
 			    GPIO_D1_FRONT_PANEL | GPIO_D1_INPUT_ROUTE);
 
+	oxygen_ac97_set_bits(chip, 0, CM9780_JACK, CM9780_FMIC2MIC);
+
 	xonar_init_cs53x1(chip);
 	xonar_enable_output(chip);
 
-- 
cgit v1.2.3


From 9fe17b5d47d3d3c85b35623dea8f571a184134c0 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 12 May 2010 10:32:42 +0200
Subject: ALSA: pcm - Use pgprot_noncached() for MIPS non-coherent archs

MIPS non-coherent archs need the noncached pgprot in mmap of PCM buffers.
But, since the coherency needs to be checked dynamically via
plat_device_is_coherent(), we need an ugly check dependent on MIPS
in ALSA core code.

This should be cleaned up in MIPS arch side (e.g. creating
dma_mmap_coherent()) in near future.

Tested-by: Wu Zhangjin <wuzhangjin@gmail.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/core/pcm_native.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 87288762403..20b5982c996 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -36,6 +36,9 @@
 #include <sound/timer.h>
 #include <sound/minors.h>
 #include <asm/io.h>
+#if defined(CONFIG_MIPS) && defined(CONFIG_DMA_NONCOHERENT)
+#include <dma-coherence.h>
+#endif
 
 /*
  *  Compatibility
@@ -3184,6 +3187,10 @@ static int snd_pcm_default_mmap(struct snd_pcm_substream *substream,
 					 substream->runtime->dma_area,
 					 substream->runtime->dma_addr,
 					 area->vm_end - area->vm_start);
+#elif defined(CONFIG_MIPS) && defined(CONFIG_DMA_NONCOHERENT)
+	if (substream->dma_buffer.dev.type == SNDRV_DMA_TYPE_DEV &&
+	    !plat_device_is_coherent(substream->dma_buffer.dev.dev))
+		area->vm_page_prot = pgprot_noncached(area->vm_page_prot);
 #endif /* ARCH_HAS_DMA_MMAP_COHERENT */
 	/* mmap with fault handler */
 	area->vm_ops = &snd_pcm_vm_ops_data_fault;
-- 
cgit v1.2.3


From f4a27aed48584b9e4699e9ee76288618a8574d77 Mon Sep 17 00:00:00 2001
From: Srinidhi Kasagar <srinidhi.kasagar@stericsson.com>
Date: Wed, 12 May 2010 05:52:18 +0100
Subject: ARM: 6125/1: ARM TWD: move TWD registers to common header

This moves the TWD register set of MPcore to a common
existing file so that watchdog driver can access it

Signed-off-by: srinidhi kasagar <srinidhi.kasagar@stericsson.com>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/smp_twd.h | 17 +++++++++++++++++
 arch/arm/kernel/smp_twd.c      | 17 -----------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/arm/include/asm/smp_twd.h b/arch/arm/include/asm/smp_twd.h
index 7be0978b262..634f357be6b 100644
--- a/arch/arm/include/asm/smp_twd.h
+++ b/arch/arm/include/asm/smp_twd.h
@@ -1,6 +1,23 @@
 #ifndef __ASMARM_SMP_TWD_H
 #define __ASMARM_SMP_TWD_H
 
+#define TWD_TIMER_LOAD			0x00
+#define TWD_TIMER_COUNTER		0x04
+#define TWD_TIMER_CONTROL		0x08
+#define TWD_TIMER_INTSTAT		0x0C
+
+#define TWD_WDOG_LOAD			0x20
+#define TWD_WDOG_COUNTER		0x24
+#define TWD_WDOG_CONTROL		0x28
+#define TWD_WDOG_INTSTAT		0x2C
+#define TWD_WDOG_RESETSTAT		0x30
+#define TWD_WDOG_DISABLE		0x34
+
+#define TWD_TIMER_CONTROL_ENABLE	(1 << 0)
+#define TWD_TIMER_CONTROL_ONESHOT	(0 << 1)
+#define TWD_TIMER_CONTROL_PERIODIC	(1 << 1)
+#define TWD_TIMER_CONTROL_IT_ENABLE	(1 << 2)
+
 struct clock_event_device;
 
 extern void __iomem *twd_base;
diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c
index ea02a7b1c24..7c5f0c024db 100644
--- a/arch/arm/kernel/smp_twd.c
+++ b/arch/arm/kernel/smp_twd.c
@@ -21,23 +21,6 @@
 #include <asm/smp_twd.h>
 #include <asm/hardware/gic.h>
 
-#define TWD_TIMER_LOAD 			0x00
-#define TWD_TIMER_COUNTER		0x04
-#define TWD_TIMER_CONTROL		0x08
-#define TWD_TIMER_INTSTAT		0x0C
-
-#define TWD_WDOG_LOAD			0x20
-#define TWD_WDOG_COUNTER		0x24
-#define TWD_WDOG_CONTROL		0x28
-#define TWD_WDOG_INTSTAT		0x2C
-#define TWD_WDOG_RESETSTAT		0x30
-#define TWD_WDOG_DISABLE		0x34
-
-#define TWD_TIMER_CONTROL_ENABLE	(1 << 0)
-#define TWD_TIMER_CONTROL_ONESHOT	(0 << 1)
-#define TWD_TIMER_CONTROL_PERIODIC	(1 << 1)
-#define TWD_TIMER_CONTROL_IT_ENABLE	(1 << 2)
-
 /* set up by the platform code */
 void __iomem *twd_base;
 
-- 
cgit v1.2.3


From 98af057092f8f0dabe63c5df08adc2bbfbddb1d2 Mon Sep 17 00:00:00 2001
From: Srinidhi Kasagar <srinidhi.kasagar@stericsson.com>
Date: Wed, 12 May 2010 05:53:26 +0100
Subject: ARM: 6126/1: ARM mpcore_wdt: fix build failure and other fixes

This fixes the build failures seen when building mpcore_wdt and it
also removes the nonexistent ARM_MPCORE_PLATFORM dependency, instead
make it dependent on HAVE_ARM_TWD.

Also this fixes spinlock usage appropriately.

Signed-off-by: srinidhi kasagar <srinidhi.kasagar@stericsson.com>
Acked-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/watchdog/Kconfig      |  2 +-
 drivers/watchdog/mpcore_wdt.c | 21 +++++++++++----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 0bf5020d0d3..b87ba23442d 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -175,7 +175,7 @@ config SA1100_WATCHDOG
 
 config MPCORE_WATCHDOG
 	tristate "MPcore watchdog"
-	depends on ARM_MPCORE_PLATFORM && LOCAL_TIMERS
+	depends on HAVE_ARM_TWD
 	help
 	  Watchdog timer embedded into the MPcore system.
 
diff --git a/drivers/watchdog/mpcore_wdt.c b/drivers/watchdog/mpcore_wdt.c
index 016c6a791ca..b8ec7aca3c8 100644
--- a/drivers/watchdog/mpcore_wdt.c
+++ b/drivers/watchdog/mpcore_wdt.c
@@ -31,8 +31,9 @@
 #include <linux/platform_device.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
+#include <linux/io.h>
 
-#include <asm/hardware/arm_twd.h>
+#include <asm/smp_twd.h>
 
 struct mpcore_wdt {
 	unsigned long	timer_alive;
@@ -44,7 +45,7 @@ struct mpcore_wdt {
 };
 
 static struct platform_device *mpcore_wdt_dev;
-extern unsigned int mpcore_timer_rate;
+static DEFINE_SPINLOCK(wdt_lock);
 
 #define TIMER_MARGIN	60
 static int mpcore_margin = TIMER_MARGIN;
@@ -94,13 +95,15 @@ static irqreturn_t mpcore_wdt_fire(int irq, void *arg)
  */
 static void mpcore_wdt_keepalive(struct mpcore_wdt *wdt)
 {
-	unsigned int count;
+	unsigned long count;
 
+	spin_lock(&wdt_lock);
 	/* Assume prescale is set to 256 */
-	count = (mpcore_timer_rate / 256) * mpcore_margin;
+	count =  __raw_readl(wdt->base + TWD_WDOG_COUNTER);
+	count = (0xFFFFFFFFU - count) * (HZ / 5);
+	count = (count / 256) * mpcore_margin;
 
 	/* Reload the counter */
-	spin_lock(&wdt_lock);
 	writel(count + wdt->perturb, wdt->base + TWD_WDOG_LOAD);
 	wdt->perturb = wdt->perturb ? 0 : 1;
 	spin_unlock(&wdt_lock);
@@ -119,7 +122,6 @@ static void mpcore_wdt_start(struct mpcore_wdt *wdt)
 {
 	dev_printk(KERN_INFO, wdt->dev, "enabling watchdog.\n");
 
-	spin_lock(&wdt_lock);
 	/* This loads the count register but does NOT start the count yet */
 	mpcore_wdt_keepalive(wdt);
 
@@ -130,7 +132,6 @@ static void mpcore_wdt_start(struct mpcore_wdt *wdt)
 		/* Enable watchdog - prescale=256, watchdog mode=1, enable=1 */
 		writel(0x0000FF09, wdt->base + TWD_WDOG_CONTROL);
 	}
-	spin_unlock(&wdt_lock);
 }
 
 static int mpcore_wdt_set_heartbeat(int t)
@@ -360,7 +361,7 @@ static int __devinit mpcore_wdt_probe(struct platform_device *dev)
 	mpcore_wdt_miscdev.parent = &dev->dev;
 	ret = misc_register(&mpcore_wdt_miscdev);
 	if (ret) {
-		dev_printk(KERN_ERR, _dev,
+		dev_printk(KERN_ERR, wdt->dev,
 			"cannot register miscdev on minor=%d (err=%d)\n",
 							WATCHDOG_MINOR, ret);
 		goto err_misc;
@@ -369,13 +370,13 @@ static int __devinit mpcore_wdt_probe(struct platform_device *dev)
 	ret = request_irq(wdt->irq, mpcore_wdt_fire, IRQF_DISABLED,
 							"mpcore_wdt", wdt);
 	if (ret) {
-		dev_printk(KERN_ERR, _dev,
+		dev_printk(KERN_ERR, wdt->dev,
 			"cannot register IRQ%d for watchdog\n", wdt->irq);
 		goto err_irq;
 	}
 
 	mpcore_wdt_stop(wdt);
-	platform_set_drvdata(&dev->dev, wdt);
+	platform_set_drvdata(dev, wdt);
 	mpcore_wdt_dev = dev;
 
 	return 0;
-- 
cgit v1.2.3


From 8213466596bf10b75887754773ee13c10cf86f5c Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 12 May 2010 16:43:32 +0200
Subject: ALSA: ice1724 - Fix ESI Maya44 capture source control

The capture source control of maya44 was wrongly coded with the bit
shift instead of the bit mask.  Also, the slot for line-in was
wrongly assigned (slot 5 instead of 4).

Reported-by: Alex Chernyshoff <alexdsp@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 sound/pci/ice1712/maya44.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sound/pci/ice1712/maya44.c b/sound/pci/ice1712/maya44.c
index 3e1c20ae2f1..726fd4b92e1 100644
--- a/sound/pci/ice1712/maya44.c
+++ b/sound/pci/ice1712/maya44.c
@@ -347,7 +347,7 @@ static int maya_gpio_sw_put(struct snd_kcontrol *kcontrol,
 
 /* known working input slots (0-4) */
 #define MAYA_LINE_IN	1	/* in-2 */
-#define MAYA_MIC_IN	4	/* in-5 */
+#define MAYA_MIC_IN	3	/* in-4 */
 
 static void wm8776_select_input(struct snd_maya44 *chip, int idx, int line)
 {
@@ -393,8 +393,8 @@ static int maya_rec_src_put(struct snd_kcontrol *kcontrol,
 	int changed;
 
 	mutex_lock(&chip->mutex);
-	changed = maya_set_gpio_bits(chip->ice, GPIO_MIC_RELAY,
-				     sel ? GPIO_MIC_RELAY : 0);
+	changed = maya_set_gpio_bits(chip->ice, 1 << GPIO_MIC_RELAY,
+				     sel ? (1 << GPIO_MIC_RELAY) : 0);
 	wm8776_select_input(chip, 0, sel ? MAYA_MIC_IN : MAYA_LINE_IN);
 	mutex_unlock(&chip->mutex);
 	return changed;
-- 
cgit v1.2.3


From e7b702b1a8f2a6961367da903217e669be0f099f Mon Sep 17 00:00:00 2001
From: Russell King <rmk@arm.linux.org.uk>
Date: Sun, 18 Apr 2010 21:25:11 +0100
Subject: Inotify: undefined reference to `anon_inode_getfd'

Fix:

fs/built-in.o: In function `sys_inotify_init1':
summary.c:(.text+0x347a4): undefined reference to `anon_inode_getfd'

found by kautobuild with arms bcmring_defconfig, which ends up with
INOTIFY_USER enabled (through the 'default y') but leaves ANON_INODES
unset.  However, inotify_user.c uses anon_inode_getfd().

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index b3a159b21cf..4427f8c1423 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -17,6 +17,7 @@ config INOTIFY_USER
 	bool "Inotify support for userspace"
 	select ANON_INODES
 	select FSNOTIFY
+	select ANON_INODES
 	default y
 	---help---
 	  Say Y here to enable inotify support for userspace, including the
-- 
cgit v1.2.3


From 91af70814105f4c05e6e11b51c3269907b71794b Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Wed, 12 May 2010 11:38:45 +0100
Subject: rwsem: Test for no active locks in __rwsem_do_wake undo code

If there are no active threasd using a semaphore, it is always correct
to unqueue blocked threads.  This seems to be what was intended in the
undo code.

What was done instead, was to look for a sem count of zero - this is an
impossible situation, given that at least one thread is known to be
queued on the semaphore.  The code might be correct as written, but it's
hard to reason about and it's not what was intended (otherwise the goto
out would have been unconditional).

Go for checking the active count - the alternative is not worth the
headache.

Signed-off-by: Michel Lespinasse <walken@google.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 3e3365e5665..ceba8e28807 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -136,9 +136,10 @@ __rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
  out:
 	return sem;
 
-	/* undo the change to count, but check for a transition 1->0 */
+	/* undo the change to the active count, but check for a transition
+	 * 1->0 */
  undo:
-	if (rwsem_atomic_update(-RWSEM_ACTIVE_BIAS, sem) != 0)
+	if (rwsem_atomic_update(-RWSEM_ACTIVE_BIAS, sem) & RWSEM_ACTIVE_MASK)
 		goto out;
 	goto try_again;
 }
-- 
cgit v1.2.3


From 7ac512aa8237c43331ffaf77a4fd8b8d684819ba Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 12 May 2010 15:34:03 +0100
Subject: CacheFiles: Fix error handling in
 cachefiles_determine_cache_security()

cachefiles_determine_cache_security() is expected to return with a
security override in place.  However, if set_create_files_as() fails, we
fail to do this.  In this case, we should just reinstate the security
override that was set by the caller.

Furthermore, if set_create_files_as() fails, we should dispose of the
new credentials we were in the process of creating.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cachefiles/security.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index b5808cdb223..039b5011d83 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -77,6 +77,8 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
 /*
  * check the security details of the on-disk cache
  * - must be called with security override in force
+ * - must return with a security override in force - even in the case of an
+ *   error
  */
 int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
 					struct dentry *root,
@@ -99,6 +101,8 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
 	 * which create files */
 	ret = set_create_files_as(new, root->d_inode);
 	if (ret < 0) {
+		abort_creds(new);
+		cachefiles_begin_secure(cache, _saved_cred);
 		_leave(" = %d [cfa]", ret);
 		return ret;
 	}
-- 
cgit v1.2.3


From 769d9968e42c995eaaf61ac5583d998f32e0769a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 12 May 2010 18:39:45 -0700
Subject: Revert "PCI: update bridge resources to get more big ranges in PCI
 assign unssigned"

This reverts commit 977d17bb1749517b353874ccdc9b85abc7a58c2a, because it
can cause problems with some devices not getting any resources at all
when the resource tree is re-allocated.

For an example of this, see

	https://bugzilla.kernel.org/show_bug.cgi?id=15960
	(originally https://bugtrack.alsa-project.org/alsa-bug/view.php?id=4982)
	(lkml thread: http://lkml.org/lkml/2010/4/19/20)

where Peter Henriksson reported his Xonar DX sound card gone, because
the IO port region was no longer allocated.

Reported-bisected-and-tested-by: Peter Henriksson <peter.henriksson@gmail.com>
Requested-by: Andrew Morton <akpm@linux-foundation.org>
Requested-by: Clemens Ladisch <clemens@ladisch.de>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pci/setup-bus.c | 114 +-----------------------------------------------
 1 file changed, 2 insertions(+), 112 deletions(-)

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 4fe36d2e104..19b111383f6 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -838,65 +838,11 @@ static void pci_bus_dump_resources(struct pci_bus *bus)
 	}
 }
 
-static int __init pci_bus_get_depth(struct pci_bus *bus)
-{
-	int depth = 0;
-	struct pci_dev *dev;
-
-	list_for_each_entry(dev, &bus->devices, bus_list) {
-		int ret;
-		struct pci_bus *b = dev->subordinate;
-		if (!b)
-			continue;
-
-		ret = pci_bus_get_depth(b);
-		if (ret + 1 > depth)
-			depth = ret + 1;
-	}
-
-	return depth;
-}
-static int __init pci_get_max_depth(void)
-{
-	int depth = 0;
-	struct pci_bus *bus;
-
-	list_for_each_entry(bus, &pci_root_buses, node) {
-		int ret;
-
-		ret = pci_bus_get_depth(bus);
-		if (ret > depth)
-			depth = ret;
-	}
-
-	return depth;
-}
-
-/*
- * first try will not touch pci bridge res
- * second  and later try will clear small leaf bridge res
- * will stop till to the max  deepth if can not find good one
- */
 void __init
 pci_assign_unassigned_resources(void)
 {
 	struct pci_bus *bus;
-	int tried_times = 0;
-	enum release_type rel_type = leaf_only;
-	struct resource_list_x head, *list;
-	unsigned long type_mask = IORESOURCE_IO | IORESOURCE_MEM |
-				  IORESOURCE_PREFETCH;
-	unsigned long failed_type;
-	int max_depth = pci_get_max_depth();
-	int pci_try_num;
 
-	head.next = NULL;
-
-	pci_try_num = max_depth + 1;
-	printk(KERN_DEBUG "PCI: max bus depth: %d pci_try_num: %d\n",
-		 max_depth, pci_try_num);
-
-again:
 	/* Depth first, calculate sizes and alignments of all
 	   subordinate buses. */
 	list_for_each_entry(bus, &pci_root_buses, node) {
@@ -904,65 +850,9 @@ again:
 	}
 	/* Depth last, allocate resources and update the hardware. */
 	list_for_each_entry(bus, &pci_root_buses, node) {
-		__pci_bus_assign_resources(bus, &head);
-	}
-	tried_times++;
-
-	/* any device complain? */
-	if (!head.next)
-		goto enable_and_dump;
-	failed_type = 0;
-	for (list = head.next; list;) {
-		failed_type |= list->flags;
-		list = list->next;
-	}
-	/*
-	 * io port are tight, don't try extra
-	 * or if reach the limit, don't want to try more
-	 */
-	failed_type &= type_mask;
-	if ((failed_type == IORESOURCE_IO) || (tried_times >= pci_try_num)) {
-		free_failed_list(&head);
-		goto enable_and_dump;
-	}
-
-	printk(KERN_DEBUG "PCI: No. %d try to assign unassigned res\n",
-			 tried_times + 1);
-
-	/* third times and later will not check if it is leaf */
-	if ((tried_times + 1) > 2)
-		rel_type = whole_subtree;
-
-	/*
-	 * Try to release leaf bridge's resources that doesn't fit resource of
-	 * child device under that bridge
-	 */
-	for (list = head.next; list;) {
-		bus = list->dev->bus;
-		pci_bus_release_bridge_resources(bus, list->flags & type_mask,
-						  rel_type);
-		list = list->next;
-	}
-	/* restore size and flags */
-	for (list = head.next; list;) {
-		struct resource *res = list->res;
-
-		res->start = list->start;
-		res->end = list->end;
-		res->flags = list->flags;
-		if (list->dev->subordinate)
-			res->flags = 0;
-
-		list = list->next;
-	}
-	free_failed_list(&head);
-
-	goto again;
-
-enable_and_dump:
-	/* Depth last, update the hardware. */
-	list_for_each_entry(bus, &pci_root_buses, node)
+		pci_bus_assign_resources(bus);
 		pci_enable_bridges(bus);
+	}
 
 	/* dump the resource on buses */
 	list_for_each_entry(bus, &pci_root_buses, node) {
-- 
cgit v1.2.3


From 46a47b1ed118cda1a08b7f6077b837a00fbc112b Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 23 Apr 2010 14:03:38 -0300
Subject: KVM: convert ioapic lock to spinlock

kvm_set_irq is used from non sleepable contexes, so convert ioapic from
mutex to spinlock.

KVM-Stable-Tag.
Tested-by: Ralf Bonenkamp <ralf.bonenkamp@swyx.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/ioapic.c | 30 +++++++++++++++---------------
 virt/kvm/ioapic.h |  2 +-
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 03a5eb22da2..7c79c1d76d0 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -197,7 +197,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 	union kvm_ioapic_redirect_entry entry;
 	int ret = 1;
 
-	mutex_lock(&ioapic->lock);
+	spin_lock(&ioapic->lock);
 	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
 		entry = ioapic->redirtbl[irq];
 		level ^= entry.fields.polarity;
@@ -214,7 +214,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 		}
 		trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
 	}
-	mutex_unlock(&ioapic->lock);
+	spin_unlock(&ioapic->lock);
 
 	return ret;
 }
@@ -238,9 +238,9 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
 		 * is dropped it will be put into irr and will be delivered
 		 * after ack notifier returns.
 		 */
-		mutex_unlock(&ioapic->lock);
+		spin_unlock(&ioapic->lock);
 		kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
-		mutex_lock(&ioapic->lock);
+		spin_lock(&ioapic->lock);
 
 		if (trigger_mode != IOAPIC_LEVEL_TRIG)
 			continue;
@@ -259,9 +259,9 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
 	smp_rmb();
 	if (!test_bit(vector, ioapic->handled_vectors))
 		return;
-	mutex_lock(&ioapic->lock);
+	spin_lock(&ioapic->lock);
 	__kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
-	mutex_unlock(&ioapic->lock);
+	spin_unlock(&ioapic->lock);
 }
 
 static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
@@ -287,7 +287,7 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
 	ASSERT(!(addr & 0xf));	/* check alignment */
 
 	addr &= 0xff;
-	mutex_lock(&ioapic->lock);
+	spin_lock(&ioapic->lock);
 	switch (addr) {
 	case IOAPIC_REG_SELECT:
 		result = ioapic->ioregsel;
@@ -301,7 +301,7 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
 		result = 0;
 		break;
 	}
-	mutex_unlock(&ioapic->lock);
+	spin_unlock(&ioapic->lock);
 
 	switch (len) {
 	case 8:
@@ -338,7 +338,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
 	}
 
 	addr &= 0xff;
-	mutex_lock(&ioapic->lock);
+	spin_lock(&ioapic->lock);
 	switch (addr) {
 	case IOAPIC_REG_SELECT:
 		ioapic->ioregsel = data;
@@ -356,7 +356,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
 	default:
 		break;
 	}
-	mutex_unlock(&ioapic->lock);
+	spin_unlock(&ioapic->lock);
 	return 0;
 }
 
@@ -386,7 +386,7 @@ int kvm_ioapic_init(struct kvm *kvm)
 	ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
 	if (!ioapic)
 		return -ENOMEM;
-	mutex_init(&ioapic->lock);
+	spin_lock_init(&ioapic->lock);
 	kvm->arch.vioapic = ioapic;
 	kvm_ioapic_reset(ioapic);
 	kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
@@ -419,9 +419,9 @@ int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 	if (!ioapic)
 		return -EINVAL;
 
-	mutex_lock(&ioapic->lock);
+	spin_lock(&ioapic->lock);
 	memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
-	mutex_unlock(&ioapic->lock);
+	spin_unlock(&ioapic->lock);
 	return 0;
 }
 
@@ -431,9 +431,9 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 	if (!ioapic)
 		return -EINVAL;
 
-	mutex_lock(&ioapic->lock);
+	spin_lock(&ioapic->lock);
 	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
 	update_handled_vectors(ioapic);
-	mutex_unlock(&ioapic->lock);
+	spin_unlock(&ioapic->lock);
 	return 0;
 }
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 8a751b78a43..0b190c34ccc 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -45,7 +45,7 @@ struct kvm_ioapic {
 	struct kvm_io_device dev;
 	struct kvm *kvm;
 	void (*ack_notifier)(void *opaque, int irq);
-	struct mutex lock;
+	spinlock_t lock;
 	DECLARE_BITMAP(handled_vectors, 256);
 };
 
-- 
cgit v1.2.3


From 061e2fd16863009c8005b4b5fdfb75c7215c0b99 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 5 May 2010 16:04:43 +0200
Subject: KVM: SVM: Fix wrong intercept masks on 32 bit

This patch makes KVM on 32 bit SVM working again by
correcting the masks used for iret interception. With the
wrong masks the upper 32 bits of the intercepts are masked
out which leaves vmrun unintercepted. This is not legal on
svm and the vmrun fails.
Bug was introduced by commits 95ba827313 and 3cfc3092.

Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Gleb Natapov <gleb@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2ba58206812..737361fcd50 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2067,7 +2067,7 @@ static int cpuid_interception(struct vcpu_svm *svm)
 static int iret_interception(struct vcpu_svm *svm)
 {
 	++svm->vcpu.stat.nmi_window_exits;
-	svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
+	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
 	svm->vcpu.arch.hflags |= HF_IRET_MASK;
 	return 1;
 }
@@ -2479,7 +2479,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 
 	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
 	vcpu->arch.hflags |= HF_NMI_MASK;
-	svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
+	svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
 	++vcpu->stat.nmi_injections;
 }
 
@@ -2539,10 +2539,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 
 	if (masked) {
 		svm->vcpu.arch.hflags |= HF_NMI_MASK;
-		svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
+		svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
 	} else {
 		svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
-		svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
+		svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
 	}
 }
 
-- 
cgit v1.2.3


From fe19c5a46b4c519153fddd4d5efe32a3e4cfa694 Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Tue, 11 May 2010 18:21:33 +0800
Subject: KVM: x86: Call vcpu_load and vcpu_put in cpuid_update

cpuid_update may operate VMCS, so vcpu_load() and vcpu_put()
should be called to ensure correctness.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c4ca98ad27..c4f35b545c1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1712,6 +1712,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 	if (copy_from_user(cpuid_entries, entries,
 			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
 		goto out_free;
+	vcpu_load(vcpu);
 	for (i = 0; i < cpuid->nent; i++) {
 		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
 		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1729,6 +1730,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 	r = 0;
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
+	vcpu_put(vcpu);
 
 out_free:
 	vfree(cpuid_entries);
@@ -1749,9 +1751,11 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
 			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
 		goto out;
+	vcpu_load(vcpu);
 	vcpu->arch.cpuid_nent = cpuid->nent;
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
+	vcpu_put(vcpu);
 	return 0;
 
 out:
-- 
cgit v1.2.3


From f8c5fae16649445e15656667f72bd51d777f7766 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 11 May 2010 15:16:46 +0200
Subject: KVM: VMX: blocked-by-sti must not defer NMI injections

As the processor may not consider GUEST_INTR_STATE_STI as a reason for
blocking NMI, it could return immediately with EXIT_REASON_NMI_WINDOW
when we asked for it. But as we consider this state as NMI-blocking, we
can run into an endless loop.

Resolve this by allowing NMI injection if just GUEST_INTR_STATE_STI is
active (originally suggested by Gleb). Intel confirmed that this is
safe, the processor will never complain about NMI injection in this
state.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
KVM-Stable-Tag
Acked-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bc933cfb4e6..2f8db0ec8ae 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2703,8 +2703,7 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 		return 0;
 
 	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
-				GUEST_INTR_STATE_NMI));
+			(GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
 }
 
 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 4f018c513a81ba243165bfc4fcf44254986ad002 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Sun, 9 May 2010 17:26:47 +0200
Subject: KVM: PPC: Keep index within boundaries in kvmppc_44x_emul_tlbwe()

An index of KVM44x_GUEST_TLB_SIZE is already one too large.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Hollis Blanchard <hollis@penguinppc.org>
Acked-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/powerpc/kvm/44x_tlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 2570fcc7665..812312542e5 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -440,7 +440,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 	unsigned int gtlb_index;
 
 	gtlb_index = kvmppc_get_gpr(vcpu, ra);
-	if (gtlb_index > KVM44x_GUEST_TLB_SIZE) {
+	if (gtlb_index >= KVM44x_GUEST_TLB_SIZE) {
 		printk("%s: index %d\n", __func__, gtlb_index);
 		kvmppc_dump_vcpu(vcpu);
 		return EMULATE_FAIL;
-- 
cgit v1.2.3


From 46db2c3205ca6e24adbb9b038441bc8f65360535 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 30 Mar 2010 18:27:39 -0300
Subject: perf record: Add a fallback to the reference relocation symbol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Usually "_text" is enough, but I received reports that its not always
available, so fallback to "_stext" for the symbol we use to check if we
need to apply any relocation to all the symbols in the kernel symtab,
for when, for instance, kexec is being used.

Reported-by: Darren Hart <dvhltc@us.ibm.com>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 3b8b6387c47..f1411e9cdf4 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -563,6 +563,9 @@ static int __cmd_record(int argc, const char **argv)
 
 	err = event__synthesize_kernel_mmap(process_synthesized_event,
 					    session, "_text");
+	if (err < 0)
+		err = event__synthesize_kernel_mmap(process_synthesized_event,
+						    session, "_stext");
 	if (err < 0) {
 		pr_err("Couldn't record kernel reference relocation symbol.\n");
 		return err;
-- 
cgit v1.2.3


From 720019908fd5a1bb442bb0a35a6027ba21864d25 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed, 12 May 2010 21:42:42 +0400
Subject: x86, perf: P4 PMU -- use hash for p4_get_escr_idx()

Linear search over all p4 MSRs should be fine if only
we would not use it in events scheduling routine which
is pretty time critical. Lets use hashes. It should speed
scheduling up significantly.

v2: Steven proposed to use more gentle approach than issue
    BUG on error, so we use WARN_ONCE now

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <20100512174242.GA5190@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 126 ++++++++++++++++++++----------------
 1 file changed, 71 insertions(+), 55 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index a603930271f..cb875b1e2e8 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -668,66 +668,80 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 	}
 }
 
-/* ESCRs are not sequential in memory so we need a map */
-static const unsigned int p4_escr_map[ARCH_P4_TOTAL_ESCR] = {
-	MSR_P4_ALF_ESCR0,	/*  0 */
-	MSR_P4_ALF_ESCR1,	/*  1 */
-	MSR_P4_BPU_ESCR0,	/*  2 */
-	MSR_P4_BPU_ESCR1,	/*  3 */
-	MSR_P4_BSU_ESCR0,	/*  4 */
-	MSR_P4_BSU_ESCR1,	/*  5 */
-	MSR_P4_CRU_ESCR0,	/*  6 */
-	MSR_P4_CRU_ESCR1,	/*  7 */
-	MSR_P4_CRU_ESCR2,	/*  8 */
-	MSR_P4_CRU_ESCR3,	/*  9 */
-	MSR_P4_CRU_ESCR4,	/* 10 */
-	MSR_P4_CRU_ESCR5,	/* 11 */
-	MSR_P4_DAC_ESCR0,	/* 12 */
-	MSR_P4_DAC_ESCR1,	/* 13 */
-	MSR_P4_FIRM_ESCR0,	/* 14 */
-	MSR_P4_FIRM_ESCR1,	/* 15 */
-	MSR_P4_FLAME_ESCR0,	/* 16 */
-	MSR_P4_FLAME_ESCR1,	/* 17 */
-	MSR_P4_FSB_ESCR0,	/* 18 */
-	MSR_P4_FSB_ESCR1,	/* 19 */
-	MSR_P4_IQ_ESCR0,	/* 20 */
-	MSR_P4_IQ_ESCR1,	/* 21 */
-	MSR_P4_IS_ESCR0,	/* 22 */
-	MSR_P4_IS_ESCR1,	/* 23 */
-	MSR_P4_ITLB_ESCR0,	/* 24 */
-	MSR_P4_ITLB_ESCR1,	/* 25 */
-	MSR_P4_IX_ESCR0,	/* 26 */
-	MSR_P4_IX_ESCR1,	/* 27 */
-	MSR_P4_MOB_ESCR0,	/* 28 */
-	MSR_P4_MOB_ESCR1,	/* 29 */
-	MSR_P4_MS_ESCR0,	/* 30 */
-	MSR_P4_MS_ESCR1,	/* 31 */
-	MSR_P4_PMH_ESCR0,	/* 32 */
-	MSR_P4_PMH_ESCR1,	/* 33 */
-	MSR_P4_RAT_ESCR0,	/* 34 */
-	MSR_P4_RAT_ESCR1,	/* 35 */
-	MSR_P4_SAAT_ESCR0,	/* 36 */
-	MSR_P4_SAAT_ESCR1,	/* 37 */
-	MSR_P4_SSU_ESCR0,	/* 38 */
-	MSR_P4_SSU_ESCR1,	/* 39 */
-	MSR_P4_TBPU_ESCR0,	/* 40 */
-	MSR_P4_TBPU_ESCR1,	/* 41 */
-	MSR_P4_TC_ESCR0,	/* 42 */
-	MSR_P4_TC_ESCR1,	/* 43 */
-	MSR_P4_U2L_ESCR0,	/* 44 */
-	MSR_P4_U2L_ESCR1,	/* 45 */
+/*
+ * ESCR address hashing is tricky, ESCRs are not sequential
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03e0) and
+ * the metric between any ESCRs is laid in range [0xa0,0xe1]
+ *
+ * so we make ~70% filled hashtable
+ */
+
+#define P4_ESCR_MSR_BASE		0x000003a0
+#define P4_ESCR_MSR_MAX			0x000003e1
+#define P4_ESCR_MSR_TABLE_SIZE		(P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
+#define P4_ESCR_MSR_IDX(msr)		(msr - P4_ESCR_MSR_BASE)
+#define P4_ESCR_MSR_TABLE_ENTRY(msr)	[P4_ESCR_MSR_IDX(msr)] = msr
+
+static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
 };
 
 static int p4_get_escr_idx(unsigned int addr)
 {
-	unsigned int i;
+	unsigned int idx = P4_ESCR_MSR_IDX(addr);
 
-	for (i = 0; i < ARRAY_SIZE(p4_escr_map); i++) {
-		if (addr == p4_escr_map[i])
-			return i;
+	if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
+			!p4_escr_table[idx])) {
+		WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
+		return -1;
 	}
 
-	return -1;
+	return idx;
 }
 
 static int p4_next_cntr(int thread, unsigned long *used_mask,
@@ -747,7 +761,7 @@ static int p4_next_cntr(int thread, unsigned long *used_mask,
 static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 {
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long escr_mask[BITS_TO_LONGS(ARCH_P4_TOTAL_ESCR)];
+	unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
 	int cpu = raw_smp_processor_id();
 	struct hw_perf_event *hwc;
 	struct p4_event_bind *bind;
@@ -755,7 +769,7 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 	int cntr_idx, escr_idx;
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-	bitmap_zero(escr_mask, ARCH_P4_TOTAL_ESCR);
+	bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
 
 	for (i = 0, num = n; i < n; i++, num--) {
 
@@ -763,6 +777,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 		thread = p4_ht_thread(cpu);
 		bind = p4_config_get_bind(hwc->config);
 		escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
+		if (unlikely(escr_idx == -1))
+			goto done;
 
 		if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
 			cntr_idx = hwc->idx;
-- 
cgit v1.2.3


From 0d5961b7f612f8f54fd6fbe1942cdfb141cddfb9 Mon Sep 17 00:00:00 2001
From: "Steven J. Magnani" <steve@digidescorp.com>
Date: Tue, 27 Apr 2010 13:00:23 -0500
Subject: microblaze: re-enable interrupts before calling schedule

schedule() should not be called with interrupts disabled.

Signed-off-by: Steven J. Magnani <steve@digidescorp.com>
Signed-off-by: Michal Simek <monstr@monstr.eu>
---
 arch/microblaze/kernel/entry-nommu.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/microblaze/kernel/entry-nommu.S b/arch/microblaze/kernel/entry-nommu.S
index 391d6197fc3..8cc18cd2cce 100644
--- a/arch/microblaze/kernel/entry-nommu.S
+++ b/arch/microblaze/kernel/entry-nommu.S
@@ -476,6 +476,8 @@ ENTRY(ret_from_fork)
 	nop
 
 work_pending:
+	enable_irq
+
 	andi	r11, r19, _TIF_NEED_RESCHED
 	beqi	r11, 1f
 	bralid	r15, schedule
-- 
cgit v1.2.3


From 538722ca3b762023ac65cec214901a1ebff8b575 Mon Sep 17 00:00:00 2001
From: "Steven J. Magnani" <steve@digidescorp.com>
Date: Thu, 6 May 2010 16:38:33 -0500
Subject: microblaze: fix get_user/put_user side-effects

The Microblaze implementations of get_user() and (MMU) put_user() evaluate
the address argument more than once. This causes unexpected side-effects for
invocations that include increment operators, i.e. get_user(foo, bar++).

This patch also removes the distinction between MMU and noMMU put_user().

Without the patch:
  $ echo 1234567890 > /proc/sys/kernel/core_pattern
  $ cat /proc/sys/kernel/core_pattern
  12345

Signed-off-by: Steven J. Magnani <steve@digidescorp.com>
---
 arch/microblaze/include/asm/uaccess.h | 87 +++++++++++++++++++++++++++--------
 1 file changed, 69 insertions(+), 18 deletions(-)

diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index 446bec29b14..26460d15b33 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -182,6 +182,39 @@ extern long __user_bad(void);
  * Returns zero on success, or -EFAULT on error.
  * On error, the variable @x is set to zero.
  */
+#define get_user(x, ptr)						\
+	__get_user_check((x), (ptr), sizeof(*(ptr)))
+
+#define __get_user_check(x, ptr, size)					\
+({									\
+	unsigned long __gu_val = 0;					\
+	const typeof(*(ptr)) __user *__gu_addr = (ptr);			\
+	int __gu_err = 0;						\
+									\
+	if (access_ok(VERIFY_READ, __gu_addr, size)) {			\
+		switch (size) {						\
+		case 1:							\
+			__get_user_asm("lbu", __gu_addr, __gu_val,	\
+				       __gu_err);			\
+			break;						\
+		case 2:							\
+			__get_user_asm("lhu", __gu_addr, __gu_val,	\
+				       __gu_err);			\
+			break;						\
+		case 4:							\
+			__get_user_asm("lw", __gu_addr, __gu_val,	\
+				       __gu_err);			\
+			break;						\
+		default:						\
+			__gu_err = __user_bad();			\
+			break;						\
+		}							\
+	} else {							\
+		__gu_err = -EFAULT;					\
+	}								\
+	x = (typeof(*(ptr)))__gu_val;					\
+	__gu_err;							\
+})
 
 #define __get_user(x, ptr)						\
 ({									\
@@ -206,12 +239,6 @@ extern long __user_bad(void);
 })
 
 
-#define get_user(x, ptr)						\
-({									\
-	access_ok(VERIFY_READ, (ptr), sizeof(*(ptr)))			\
-		? __get_user((x), (ptr)) : -EFAULT;			\
-})
-
 #define __put_user_asm(insn, __gu_ptr, __gu_val, __gu_err)	\
 ({								\
 	__asm__ __volatile__ (					\
@@ -266,6 +293,42 @@ extern long __user_bad(void);
  *
  * Returns zero on success, or -EFAULT on error.
  */
+#define put_user(x, ptr)						\
+	__put_user_check((x), (ptr), sizeof(*(ptr)))
+
+#define __put_user_check(x, ptr, size)					\
+({									\
+	typeof(*(ptr)) __pu_val;					\
+	typeof(*(ptr)) __user *__pu_addr = (ptr);			\
+	int __pu_err = 0;						\
+									\
+	__pu_val = (x);							\
+	if (access_ok(VERIFY_WRITE, __pu_addr, size)) {			\
+		switch (size) {						\
+		case 1:							\
+			__put_user_asm("sb", __pu_addr, __pu_val,	\
+				       __pu_err);			\
+			break;						\
+		case 2:							\
+			__put_user_asm("sh", __pu_addr, __pu_val,	\
+				       __pu_err);			\
+			break;						\
+		case 4:							\
+			__put_user_asm("sw", __pu_addr, __pu_val,	\
+				       __pu_err);			\
+			break;						\
+		case 8:							\
+			__put_user_asm_8(__pu_addr, __pu_val, __pu_err);\
+			break;						\
+		default:						\
+			__pu_err = __user_bad();			\
+			break;						\
+		}							\
+	} else {							\
+		__pu_err = -EFAULT;					\
+	}								\
+	__pu_err;							\
+})
 
 #define __put_user(x, ptr)						\
 ({									\
@@ -290,18 +353,6 @@ extern long __user_bad(void);
 	__gu_err;							\
 })
 
-#ifndef CONFIG_MMU
-
-#define put_user(x, ptr)	__put_user((x), (ptr))
-
-#else /* CONFIG_MMU */
-
-#define put_user(x, ptr)						\
-({									\
-	access_ok(VERIFY_WRITE, (ptr), sizeof(*(ptr)))			\
-		? __put_user((x), (ptr)) : -EFAULT;			\
-})
-#endif /* CONFIG_MMU */
 
 /* copy_to_from_user */
 #define __copy_from_user(to, from, n)	\
-- 
cgit v1.2.3


From 504e8beed161bd11a2c6cbb8aaf352c14d39b5bb Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 13 May 2010 00:41:15 -0700
Subject: Input: elantech - use all 3 bytes when checking version

Apparently all 3 bytes returned by ETP_FW_VERSION_QUERY are significant
and should be taken into account when matching hardware version/features.

Tested-by: Eric Piel <eric.piel@tremplin-utc.net>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/mouse/elantech.c | 24 ++++++++++++------------
 drivers/input/mouse/elantech.h |  5 ++---
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c
index 2cbf3fc4729..1ac12f7c872 100644
--- a/drivers/input/mouse/elantech.c
+++ b/drivers/input/mouse/elantech.c
@@ -184,7 +184,7 @@ static void elantech_report_absolute_v1(struct psmouse *psmouse)
 	int fingers;
 	static int old_fingers;
 
-	if (etd->fw_version_maj == 0x01) {
+	if (etd->fw_version < 0x020000) {
 		/*
 		 * byte 0:  D   U  p1  p2   1  p3   R   L
 		 * byte 1:  f   0  th  tw  x9  x8  y9  y8
@@ -226,7 +226,7 @@ static void elantech_report_absolute_v1(struct psmouse *psmouse)
 	input_report_key(dev, BTN_LEFT, packet[0] & 0x01);
 	input_report_key(dev, BTN_RIGHT, packet[0] & 0x02);
 
-	if ((etd->fw_version_maj == 0x01) &&
+	if (etd->fw_version < 0x020000 &&
 	    (etd->capabilities & ETP_CAP_HAS_ROCKER)) {
 		/* rocker up */
 		input_report_key(dev, BTN_FORWARD, packet[0] & 0x40);
@@ -320,7 +320,7 @@ static int elantech_check_parity_v1(struct psmouse *psmouse)
 	unsigned char p1, p2, p3;
 
 	/* Parity bits are placed differently */
-	if (etd->fw_version_maj == 0x01) {
+	if (etd->fw_version < 0x020000) {
 		/* byte 0:  D   U  p1  p2   1  p3   R   L */
 		p1 = (packet[0] & 0x20) >> 5;
 		p2 = (packet[0] & 0x10) >> 4;
@@ -456,7 +456,7 @@ static void elantech_set_input_params(struct psmouse *psmouse)
 	switch (etd->hw_version) {
 	case 1:
 		/* Rocker button */
-		if ((etd->fw_version_maj == 0x01) &&
+		if (etd->fw_version < 0x020000 &&
 		    (etd->capabilities & ETP_CAP_HAS_ROCKER)) {
 			__set_bit(BTN_FORWARD, dev->keybit);
 			__set_bit(BTN_BACK, dev->keybit);
@@ -685,15 +685,14 @@ int elantech_init(struct psmouse *psmouse)
 		pr_err("elantech.c: failed to query firmware version.\n");
 		goto init_fail;
 	}
-	etd->fw_version_maj = param[0];
-	etd->fw_version_min = param[2];
+
+	etd->fw_version = (param[0] << 16) | (param[1] << 8) | param[2];
 
 	/*
 	 * Assume every version greater than this is new EeePC style
 	 * hardware with 6 byte packets
 	 */
-	if ((etd->fw_version_maj == 0x02 && etd->fw_version_min >= 0x30) ||
-	    etd->fw_version_maj > 0x02) {
+	if (etd->fw_version >= 0x020030) {
 		etd->hw_version = 2;
 		/* For now show extra debug information */
 		etd->debug = 1;
@@ -703,8 +702,9 @@ int elantech_init(struct psmouse *psmouse)
 		etd->hw_version = 1;
 		etd->paritycheck = 1;
 	}
-	pr_info("elantech.c: assuming hardware version %d, firmware version %d.%d\n",
-		etd->hw_version, etd->fw_version_maj, etd->fw_version_min);
+
+	pr_info("elantech.c: assuming hardware version %d, firmware version %d.%d.%d\n",
+		etd->hw_version, param[0], param[1], param[2]);
 
 	if (synaptics_send_cmd(psmouse, ETP_CAPABILITIES_QUERY, param)) {
 		pr_err("elantech.c: failed to query capabilities.\n");
@@ -719,8 +719,8 @@ int elantech_init(struct psmouse *psmouse)
 	 * a touch action starts causing the mouse cursor or scrolled page
 	 * to jump. Enable a workaround.
 	 */
-	if (etd->fw_version_maj == 0x02 && etd->fw_version_min == 0x22) {
-		pr_info("elantech.c: firmware version 2.34 detected, "
+	if (etd->fw_version == 0x020022) {
+		pr_info("elantech.c: firmware version 2.0.34 detected, "
 			"enabling jumpy cursor workaround\n");
 		etd->jumpy_cursor = 1;
 	}
diff --git a/drivers/input/mouse/elantech.h b/drivers/input/mouse/elantech.h
index feac5f7af96..ac57bde1bb9 100644
--- a/drivers/input/mouse/elantech.h
+++ b/drivers/input/mouse/elantech.h
@@ -100,11 +100,10 @@ struct elantech_data {
 	unsigned char reg_26;
 	unsigned char debug;
 	unsigned char capabilities;
-	unsigned char fw_version_maj;
-	unsigned char fw_version_min;
-	unsigned char hw_version;
 	unsigned char paritycheck;
 	unsigned char jumpy_cursor;
+	unsigned char hw_version;
+	unsigned int  fw_version;
 	unsigned char parity[256];
 };
 
-- 
cgit v1.2.3


From ef110b24e28f36620f63dab94708a17c7e267358 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 13 May 2010 00:42:23 -0700
Subject: Input: psmouse - reset all types of mice before reconnecting

Synaptics hardware requires resetting device after suspend to ram
in order for the device to be operational. The reset lives in
synaptics-specific reconnect handler, but it is not being invoked
if synaptics support is disabled and the device is handled as a
standard PS/2 device (bare or IntelliMouse protocol).

Let's add reset into generic reconnect handler as well.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/mouse/psmouse-base.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/input/mouse/psmouse-base.c b/drivers/input/mouse/psmouse-base.c
index cbc80726494..a3c97315a47 100644
--- a/drivers/input/mouse/psmouse-base.c
+++ b/drivers/input/mouse/psmouse-base.c
@@ -1394,6 +1394,7 @@ static int psmouse_reconnect(struct serio *serio)
 	struct psmouse *psmouse = serio_get_drvdata(serio);
 	struct psmouse *parent = NULL;
 	struct serio_driver *drv = serio->drv;
+	unsigned char type;
 	int rc = -1;
 
 	if (!drv || !psmouse) {
@@ -1413,10 +1414,15 @@ static int psmouse_reconnect(struct serio *serio)
 	if (psmouse->reconnect) {
 		if (psmouse->reconnect(psmouse))
 			goto out;
-	} else if (psmouse_probe(psmouse) < 0 ||
-		   psmouse->type != psmouse_extensions(psmouse,
-						psmouse_max_proto, false)) {
-		goto out;
+	} else {
+		psmouse_reset(psmouse);
+
+		if (psmouse_probe(psmouse) < 0)
+			goto out;
+
+		type = psmouse_extensions(psmouse, psmouse_max_proto, false);
+		if (psmouse->type != type)
+			goto out;
 	}
 
 	/* ok, the device type (and capabilities) match the old one,
-- 
cgit v1.2.3


From 3843384a055496dfed3c93ae883d964d8290fdab Mon Sep 17 00:00:00 2001
From: Oskar Schirmer <os@emlix.com>
Date: Thu, 13 May 2010 00:42:23 -0700
Subject: Input: ad7877 - keep dma rx buffers in seperate cache lines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With dma based spi transmission, data corruption is observed
occasionally. With dma buffers located right next to msg and
xfer fields, cache lines correctly flushed in preparation for
dma usage may be polluted again when writing to fields in the
same cache line.

Make sure cache fields used with dma do not share cache lines
with fields changed during dma handling. As both fields are part
of a struct that is allocated via kzalloc, thus cache aligned,
moving the fields to the 1st position and insert padding for
alignment does the job.

Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Daniel Glöckner <dg@emlix.com>
Signed-off-by: Oliver Schneidewind <osw@emlix.com>
Signed-off-by: Johannes Weiner <jw@emlix.com>
Acked-by: Mike Frysinger <vapier@gentoo.org>
[dtor@mail.ru - changed to use ___cacheline_aligned as suggested
 by akpm]
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/ad7877.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/input/touchscreen/ad7877.c b/drivers/input/touchscreen/ad7877.c
index e019d53d1ab..0d2d7e54b46 100644
--- a/drivers/input/touchscreen/ad7877.c
+++ b/drivers/input/touchscreen/ad7877.c
@@ -156,9 +156,14 @@ struct ser_req {
 	u16			reset;
 	u16			ref_on;
 	u16			command;
-	u16			sample;
 	struct spi_message	msg;
 	struct spi_transfer	xfer[6];
+
+	/*
+	 * DMA (thus cache coherency maintenance) requires the
+	 * transfer buffers to live in their own cache lines.
+	 */
+	u16 sample ____cacheline_aligned;
 };
 
 struct ad7877 {
@@ -182,8 +187,6 @@ struct ad7877 {
 	u8			averaging;
 	u8			pen_down_acc_interval;
 
-	u16			conversion_data[AD7877_NR_SENSE];
-
 	struct spi_transfer	xfer[AD7877_NR_SENSE + 2];
 	struct spi_message	msg;
 
@@ -195,6 +198,12 @@ struct ad7877 {
 	spinlock_t		lock;
 	struct timer_list	timer;		/* P: lock */
 	unsigned		pending:1;	/* P: lock */
+
+	/*
+	 * DMA (thus cache coherency maintenance) requires the
+	 * transfer buffers to live in their own cache lines.
+	 */
+	u16 conversion_data[AD7877_NR_SENSE] ____cacheline_aligned;
 };
 
 static int gpio3;
-- 
cgit v1.2.3


From e1733d2c397476c245a681ba0b54c88858b7a0be Mon Sep 17 00:00:00 2001
From: "Steven J. Magnani" <steve@digidescorp.com>
Date: Tue, 27 Apr 2010 13:00:35 -0500
Subject: microblaze: export assembly functions used by modules

Modules that use copy_{to,from}_user(), memcpy(), and memset() fail to build
in certain circumstances.

Signed-off-by: Steven J. Magnani <steve@digidescorp.com>
Signed-off-by: Michal Simek <monstr@monstr.eu>
---
 arch/microblaze/kernel/microblaze_ksyms.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/microblaze/kernel/microblaze_ksyms.c b/arch/microblaze/kernel/microblaze_ksyms.c
index bc4dcb7d386..ecfb852cd1c 100644
--- a/arch/microblaze/kernel/microblaze_ksyms.c
+++ b/arch/microblaze/kernel/microblaze_ksyms.c
@@ -52,3 +52,13 @@ EXPORT_SYMBOL_GPL(_ebss);
 extern void _mcount(void);
 EXPORT_SYMBOL(_mcount);
 #endif
+
+/*
+ * Assembly functions that may be used (directly or indirectly) by modules
+ */
+EXPORT_SYMBOL(__copy_tofrom_user);
+
+#ifdef CONFIG_OPT_LIB_ASM
+EXPORT_SYMBOL(memcpy);
+EXPORT_SYMBOL(memmove);
+#endif
-- 
cgit v1.2.3


From ddfbc935eae68294834dc29998f93147a5422a0d Mon Sep 17 00:00:00 2001
From: Michal Simek <monstr@monstr.eu>
Date: Thu, 13 May 2010 10:55:47 +0200
Subject: microblaze: Remove compilation warnings in cache macro

CC      arch/microblaze/kernel/cpu/cache.o
arch/microblaze/kernel/cpu/cache.c: In function '__invalidate_dcache_range_wb':
arch/microblaze/kernel/cpu/cache.c:398: warning: ISO C90 forbids mixed declarations and code
arch/microblaze/kernel/cpu/cache.c: In function '__flush_dcache_range_wb':
arch/microblaze/kernel/cpu/cache.c:509: warning: ISO C90 forbids mixed declara

Signed-off-by: Michal Simek <monstr@monstr.eu>
---
 arch/microblaze/kernel/cpu/cache.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/microblaze/kernel/cpu/cache.c b/arch/microblaze/kernel/cpu/cache.c
index 21c3a92394d..109876e8d64 100644
--- a/arch/microblaze/kernel/cpu/cache.c
+++ b/arch/microblaze/kernel/cpu/cache.c
@@ -137,8 +137,9 @@ do {									\
 do {									\
 	int step = -line_length;					\
 	int align = ~(line_length - 1);					\
+	int count;							\
 	end = ((end & align) == end) ? end - line_length : end & align;	\
-	int count = end - start;					\
+	count = end - start;						\
 	WARN_ON(count < 0);						\
 									\
 	__asm__ __volatile__ (" 1:	" #op "	%0, %1;			\
-- 
cgit v1.2.3


From 1ce2470aa544a0aa37b575c45cba366770860af7 Mon Sep 17 00:00:00 2001
From: Michal Simek <monstr@monstr.eu>
Date: Thu, 13 May 2010 12:09:54 +0200
Subject: microblaze: Remove powerpc code from Microblaze port

Remove eeh_add_device_tree_late which is powerpc specific code.

Signed-off-by: Michal Simek <monstr@monstr.eu>
---
 arch/microblaze/pci/pci-common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index 01c8c97c15b..9cb782b8e03 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -1507,7 +1507,7 @@ void pcibios_finish_adding_to_bus(struct pci_bus *bus)
 	pci_bus_add_devices(bus);
 
 	/* Fixup EEH */
-	eeh_add_device_tree_late(bus);
+	/* eeh_add_device_tree_late(bus); */
 }
 EXPORT_SYMBOL_GPL(pcibios_finish_adding_to_bus);
 
-- 
cgit v1.2.3


From ee4bcdf1d248c4ebe5f73e11631c3bd3f76d777b Mon Sep 17 00:00:00 2001
From: Michal Simek <monstr@monstr.eu>
Date: Thu, 13 May 2010 12:11:42 +0200
Subject: microblaze: export assembly functions used by modules

Export __strncpy_user, memory_size, ioremap_bot for modules.

Signed-off-by: Michal Simek <monstr@monstr.eu>
---
 arch/microblaze/kernel/microblaze_ksyms.c | 1 +
 arch/microblaze/mm/init.c                 | 1 +
 arch/microblaze/mm/pgtable.c              | 1 +
 3 files changed, 3 insertions(+)

diff --git a/arch/microblaze/kernel/microblaze_ksyms.c b/arch/microblaze/kernel/microblaze_ksyms.c
index ecfb852cd1c..ff85f771803 100644
--- a/arch/microblaze/kernel/microblaze_ksyms.c
+++ b/arch/microblaze/kernel/microblaze_ksyms.c
@@ -57,6 +57,7 @@ EXPORT_SYMBOL(_mcount);
  * Assembly functions that may be used (directly or indirectly) by modules
  */
 EXPORT_SYMBOL(__copy_tofrom_user);
+EXPORT_SYMBOL(__strncpy_user);
 
 #ifdef CONFIG_OPT_LIB_ASM
 EXPORT_SYMBOL(memcpy);
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index f42c2dde8b1..cca3579d426 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -47,6 +47,7 @@ unsigned long memory_start;
 EXPORT_SYMBOL(memory_start);
 unsigned long memory_end; /* due to mm/nommu.c */
 unsigned long memory_size;
+EXPORT_SYMBOL(memory_size);
 
 /*
  * paging_init() sets up the page tables - in fact we've already done this.
diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c
index 784557fb28c..59bf2335a4c 100644
--- a/arch/microblaze/mm/pgtable.c
+++ b/arch/microblaze/mm/pgtable.c
@@ -42,6 +42,7 @@
 
 unsigned long ioremap_base;
 unsigned long ioremap_bot;
+EXPORT_SYMBOL(ioremap_bot);
 
 /* The maximum lowmem defaults to 768Mb, but this can be configured to
  * another value.
-- 
cgit v1.2.3


From 5051d411ec87381693433d24c4488b2fa4a6306c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 2 Apr 2010 13:08:39 +0100
Subject: mfd: Clean up after WM83xx AUXADC interrupt if it arrives late

In certain circumstances, especially under heavy load, the AUXADC
completion interrupt may be detected after we've timed out waiting for
it.  That conversion would still succeed but the next conversion will
see the completion that was signalled by the interrupt for the previous
conversion and therefore not wait for the AUXADC conversion to run,
causing it to report failure.

Provide a simple, non-invasive cleanup by using try_wait_for_completion()
to ensure that the completion is not signalled before we wait.  Since
the AUXADC is run within a mutex we know there can only have been at
most one AUXADC interrupt outstanding.  A more involved change should
follow for the next merge window.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/wm831x-core.c | 3 +++
 drivers/mfd/wm8350-core.c | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/drivers/mfd/wm831x-core.c b/drivers/mfd/wm831x-core.c
index a3d5728b644..f2ab025ad97 100644
--- a/drivers/mfd/wm831x-core.c
+++ b/drivers/mfd/wm831x-core.c
@@ -349,6 +349,9 @@ int wm831x_auxadc_read(struct wm831x *wm831x, enum wm831x_auxadc input)
 		goto disable;
 	}
 
+	/* If an interrupt arrived late clean up after it */
+	try_wait_for_completion(&wm831x->auxadc_done);
+
 	/* Ignore the result to allow us to soldier on without IRQ hookup */
 	wait_for_completion_timeout(&wm831x->auxadc_done, msecs_to_jiffies(5));
 
diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c
index e400a3bed06..b5807484b4c 100644
--- a/drivers/mfd/wm8350-core.c
+++ b/drivers/mfd/wm8350-core.c
@@ -363,6 +363,10 @@ int wm8350_read_auxadc(struct wm8350 *wm8350, int channel, int scale, int vref)
 	reg |= 1 << channel | WM8350_AUXADC_POLL;
 	wm8350_reg_write(wm8350, WM8350_DIGITISER_CONTROL_1, reg);
 
+	/* If a late IRQ left the completion signalled then consume
+	 * the completion. */
+	try_wait_for_completion(&wm8350->auxadc_done);
+
 	/* We ignore the result of the completion and just check for a
 	 * conversion result, allowing us to soldier on if the IRQ
 	 * infrastructure is not set up for the chip. */
-- 
cgit v1.2.3


From 002baeecf53677d2034113e34197ec221f42e037 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 13 May 2010 12:52:57 +0200
Subject: vfs: Fix O_NOFOLLOW behavior for paths with trailing slashes

According to specification

	mkdir d; ln -s d a; open("a/", O_NOFOLLOW | O_RDONLY)

should return success but currently it returns ELOOP.  This is a
regression caused by path lookup cleanup patch series.

Fix the code to ignore O_NOFOLLOW in case the provided path has trailing
slashes.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Reported-by: Marius Tolzmann <tolzmann@molgen.mpg.de>
Acked-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index a7dce91a7e4..16df7277a92 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1641,7 +1641,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (nd->last.name[nd->last.len]) {
 		if (open_flag & O_CREAT)
 			goto exit;
-		nd->flags |= LOOKUP_DIRECTORY;
+		nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
 	}
 
 	/* just plain open? */
@@ -1830,6 +1830,8 @@ reval:
 	}
 	if (open_flag & O_DIRECTORY)
 		nd.flags |= LOOKUP_DIRECTORY;
+	if (!(open_flag & O_NOFOLLOW))
+		nd.flags |= LOOKUP_FOLLOW;
 	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path holder;
@@ -1837,7 +1839,7 @@ reval:
 		void *cookie;
 		error = -ELOOP;
 		/* S_ISDIR part is a temporary automount kludge */
-		if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode))
+		if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
 			goto exit_dput;
 		if (count++ == 32)
 			goto exit_dput;
-- 
cgit v1.2.3


From e6114fa1d1d6a588766f28081b971047dd3e5def Mon Sep 17 00:00:00 2001
From: Anatolij Gustschin <agust@denx.de>
Date: Wed, 5 May 2010 00:18:59 +0200
Subject: serial: mpc52xx_uart: fix null pointer dereference

Commit 6acc6833510db8f72b5ef343296d97480555fda9
introduced NULL pointer dereference and kernel crash
on ppc32 machines while booting. Fix this bug now.

Reported-by: Leonardo Chiquitto <leonardo.lists@gmail.com>
Tested-by: Leonardo Chiquitto <leonardo.lists@gmail.com>
Signed-off-by: Anatolij Gustschin <agust@denx.de>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/serial/mpc52xx_uart.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/serial/mpc52xx_uart.c b/drivers/serial/mpc52xx_uart.c
index a176ab4bd65..02469c31bf0 100644
--- a/drivers/serial/mpc52xx_uart.c
+++ b/drivers/serial/mpc52xx_uart.c
@@ -1467,7 +1467,7 @@ mpc52xx_uart_init(void)
 	/*
 	 * Map the PSC FIFO Controller and init if on MPC512x.
 	 */
-	if (psc_ops->fifoc_init) {
+	if (psc_ops && psc_ops->fifoc_init) {
 		ret = psc_ops->fifoc_init();
 		if (ret)
 			return ret;
-- 
cgit v1.2.3


From 77945febbe60a69e9dcab7f49d33a1aa1e436973 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Tue, 4 May 2010 20:42:36 +0100
Subject: tty: Fix unbalanced BKL handling in error path

Arnd noted:

After the "retry_open:" label, we first get the tty_mutex
and then the BKL. However a the end of tty_open, we jump
back to retry_open with the BKL still held. If we run into
this case, the tty_open function will be left with the BKL
still held.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/char/tty_io.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 6da962c9b21..d71f0fc34b4 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -1875,6 +1875,7 @@ got_driver:
 		 */
 		if (filp->f_op == &hung_up_tty_fops)
 			filp->f_op = &tty_fops;
+		unlock_kernel();
 		goto retry_open;
 	}
 	unlock_kernel();
-- 
cgit v1.2.3


From 1c5250d6163dac28be3afabdfb6c723f107051b7 Mon Sep 17 00:00:00 2001
From: Valentin Longchamp <valentin.longchamp@epfl.ch>
Date: Wed, 5 May 2010 11:47:07 +0200
Subject: serial: imx.c: fix CTS trigger level lower to avoid lost chars
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The imx CTS trigger level is left at its reset value that is 32
chars. Since the RX FIFO has 32 entries, when CTS is raised, the
FIFO already is full. However, some serial port devices first empty
their TX FIFO before stopping when CTS is raised, resulting in lost
chars.

This patch sets the trigger level lower so that other chars arrive
after CTS is raised, there is still room for 16 of them.

Signed-off-by: Valentin Longchamp<valentin.longchamp@epfl.ch>
Tested-by: Philippe Rétornaz<philippe.retornaz@epfl.ch>
Acked-by: Wolfram Sang<w.sang@pengutronix.de>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/serial/imx.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 4315b23590b..eacb588a934 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -120,7 +120,8 @@
 #define  MX2_UCR3_RXDMUXSEL	 (1<<2)  /* RXD Muxed Input Select, on mx2/mx3 */
 #define  UCR3_INVT  	 (1<<1)  /* Inverted Infrared transmission */
 #define  UCR3_BPEN  	 (1<<0)  /* Preset registers enable */
-#define  UCR4_CTSTL_32   (32<<10) /* CTS trigger level (32 chars) */
+#define  UCR4_CTSTL_SHF  10      /* CTS trigger level shift */
+#define  UCR4_CTSTL_MASK 0x3F    /* CTS trigger is 6 bits wide */
 #define  UCR4_INVR  	 (1<<9)  /* Inverted infrared reception */
 #define  UCR4_ENIRI 	 (1<<8)  /* Serial infrared interrupt enable */
 #define  UCR4_WKEN  	 (1<<7)  /* Wake interrupt enable */
@@ -591,6 +592,9 @@ static int imx_setup_ufcr(struct imx_port *sport, unsigned int mode)
 	return 0;
 }
 
+/* half the RX buffer size */
+#define CTSTL 16
+
 static int imx_startup(struct uart_port *port)
 {
 	struct imx_port *sport = (struct imx_port *)port;
@@ -607,6 +611,10 @@ static int imx_startup(struct uart_port *port)
 	if (USE_IRDA(sport))
 		temp |= UCR4_IRSC;
 
+	/* set the trigger level for CTS */
+	temp &= ~(UCR4_CTSTL_MASK<<  UCR4_CTSTL_SHF);
+	temp |= CTSTL<<  UCR4_CTSTL_SHF;
+
 	writel(temp & ~UCR4_DREN, sport->port.membase + UCR4);
 
 	if (USE_IRDA(sport)) {
-- 
cgit v1.2.3


From 8769e1c7177fd9f6981042bcc6c2851c99a4a7e7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 13 May 2010 19:22:58 +0200
Subject: perf hist: Fix hists__browse no-newt case
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix mistake in a parameter type of the no-newt hists__browse()
version.

Fixes:
	builtin-report.c: In function ‘__cmd_report’:
	builtin-report.c:314: erreur: incompatible type for argument 1 of ‘hists__browse’

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1273771378-8577-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index ed9c0673496..0b4c8df914b 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -78,7 +78,7 @@ void hists__filter_by_dso(struct hists *self, const struct dso *dso);
 void hists__filter_by_thread(struct hists *self, const struct thread *thread);
 
 #ifdef NO_NEWT_SUPPORT
-static inline int hists__browse(struct hists self __used,
+static inline int hists__browse(struct hists *self __used,
 				const char *helpline __used,
 				const char *input_name __used)
 {
-- 
cgit v1.2.3


From 8a0ecfb8b47dc765fdf460913231876bbc95385e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 13 May 2010 19:47:16 +0200
Subject: perf hist: Fix missing getline declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

hist.c needs to include util.h so that it gets stdio.h
inclusion with __GNU_SOURCE defined.

Fixes:
	util/hist.c: In function ‘hist_entry__parse_objdump_line’:
	util/hist.c:931: erreur: implicit declaration of function ‘getline’
	util/hist.c:931: erreur: nested extern declaration of ‘getline’

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1273772836-11533-1-git-send-regression-fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/hist.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 451d2e45d84..5dc4f8429ed 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1,3 +1,4 @@
+#include "util.h"
 #include "hist.h"
 #include "session.h"
 #include "sort.h"
-- 
cgit v1.2.3


From 2e6cdf996ba43ce0b090ffbf754f83e17362cd69 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 12 May 2010 10:40:01 +0200
Subject: perf tools: change event inheritance logic in stat and record
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

By default, event inheritance across fork and pthread_create was on but the -i
option of stat and record, which enabled inheritance, led to believe it was off
by default.

This patch fixes this logic by inverting the meaning of the -i option.  By
default inheritance is on whether you attach to a process (-p), a thread (-t)
or start a process. If you pass -i, then you turn off inheritance. Turning off
inheritance if you don't need it, helps limit perf resource usage as well.

The patch also fixes perf stat -t xxxx and perf record -t xxxx which did not
start the counters.

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <4bea9d2f.d60ce30a.0b5b.08e1@mx.google.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-record.txt |  4 ++--
 tools/perf/Documentation/perf-stat.txt   |  4 ++--
 tools/perf/builtin-record.c              | 12 ++++++------
 tools/perf/builtin-stat.c                | 10 +++++-----
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 020d871c793..34e255fc3e2 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -69,8 +69,8 @@ OPTIONS
 	Output file name.
 
 -i::
---inherit::
-	Child tasks inherit counters.
+--no-inherit::
+	Child tasks do not inherit counters.
 -F::
 --freq=::
 	Profile at this frequency.
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 484080dd5b6..2cab8e8c33d 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -31,8 +31,8 @@ OPTIONS
 	 hexadecimal event descriptor.
 
 -i::
---inherit::
-        child tasks inherit counters
+--no-inherit::
+        child tasks do not inherit counters
 -p::
 --pid=<pid>::
         stat events on existing pid
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 6b77b285fe1..0f467cf7aa7 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -54,7 +54,7 @@ static pid_t			target_tid			=     -1;
 static pid_t			*all_tids			=      NULL;
 static int			thread_num			=      0;
 static pid_t			child_pid			=     -1;
-static bool			inherit				=   true;
+static bool			no_inherit			=  false;
 static enum write_mode_t	write_mode			= WRITE_FORCE;
 static bool			call_graph			=  false;
 static bool			inherit_stat			=  false;
@@ -298,8 +298,8 @@ static void create_counter(int counter, int cpu)
 
 	attr->mmap		= track;
 	attr->comm		= track;
-	attr->inherit		= inherit;
-	if (target_pid == -1 && !system_wide) {
+	attr->inherit		= !no_inherit;
+	if (target_pid == -1 && target_tid == -1 && !system_wide) {
 		attr->disabled = 1;
 		attr->enable_on_exec = 1;
 	}
@@ -641,7 +641,7 @@ static int __cmd_record(int argc, const char **argv)
 		close(child_ready_pipe[0]);
 	}
 
-	if ((!system_wide && !inherit) || profile_cpu != -1) {
+	if ((!system_wide && no_inherit) || profile_cpu != -1) {
 		open_counters(profile_cpu);
 	} else {
 		nr_cpus = read_cpu_map();
@@ -821,8 +821,8 @@ static const struct option options[] = {
 		    "event period to sample"),
 	OPT_STRING('o', "output", &output_name, "file",
 		    "output file name"),
-	OPT_BOOLEAN('i', "inherit", &inherit,
-		    "child tasks inherit counters"),
+	OPT_BOOLEAN('i', "no-inherit", &no_inherit,
+		    "child tasks do not inherit counters"),
 	OPT_INTEGER('F', "freq", &user_freq,
 		    "profile at this frequency"),
 	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index e619ac89dff..ff8c413b7e7 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -72,7 +72,7 @@ static unsigned int		nr_cpus				=  0;
 static int			run_idx				=  0;
 
 static int			run_count			=  1;
-static bool			inherit				=  true;
+static bool			no_inherit			= false;
 static bool			scale				=  true;
 static pid_t			target_pid			= -1;
 static pid_t			target_tid			= -1;
@@ -167,8 +167,8 @@ static int create_perf_stat_counter(int counter)
 				++ncreated;
 		}
 	} else {
-		attr->inherit	     = inherit;
-		if (target_pid == -1) {
+		attr->inherit = !no_inherit;
+		if (target_pid == -1 && target_tid == -1) {
 			attr->disabled = 1;
 			attr->enable_on_exec = 1;
 		}
@@ -518,8 +518,8 @@ static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
 		     "event selector. use 'perf list' to list available events",
 		     parse_events),
-	OPT_BOOLEAN('i', "inherit", &inherit,
-		    "child tasks inherit counters"),
+	OPT_BOOLEAN('i', "no-inherit", &no_inherit,
+		    "child tasks do not inherit counters"),
 	OPT_INTEGER('p', "pid", &target_pid,
 		    "stat events on existing process id"),
 	OPT_INTEGER('t', "tid", &target_tid,
-- 
cgit v1.2.3


From 5d2be7cb198a0a6bc6088d3806fb7261b184ad89 Mon Sep 17 00:00:00 2001
From: Kirill Smelkov <kirr@mns.spb.ru>
Date: Thu, 13 May 2010 14:39:25 +0400
Subject: perf trace scripts: Fix typos in perf-trace-python.txt

option option   -> option
special special -> special

Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <1273747165-17242-1-git-send-email-kirr@mns.spb.ru>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Documentation/perf-trace-python.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-trace-python.txt b/tools/perf/Documentation/perf-trace-python.txt
index 864aac283a7..693be804dd3 100644
--- a/tools/perf/Documentation/perf-trace-python.txt
+++ b/tools/perf/Documentation/perf-trace-python.txt
@@ -182,7 +182,7 @@ mean either that the record step recorded event types that it wasn't
 really interested in, or the script was run against a trace file that
 doesn't correspond to the script.
 
-The script generated by -g option option simply prints a line for each
+The script generated by -g option simply prints a line for each
 event found in the trace stream i.e. it basically just dumps the event
 and its parameter values to stdout.  The print_header() function is
 simply a utility function used for that purpose.  Let's rename the
@@ -582,7 +582,7 @@ files:
   flag_str(event_name, field_name, field_value) - returns the string represention corresponding to field_value for the flag field field_name of event event_name
   symbol_str(event_name, field_name, field_value) - returns the string represention corresponding to field_value for the symbolic field field_name of event event_name
 
-The *autodict* function returns a special special kind of Python
+The *autodict* function returns a special kind of Python
 dictionary that implements Perl's 'autovivifying' hashes in Python
 i.e. with autovivifying hashes, you can assign nested hash values
 without having to go to the trouble of creating intermediate levels if
-- 
cgit v1.2.3


From f01487119dda3d9f58c9729c7361ecc50a61c188 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <herrmann.der.user@googlemail.com>
Date: Tue, 27 Apr 2010 12:13:48 +0200
Subject: x86, amd: Check X86_FEATURE_OSVW bit before accessing OSVW MSRs

If host CPU is exposed to a guest the OSVW MSRs are not guaranteed
to be present and a GP fault occurs. Thus checking the feature flag is
essential.

Cc: <stable@kernel.org> # .32.x .33.x
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
LKML-Reference: <20100427101348.GC4489@alberich.amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/process.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 28ad9f4d8b9..0415c3ef91b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -546,11 +546,13 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
 		 * check OSVW bit for CPUs that are not affected
 		 * by erratum #400
 		 */
-		rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
-		if (val >= 2) {
-			rdmsrl(MSR_AMD64_OSVW_STATUS, val);
-			if (!(val & BIT(1)))
-				goto no_c1e_idle;
+		if (cpu_has(c, X86_FEATURE_OSVW)) {
+			rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
+			if (val >= 2) {
+				rdmsrl(MSR_AMD64_OSVW_STATUS, val);
+				if (!(val & BIT(1)))
+					goto no_c1e_idle;
+			}
 		}
 		return 1;
 	}
-- 
cgit v1.2.3


From 9e565292270a2d55524be38835104c564ac8f795 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Thu, 13 May 2010 21:43:03 -0700
Subject: x86: Use .cfi_sections for assembly code

The newer assemblers support the .cfi_sections directive so we can put
the CFI from .S files into the .debug_frame section that is preserved
in unstripped vmlinux and in separate debuginfo, rather than the
.eh_frame section that is now discarded by vmlinux.lds.S.

Signed-off-by: Roland McGrath <roland@redhat.com>
LKML-Reference: <20100514044303.A6FE7400BE@magilla.sf.frob.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Makefile             |  5 +++--
 arch/x86/include/asm/dwarf2.h | 12 ++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 0a43dc515e4..8aa1b59b907 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -95,8 +95,9 @@ sp-$(CONFIG_X86_64) := rsp
 cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
 # is .cfi_signal_frame supported too?
 cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe)
+cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index ae6253ab902..733f7e91e7a 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -34,6 +34,18 @@
 #define CFI_SIGNAL_FRAME
 #endif
 
+#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__)
+	/*
+	 * Emit CFI data in .debug_frame sections, not .eh_frame sections.
+	 * The latter we currently just discard since we don't do DWARF
+	 * unwinding at runtime.  So only the offline DWARF information is
+	 * useful to anyone.  Note we should not use this directive if this
+	 * file is used in the vDSO assembly, or if vmlinux.lds.S gets
+	 * changed so it doesn't discard .eh_frame.
+	 */
+	.cfi_sections .debug_frame
+#endif
+
 #else
 
 /*
-- 
cgit v1.2.3


From a5e48b88da225580394f825ffe67e444b050074b Mon Sep 17 00:00:00 2001
From: Michal Simek <monstr@monstr.eu>
Date: Fri, 14 May 2010 07:40:46 +0200
Subject: microblaze: Fix module loading on system with WB cache

There is necessary to flush whole dcache. Icache work should be
done in kernel/module.c.

Signed-off-by: Michal Simek <monstr@monstr.eu>
---
 arch/microblaze/kernel/module.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/microblaze/kernel/module.c b/arch/microblaze/kernel/module.c
index cbecf110dc3..0e73f660654 100644
--- a/arch/microblaze/kernel/module.c
+++ b/arch/microblaze/kernel/module.c
@@ -16,6 +16,7 @@
 #include <linux/string.h>
 
 #include <asm/pgtable.h>
+#include <asm/cacheflush.h>
 
 void *module_alloc(unsigned long size)
 {
@@ -151,6 +152,7 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
 int module_finalize(const Elf32_Ehdr *hdr, const Elf_Shdr *sechdrs,
 		struct module *module)
 {
+	flush_dcache();
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4ae69e6b718589abe97c9625ccbb1e0bc95a8c0e Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Thu, 22 Apr 2010 12:19:17 -0700
Subject: mmap_min_addr check CAP_SYS_RAWIO only for write

Redirecting directly to lsm, here's the patch discussed on lkml:
http://lkml.org/lkml/2010/4/22/219

The mmap_min_addr value is useful information for an admin to see without
being root ("is my system vulnerable to kernel NULL pointer attacks?") and
its setting is trivially easy for an attacker to determine by calling
mmap() in PAGE_SIZE increments starting at 0, so trying to keep it private
has no value.

Only require CAP_SYS_RAWIO if changing the value, not reading it.

Comment from Serge :

  Me, I like to write my passwords with light blue pen on dark blue
  paper, pasted on my window - if you're going to get my password, you're
  gonna get a headache.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
(cherry picked from commit 822cceec7248013821d655545ea45d1c6a9d15b3)
---
 security/min_addr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/min_addr.c b/security/min_addr.c
index e86f297522b..f728728f193 100644
--- a/security/min_addr.c
+++ b/security/min_addr.c
@@ -33,7 +33,7 @@ int mmap_min_addr_handler(struct ctl_table *table, int write,
 {
 	int ret;
 
-	if (!capable(CAP_SYS_RAWIO))
+	if (write && !capable(CAP_SYS_RAWIO))
 		return -EPERM;
 
 	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
-- 
cgit v1.2.3


From c8446b9bdabcb0caa61bb341bd73c58f7104b503 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 14 May 2010 10:36:42 -0300
Subject: perf hist: Make event__totals per hists
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is one more thing that started global but are more useful per hist
or per session.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-annotate.c |  2 +-
 tools/perf/builtin-report.c   |  8 +++++++-
 tools/perf/util/event.c       | 17 +++++++++++++++++
 tools/perf/util/event.h       |  2 ++
 tools/perf/util/hist.c        | 21 +++++++++++++++++++++
 tools/perf/util/hist.h        |  6 ++++++
 tools/perf/util/session.c     | 36 ++----------------------------------
 tools/perf/util/session.h     |  8 ++++++--
 8 files changed, 62 insertions(+), 38 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index fd1b786c8f3..77bcc9b130f 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -365,7 +365,7 @@ static int __cmd_annotate(void)
 		goto out_delete;
 
 	if (dump_trace) {
-		event__print_totals();
+		perf_session__fprintf_nr_events(session, stdout);
 		goto out_delete;
 	}
 
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 04de3387de3..f13cda1ef05 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -139,6 +139,12 @@ static int add_event_total(struct perf_session *session,
 		return -ENOMEM;
 
 	hists->stats.total += data->period;
+	/*
+	 * FIXME: add_event_total should be moved from here to
+	 * perf_session__process_event so that the proper hist is passed to
+	 * the event_op methods.
+	 */
+	hists__inc_nr_events(hists, PERF_RECORD_SAMPLE);
 	session->hists.stats.total += data->period;
 	return 0;
 }
@@ -293,7 +299,7 @@ static int __cmd_report(void)
 		goto out_delete;
 
 	if (dump_trace) {
-		event__print_totals();
+		perf_session__fprintf_nr_events(session, stdout);
 		goto out_delete;
 	}
 
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index cce006ec8f0..3e8fec17304 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -7,6 +7,23 @@
 #include "strlist.h"
 #include "thread.h"
 
+const char *event__name[] = {
+	[0]			 = "TOTAL",
+	[PERF_RECORD_MMAP]	 = "MMAP",
+	[PERF_RECORD_LOST]	 = "LOST",
+	[PERF_RECORD_COMM]	 = "COMM",
+	[PERF_RECORD_EXIT]	 = "EXIT",
+	[PERF_RECORD_THROTTLE]	 = "THROTTLE",
+	[PERF_RECORD_UNTHROTTLE] = "UNTHROTTLE",
+	[PERF_RECORD_FORK]	 = "FORK",
+	[PERF_RECORD_READ]	 = "READ",
+	[PERF_RECORD_SAMPLE]	 = "SAMPLE",
+	[PERF_RECORD_HEADER_ATTR]	 = "ATTR",
+	[PERF_RECORD_HEADER_EVENT_TYPE]	 = "EVENT_TYPE",
+	[PERF_RECORD_HEADER_TRACING_DATA]	 = "TRACING_DATA",
+	[PERF_RECORD_HEADER_BUILD_ID]	 = "BUILD_ID",
+};
+
 static pid_t event__synthesize_comm(pid_t pid, int full,
 				    event__handler_t process,
 				    struct perf_session *session)
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 48c2cc9dae4..8577085db06 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -160,4 +160,6 @@ int event__preprocess_sample(const event_t *self, struct perf_session *session,
 			     struct addr_location *al, symbol_filter_t filter);
 int event__parse_sample(event_t *event, u64 type, struct sample_data *data);
 
+extern const char *event__name[];
+
 #endif /* __PERF_RECORD_H */
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 5dc4f8429ed..1614ad71004 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1028,3 +1028,24 @@ int hist_entry__annotate(struct hist_entry *self, struct list_head *head)
 	pclose(file);
 	return 0;
 }
+
+void hists__inc_nr_events(struct hists *self, u32 type)
+{
+	++self->hists.stats.nr_events[0];
+	++self->hists.stats.nr_events[type];
+}
+
+size_t hists__fprintf_nr_events(struct hists *self, FILE *fp)
+{
+	int i;
+	size_t ret = 0;
+
+	for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
+		if (!event__name[i])
+			continue;
+		ret += fprintf(fp, "%10s events: %10d\n",
+			       event__name[i], self->stats.nr_events[i]);
+	}
+
+	return ret;
+}
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 0b4c8df914b..97b8962ff69 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -40,6 +40,8 @@ struct sym_priv {
 struct events_stats {
 	u64 total;
 	u64 lost;
+	u32 nr_events[PERF_RECORD_HEADER_MAX];
+	u32 nr_unknown_events;
 };
 
 struct hists {
@@ -68,6 +70,10 @@ void hist_entry__free(struct hist_entry *);
 
 void hists__output_resort(struct hists *self);
 void hists__collapse_resort(struct hists *self);
+
+void hists__inc_nr_events(struct hists *self, u32 type);
+size_t hists__fprintf_nr_events(struct hists *self, FILE *fp);
+
 size_t hists__fprintf(struct hists *self, struct hists *pair,
 		      bool show_displacement, FILE *fp);
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 72a7f6ae029..7231f6b19fb 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -94,7 +94,6 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
 	self->mmap_window = 32;
 	self->cwd = NULL;
 	self->cwdlen = 0;
-	self->unknown_events = 0;
 	self->machines = RB_ROOT;
 	self->repipe = repipe;
 	INIT_LIST_HEAD(&self->ordered_samples.samples_head);
@@ -241,36 +240,6 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
 	}
 }
 
-static const char *event__name[] = {
-	[0]			 = "TOTAL",
-	[PERF_RECORD_MMAP]	 = "MMAP",
-	[PERF_RECORD_LOST]	 = "LOST",
-	[PERF_RECORD_COMM]	 = "COMM",
-	[PERF_RECORD_EXIT]	 = "EXIT",
-	[PERF_RECORD_THROTTLE]	 = "THROTTLE",
-	[PERF_RECORD_UNTHROTTLE] = "UNTHROTTLE",
-	[PERF_RECORD_FORK]	 = "FORK",
-	[PERF_RECORD_READ]	 = "READ",
-	[PERF_RECORD_SAMPLE]	 = "SAMPLE",
-	[PERF_RECORD_HEADER_ATTR]	 = "ATTR",
-	[PERF_RECORD_HEADER_EVENT_TYPE]	 = "EVENT_TYPE",
-	[PERF_RECORD_HEADER_TRACING_DATA]	 = "TRACING_DATA",
-	[PERF_RECORD_HEADER_BUILD_ID]	 = "BUILD_ID",
-};
-
-unsigned long event__total[PERF_RECORD_HEADER_MAX];
-
-void event__print_totals(void)
-{
-	int i;
-	for (i = 0; i < PERF_RECORD_HEADER_MAX; ++i) {
-		if (!event__name[i])
-			continue;
-		pr_info("%10s events: %10ld\n",
-			event__name[i], event__total[i]);
-	}
-}
-
 void mem_bswap_64(void *src, int byte_size)
 {
 	u64 *m = src;
@@ -580,8 +549,7 @@ static int perf_session__process_event(struct perf_session *self,
 		dump_printf("%#Lx [%#x]: PERF_RECORD_%s",
 			    offset + head, event->header.size,
 			    event__name[event->header.type]);
-		++event__total[0];
-		++event__total[event->header.type];
+		hists__inc_nr_events(self, event->header.type);
 	}
 
 	if (self->header.needs_swap && event__swap_ops[event->header.type])
@@ -619,7 +587,7 @@ static int perf_session__process_event(struct perf_session *self,
 	case PERF_RECORD_FINISHED_ROUND:
 		return ops->finished_round(event, self, ops);
 	default:
-		self->unknown_events++;
+		++self->hists.stats.nr_unknown_events;
 		return -1;
 	}
 }
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index ce00fa6cded..e7fce486ebe 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -30,8 +30,6 @@ struct perf_session {
 	struct machine		host_machine;
 	struct rb_root		machines;
 	struct rb_root		hists_tree;
-	unsigned long		event_total[PERF_RECORD_MAX];
-	unsigned long		unknown_events;
 	/*
 	 * FIXME: should point to the first entry in hists_tree and
 	 *        be a hists instance. Right now its only 'report'
@@ -140,4 +138,10 @@ size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, FILE *fp,
 {
 	return machines__fprintf_dsos_buildid(&self->machines, fp, with_hits);
 }
+
+static inline
+size_t perf_session__fprintf_nr_events(struct perf_session *self, FILE *fp)
+{
+	return hists__fprintf_nr_events(&self->hists, fp);
+}
 #endif /* __PERF_SESSION_H */
-- 
cgit v1.2.3


From 3dbc6fb6a3c8a7dc164ae330ab024a3fe65ae53e Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 May 2010 17:16:23 -0400
Subject: inotify: clean up the inotify_add_watch out path

inotify_add_watch explictly frees the unused inode mark, but it can just
use the generic code.  Just do that.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 472cdf29ef8..40da732eb73 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -578,16 +578,13 @@ retry:
 	/* return the watch descriptor for this new entry */
 	ret = tmp_ientry->wd;
 
-	/* match the ref from fsnotify_init_markentry() */
-	fsnotify_put_mark(&tmp_ientry->fsn_entry);
-
 	/* if this mark added a new event update the group mask */
 	if (mask & ~group->mask)
 		fsnotify_recalc_group_mask(group);
 
 out_err:
-	if (ret < 0)
-		kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
+	/* match the ref from fsnotify_init_markentry() */
+	fsnotify_put_mark(&tmp_ientry->fsn_entry);
 
 	return ret;
 }
-- 
cgit v1.2.3


From e08733446e72b983fed850fc5d8bd21b386feb29 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 May 2010 17:17:40 -0400
Subject: inotify: race use after free/double free in inotify inode marks

There is a race in the inotify add/rm watch code.  A task can find and
remove a mark which doesn't have all of it's references.  This can
result in a use after free/double free situation.

Task A					Task B
------------				-----------
inotify_new_watch()
 allocate a mark (refcnt == 1)
 add it to the idr
					inotify_rm_watch()
					 inotify_remove_from_idr()
					  fsnotify_put_mark()
					      refcnt hits 0, free
 take reference because we are on idr
 [at this point it is a use after free]
 [time goes on]
 refcnt may hit 0 again, double free

The fix is to take the reference BEFORE the object can be found in the
idr.

Signed-off-by: Eric Paris <eparis@redhat.com>
Cc: <stable@kernel.org>
---
 fs/notify/inotify/inotify_user.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 40da732eb73..e46ca685b9b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -546,21 +546,24 @@ retry:
 	if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
 		goto out_err;
 
+	/* we are putting the mark on the idr, take a reference */
+	fsnotify_get_mark(&tmp_ientry->fsn_entry);
+
 	spin_lock(&group->inotify_data.idr_lock);
 	ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
 				group->inotify_data.last_wd+1,
 				&tmp_ientry->wd);
 	spin_unlock(&group->inotify_data.idr_lock);
 	if (ret) {
+		/* we didn't get on the idr, drop the idr reference */
+		fsnotify_put_mark(&tmp_ientry->fsn_entry);
+
 		/* idr was out of memory allocate and try again */
 		if (ret == -EAGAIN)
 			goto retry;
 		goto out_err;
 	}
 
-	/* we put the mark on the idr, take a reference */
-	fsnotify_get_mark(&tmp_ientry->fsn_entry);
-
 	/* we are on the idr, now get on the inode */
 	ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
 	if (ret) {
-- 
cgit v1.2.3


From b3b38d842fa367d862b83e7670af4e0fd6a80fc0 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 12 May 2010 15:34:07 -0700
Subject: inotify: don't leak user struct on inotify release

inotify_new_group() receives a get_uid-ed user_struct and saves the
reference on group->inotify_data.user.  The problem is that free_uid() is
never called on it.

Issue seem to be introduced by 63c882a0 (inotify: reimplement inotify
using fsnotify) after 2.6.30.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Eric Paris <eparis@parisplace.org>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_fsnotify.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 1afb0a10229..e27960cd76a 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -28,6 +28,7 @@
 #include <linux/path.h> /* struct path */
 #include <linux/slab.h> /* kmem_* */
 #include <linux/types.h>
+#include <linux/sched.h>
 
 #include "inotify.h"
 
@@ -146,6 +147,7 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
 	idr_for_each(&group->inotify_data.idr, idr_callback, group);
 	idr_remove_all(&group->inotify_data.idr);
 	idr_destroy(&group->inotify_data.idr);
+	free_uid(group->inotify_data.user);
 }
 
 void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
-- 
cgit v1.2.3


From cee75ac7ecc27084accdb9d9d6fde65a09f047ae Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 14 May 2010 13:16:55 -0300
Subject: perf hist: Clarify events_stats fields usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The events_stats.total field is too generic, rename it to .total_period,
and also add a comment explaining that it is the sum of all the .period
fields in samples, that is needed because we use auto-freq to avoid
sampling artifacts.

Ditto for events_stats.lost, that is the sum of all lost_event.lost
fields, i.e. the number of events the kernel dropped.

Looking at the users, builtin-sched.c can make use of these fields and
stop doing it again.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-diff.c   |  2 +-
 tools/perf/builtin-report.c |  8 ++++----
 tools/perf/builtin-sched.c  | 17 ++++++-----------
 tools/perf/builtin-trace.c  |  2 +-
 tools/perf/util/event.c     |  2 +-
 tools/perf/util/hist.c      | 20 ++++++++++----------
 tools/perf/util/hist.h      | 16 ++++++++++++++--
 tools/perf/util/newt.c      |  6 +++---
 tools/perf/util/session.c   |  2 +-
 9 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 3a95a0260a5..6dd4bdae8a8 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -54,7 +54,7 @@ static int diff__process_sample_event(event_t *event, struct perf_session *sessi
 		return -1;
 	}
 
-	session->hists.stats.total += data.period;
+	session->hists.stats.total_period += data.period;
 	return 0;
 }
 
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f13cda1ef05..b8f47ded628 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -138,14 +138,14 @@ static int add_event_total(struct perf_session *session,
 	if (!hists)
 		return -ENOMEM;
 
-	hists->stats.total += data->period;
+	hists->stats.total_period += data->period;
 	/*
 	 * FIXME: add_event_total should be moved from here to
 	 * perf_session__process_event so that the proper hist is passed to
 	 * the event_op methods.
 	 */
 	hists__inc_nr_events(hists, PERF_RECORD_SAMPLE);
-	session->hists.stats.total += data->period;
+	session->hists.stats.total_period += data->period;
 	return 0;
 }
 
@@ -322,10 +322,10 @@ static int __cmd_report(void)
 			if (rb_first(&session->hists.entries) ==
 			    rb_last(&session->hists.entries))
 				fprintf(stdout, "# Samples: %Ld\n#\n",
-					hists->stats.total);
+					hists->stats.total_period);
 			else
 				fprintf(stdout, "# Samples: %Ld %s\n#\n",
-					hists->stats.total,
+					hists->stats.total_period,
 					__event_name(hists->type, hists->config));
 
 			hists__fprintf(hists, NULL, false, stdout);
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index aef6ed0e119..be7bc926471 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1641,19 +1641,10 @@ static int process_sample_event(event_t *event, struct perf_session *session)
 	return 0;
 }
 
-static int process_lost_event(event_t *event __used,
-			      struct perf_session *session __used)
-{
-	nr_lost_chunks++;
-	nr_lost_events += event->lost.lost;
-
-	return 0;
-}
-
 static struct perf_event_ops event_ops = {
 	.sample			= process_sample_event,
 	.comm			= event__process_comm,
-	.lost			= process_lost_event,
+	.lost			= event__process_lost,
 	.ordered_samples	= true,
 };
 
@@ -1664,8 +1655,12 @@ static int read_events(void)
 	if (session == NULL)
 		return -ENOMEM;
 
-	if (perf_session__has_traces(session, "record -R"))
+	if (perf_session__has_traces(session, "record -R")) {
 		err = perf_session__process_events(session, &event_ops);
+		nr_events      = session->hists.stats.nr_events[0];
+		nr_lost_events = session->hists.stats.total_lost;
+		nr_lost_chunks = session->hists.stats.nr_events[PERF_RECORD_LOST];
+	}
 
 	perf_session__delete(session);
 	return err;
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 95fcb0517a9..dddf3f01b5a 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -109,7 +109,7 @@ static int process_sample_event(event_t *event, struct perf_session *session)
 					     data.time, thread->comm);
 	}
 
-	session->hists.stats.total += data.period;
+	session->hists.stats.total_period += data.period;
 	return 0;
 }
 
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 3e8fec17304..50771b5813e 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -385,7 +385,7 @@ int event__process_comm(event_t *self, struct perf_session *session)
 int event__process_lost(event_t *self, struct perf_session *session)
 {
 	dump_printf(": id:%Ld: lost:%Ld\n", self->lost.id, self->lost.lost);
-	session->hists.stats.lost += self->lost.lost;
+	session->hists.stats.total_lost += self->lost.lost;
 	return 0;
 }
 
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 1614ad71004..c5922451808 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -239,7 +239,7 @@ void hists__output_resort(struct hists *self)
 	struct hist_entry *n;
 	u64 min_callchain_hits;
 
-	min_callchain_hits = self->stats.total * (callchain_param.min_percent / 100);
+	min_callchain_hits = self->stats.total_period * (callchain_param.min_percent / 100);
 
 	tmp = RB_ROOT;
 	next = rb_first(&self->entries);
@@ -525,7 +525,7 @@ int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size,
 
 	if (pair_hists) {
 		count = self->pair ? self->pair->count : 0;
-		total = pair_hists->stats.total;
+		total = pair_hists->stats.total_period;
 		count_sys = self->pair ? self->pair->count_sys : 0;
 		count_us = self->pair ? self->pair->count_us : 0;
 		count_guest_sys = self->pair ? self->pair->count_guest_sys : 0;
@@ -769,10 +769,10 @@ print_entries:
 			++position;
 		}
 		ret += hist_entry__fprintf(h, pair, show_displacement,
-					   displacement, fp, self->stats.total);
+					   displacement, fp, self->stats.total_period);
 
 		if (symbol_conf.use_callchain)
-			ret += hist_entry__fprintf_callchain(h, fp, self->stats.total);
+			ret += hist_entry__fprintf_callchain(h, fp, self->stats.total_period);
 
 		if (h->ms.map == NULL && verbose > 1) {
 			__map_groups__fprintf_maps(&h->thread->mg,
@@ -795,7 +795,7 @@ void hists__filter_by_dso(struct hists *self, const struct dso *dso)
 {
 	struct rb_node *nd;
 
-	self->nr_entries = self->stats.total = 0;
+	self->nr_entries = self->stats.total_period = 0;
 	self->max_sym_namelen = 0;
 
 	for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
@@ -812,7 +812,7 @@ void hists__filter_by_dso(struct hists *self, const struct dso *dso)
 		h->filtered &= ~(1 << HIST_FILTER__DSO);
 		if (!h->filtered) {
 			++self->nr_entries;
-			self->stats.total += h->count;
+			self->stats.total_period += h->count;
 			if (h->ms.sym &&
 			    self->max_sym_namelen < h->ms.sym->namelen)
 				self->max_sym_namelen = h->ms.sym->namelen;
@@ -824,7 +824,7 @@ void hists__filter_by_thread(struct hists *self, const struct thread *thread)
 {
 	struct rb_node *nd;
 
-	self->nr_entries = self->stats.total = 0;
+	self->nr_entries = self->stats.total_period = 0;
 	self->max_sym_namelen = 0;
 
 	for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
@@ -837,7 +837,7 @@ void hists__filter_by_thread(struct hists *self, const struct thread *thread)
 		h->filtered &= ~(1 << HIST_FILTER__THREAD);
 		if (!h->filtered) {
 			++self->nr_entries;
-			self->stats.total += h->count;
+			self->stats.total_period += h->count;
 			if (h->ms.sym &&
 			    self->max_sym_namelen < h->ms.sym->namelen)
 				self->max_sym_namelen = h->ms.sym->namelen;
@@ -1031,8 +1031,8 @@ int hist_entry__annotate(struct hist_entry *self, struct list_head *head)
 
 void hists__inc_nr_events(struct hists *self, u32 type)
 {
-	++self->hists.stats.nr_events[0];
-	++self->hists.stats.nr_events[type];
+	++self->stats.nr_events[0];
+	++self->stats.nr_events[type];
 }
 
 size_t hists__fprintf_nr_events(struct hists *self, FILE *fp)
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 97b8962ff69..da6b84814a5 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -37,9 +37,21 @@ struct sym_priv {
 	struct sym_ext	*ext;
 };
 
+/*
+ * The kernel collects the number of events it couldn't send in a stretch and
+ * when possible sends this number in a PERF_RECORD_LOST event. The number of
+ * such "chunks" of lost events is stored in .nr_events[PERF_EVENT_LOST] while
+ * total_lost tells exactly how many events the kernel in fact lost, i.e. it is
+ * the sum of all struct lost_event.lost fields reported.
+ *
+ * The total_period is needed because by default auto-freq is used, so
+ * multipling nr_events[PERF_EVENT_SAMPLE] by a frequency isn't possible to get
+ * the total number of low level events, it is necessary to to sum all struct
+ * sample_event.period and stash the result in total_period.
+ */
 struct events_stats {
-	u64 total;
-	u64 lost;
+	u64 total_period;
+	u64 total_lost;
 	u32 nr_events[PERF_RECORD_HEADER_MAX];
 	u32 nr_unknown_events;
 };
diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index ba6acd04c08..010bacf4016 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -689,7 +689,7 @@ static int hist_browser__populate(struct hist_browser *self, struct hists *hists
 	}
 
 	snprintf(str, sizeof(str), "Samples: %Ld                            ",
-		 hists->stats.total);
+		 hists->stats.total_period);
 	newtDrawRootText(0, 0, str);
 
 	newtGetScreenSize(NULL, &rows);
@@ -718,12 +718,12 @@ static int hist_browser__populate(struct hist_browser *self, struct hists *hists
 		if (h->filtered)
 			continue;
 
-		len = hist_entry__append_browser(h, self->tree, hists->stats.total);
+		len = hist_entry__append_browser(h, self->tree, hists->stats.total_period);
 		if (len > max_len)
 			max_len = len;
 		if (symbol_conf.use_callchain)
 			hist_entry__append_callchain_browser(h, self->tree,
-							     hists->stats.total, idx++);
+							     hists->stats.total_period, idx++);
 		++curr_hist;
 		if (curr_hist % 5)
 			ui_progress__update(progress, curr_hist);
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 7231f6b19fb..25bfca4f10f 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -549,7 +549,7 @@ static int perf_session__process_event(struct perf_session *self,
 		dump_printf("%#Lx [%#x]: PERF_RECORD_%s",
 			    offset + head, event->header.size,
 			    event__name[event->header.type]);
-		hists__inc_nr_events(self, event->header.type);
+		hists__inc_nr_events(&self->hists, event->header.type);
 	}
 
 	if (self->header.needs_swap && event__swap_ops[event->header.type])
-- 
cgit v1.2.3


From c82ee828aa20487d254a5225d256cd422acee459 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 14 May 2010 14:19:35 -0300
Subject: perf report: Report number of events, not samples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Number of samples is meaningless after we switched to auto-freq, so
report the number of events, i.e. not the sum of the different periods,
but the number PERF_RECORD_SAMPLE emitted by the kernel.

While doing this I noticed that naming "count" to the sum of all the
event periods can be confusing, so rename it to .period, just like in
struct sample.data, so that we become more consistent.

This helps with the next step, that was to record in struct hist_entry
the number of sample events for each instance, we need that because we
use it to generate the number of events when applying filters to the
tree of hist entries like it is being done in the TUI report browser.

Suggested-by: Ingo Molnar <mingo@elte.hu>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-diff.c   |  6 ++--
 tools/perf/builtin-report.c | 32 +++++++++++------
 tools/perf/util/hist.c      | 88 ++++++++++++++++++++++++---------------------
 tools/perf/util/hist.h      |  2 +-
 tools/perf/util/newt.c      |  8 +++--
 tools/perf/util/sort.h      | 11 +++---
 tools/perf/util/util.c      | 22 ++++++++++++
 tools/perf/util/util.h      |  1 +
 8 files changed, 107 insertions(+), 63 deletions(-)

diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 6dd4bdae8a8..a6e2fdc7a04 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -23,9 +23,9 @@ static bool  force;
 static bool show_displacement;
 
 static int hists__add_entry(struct hists *self,
-			    struct addr_location *al, u64 count)
+			    struct addr_location *al, u64 period)
 {
-	if (__hists__add_entry(self, al, NULL, count) != NULL)
+	if (__hists__add_entry(self, al, NULL, period) != NULL)
 		return 0;
 	return -ENOMEM;
 }
@@ -50,7 +50,7 @@ static int diff__process_sample_event(event_t *event, struct perf_session *sessi
 	event__parse_sample(event, session->sample_type, &data);
 
 	if (hists__add_entry(&session->hists, &al, data.period)) {
-		pr_warning("problem incrementing symbol count, skipping event\n");
+		pr_warning("problem incrementing symbol period, skipping event\n");
 		return -1;
 	}
 
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index b8f47ded628..68265120ee0 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -188,14 +188,14 @@ static int process_sample_event(event_t *event, struct perf_session *session)
 		return 0;
 
 	if (perf_session__add_hist_entry(session, &al, &data)) {
-		pr_debug("problem incrementing symbol count, skipping event\n");
+		pr_debug("problem incrementing symbol period, skipping event\n");
 		return -1;
 	}
 
 	attr = perf_header__find_attr(data.id, &session->header);
 
 	if (add_event_total(session, &data, attr)) {
-		pr_debug("problem adding event count\n");
+		pr_debug("problem adding event period\n");
 		return -1;
 	}
 
@@ -269,11 +269,25 @@ static struct perf_event_ops event_ops = {
 
 extern volatile int session_done;
 
-static void sig_handler(int sig __attribute__((__unused__)))
+static void sig_handler(int sig __used)
 {
 	session_done = 1;
 }
 
+static size_t hists__fprintf_nr_sample_events(struct hists *self,
+					      const char *evname, FILE *fp)
+{
+	size_t ret;
+	char unit;
+	unsigned long nr_events = self->stats.nr_events[PERF_RECORD_SAMPLE];
+
+	nr_events = convert_unit(nr_events, &unit);
+	ret = fprintf(fp, "# Events: %lu%c", nr_events, unit);
+	if (evname != NULL)
+		ret += fprintf(fp, " %s", evname);
+	return ret + fprintf(fp, "\n#\n");
+}
+
 static int __cmd_report(void)
 {
 	int ret = -EINVAL;
@@ -319,14 +333,12 @@ static int __cmd_report(void)
 		if (use_browser)
 			hists__browse(hists, help, input_name);
 		else {
-			if (rb_first(&session->hists.entries) ==
+			const char *evname = NULL;
+			if (rb_first(&session->hists.entries) !=
 			    rb_last(&session->hists.entries))
-				fprintf(stdout, "# Samples: %Ld\n#\n",
-					hists->stats.total_period);
-			else
-				fprintf(stdout, "# Samples: %Ld %s\n#\n",
-					hists->stats.total_period,
-					__event_name(hists->type, hists->config));
+				evname = __event_name(hists->type, hists->config);
+
+			hists__fprintf_nr_sample_events(hists, evname, stdout);
 
 			hists__fprintf(hists, NULL, false, stdout);
 			fprintf(stdout, "\n\n");
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index c5922451808..f75c5f62401 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -9,21 +9,21 @@ struct callchain_param	callchain_param = {
 	.min_percent = 0.5
 };
 
-static void hist_entry__add_cpumode_count(struct hist_entry *self,
-					  unsigned int cpumode, u64 count)
+static void hist_entry__add_cpumode_period(struct hist_entry *self,
+					   unsigned int cpumode, u64 period)
 {
 	switch (cpumode) {
 	case PERF_RECORD_MISC_KERNEL:
-		self->count_sys += count;
+		self->period_sys += period;
 		break;
 	case PERF_RECORD_MISC_USER:
-		self->count_us += count;
+		self->period_us += period;
 		break;
 	case PERF_RECORD_MISC_GUEST_KERNEL:
-		self->count_guest_sys += count;
+		self->period_guest_sys += period;
 		break;
 	case PERF_RECORD_MISC_GUEST_USER:
-		self->count_guest_us += count;
+		self->period_guest_us += period;
 		break;
 	default:
 		break;
@@ -31,7 +31,7 @@ static void hist_entry__add_cpumode_count(struct hist_entry *self,
 }
 
 /*
- * histogram, sorted on item, collects counts
+ * histogram, sorted on item, collects periods
  */
 
 static struct hist_entry *hist_entry__new(struct hist_entry *template)
@@ -41,6 +41,7 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template)
 
 	if (self != NULL) {
 		*self = *template;
+		self->nr_events = 1;
 		if (symbol_conf.use_callchain)
 			callchain_init(self->callchain);
 	}
@@ -57,7 +58,7 @@ static void hists__inc_nr_entries(struct hists *self, struct hist_entry *entry)
 
 struct hist_entry *__hists__add_entry(struct hists *self,
 				      struct addr_location *al,
-				      struct symbol *sym_parent, u64 count)
+				      struct symbol *sym_parent, u64 period)
 {
 	struct rb_node **p = &self->entries.rb_node;
 	struct rb_node *parent = NULL;
@@ -70,7 +71,7 @@ struct hist_entry *__hists__add_entry(struct hists *self,
 		},
 		.ip	= al->addr,
 		.level	= al->level,
-		.count	= count,
+		.period	= period,
 		.parent = sym_parent,
 	};
 	int cmp;
@@ -82,7 +83,8 @@ struct hist_entry *__hists__add_entry(struct hists *self,
 		cmp = hist_entry__cmp(&entry, he);
 
 		if (!cmp) {
-			he->count += count;
+			he->period += period;
+			++he->nr_events;
 			goto out;
 		}
 
@@ -99,7 +101,7 @@ struct hist_entry *__hists__add_entry(struct hists *self,
 	rb_insert_color(&he->rb_node, &self->entries);
 	hists__inc_nr_entries(self, he);
 out:
-	hist_entry__add_cpumode_count(he, al->cpumode, count);
+	hist_entry__add_cpumode_period(he, al->cpumode, period);
 	return he;
 }
 
@@ -160,7 +162,7 @@ static bool collapse__insert_entry(struct rb_root *root, struct hist_entry *he)
 		cmp = hist_entry__collapse(iter, he);
 
 		if (!cmp) {
-			iter->count += he->count;
+			iter->period += he->period;
 			hist_entry__free(he);
 			return false;
 		}
@@ -203,7 +205,7 @@ void hists__collapse_resort(struct hists *self)
 }
 
 /*
- * reverse the map, sort on count.
+ * reverse the map, sort on period.
  */
 
 static void __hists__insert_output_entry(struct rb_root *entries,
@@ -222,7 +224,7 @@ static void __hists__insert_output_entry(struct rb_root *entries,
 		parent = *p;
 		iter = rb_entry(parent, struct hist_entry, rb_node);
 
-		if (he->count > iter->count)
+		if (he->period > iter->period)
 			p = &(*p)->rb_left;
 		else
 			p = &(*p)->rb_right;
@@ -288,7 +290,7 @@ static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask,
 }
 
 static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain,
-				     int depth, int depth_mask, int count,
+				     int depth, int depth_mask, int period,
 				     u64 total_samples, int hits,
 				     int left_margin)
 {
@@ -301,7 +303,7 @@ static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain,
 			ret += fprintf(fp, "|");
 		else
 			ret += fprintf(fp, " ");
-		if (!count && i == depth - 1) {
+		if (!period && i == depth - 1) {
 			double percent;
 
 			percent = hits * 100.0 / total_samples;
@@ -516,7 +518,7 @@ int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size,
 			 long displacement, bool color, u64 session_total)
 {
 	struct sort_entry *se;
-	u64 count, total, count_sys, count_us, count_guest_sys, count_guest_us;
+	u64 period, total, period_sys, period_us, period_guest_sys, period_guest_us;
 	const char *sep = symbol_conf.field_sep;
 	int ret;
 
@@ -524,57 +526,57 @@ int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size,
 		return 0;
 
 	if (pair_hists) {
-		count = self->pair ? self->pair->count : 0;
+		period = self->pair ? self->pair->period : 0;
 		total = pair_hists->stats.total_period;
-		count_sys = self->pair ? self->pair->count_sys : 0;
-		count_us = self->pair ? self->pair->count_us : 0;
-		count_guest_sys = self->pair ? self->pair->count_guest_sys : 0;
-		count_guest_us = self->pair ? self->pair->count_guest_us : 0;
+		period_sys = self->pair ? self->pair->period_sys : 0;
+		period_us = self->pair ? self->pair->period_us : 0;
+		period_guest_sys = self->pair ? self->pair->period_guest_sys : 0;
+		period_guest_us = self->pair ? self->pair->period_guest_us : 0;
 	} else {
-		count = self->count;
+		period = self->period;
 		total = session_total;
-		count_sys = self->count_sys;
-		count_us = self->count_us;
-		count_guest_sys = self->count_guest_sys;
-		count_guest_us = self->count_guest_us;
+		period_sys = self->period_sys;
+		period_us = self->period_us;
+		period_guest_sys = self->period_guest_sys;
+		period_guest_us = self->period_guest_us;
 	}
 
 	if (total) {
 		if (color)
 			ret = percent_color_snprintf(s, size,
 						     sep ? "%.2f" : "   %6.2f%%",
-						     (count * 100.0) / total);
+						     (period * 100.0) / total);
 		else
 			ret = snprintf(s, size, sep ? "%.2f" : "   %6.2f%%",
-				       (count * 100.0) / total);
+				       (period * 100.0) / total);
 		if (symbol_conf.show_cpu_utilization) {
 			ret += percent_color_snprintf(s + ret, size - ret,
 					sep ? "%.2f" : "   %6.2f%%",
-					(count_sys * 100.0) / total);
+					(period_sys * 100.0) / total);
 			ret += percent_color_snprintf(s + ret, size - ret,
 					sep ? "%.2f" : "   %6.2f%%",
-					(count_us * 100.0) / total);
+					(period_us * 100.0) / total);
 			if (perf_guest) {
 				ret += percent_color_snprintf(s + ret,
 						size - ret,
 						sep ? "%.2f" : "   %6.2f%%",
-						(count_guest_sys * 100.0) /
+						(period_guest_sys * 100.0) /
 								total);
 				ret += percent_color_snprintf(s + ret,
 						size - ret,
 						sep ? "%.2f" : "   %6.2f%%",
-						(count_guest_us * 100.0) /
+						(period_guest_us * 100.0) /
 								total);
 			}
 		}
 	} else
-		ret = snprintf(s, size, sep ? "%lld" : "%12lld ", count);
+		ret = snprintf(s, size, sep ? "%lld" : "%12lld ", period);
 
 	if (symbol_conf.show_nr_samples) {
 		if (sep)
-			ret += snprintf(s + ret, size - ret, "%c%lld", *sep, count);
+			ret += snprintf(s + ret, size - ret, "%c%lld", *sep, period);
 		else
-			ret += snprintf(s + ret, size - ret, "%11lld", count);
+			ret += snprintf(s + ret, size - ret, "%11lld", period);
 	}
 
 	if (pair_hists) {
@@ -582,9 +584,9 @@ int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size,
 		double old_percent = 0, new_percent = 0, diff;
 
 		if (total > 0)
-			old_percent = (count * 100.0) / total;
+			old_percent = (period * 100.0) / total;
 		if (session_total > 0)
-			new_percent = (self->count * 100.0) / session_total;
+			new_percent = (self->period * 100.0) / session_total;
 
 		diff = new_percent - old_percent;
 
@@ -796,6 +798,7 @@ void hists__filter_by_dso(struct hists *self, const struct dso *dso)
 	struct rb_node *nd;
 
 	self->nr_entries = self->stats.total_period = 0;
+	self->stats.nr_events[PERF_RECORD_SAMPLE] = 0;
 	self->max_sym_namelen = 0;
 
 	for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
@@ -812,7 +815,8 @@ void hists__filter_by_dso(struct hists *self, const struct dso *dso)
 		h->filtered &= ~(1 << HIST_FILTER__DSO);
 		if (!h->filtered) {
 			++self->nr_entries;
-			self->stats.total_period += h->count;
+			self->stats.total_period += h->period;
+			self->stats.nr_events[PERF_RECORD_SAMPLE] += h->nr_events;
 			if (h->ms.sym &&
 			    self->max_sym_namelen < h->ms.sym->namelen)
 				self->max_sym_namelen = h->ms.sym->namelen;
@@ -825,6 +829,7 @@ void hists__filter_by_thread(struct hists *self, const struct thread *thread)
 	struct rb_node *nd;
 
 	self->nr_entries = self->stats.total_period = 0;
+	self->stats.nr_events[PERF_RECORD_SAMPLE] = 0;
 	self->max_sym_namelen = 0;
 
 	for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
@@ -837,7 +842,8 @@ void hists__filter_by_thread(struct hists *self, const struct thread *thread)
 		h->filtered &= ~(1 << HIST_FILTER__THREAD);
 		if (!h->filtered) {
 			++self->nr_entries;
-			self->stats.total_period += h->count;
+			self->stats.total_period += h->period;
+			self->stats.nr_events[PERF_RECORD_SAMPLE] += h->nr_events;
 			if (h->ms.sym &&
 			    self->max_sym_namelen < h->ms.sym->namelen)
 				self->max_sym_namelen = h->ms.sym->namelen;
@@ -881,7 +887,7 @@ int hist_entry__inc_addr_samples(struct hist_entry *self, u64 ip)
 	h->sum++;
 	h->ip[offset]++;
 
-	pr_debug3("%#Lx %s: count++ [ip: %#Lx, %#Lx] => %Ld\n", self->ms.sym->start,
+	pr_debug3("%#Lx %s: period++ [ip: %#Lx, %#Lx] => %Ld\n", self->ms.sym->start,
 		  self->ms.sym->name, ip, ip - self->ms.sym->start, h->ip[offset]);
 	return 0;
 }
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index da6b84814a5..6f17dcd8412 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -69,7 +69,7 @@ struct hists {
 
 struct hist_entry *__hists__add_entry(struct hists *self,
 				      struct addr_location *al,
-				      struct symbol *parent, u64 count);
+				      struct symbol *parent, u64 period);
 extern int64_t hist_entry__cmp(struct hist_entry *, struct hist_entry *);
 extern int64_t hist_entry__collapse(struct hist_entry *, struct hist_entry *);
 int hist_entry__fprintf(struct hist_entry *self, struct hists *pair_hists,
diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 010bacf4016..3402453812b 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -680,16 +680,18 @@ static int hist_browser__populate(struct hist_browser *self, struct hists *hists
 	struct ui_progress *progress;
 	struct rb_node *nd;
 	u64 curr_hist = 0;
-	char seq[] = ".";
+	char seq[] = ".", unit;
 	char str[256];
+	unsigned long nr_events = hists->stats.nr_events[PERF_RECORD_SAMPLE];
 
 	if (self->form) {
 		newtFormDestroy(self->form);
 		newtPopWindow();
 	}
 
-	snprintf(str, sizeof(str), "Samples: %Ld                            ",
-		 hists->stats.total_period);
+	nr_events = convert_unit(nr_events, &unit);
+	snprintf(str, sizeof(str), "Events: %lu%c                            ",
+		 nr_events, unit);
 	newtDrawRootText(0, 0, str);
 
 	newtGetScreenSize(NULL, &rows);
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index af301acc461..eab2e0b3b74 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -43,14 +43,15 @@ extern enum sort_type sort__first_dimension;
 
 struct hist_entry {
 	struct rb_node		rb_node;
-	u64			count;
-	u64			count_sys;
-	u64			count_us;
-	u64			count_guest_sys;
-	u64			count_guest_us;
+	u64			period;
+	u64			period_sys;
+	u64			period_us;
+	u64			period_guest_sys;
+	u64			period_guest_us;
 	struct map_symbol	ms;
 	struct thread		*thread;
 	u64			ip;
+	u32			nr_events;
 	char			level;
 	u8			filtered;
 	struct symbol		*parent;
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index f9b890fde68..214265674dd 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -92,3 +92,25 @@ out_close_from:
 out:
 	return err;
 }
+
+unsigned long convert_unit(unsigned long value, char *unit)
+{
+	*unit = ' ';
+
+	if (value > 1000) {
+		value /= 1000;
+		*unit = 'K';
+	}
+
+	if (value > 1000) {
+		value /= 1000;
+		*unit = 'M';
+	}
+
+	if (value > 1000) {
+		value /= 1000;
+		*unit = 'G';
+	}
+
+	return value;
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index fbf45d1b26f..0795bf304b1 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -423,6 +423,7 @@ char **argv_split(const char *str, int *argcp);
 void argv_free(char **argv);
 bool strglobmatch(const char *str, const char *pat);
 bool strlazymatch(const char *str, const char *pat);
+unsigned long convert_unit(unsigned long value, char *unit);
 
 #define _STR(x) #x
 #define STR(x) _STR(x)
-- 
cgit v1.2.3


From b9af5ddf8a34ff3c911372173c2e51c6f8a6ca8f Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Thu, 13 May 2010 22:51:51 +0400
Subject: DA830: fix USB 2.0 clock entry

DA8xx OHCI driver fails to load due to failing clk_get() call for the USB 2.0
clock. Arrange matching USB 2.0 clock by the clock name instead of the device.
(Adding another CLK() entry for "ohci.0" device won't do -- in the future I'll
also have to enable USB 2.0 clock to configure CPPI 4.1 module, in which case
I won't have any device at all.)

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Kevin Hilman <khilman@deeprootsystems.com>
---
 arch/arm/mach-davinci/da830.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mach-davinci/da830.c b/arch/arm/mach-davinci/da830.c
index 122e61a9f50..e8cb982f5e8 100644
--- a/arch/arm/mach-davinci/da830.c
+++ b/arch/arm/mach-davinci/da830.c
@@ -410,7 +410,7 @@ static struct clk_lookup da830_clks[] = {
 	CLK("davinci-mcasp.0",	NULL,		&mcasp0_clk),
 	CLK("davinci-mcasp.1",	NULL,		&mcasp1_clk),
 	CLK("davinci-mcasp.2",	NULL,		&mcasp2_clk),
-	CLK("musb_hdrc",	NULL,		&usb20_clk),
+	CLK(NULL,		"usb20",	&usb20_clk),
 	CLK(NULL,		"aemif",	&aemif_clk),
 	CLK(NULL,		"aintc",	&aintc_clk),
 	CLK(NULL,		"secu_mgr",	&secu_mgr_clk),
-- 
cgit v1.2.3


From ade029e2aaacc8965a548b0b0f80c5bee97ffc68 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Sat, 24 Apr 2010 09:56:53 +0200
Subject: x86, k8: Fix build error when K8_NB is disabled

K8_NB depends on PCI and when the last is disabled (allnoconfig) we fail
at the final linking stage due to missing exported num_k8_northbridges.
Add a header stub for that.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100503183036.GJ26107@aftab>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@kernel.org>
---
 arch/x86/include/asm/k8.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index f70e60071fe..af00bd1d208 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -16,11 +16,16 @@ extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
 extern int k8_scan_nodes(void);
 
 #ifdef CONFIG_K8_NB
+extern int num_k8_northbridges;
+
 static inline struct pci_dev *node_to_k8_nb_misc(int node)
 {
 	return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
 }
+
 #else
+#define num_k8_northbridges 0
+
 static inline struct pci_dev *node_to_k8_nb_misc(int node)
 {
 	return NULL;
-- 
cgit v1.2.3


From 7f284d3cc96e02468a42e045f77af11e5ff8b095 Mon Sep 17 00:00:00 2001
From: Frank Arnold <frank.arnold@amd.com>
Date: Thu, 22 Apr 2010 16:06:59 +0200
Subject: x86, cacheinfo: Turn off L3 cache index disable feature in
 virtualized environments

When running a quest kernel on xen we get:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000038
IP: [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
PGD 0
Oops: 0000 [#1] SMP
last sysfs file:
CPU 0
Modules linked in:

Pid: 0, comm: swapper Tainted: G        W  2.6.34-rc3 #1 /HVM domU
RIP: 0010:[<ffffffff8142f2fb>]  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x
2ca/0x3df
RSP: 0018:ffff880002203e08  EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000060
RDX: 0000000000000000 RSI: 0000000000000040 RDI: 0000000000000000
RBP: ffff880002203ed8 R08: 00000000000017c0 R09: ffff880002203e38
R10: ffff8800023d5d40 R11: ffffffff81a01e28 R12: ffff880187e6f5c0
R13: ffff880002203e34 R14: ffff880002203e58 R15: ffff880002203e68
FS:  0000000000000000(0000) GS:ffff880002200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000038 CR3: 0000000001a3c000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffffffff81a00000, task ffffffff81a44020)
Stack:
 ffffffff810d7ecb ffff880002203e20 ffffffff81059140 ffff880002203e30
<0> ffffffff810d7ec9 0000000002203e40 000000000050d140 ffff880002203e70
<0> 0000000002008140 0000000000000086 ffff880040020140 ffffffff81068b8b
Call Trace:
 <IRQ>
 [<ffffffff810d7ecb>] ? sync_supers_timer_fn+0x0/0x1c
 [<ffffffff81059140>] ? mod_timer+0x23/0x25
 [<ffffffff810d7ec9>] ? arm_supers_timer+0x34/0x36
 [<ffffffff81068b8b>] ? hrtimer_get_next_event+0xa7/0xc3
 [<ffffffff81058e85>] ? get_next_timer_interrupt+0x19a/0x20d
 [<ffffffff8142fa23>] get_cpu_leaves+0x5c/0x232
 [<ffffffff8106a7b1>] ? sched_clock_local+0x1c/0x82
 [<ffffffff8106a9a0>] ? sched_clock_tick+0x75/0x7a
 [<ffffffff8107748c>] generic_smp_call_function_single_interrupt+0xae/0xd0
 [<ffffffff8101f6ef>] smp_call_function_single_interrupt+0x18/0x27
 [<ffffffff8100a773>] call_function_single_interrupt+0x13/0x20
 <EOI>
 [<ffffffff8143c468>] ? notifier_call_chain+0x14/0x63
 [<ffffffff810295c6>] ? native_safe_halt+0xc/0xd
 [<ffffffff810114eb>] ? default_idle+0x36/0x53
 [<ffffffff81008c22>] cpu_idle+0xaa/0xe4
 [<ffffffff81423a9a>] rest_init+0x7e/0x80
 [<ffffffff81b10dd2>] start_kernel+0x40e/0x419
 [<ffffffff81b102c8>] x86_64_start_reservations+0xb3/0xb7
 [<ffffffff81b103c4>] x86_64_start_kernel+0xf8/0x107
Code: 14 d5 40 ff ae 81 8b 14 02 31 c0 3b 15 47 1c 8b 00 7d 0e 48 8b 05 36 1c 8b
 00 48 63 d2 48 8b 04 d0 c7 85 5c ff ff ff 00 00 00 00 <8b> 70 38 48 8d 8d 5c ff
 ff ff 48 8b 78 10 ba c4 01 00 00 e8 eb
RIP  [<ffffffff8142f2fb>] cpuid4_cache_lookup_regs+0x2ca/0x3df
 RSP <ffff880002203e08>
CR2: 0000000000000038
---[ end trace a7919e7f17c0a726 ]---

The L3 cache index disable feature of AMD CPUs has to be disabled if the
kernel is running as guest on top of a hypervisor because northbridge
devices are not available to the guest. Currently, this fixes a boot
crash on top of Xen. In the future this will become an issue on KVM as
well.

Check if northbridge devices are present and do not enable the feature
if there are none.

[ hpa: backported to 2.6.34 ]

Signed-off-by: Frank Arnold <frank.arnold@amd.com>
LKML-Reference: <1271945222-5283-3-git-send-email-bp@amd64.org>
Acked-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: <stable@kernel.org>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index b3eeb66c0a5..95962a93f99 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -340,6 +340,10 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 	     (boot_cpu_data.x86_mask  < 0x1)))
 		return;
 
+	/* not in virtualized environments */
+	if (num_k8_northbridges == 0)
+		return;
+
 	this_leaf->can_disable = true;
 	this_leaf->l3_indices  = amd_calc_l3_indices();
 }
-- 
cgit v1.2.3


From e9b1d5d0ff4d3ae86050dc4c91b3147361c7af9e Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 14 May 2010 13:55:57 -0700
Subject: x86, mrst: Don't blindly access extended config space

Do not blindly access extended configuration space unless we actively
know we're on a Moorestown platform.  The fixed-size BAR capability
lives in the extended configuration space, and thus is not applicable
if the configuration space isn't appropriately sized.

This fixes booting certain VMware configurations with CONFIG_MRST=y.

Moorestown will add a fake PCI-X 266 capability to advertise the
presence of extended configuration space.

Reported-and-tested-by: Petr Vandrovec <petr@vandrovec.name>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Acked-by: Jacob Pan <jacob.jun.pan@intel.com>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
LKML-Reference: <AANLkTiltKUa3TrKR1M51eGw8FLNoQJSLT0k0_K5X3-OJ@mail.gmail.com>
---
 arch/x86/pci/mrst.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index 8bf2fcb88d0..1cdc02cf8fa 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -247,6 +247,10 @@ static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev)
 	u32 size;
 	int i;
 
+	/* Must have extended configuration space */
+	if (dev->cfg_size < PCIE_CAP_OFFSET + 4)
+		return;
+
 	/* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */
 	offset = fixed_bar_cap(dev->bus, dev->devfn);
 	if (!offset || PCI_DEVFN(2, 0) == dev->devfn ||
-- 
cgit v1.2.3


From 3e1bbdc3a721f4b1ed44f4554402a8dbc60fa97f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 14 May 2010 20:05:21 -0300
Subject: perf newt: Make <- zoom out filters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After we use the filters to zoom into DSOs or threads, we can use <-
(left arrow) to zoom out from the last filter applied.

It is still possible to zoom out of order by using the popup menu.

With this we now have the zoom out operation on the browsing fast path,
by allowing fast navigation using just the four arrors and the enter key
to expand collapse callchains.

Suggested-by: Ingo Molnar <mingo@elte.hu>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile      |  2 ++
 tools/perf/util/newt.c   | 34 ++++++++++++++++++++--
 tools/perf/util/pstack.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/pstack.h | 12 ++++++++
 4 files changed, 120 insertions(+), 3 deletions(-)
 create mode 100644 tools/perf/util/pstack.c
 create mode 100644 tools/perf/util/pstack.h

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 9c4dc30cdc1..a9281cca411 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -416,6 +416,7 @@ LIB_H += util/thread.h
 LIB_H += util/trace-event.h
 LIB_H += util/probe-finder.h
 LIB_H += util/probe-event.h
+LIB_H += util/pstack.h
 LIB_H += util/cpumap.h
 
 LIB_OBJS += $(OUTPUT)util/abspath.o
@@ -451,6 +452,7 @@ LIB_OBJS += $(OUTPUT)util/callchain.o
 LIB_OBJS += $(OUTPUT)util/values.o
 LIB_OBJS += $(OUTPUT)util/debug.o
 LIB_OBJS += $(OUTPUT)util/map.o
+LIB_OBJS += $(OUTPUT)util/pstack.o
 LIB_OBJS += $(OUTPUT)util/session.o
 LIB_OBJS += $(OUTPUT)util/thread.o
 LIB_OBJS += $(OUTPUT)util/trace-event-parse.o
diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 3402453812b..e74df1240ef 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -9,6 +9,7 @@
 
 #include "cache.h"
 #include "hist.h"
+#include "pstack.h"
 #include "session.h"
 #include "sort.h"
 #include "symbol.h"
@@ -750,6 +751,7 @@ static int hist_browser__populate(struct hist_browser *self, struct hists *hists
 	newtFormAddHotKey(self->form, 'A');
 	newtFormAddHotKey(self->form, 'a');
 	newtFormAddHotKey(self->form, NEWT_KEY_RIGHT);
+	newtFormAddHotKey(self->form, NEWT_KEY_LEFT);
 	newtFormAddComponents(self->form, self->tree, NULL);
 	self->selection = newt__symbol_tree_get_current(self->tree);
 
@@ -801,6 +803,7 @@ static int hist_browser__title(char *bf, size_t size, const char *input_name,
 int hists__browse(struct hists *self, const char *helpline, const char *input_name)
 {
 	struct hist_browser *browser = hist_browser__new();
+	struct pstack *fstack = pstack__new(2);
 	const struct thread *thread_filter = NULL;
 	const struct dso *dso_filter = NULL;
 	struct newtExitStruct es;
@@ -810,12 +813,16 @@ int hists__browse(struct hists *self, const char *helpline, const char *input_na
 	if (browser == NULL)
 		return -1;
 
+	fstack = pstack__new(2);
+	if (fstack == NULL)
+		goto out;
+
 	ui_helpline__push(helpline);
 
 	hist_browser__title(msg, sizeof(msg), input_name,
 			    dso_filter, thread_filter);
 	if (hist_browser__populate(browser, self, msg) < 0)
-		goto out;
+		goto out_free_stack;
 
 	while (1) {
 		const struct thread *thread;
@@ -836,6 +843,19 @@ int hists__browse(struct hists *self, const char *helpline, const char *input_na
 				else
 					continue;
 			}
+
+			if (es.u.key == NEWT_KEY_LEFT) {
+				const void *top;
+
+				if (pstack__empty(fstack))
+					continue;
+				top = pstack__pop(fstack);
+				if (top == &dso_filter)
+					goto zoom_out_dso;
+				if (top == &thread_filter)
+					goto zoom_out_thread;
+				continue;
+			}
 		}
 
 		if (browser->selection->sym != NULL &&
@@ -888,12 +908,15 @@ do_annotate:
 			hist_entry__annotate_browser(he);
 		} else if (choice == zoom_dso) {
 			if (dso_filter) {
+				pstack__remove(fstack, &dso_filter);
+zoom_out_dso:
 				ui_helpline__pop();
 				dso_filter = NULL;
 			} else {
-				ui_helpline__fpush("To zoom out press -> + \"Zoom out of %s DSO\"",
+				ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s DSO\"",
 						   dso->kernel ? "the Kernel" : dso->short_name);
 				dso_filter = dso;
+				pstack__push(fstack, &dso_filter);
 			}
 			hists__filter_by_dso(self, dso_filter);
 			hist_browser__title(msg, sizeof(msg), input_name,
@@ -902,13 +925,16 @@ do_annotate:
 				goto out;
 		} else if (choice == zoom_thread) {
 			if (thread_filter) {
+				pstack__remove(fstack, &thread_filter);
+zoom_out_thread:
 				ui_helpline__pop();
 				thread_filter = NULL;
 			} else {
-				ui_helpline__fpush("To zoom out press -> + \"Zoom out of %s(%d) thread\"",
+				ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s(%d) thread\"",
 						   thread->comm_set ? thread->comm : "",
 						   thread->pid);
 				thread_filter = thread;
+				pstack__push(fstack, &thread_filter);
 			}
 			hists__filter_by_thread(self, thread_filter);
 			hist_browser__title(msg, sizeof(msg), input_name,
@@ -918,6 +944,8 @@ do_annotate:
 		}
 	}
 	err = 0;
+out_free_stack:
+	pstack__delete(fstack);
 out:
 	hist_browser__delete(browser);
 	return err;
diff --git a/tools/perf/util/pstack.c b/tools/perf/util/pstack.c
new file mode 100644
index 00000000000..13d36faf64e
--- /dev/null
+++ b/tools/perf/util/pstack.c
@@ -0,0 +1,75 @@
+/*
+ * Simple pointer stack
+ *
+ * (c) 2010 Arnaldo Carvalho de Melo <acme@redhat.com>
+ */
+
+#include "util.h"
+#include "pstack.h"
+#include <linux/kernel.h>
+#include <stdlib.h>
+
+struct pstack {
+	unsigned short	top;
+	unsigned short	max_nr_entries;
+	void		*entries[0];
+};
+
+struct pstack *pstack__new(unsigned short max_nr_entries)
+{
+	struct pstack *self = zalloc((sizeof(*self) +
+				     max_nr_entries * sizeof(void *)));
+	if (self != NULL)
+		self->max_nr_entries = max_nr_entries;
+	return self;
+}
+
+void pstack__delete(struct pstack *self)
+{
+	free(self);
+}
+
+bool pstack__empty(const struct pstack *self)
+{
+	return self->top == 0;
+}
+
+void pstack__remove(struct pstack *self, void *key)
+{
+	unsigned short i = self->top, last_index = self->top - 1;
+
+	while (i-- != 0) {
+		if (self->entries[i] == key) {
+			if (i < last_index)
+				memmove(self->entries + i,
+					self->entries + i + 1,
+					(last_index - i) * sizeof(void *));
+			--self->top;
+			return;
+		}
+	}
+	pr_err("%s: %p not on the pstack!\n", __func__, key);
+}
+
+void pstack__push(struct pstack *self, void *key)
+{
+	if (self->top == self->max_nr_entries) {
+		pr_err("%s: top=%d, overflow!\n", __func__, self->top);
+		return;
+	}
+	self->entries[self->top++] = key;
+}
+
+void *pstack__pop(struct pstack *self)
+{
+	void *ret;
+
+	if (self->top == 0) {
+		pr_err("%s: underflow!\n", __func__);
+		return NULL;
+	}
+
+	ret = self->entries[--self->top];
+	self->entries[self->top] = NULL;
+	return ret;
+}
diff --git a/tools/perf/util/pstack.h b/tools/perf/util/pstack.h
new file mode 100644
index 00000000000..5ad07023504
--- /dev/null
+++ b/tools/perf/util/pstack.h
@@ -0,0 +1,12 @@
+#ifndef _PERF_PSTACK_
+#define _PERF_PSTACK_
+
+struct pstack;
+struct pstack *pstack__new(unsigned short max_nr_entries);
+void pstack__delete(struct pstack *self);
+bool pstack__empty(const struct pstack *self);
+void pstack__remove(struct pstack *self, void *key);
+void pstack__push(struct pstack *self, void *key);
+void *pstack__pop(struct pstack *self);
+
+#endif /* _PERF_PSTACK_ */
-- 
cgit v1.2.3


From bfcc6e2eca507819e297a4c758ebd6b26e625330 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 14 May 2010 19:40:35 -0700
Subject: hughd: update email address

My old address will shut down in a couple of weeks: update the tree.

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index d5b0b1b6dc5..d329b053a71 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5492,7 +5492,7 @@ S:	Maintained
 F:	drivers/mmc/host/tmio_mmc.*
 
 TMPFS (SHMEM FILESYSTEM)
-M:	Hugh Dickins <hugh.dickins@tiscali.co.uk>
+M:	Hugh Dickins <hughd@google.com>
 L:	linux-mm@kvack.org
 S:	Maintained
 F:	include/linux/shmem_fs.h
-- 
cgit v1.2.3


From 16a2164bb03612efe79a76c73da6da44445b9287 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 14 May 2010 19:44:10 -0700
Subject: profile: fix stats and data leakage

If the kernel is large or the profiling step small, /proc/profile
leaks data and readprofile shows silly stats, until readprofile -r
has reset the buffer: clear the prof_buffer when it is vmalloc()ed.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/profile.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/profile.c b/kernel/profile.c
index a55d3a367ae..dfadc5b729f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -127,8 +127,10 @@ int __ref profile_init(void)
 		return 0;
 
 	prof_buffer = vmalloc(buffer_bytes);
-	if (prof_buffer)
+	if (prof_buffer) {
+		memset(prof_buffer, 0, buffer_bytes);
 		return 0;
+	}
 
 	free_cpumask_var(prof_cpu_mask);
 	return -ENOMEM;
-- 
cgit v1.2.3


From 1ff3d7d79204612ebe2e611d2592f8898908ca00 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Fri, 14 May 2010 23:08:15 +0400
Subject: x86, perf: P4 PMU - fix counters management logic

Jaswinder reported this #GP:

 |
 | Message from syslogd@ht at May 14 09:39:32 ...
 | kernel:[  314.908612] EIP: [<c100ccca>]
 | x86_perf_event_set_period+0x19d/0x1b2 SS:ESP 0068:edac3d70
 |

Ming has narrowed it down to a comparision issue
between arguments with different sizes and
signs. As result event index reached a wrong
value which in turn led to a GP fault.

At the same time it was found that p4_next_cntr
has broken logic and should return the counter
index only if it was not yet borrowed for
another event.

Reported-by: Jaswinder Singh Rajput <jaswinderlinux@gmail.com>
Reported-by: Lin Ming <ming.m.lin@intel.com>
Bisected-by: Lin Ming <ming.m.lin@intel.com>
Tested-by: Jaswinder Singh Rajput <jaswinderlinux@gmail.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100514190815.GG13509@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index cb875b1e2e8..424fc8de68e 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -18,7 +18,7 @@
 struct p4_event_bind {
 	unsigned int opcode;			/* Event code and ESCR selector */
 	unsigned int escr_msr[2];		/* ESCR MSR for this event */
-	unsigned char cntr[2][P4_CNTR_LIMIT];	/* counter index (offset), -1 on abscence */
+	char cntr[2][P4_CNTR_LIMIT];		/* counter index (offset), -1 on abscence */
 };
 
 struct p4_cache_event_bind {
@@ -747,11 +747,11 @@ static int p4_get_escr_idx(unsigned int addr)
 static int p4_next_cntr(int thread, unsigned long *used_mask,
 			struct p4_event_bind *bind)
 {
-	int i = 0, j;
+	int i, j;
 
 	for (i = 0; i < P4_CNTR_LIMIT; i++) {
-		j = bind->cntr[thread][i++];
-		if (j == -1 || !test_bit(j, used_mask))
+		j = bind->cntr[thread][i];
+		if (j != -1 && !test_bit(j, used_mask))
 			return j;
 	}
 
-- 
cgit v1.2.3


From d83c49f3e36cecd2e8823b6c48ffba083b8a5704 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 30 Apr 2010 17:17:09 -0400
Subject: Fix the regression created by "set S_DEAD on unlink()..." commit

1) i_flags simply doesn't work for mount/unlink race prevention;
we may have many links to file and rm on one of those obviously
shouldn't prevent bind on top of another later on.  To fix it
right way we need to mark _dentry_ as unsuitable for mounting
upon; new flag (DCACHE_CANT_MOUNT) is protected by d_flags and
i_mutex on the inode in question.  Set it (with dont_mount(dentry))
in unlink/rmdir/etc., check (with cant_mount(dentry)) in places
in namespace.c that used to check for S_DEAD.  Setting S_DEAD
is still needed in places where we used to set it (for directories
getting killed), since we rely on it for readdir/rmdir race
prevention.

2) rename()/mount() protection has another bogosity - we unhash
the target before we'd checked that it's not a mountpoint.  Fixed.

3) ancient bogosity in pivot_root() - we locked i_mutex on the
right directory, but checked S_DEAD on the different (and wrong)
one.  Noticed and fixed.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/usb/core/inode.c |  1 +
 fs/configfs/dir.c        |  4 ++++
 fs/namei.c               | 21 +++++++++++++--------
 fs/namespace.c           |  6 +++---
 include/linux/dcache.h   | 14 ++++++++++++++
 5 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index 4a6366a4212..111a01a747f 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -380,6 +380,7 @@ static int usbfs_rmdir(struct inode *dir, struct dentry *dentry)
 	mutex_lock(&inode->i_mutex);
 	dentry_unhash(dentry);
 	if (usbfs_empty(dentry)) {
+		dont_mount(dentry);
 		drop_nlink(dentry->d_inode);
 		drop_nlink(dentry->d_inode);
 		dput(dentry);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8e48b52205a..0b502f80c69 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -645,6 +645,7 @@ static void detach_groups(struct config_group *group)
 
 		configfs_detach_group(sd->s_element);
 		child->d_inode->i_flags |= S_DEAD;
+		dont_mount(child);
 
 		mutex_unlock(&child->d_inode->i_mutex);
 
@@ -840,6 +841,7 @@ static int configfs_attach_item(struct config_item *parent_item,
 			mutex_lock(&dentry->d_inode->i_mutex);
 			configfs_remove_dir(item);
 			dentry->d_inode->i_flags |= S_DEAD;
+			dont_mount(dentry);
 			mutex_unlock(&dentry->d_inode->i_mutex);
 			d_delete(dentry);
 		}
@@ -882,6 +884,7 @@ static int configfs_attach_group(struct config_item *parent_item,
 		if (ret) {
 			configfs_detach_item(item);
 			dentry->d_inode->i_flags |= S_DEAD;
+			dont_mount(dentry);
 		}
 		configfs_adjust_dir_dirent_depth_after_populate(sd);
 		mutex_unlock(&dentry->d_inode->i_mutex);
@@ -1725,6 +1728,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 	mutex_unlock(&configfs_symlink_mutex);
 	configfs_detach_group(&group->cg_item);
 	dentry->d_inode->i_flags |= S_DEAD;
+	dont_mount(dentry);
 	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	d_delete(dentry);
diff --git a/fs/namei.c b/fs/namei.c
index 16df7277a92..b86b96fe1dc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2176,8 +2176,10 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 		error = security_inode_rmdir(dir, dentry);
 		if (!error) {
 			error = dir->i_op->rmdir(dir, dentry);
-			if (!error)
+			if (!error) {
 				dentry->d_inode->i_flags |= S_DEAD;
+				dont_mount(dentry);
+			}
 		}
 	}
 	mutex_unlock(&dentry->d_inode->i_mutex);
@@ -2261,7 +2263,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 		if (!error) {
 			error = dir->i_op->unlink(dir, dentry);
 			if (!error)
-				dentry->d_inode->i_flags |= S_DEAD;
+				dont_mount(dentry);
 		}
 	}
 	mutex_unlock(&dentry->d_inode->i_mutex);
@@ -2572,17 +2574,20 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 		return error;
 
 	target = new_dentry->d_inode;
-	if (target) {
+	if (target)
 		mutex_lock(&target->i_mutex);
-		dentry_unhash(new_dentry);
-	}
 	if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
 		error = -EBUSY;
-	else 
+	else {
+		if (target)
+			dentry_unhash(new_dentry);
 		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+	}
 	if (target) {
-		if (!error)
+		if (!error) {
 			target->i_flags |= S_DEAD;
+			dont_mount(new_dentry);
+		}
 		mutex_unlock(&target->i_mutex);
 		if (d_unhashed(new_dentry))
 			d_rehash(new_dentry);
@@ -2614,7 +2619,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
 		error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
 	if (!error) {
 		if (target)
-			target->i_flags |= S_DEAD;
+			dont_mount(new_dentry);
 		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
 			d_move(old_dentry, new_dentry);
 	}
diff --git a/fs/namespace.c b/fs/namespace.c
index 8174c8ab5c7..f20cb57d106 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1432,7 +1432,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
 
 	err = -ENOENT;
 	mutex_lock(&path->dentry->d_inode->i_mutex);
-	if (IS_DEADDIR(path->dentry->d_inode))
+	if (cant_mount(path->dentry))
 		goto out_unlock;
 
 	err = security_sb_check_sb(mnt, path);
@@ -1623,7 +1623,7 @@ static int do_move_mount(struct path *path, char *old_name)
 
 	err = -ENOENT;
 	mutex_lock(&path->dentry->d_inode->i_mutex);
-	if (IS_DEADDIR(path->dentry->d_inode))
+	if (cant_mount(path->dentry))
 		goto out1;
 
 	if (d_unlinked(path->dentry))
@@ -2234,7 +2234,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	if (!check_mnt(root.mnt))
 		goto out2;
 	error = -ENOENT;
-	if (IS_DEADDIR(new.dentry->d_inode))
+	if (cant_mount(old.dentry))
 		goto out2;
 	if (d_unlinked(new.dentry))
 		goto out2;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 30b93b2a01a..eebb617c17d 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -186,6 +186,8 @@ d_iput:		no		no		no       yes
 
 #define DCACHE_FSNOTIFY_PARENT_WATCHED	0x0080 /* Parent inode is watched by some fsnotify listener */
 
+#define DCACHE_CANT_MOUNT	0x0100
+
 extern spinlock_t dcache_lock;
 extern seqlock_t rename_lock;
 
@@ -358,6 +360,18 @@ static inline int d_unlinked(struct dentry *dentry)
 	return d_unhashed(dentry) && !IS_ROOT(dentry);
 }
 
+static inline int cant_mount(struct dentry *dentry)
+{
+	return (dentry->d_flags & DCACHE_CANT_MOUNT);
+}
+
+static inline void dont_mount(struct dentry *dentry)
+{
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags |= DCACHE_CANT_MOUNT;
+	spin_unlock(&dentry->d_lock);
+}
+
 static inline struct dentry *dget_parent(struct dentry *dentry)
 {
 	struct dentry *ret;
-- 
cgit v1.2.3


From 265624495f5acf6077f8f8d264f8170573d8d752 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 28 Apr 2010 20:57:02 -0400
Subject: Fix double-free in logfs

iput() is needed *until* we'd done successful d_alloc_root()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/logfs/super.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 5866ee6e132..d7c23ed8349 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -333,27 +333,27 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
 		goto fail;
 
 	sb->s_root = d_alloc_root(rootdir);
-	if (!sb->s_root)
-		goto fail2;
+	if (!sb->s_root) {
+		iput(rootdir);
+		goto fail;
+	}
 
 	super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
 	if (!super->s_erase_page)
-		goto fail2;
+		goto fail;
 	memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
 
 	/* FIXME: check for read-only mounts */
 	err = logfs_make_writeable(sb);
 	if (err)
-		goto fail3;
+		goto fail1;
 
 	log_super("LogFS: Finished mounting\n");
 	simple_set_mnt(mnt, sb);
 	return 0;
 
-fail3:
+fail1:
 	__free_page(super->s_erase_page);
-fail2:
-	iput(rootdir);
 fail:
 	iput(logfs_super(sb)->s_master_inode);
 	return -EIO;
-- 
cgit v1.2.3


From 404e781249f003a37a140756fc4aeae463dcb217 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Wed, 21 Apr 2010 12:30:32 +0200
Subject: fs/sysv: dereferencing ERR_PTR()

I moved the dir_put_page() inside the if condition so we don't dereference
"page", if it's an ERR_PTR().

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sysv/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 4e50286a4cc..1dabed286b4 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -164,8 +164,8 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_
 							name, de->name))
 					goto found;
 			}
+			dir_put_page(page);
 		}
-		dir_put_page(page);
 
 		if (++n >= npages)
 			n = 0;
-- 
cgit v1.2.3


From 684bdc7ff95e0c1d4b0bcf236491840b55a54189 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Mon, 12 Apr 2010 16:44:08 -0700
Subject: JFS: Free sbi memory in error path

I spotted the missing kfree() while removing the BKL.

[akpm@linux-foundation.org: avoid multiple returns so it doesn't happen again]
Signed-off-by: Jan Blunck <jblunck@suse.de>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jfs/super.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 157382fa625..b66832ac33a 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -446,10 +446,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	/* initialize the mount flag and determine the default error handler */
 	flag = JFS_ERR_REMOUNT_RO;
 
-	if (!parse_options((char *) data, sb, &newLVSize, &flag)) {
-		kfree(sbi);
-		return -EINVAL;
-	}
+	if (!parse_options((char *) data, sb, &newLVSize, &flag))
+		goto out_kfree;
 	sbi->flag = flag;
 
 #ifdef CONFIG_JFS_POSIX_ACL
@@ -458,7 +456,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (newLVSize) {
 		printk(KERN_ERR "resize option for remount only\n");
-		return -EINVAL;
+		goto out_kfree;
 	}
 
 	/*
@@ -478,7 +476,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	inode = new_inode(sb);
 	if (inode == NULL) {
 		ret = -ENOMEM;
-		goto out_kfree;
+		goto out_unload;
 	}
 	inode->i_ino = 0;
 	inode->i_nlink = 1;
@@ -550,9 +548,10 @@ out_mount_failed:
 	make_bad_inode(sbi->direct_inode);
 	iput(sbi->direct_inode);
 	sbi->direct_inode = NULL;
-out_kfree:
+out_unload:
 	if (sbi->nls_tab)
 		unload_nls(sbi->nls_tab);
+out_kfree:
 	kfree(sbi);
 	return ret;
 }
-- 
cgit v1.2.3


From 5dc6416414fb3ec6e2825fd4d20c8bf1d7fe0395 Mon Sep 17 00:00:00 2001
From: Dan Rosenberg <dan.j.rosenberg@gmail.com>
Date: Sat, 15 May 2010 11:27:37 -0400
Subject: Btrfs: check for read permission on src file in the clone ioctl

The existing code would have allowed you to clone a file that was
only open for writing

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2b7dd88fc54..9de6c3a75bf 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1480,12 +1480,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 		ret = -EBADF;
 		goto out_drop_write;
 	}
+
 	src = src_file->f_dentry->d_inode;
 
 	ret = -EINVAL;
 	if (src == inode)
 		goto out_fput;
 
+	/* the src must be open for reading */
+	if (!(src_file->f_mode & FMODE_READ))
+		goto out_fput;
+
 	ret = -EISDIR;
 	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
 		goto out_fput;
-- 
cgit v1.2.3


From bdef2fe88b1e4bde7458aedd207929ce3f9d66ee Mon Sep 17 00:00:00 2001
From: Nicolas Ferre <nicolas.ferre@atmel.com>
Date: Sat, 15 May 2010 12:32:31 -0400
Subject: mmc: at91_mci: modify cache flush routines

As we were using an internal dma flushing routine, this patch changes to
the DMA API flush_kernel_dcache_page().  Driver is able to compile now.

[akpm@linux-foundation.org: flush_kernel_dcache_page() comes before kunmap_atomic()]
Signed-off-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mmc/host/at91_mci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mmc/host/at91_mci.c b/drivers/mmc/host/at91_mci.c
index a6dd7da3735..336d9f553f3 100644
--- a/drivers/mmc/host/at91_mci.c
+++ b/drivers/mmc/host/at91_mci.c
@@ -314,8 +314,8 @@ static void at91_mci_post_dma_read(struct at91mci_host *host)
 			dmabuf = (unsigned *)tmpv;
 		}
 
+		flush_kernel_dcache_page(sg_page(sg));
 		kunmap_atomic(sgbuffer, KM_BIO_SRC_IRQ);
-		dmac_flush_range((void *)sgbuffer, ((void *)sgbuffer) + amount);
 		data->bytes_xfered += amount;
 		if (size == 0)
 			break;
-- 
cgit v1.2.3


From 43aa7ac736a4e21aae4882bd8f7c67403aed45b8 Mon Sep 17 00:00:00 2001
From: "kirjanov@gmail.com" <kirjanov@gmail.com>
Date: Sat, 15 May 2010 12:32:34 -0400
Subject: lib/btree: fix possible NULL pointer dereference

mempool_alloc() can return null in atomic case.

Signed-off-by: Denis Kirjanov <kirjanov@gmail.com>
Cc: Joern Engel <joern@logfs.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/btree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/btree.c b/lib/btree.c
index 41859a82021..c9c6f035152 100644
--- a/lib/btree.c
+++ b/lib/btree.c
@@ -95,7 +95,8 @@ static unsigned long *btree_node_alloc(struct btree_head *head, gfp_t gfp)
 	unsigned long *node;
 
 	node = mempool_alloc(head->mempool, gfp);
-	memset(node, 0, NODESIZE);
+	if (likely(node))
+		memset(node, 0, NODESIZE);
 	return node;
 }
 
-- 
cgit v1.2.3


From 95e8f634d7a3ea5af40ec3fa42c8a152fd3a0624 Mon Sep 17 00:00:00 2001
From: Shane McDonald <mcdonald.shane@gmail.com>
Date: Thu, 6 May 2010 23:26:57 -0600
Subject:     MIPS FPU emulator: allow Cause bits of FCSR to be writeable by
 ctc1

    In the FPU emulator code of the MIPS, the Cause bits of the FCSR register
    are not currently writeable by the ctc1 instruction.  In odd corner cases,
    this can cause problems.  For example, a case existed where a divide-by-zero
    exception was generated by the FPU, and the signal handler attempted to
    restore the FPU registers to their state before the exception occurred.  In
    this particular setup, writing the old value to the FCSR register would
    cause another divide-by-zero exception to occur immediately.  The solution
    is to change the ctc1 instruction emulator code to allow the Cause bits of
    the FCSR register to be writeable.  This is the behaviour of the hardware
    that the code is emulating.

    This problem was found by Shane McDonald, but the credit for the fix goes
    to Kevin Kissell.  In Kevin's words:

    I submit that the bug is indeed in that ctc_op:  case of the emulator.  The
    Cause bits (17:12) are supposed to be writable by that instruction, but the
    CTC1 emulation won't let them be updated by the instruction.  I think that
    actually if you just completely removed lines 387-388 [...] things would
    work a good deal better.  At least, it would be a more accurate emulation of
    the architecturally defined FPU.  If I wanted to be really, really pedantic
    (which I sometimes do), I'd also protect the reserved bits that aren't
    necessarily writable.

    Signed-off-by: Shane McDonald <mcdonald.shane@gmail.com>
    To: anemo@mba.ocn.ne.jp
    To: kevink@paralogos.com
    To: sshtylyov@mvista.com
    Patchwork: http://patchwork.linux-mips.org/patch/1205/
    Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

---
---
 arch/mips/include/asm/mipsregs.h |  9 ++++++++-
 arch/mips/math-emu/cp1emu.c      | 15 +++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h
index 49382d5e891..c6e3c93ce7c 100644
--- a/arch/mips/include/asm/mipsregs.h
+++ b/arch/mips/include/asm/mipsregs.h
@@ -134,6 +134,12 @@
 #define FPU_CSR_COND6   0x40000000      /* $fcc6 */
 #define FPU_CSR_COND7   0x80000000      /* $fcc7 */
 
+/*
+ * Bits 18 - 20 of the FPU Status Register will be read as 0,
+ * and should be written as zero.
+ */
+#define FPU_CSR_RSVD	0x001c0000
+
 /*
  * X the exception cause indicator
  * E the exception enable
@@ -161,7 +167,8 @@
 #define FPU_CSR_UDF_S   0x00000008
 #define FPU_CSR_INE_S   0x00000004
 
-/* rounding mode */
+/* Bits 0 and 1 of FPU Status Register specify the rounding mode */
+#define FPU_CSR_RM	0x00000003
 #define FPU_CSR_RN      0x0     /* nearest */
 #define FPU_CSR_RZ      0x1     /* towards zero */
 #define FPU_CSR_RU      0x2     /* towards +Infinity */
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 8f2f8e9d8b2..f2338d1c0b4 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -78,6 +78,9 @@ DEFINE_PER_CPU(struct mips_fpu_emulator_stats, fpuemustats);
 #define FPCREG_RID	0	/* $0  = revision id */
 #define FPCREG_CSR	31	/* $31 = csr */
 
+/* Determine rounding mode from the RM bits of the FCSR */
+#define modeindex(v) ((v) & FPU_CSR_RM)
+
 /* Convert Mips rounding mode (0..3) to IEEE library modes. */
 static const unsigned char ieee_rm[4] = {
 	[FPU_CSR_RN] = IEEE754_RN,
@@ -384,10 +387,14 @@ static int cop1Emulate(struct pt_regs *xcp, struct mips_fpu_struct *ctx)
 					(void *) (xcp->cp0_epc),
 					MIPSInst_RT(ir), value);
 #endif
-				value &= (FPU_CSR_FLUSH | FPU_CSR_ALL_E | FPU_CSR_ALL_S | 0x03);
-				ctx->fcr31 &= ~(FPU_CSR_FLUSH | FPU_CSR_ALL_E | FPU_CSR_ALL_S | 0x03);
-				/* convert to ieee library modes */
-				ctx->fcr31 |= (value & ~0x3) | ieee_rm[value & 0x3];
+
+				/*
+				 * Don't write reserved bits,
+				 * and convert to ieee library modes
+				 */
+				ctx->fcr31 = (value &
+						~(FPU_CSR_RSVD | FPU_CSR_RM)) |
+						ieee_rm[modeindex(value)];
 			}
 			if ((ctx->fcr31 >> 5) & ctx->fcr31 & FPU_CSR_ALL_E) {
 				return SIGFPE;
-- 
cgit v1.2.3


From 46afb8296c2494bfce17064124b253eb9b176ef9 Mon Sep 17 00:00:00 2001
From: Chandrakala Chavva <cchavva@caviumnetworks.com>
Date: Mon, 10 May 2010 17:11:54 -0700
Subject:     MIPS: N32: Use compat version for sys_ppoll.

    The sys_ppoll() takes struct 'struct timespec'. This is different for the
    N32 and N64 ABIs. Use the compat version to do the proper conversions.

    Signed-off-by: David Daney <ddaney@caviumnetworks.com>
    To: linux-mips@linux-mips.org
    Patchwork: http://patchwork.linux-mips.org/patch/1210/
    Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

---
---
 arch/mips/kernel/scall64-n32.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index 44337ba0371..a5297e2a353 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -385,7 +385,7 @@ EXPORT(sysn32_call_table)
 	PTR	sys_fchmodat
 	PTR	sys_faccessat
 	PTR	compat_sys_pselect6
-	PTR	sys_ppoll			/* 6265 */
+	PTR	compat_sys_ppoll		/* 6265 */
 	PTR	sys_unshare
 	PTR	sys_splice
 	PTR	sys_sync_file_range
-- 
cgit v1.2.3


From 4e73238d163c6fcf001264832701d2a6d4927672 Mon Sep 17 00:00:00 2001
From: Wu Zhangjin <wuzhangjin@gmail.com>
Date: Fri, 7 May 2010 00:59:46 +0800
Subject:     MIPS: Oprofile: Fix Loongson irq handler

    The interrupt enable bit for the performance counters is in the Control
    Register $24, not in the counter register.
    loongson2_perfcount_handler(), we need to use

    Reported-by: Xu Hengyang <hengyang@mail.ustc.edu.cn>
    Signed-off-by: Wu Zhangjin <wuzhangjin@gmail.com>
    Cc: linux-mips@linux-mips.org
    Patchwork: http://patchwork.linux-mips.org/patch/1198/
    Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

---
---
 arch/mips/oprofile/op_model_loongson2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/oprofile/op_model_loongson2.c b/arch/mips/oprofile/op_model_loongson2.c
index 29e2326b625..fa3bf661ae2 100644
--- a/arch/mips/oprofile/op_model_loongson2.c
+++ b/arch/mips/oprofile/op_model_loongson2.c
@@ -122,7 +122,7 @@ static irqreturn_t loongson2_perfcount_handler(int irq, void *dev_id)
 	 */
 
 	/* Check whether the irq belongs to me */
-	enabled = read_c0_perfcnt() & LOONGSON2_PERFCNT_INT_EN;
+	enabled = read_c0_perfctrl() & LOONGSON2_PERFCNT_INT_EN;
 	if (!enabled)
 		return IRQ_NONE;
 	enabled = reg.cnt1_enabled | reg.cnt2_enabled;
-- 
cgit v1.2.3


From 605539034f208d365f76af8e2152cb25f702367d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 15 May 2010 20:40:34 -0300
Subject: perf newt: Make <- exit the ui_browser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Right now that means that pressing the left arrow willl make the symbol
annotation window to exit back to the main symbol histogram browser.

This is another improvement on the UI fastpath, i.e. just the arrows and
enter are enough for most browsing.

Suggested-by: Ingo Molnar <mingo@elte.hu>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index e74df1240ef..5ded2f0d3e4 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -323,6 +323,7 @@ static int ui_browser__run(struct ui_browser *self, const char *title,
 	newtFormAddHotKey(self->form, NEWT_KEY_PGDN);
 	newtFormAddHotKey(self->form, NEWT_KEY_HOME);
 	newtFormAddHotKey(self->form, NEWT_KEY_END);
+	newtFormAddHotKey(self->form, NEWT_KEY_LEFT);
 
 	if (ui_browser__refresh_entries(self) < 0)
 		return -1;
@@ -408,6 +409,7 @@ static int ui_browser__run(struct ui_browser *self, const char *title,
 		}
 			break;
 		case NEWT_KEY_ESCAPE:
+		case NEWT_KEY_LEFT:
 		case CTRL('c'):
 		case 'Q':
 		case 'q':
@@ -616,7 +618,7 @@ static void hist_entry__annotate_browser(struct hist_entry *self)
 	if (hist_entry__annotate(self, &head) < 0)
 		return;
 
-	ui_helpline__push("Press ESC to exit");
+	ui_helpline__push("Press <- or ESC to exit");
 
 	memset(&browser, 0, sizeof(browser));
 	browser.entries = &head;
-- 
cgit v1.2.3


From c1ec5fefd9cd9ccb020966a49a3c7f44b25d9e84 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 15 May 2010 20:45:31 -0300
Subject: perf newt: Fix the 'A'/'a' shortcut for annotate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 5ded2f0d3e4..094887f45d0 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -892,10 +892,10 @@ int hists__browse(struct hists *self, const char *helpline, const char *input_na
 
 		if (choice == -1)
 			continue;
-do_annotate:
+
 		if (choice == annotate) {
 			struct hist_entry *he;
-
+do_annotate:
 			if (browser->selection->map->dso->origin == DSO__ORIG_KERNEL) {
 				ui_helpline__puts("No vmlinux file found, can't "
 						 "annotate with just a "
-- 
cgit v1.2.3


From 29351db6a05e7e42be457569428425520a18beec Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 15 May 2010 21:06:58 -0300
Subject: perf newt: Exit browser unconditionally when CTRL+C, q or Q is
 pressed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ESC still asks for confirmation.

Suggested-by: Ingo Molnar <mingo@elte.hu>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 094887f45d0..139eb1a16cd 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -837,9 +837,10 @@ int hists__browse(struct hists *self, const char *helpline, const char *input_na
 		if (es.reason == NEWT_EXIT_HOTKEY) {
 			if (toupper(es.u.key) == 'A')
 				goto do_annotate;
-			if (es.u.key == NEWT_KEY_ESCAPE ||
-			    toupper(es.u.key) == 'Q' ||
-			    es.u.key == CTRL('c')) {
+			if (toupper(es.u.key) == 'Q' ||
+			    es.u.key == CTRL('c'))
+				break;
+			if (es.u.key == NEWT_KEY_ESCAPE) {
 				if (dialog_yesno("Do you really want to exit?"))
 					break;
 				else
-- 
cgit v1.2.3


From 9d192e118a094087494997ea1c8a2faf39af38c5 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 15 May 2010 21:15:01 -0300
Subject: perf newt: Add single key shortcuts for zoom into DSO and threads
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

'D'/'d' for zooming into the DSO in the current highlighted hist entry,
'T'/'t' for zooming into the current thread.

Suggested-by: Ingo Molnar <mingo@elte.hu>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 139eb1a16cd..6974431d212 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -752,6 +752,10 @@ static int hist_browser__populate(struct hist_browser *self, struct hists *hists
 
 	newtFormAddHotKey(self->form, 'A');
 	newtFormAddHotKey(self->form, 'a');
+	newtFormAddHotKey(self->form, 'D');
+	newtFormAddHotKey(self->form, 'd');
+	newtFormAddHotKey(self->form, 'T');
+	newtFormAddHotKey(self->form, 't');
 	newtFormAddHotKey(self->form, NEWT_KEY_RIGHT);
 	newtFormAddHotKey(self->form, NEWT_KEY_LEFT);
 	newtFormAddComponents(self->form, self->tree, NULL);
@@ -834,9 +838,20 @@ int hists__browse(struct hists *self, const char *helpline, const char *input_na
 		    annotate = -2, zoom_dso = -2, zoom_thread = -2;
 
 		newtFormRun(browser->form, &es);
+
+		thread = hist_browser__selected_thread(browser);
+		dso = browser->selection->map ? browser->selection->map->dso : NULL;
+
 		if (es.reason == NEWT_EXIT_HOTKEY) {
-			if (toupper(es.u.key) == 'A')
+			switch (toupper(es.u.key)) {
+			case 'A':
 				goto do_annotate;
+			case 'D':
+				goto zoom_dso;
+			case 'T':
+				goto zoom_thread;
+			default:;
+			}
 			if (toupper(es.u.key) == 'Q' ||
 			    es.u.key == CTRL('c'))
 				break;
@@ -866,7 +881,6 @@ int hists__browse(struct hists *self, const char *helpline, const char *input_na
 			     browser->selection->sym->name) > 0)
 			annotate = nr_options++;
 
-		thread = hist_browser__selected_thread(browser);
 		if (thread != NULL &&
 		    asprintf(&options[nr_options], "Zoom %s %s(%d) thread",
 			     (thread_filter ? "out of" : "into"),
@@ -874,7 +888,6 @@ int hists__browse(struct hists *self, const char *helpline, const char *input_na
 			     thread->pid) > 0)
 			zoom_thread = nr_options++;
 
-		dso = browser->selection->map ? browser->selection->map->dso : NULL;
 		if (dso != NULL &&
 		    asprintf(&options[nr_options], "Zoom %s %s DSO",
 			     (dso_filter ? "out of" : "into"),
@@ -910,12 +923,15 @@ do_annotate:
 
 			hist_entry__annotate_browser(he);
 		} else if (choice == zoom_dso) {
+zoom_dso:
 			if (dso_filter) {
 				pstack__remove(fstack, &dso_filter);
 zoom_out_dso:
 				ui_helpline__pop();
 				dso_filter = NULL;
 			} else {
+				if (dso == NULL)
+					continue;
 				ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s DSO\"",
 						   dso->kernel ? "the Kernel" : dso->short_name);
 				dso_filter = dso;
@@ -927,6 +943,7 @@ zoom_out_dso:
 			if (hist_browser__populate(browser, self, msg) < 0)
 				goto out;
 		} else if (choice == zoom_thread) {
+zoom_thread:
 			if (thread_filter) {
 				pstack__remove(fstack, &thread_filter);
 zoom_out_thread:
-- 
cgit v1.2.3


From e40152ee1e1c7a63f4777791863215e3faa37a86 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 16 May 2010 14:17:36 -0700
Subject: Linus 2.6.34

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 701bc65b395..ebc8225f7a9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 34
-EXTRAVERSION = -rc7
+EXTRAVERSION =
 NAME = Sheep on Meth
 
 # *DOCUMENTATION*
-- 
cgit v1.2.3


From a308f3a868185d4f804fe71d0400e2b058c6d9af Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 16 May 2010 20:29:38 -0300
Subject: perf tui: Make <- exit menus too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In fact it is now added to the hot key list when newt_form__new is used,
allowing us to remove the explicit assignment in all its users.

The visible change is that <- will exit the menu that pops up when -> is
pressed (and Enter when callchains are not being used).

Suggested-by: Ingo Molnar <mingo@elte.hu>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 6974431d212..59a63e4405f 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -118,6 +118,7 @@ int browser__show_help(const char *format, va_list ap)
 
 static void newt_form__set_exit_keys(newtComponent self)
 {
+	newtFormAddHotKey(self, NEWT_KEY_LEFT);
 	newtFormAddHotKey(self, NEWT_KEY_ESCAPE);
 	newtFormAddHotKey(self, 'Q');
 	newtFormAddHotKey(self, 'q');
@@ -323,7 +324,6 @@ static int ui_browser__run(struct ui_browser *self, const char *title,
 	newtFormAddHotKey(self->form, NEWT_KEY_PGDN);
 	newtFormAddHotKey(self->form, NEWT_KEY_HOME);
 	newtFormAddHotKey(self->form, NEWT_KEY_END);
-	newtFormAddHotKey(self->form, NEWT_KEY_LEFT);
 
 	if (ui_browser__refresh_entries(self) < 0)
 		return -1;
@@ -757,7 +757,6 @@ static int hist_browser__populate(struct hist_browser *self, struct hists *hists
 	newtFormAddHotKey(self->form, 'T');
 	newtFormAddHotKey(self->form, 't');
 	newtFormAddHotKey(self->form, NEWT_KEY_RIGHT);
-	newtFormAddHotKey(self->form, NEWT_KEY_LEFT);
 	newtFormAddComponents(self->form, self->tree, NULL);
 	self->selection = newt__symbol_tree_get_current(self->tree);
 
-- 
cgit v1.2.3


From a9a4ab747e2d45bf08fddbc1568f080091486af9 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sun, 16 May 2010 21:04:27 -0300
Subject: perf tui: Add help window to show key associations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Suggested-by: Ingo Molnar <mingo@elte.hu>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 59a63e4405f..2001d26a557 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -167,6 +167,48 @@ out_destroy_form:
 	return rc;
 }
 
+static int ui__help_window(const char *text)
+{
+	struct newtExitStruct es;
+	newtComponent tb, form = newt_form__new();
+	int rc = -1;
+	int max_len = 0, nr_lines = 0;
+	const char *t;
+
+	if (form == NULL)
+		return -1;
+
+	t = text;
+	while (1) {
+		const char *sep = strchr(t, '\n');
+		int len;
+
+		if (sep == NULL)
+			sep = strchr(t, '\0');
+		len = sep - t;
+		if (max_len < len)
+			max_len = len;
+		++nr_lines;
+		if (*sep == '\0')
+			break;
+		t = sep + 1;
+	}
+
+	tb = newtTextbox(0, 0, max_len, nr_lines, 0);
+	if (tb == NULL)
+		goto out_destroy_form;
+
+	newtTextboxSetText(tb, text);
+	newtFormAddComponent(form, tb);
+	newtCenteredWindow(max_len, nr_lines, NULL);
+	newtFormRun(form, &es);
+	newtPopWindow();
+	rc = 0;
+out_destroy_form:
+	newtFormDestroy(form);
+	return rc;
+}
+
 static bool dialog_yesno(const char *msg)
 {
 	/* newtWinChoice should really be accepting const char pointers... */
@@ -756,6 +798,10 @@ static int hist_browser__populate(struct hist_browser *self, struct hists *hists
 	newtFormAddHotKey(self->form, 'd');
 	newtFormAddHotKey(self->form, 'T');
 	newtFormAddHotKey(self->form, 't');
+	newtFormAddHotKey(self->form, '?');
+	newtFormAddHotKey(self->form, 'H');
+	newtFormAddHotKey(self->form, 'h');
+	newtFormAddHotKey(self->form, NEWT_KEY_F1);
 	newtFormAddHotKey(self->form, NEWT_KEY_RIGHT);
 	newtFormAddComponents(self->form, self->tree, NULL);
 	self->selection = newt__symbol_tree_get_current(self->tree);
@@ -842,6 +888,9 @@ int hists__browse(struct hists *self, const char *helpline, const char *input_na
 		dso = browser->selection->map ? browser->selection->map->dso : NULL;
 
 		if (es.reason == NEWT_EXIT_HOTKEY) {
+			if (es.u.key == NEWT_KEY_F1)
+				goto do_help;
+
 			switch (toupper(es.u.key)) {
 			case 'A':
 				goto do_annotate;
@@ -849,6 +898,17 @@ int hists__browse(struct hists *self, const char *helpline, const char *input_na
 				goto zoom_dso;
 			case 'T':
 				goto zoom_thread;
+			case 'H':
+			case '?':
+do_help:
+				ui__help_window("->        Zoom into DSO/Threads & Annotate current symbol\n"
+						"<-        Zoom out\n"
+						"a         Annotate current symbol\n"
+						"h/?/F1    Show this window\n"
+						"d         Zoom into current DSO\n"
+						"t         Zoom into current Thread\n"
+						"q/CTRL+C  Exit browser");
+				continue;
 			default:;
 			}
 			if (toupper(es.u.key) == 'Q' ||
-- 
cgit v1.2.3


From e4af4268a34d8cd28c46a03161fc017cbd2db887 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 14 May 2010 14:41:14 -0700
Subject: x86, mrst, pci: return 0 for non-present pci bars

Moorestown PCI code has special handling of devices with fixed BARs. In
case of BAR sizing writes, we need to update the fake PCI MMCFG space with real
size decode value.

When a BAR is not present, we need to return 0 instead of ~0. ~0 will be
treated as device error per bugzilla 12006.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1273873281-17489-2-git-send-email-jacob.jun.pan@linux.intel.com>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/pci/mrst.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
index 8bf2fcb88d0..d5c7aefe56f 100644
--- a/arch/x86/pci/mrst.c
+++ b/arch/x86/pci/mrst.c
@@ -109,7 +109,7 @@ static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn,
 			decode++;
 			decode = ~(decode - 1);
 		} else {
-			decode = ~0;
+			decode = 0;
 		}
 
 		/*
-- 
cgit v1.2.3


From fea24e28c663e62663097f0ed3b8ff1f9a87f15e Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Fri, 14 May 2010 14:41:20 -0700
Subject: x86, mrst: add nop functions to x86_init mpparse functions

Moorestown does not have BIOS provided MP tables, we can save some time
by avoiding scaning of these tables. e.g.
[    0.000000] Scan SMP from c0000000 for 1024 bytes.
[    0.000000] Scan SMP from c009fc00 for 1024 bytes.
[    0.000000] Scan SMP from c00f0000 for 65536 bytes.
[    0.000000] Scan SMP from c00bfff0 for 1024 bytes.

Searching EBDA with the base at 0x40E will also result in random pointer
deferencing within 1MB. This can be a problem in Lincroft if the pointer
hits VGA area and VGA mode is not enabled.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
LKML-Reference: <1273873281-17489-8-git-send-email-jacob.jun.pan@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/mrst.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 0aad8670858..e796448f0eb 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -237,4 +237,9 @@ void __init x86_mrst_early_setup(void)
 	x86_init.pci.fixup_irqs = x86_init_noop;
 
 	legacy_pic = &null_legacy_pic;
+
+	/* Avoid searching for BIOS MP tables */
+	x86_init.mpparse.find_smp_config = x86_init_noop;
+	x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+
 }
-- 
cgit v1.2.3


From f3d46f9d3194e0329216002a8724d4c0957abc79 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Mon, 17 May 2010 14:33:53 +1000
Subject: atomic_t: Cast to volatile when accessing atomic variables

In preparation for removing volatile from the atomic_t definition, this
patch adds a volatile cast to all the atomic read functions.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/atomic.h    | 4 ++--
 arch/arm/include/asm/atomic.h      | 2 +-
 arch/avr32/include/asm/atomic.h    | 2 +-
 arch/cris/include/asm/atomic.h     | 2 +-
 arch/frv/include/asm/atomic.h      | 2 +-
 arch/h8300/include/asm/atomic.h    | 2 +-
 arch/ia64/include/asm/atomic.h     | 4 ++--
 arch/m32r/include/asm/atomic.h     | 2 +-
 arch/m68k/include/asm/atomic_mm.h  | 2 +-
 arch/m68k/include/asm/atomic_no.h  | 2 +-
 arch/mips/include/asm/atomic.h     | 4 ++--
 arch/mn10300/include/asm/atomic.h  | 2 +-
 arch/parisc/include/asm/atomic.h   | 4 ++--
 arch/sh/include/asm/atomic.h       | 2 +-
 arch/sparc/include/asm/atomic_32.h | 2 +-
 arch/sparc/include/asm/atomic_64.h | 4 ++--
 arch/x86/include/asm/atomic.h      | 2 +-
 arch/x86/include/asm/atomic64_64.h | 2 +-
 arch/xtensa/include/asm/atomic.h   | 2 +-
 include/asm-generic/atomic.h       | 2 +-
 20 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index 610dff44d94..e756d04b6cd 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h
@@ -17,8 +17,8 @@
 #define ATOMIC_INIT(i)		( (atomic_t) { (i) } )
 #define ATOMIC64_INIT(i)	( (atomic64_t) { (i) } )
 
-#define atomic_read(v)		((v)->counter + 0)
-#define atomic64_read(v)	((v)->counter + 0)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
+#define atomic64_read(v)	(*(volatile long *)&(v)->counter)
 
 #define atomic_set(v,i)		((v)->counter = (i))
 #define atomic64_set(v,i)	((v)->counter = (i))
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index e8ddec2cb15..a0162fa9456 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -24,7 +24,7 @@
  * strex/ldrex monitor on some implementations. The reason we can use it for
  * atomic_set() is the clrex or dummy strex done on every exception return.
  */
-#define atomic_read(v)	((v)->counter)
+#define atomic_read(v)	(*(volatile int *)&(v)->counter)
 #define atomic_set(v,i)	(((v)->counter) = (i))
 
 #if __LINUX_ARM_ARCH__ >= 6
diff --git a/arch/avr32/include/asm/atomic.h b/arch/avr32/include/asm/atomic.h
index b131c27ddf5..bbce6a1c6bb 100644
--- a/arch/avr32/include/asm/atomic.h
+++ b/arch/avr32/include/asm/atomic.h
@@ -19,7 +19,7 @@
 
 #define ATOMIC_INIT(i)  { (i) }
 
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 #define atomic_set(v, i)	(((v)->counter) = i)
 
 /*
diff --git a/arch/cris/include/asm/atomic.h b/arch/cris/include/asm/atomic.h
index a6aca819e9f..88dc9b9c4ba 100644
--- a/arch/cris/include/asm/atomic.h
+++ b/arch/cris/include/asm/atomic.h
@@ -15,7 +15,7 @@
 
 #define ATOMIC_INIT(i)  { (i) }
 
-#define atomic_read(v) ((v)->counter)
+#define atomic_read(v) (*(volatile int *)&(v)->counter)
 #define atomic_set(v,i) (((v)->counter) = (i))
 
 /* These should be written in asm but we do it in C for now. */
diff --git a/arch/frv/include/asm/atomic.h b/arch/frv/include/asm/atomic.h
index 00a57af79af..fae32c7fdcb 100644
--- a/arch/frv/include/asm/atomic.h
+++ b/arch/frv/include/asm/atomic.h
@@ -36,7 +36,7 @@
 #define smp_mb__after_atomic_inc()	barrier()
 
 #define ATOMIC_INIT(i)		{ (i) }
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 #define atomic_set(v, i)	(((v)->counter) = (i))
 
 #ifndef CONFIG_FRV_OUTOFLINE_ATOMIC_OPS
diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h
index 33c8c0fa958..e936804b750 100644
--- a/arch/h8300/include/asm/atomic.h
+++ b/arch/h8300/include/asm/atomic.h
@@ -10,7 +10,7 @@
 
 #define ATOMIC_INIT(i)	{ (i) }
 
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 #define atomic_set(v, i)	(((v)->counter) = i)
 
 #include <asm/system.h>
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index 88405cb0832..4e1948447a0 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -21,8 +21,8 @@
 #define ATOMIC_INIT(i)		((atomic_t) { (i) })
 #define ATOMIC64_INIT(i)	((atomic64_t) { (i) })
 
-#define atomic_read(v)		((v)->counter)
-#define atomic64_read(v)	((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
+#define atomic64_read(v)	(*(volatile long *)&(v)->counter)
 
 #define atomic_set(v,i)		(((v)->counter) = (i))
 #define atomic64_set(v,i)	(((v)->counter) = (i))
diff --git a/arch/m32r/include/asm/atomic.h b/arch/m32r/include/asm/atomic.h
index 63f0cf0f50d..d44a51e5271 100644
--- a/arch/m32r/include/asm/atomic.h
+++ b/arch/m32r/include/asm/atomic.h
@@ -26,7 +26,7 @@
  *
  * Atomically reads the value of @v.
  */
-#define atomic_read(v)	((v)->counter)
+#define atomic_read(v)	(*(volatile int *)&(v)->counter)
 
 /**
  * atomic_set - set atomic variable
diff --git a/arch/m68k/include/asm/atomic_mm.h b/arch/m68k/include/asm/atomic_mm.h
index d9d2ed64743..6a223b3f7e7 100644
--- a/arch/m68k/include/asm/atomic_mm.h
+++ b/arch/m68k/include/asm/atomic_mm.h
@@ -15,7 +15,7 @@
 
 #define ATOMIC_INIT(i)	{ (i) }
 
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 #define atomic_set(v, i)	(((v)->counter) = i)
 
 static inline void atomic_add(int i, atomic_t *v)
diff --git a/arch/m68k/include/asm/atomic_no.h b/arch/m68k/include/asm/atomic_no.h
index 5674cb9449b..289310c63a8 100644
--- a/arch/m68k/include/asm/atomic_no.h
+++ b/arch/m68k/include/asm/atomic_no.h
@@ -15,7 +15,7 @@
 
 #define ATOMIC_INIT(i)	{ (i) }
 
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 #define atomic_set(v, i)	(((v)->counter) = i)
 
 static __inline__ void atomic_add(int i, atomic_t *v)
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index 519197ede08..59dc0c7ef73 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -29,7 +29,7 @@
  *
  * Atomically reads the value of @v.
  */
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 
 /*
  * atomic_set - set atomic variable
@@ -410,7 +410,7 @@ static __inline__ int atomic_add_unless(atomic_t *v, int a, int u)
  * @v: pointer of type atomic64_t
  *
  */
-#define atomic64_read(v)	((v)->counter)
+#define atomic64_read(v)	(*(volatile long *)&(v)->counter)
 
 /*
  * atomic64_set - set atomic variable
diff --git a/arch/mn10300/include/asm/atomic.h b/arch/mn10300/include/asm/atomic.h
index 5bf5be9566d..e41222d6c2f 100644
--- a/arch/mn10300/include/asm/atomic.h
+++ b/arch/mn10300/include/asm/atomic.h
@@ -31,7 +31,7 @@
  * Atomically reads the value of @v.  Note that the guaranteed
  * useful range of an atomic_t is only 24 bits.
  */
-#define atomic_read(v)	((v)->counter)
+#define atomic_read(v)	(*(volatile int *)&(v)->counter)
 
 /**
  * atomic_set - set atomic variable
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 716634d1f54..f81955934ae 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -189,7 +189,7 @@ static __inline__ void atomic_set(atomic_t *v, int i)
 
 static __inline__ int atomic_read(const atomic_t *v)
 {
-	return v->counter;
+	return (*(volatile int *)&(v)->counter);
 }
 
 /* exported interface */
@@ -286,7 +286,7 @@ atomic64_set(atomic64_t *v, s64 i)
 static __inline__ s64
 atomic64_read(const atomic64_t *v)
 {
-	return v->counter;
+	return (*(volatile long *)&(v)->counter);
 }
 
 #define atomic64_add(i,v)	((void)(__atomic64_add_return( ((s64)(i)),(v))))
diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h
index 275a448ae8c..c7983124d99 100644
--- a/arch/sh/include/asm/atomic.h
+++ b/arch/sh/include/asm/atomic.h
@@ -13,7 +13,7 @@
 
 #define ATOMIC_INIT(i)	( (atomic_t) { (i) } )
 
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 #define atomic_set(v,i)		((v)->counter = (i))
 
 #if defined(CONFIG_GUSA_RB)
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index f0d343c3b95..7ae128b19d3 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -25,7 +25,7 @@ extern int atomic_cmpxchg(atomic_t *, int, int);
 extern int atomic_add_unless(atomic_t *, int, int);
 extern void atomic_set(atomic_t *, int);
 
-#define atomic_read(v)          ((v)->counter)
+#define atomic_read(v)          (*(volatile int *)&(v)->counter)
 
 #define atomic_add(i, v)	((void)__atomic_add_return( (int)(i), (v)))
 #define atomic_sub(i, v)	((void)__atomic_add_return(-(int)(i), (v)))
diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h
index f2e48009989..2050ca02c42 100644
--- a/arch/sparc/include/asm/atomic_64.h
+++ b/arch/sparc/include/asm/atomic_64.h
@@ -13,8 +13,8 @@
 #define ATOMIC_INIT(i)		{ (i) }
 #define ATOMIC64_INIT(i)	{ (i) }
 
-#define atomic_read(v)		((v)->counter)
-#define atomic64_read(v)	((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
+#define atomic64_read(v)	(*(volatile long *)&(v)->counter)
 
 #define atomic_set(v, i)	(((v)->counter) = i)
 #define atomic64_set(v, i)	(((v)->counter) = i)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 8f8217b9bda..37b39d27abe 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -22,7 +22,7 @@
  */
 static inline int atomic_read(const atomic_t *v)
 {
-	return v->counter;
+	return (*(volatile int *)&(v)->counter);
 }
 
 /**
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 51c5b405692..b014e235ea8 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -18,7 +18,7 @@
  */
 static inline long atomic64_read(const atomic64_t *v)
 {
-	return v->counter;
+	return (*(volatile long *)&(v)->counter);
 }
 
 /**
diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h
index 22d6dde4261..a96a0619d0b 100644
--- a/arch/xtensa/include/asm/atomic.h
+++ b/arch/xtensa/include/asm/atomic.h
@@ -46,7 +46,7 @@
  *
  * Atomically reads the value of @v.
  */
-#define atomic_read(v)		((v)->counter)
+#define atomic_read(v)		(*(volatile int *)&(v)->counter)
 
 /**
  * atomic_set - set atomic variable
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index c99c64dc5f3..c33749f95b3 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -33,7 +33,7 @@
  * Atomically reads the value of @v.  Note that the guaranteed
  * useful range of an atomic_t is only 24 bits.
  */
-#define atomic_read(v)	((v)->counter)
+#define atomic_read(v)	(*(volatile int *)&(v)->counter)
 
 /**
  * atomic_set - set atomic variable
-- 
cgit v1.2.3


From 81880d603d00c645e0890d0a44d50711c503b72b Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Mon, 17 May 2010 14:34:57 +1000
Subject: atomic_t: Remove volatile from atomic_t definition

When looking at a performance problem on PowerPC, I noticed some awful code
generation:

c00000000051fc98:       3b 60 00 01     li      r27,1
...
c00000000051fca0:       3b 80 00 00     li      r28,0
...
c00000000051fcdc:       93 61 00 70     stw     r27,112(r1)
c00000000051fce0:       93 81 00 74     stw     r28,116(r1)
c00000000051fce4:       81 21 00 70     lwz     r9,112(r1)
c00000000051fce8:       80 01 00 74     lwz     r0,116(r1)
c00000000051fcec:       7d 29 07 b4     extsw   r9,r9
c00000000051fcf0:       7c 00 07 b4     extsw   r0,r0

c00000000051fcf4:       7c 20 04 ac     lwsync
c00000000051fcf8:       7d 60 f8 28     lwarx   r11,0,r31
c00000000051fcfc:       7c 0b 48 00     cmpw    r11,r9
c00000000051fd00:       40 c2 00 10     bne-    c00000000051fd10
c00000000051fd04:       7c 00 f9 2d     stwcx.  r0,0,r31
c00000000051fd08:       40 c2 ff f0     bne+    c00000000051fcf8
c00000000051fd0c:       4c 00 01 2c     isync

We create two constants, write them out to the stack, read them straight back
in and sign extend them. What a mess.

It turns out this bad code is a result of us defining atomic_t as a
volatile int.

We removed the volatile attribute from the powerpc atomic_t definition years
ago, but commit ea435467500612636f8f4fb639ff6e76b2496e4b (atomic_t: unify all
arch definitions) added it back in.

To dig up an old quote from Linus:

> The fact is, volatile on data structures is a bug. It's a wart in the C
> language. It shouldn't be used.
>
> Volatile accesses in *code* can be ok, and if we have "atomic_read()"
> expand to a "*(volatile int *)&(x)->value", then I'd be ok with that.
>
> But marking data structures volatile just makes the compiler screw up
> totally, and makes code for initialization sequences etc much worse.

And screw up it does :)

With the volatile removed, we see much more reasonable code generation:

c00000000051f5b8:       3b 60 00 01     li      r27,1
...
c00000000051f5c0:       3b 80 00 00     li      r28,0
...

c00000000051fc7c:       7c 20 04 ac     lwsync
c00000000051fc80:       7c 00 f8 28     lwarx   r0,0,r31
c00000000051fc84:       7c 00 d8 00     cmpw    r0,r27
c00000000051fc88:       40 c2 00 10     bne-    c00000000051fc98
c00000000051fc8c:       7f 80 f9 2d     stwcx.  r28,0,r31
c00000000051fc90:       40 c2 ff f0     bne+    c00000000051fc80
c00000000051fc94:       4c 00 01 2c     isync

Six instructions less.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/types.h b/include/linux/types.h
index c42724f8c80..23d237a075e 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -188,12 +188,12 @@ typedef u32 phys_addr_t;
 typedef phys_addr_t resource_size_t;
 
 typedef struct {
-	volatile int counter;
+	int counter;
 } atomic_t;
 
 #ifdef CONFIG_64BIT
 typedef struct {
-	volatile long counter;
+	long counter;
 } atomic64_t;
 #endif
 
-- 
cgit v1.2.3


From 6ba85cea872954a36d79e46bf6a9c6ea92794f01 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 May 2010 12:16:48 -0300
Subject: perf options: Introduce OPT_U64
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We have things like user_interval (-c/--count) in 'perf record' that
needs this.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-options.c | 18 ++++++++++++++++++
 tools/perf/util/parse-options.h |  2 ++
 2 files changed, 20 insertions(+)

diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
index ed887642460..b05d51df2ce 100644
--- a/tools/perf/util/parse-options.c
+++ b/tools/perf/util/parse-options.c
@@ -60,6 +60,7 @@ static int get_value(struct parse_opt_ctx_t *p,
 		case OPTION_STRING:
 		case OPTION_INTEGER:
 		case OPTION_LONG:
+		case OPTION_U64:
 		default:
 			break;
 		}
@@ -141,6 +142,22 @@ static int get_value(struct parse_opt_ctx_t *p,
 			return opterror(opt, "expects a numerical value", flags);
 		return 0;
 
+	case OPTION_U64:
+		if (unset) {
+			*(u64 *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(u64 *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		*(u64 *)opt->value = strtoull(arg, (char **)&s, 10);
+		if (*s)
+			return opterror(opt, "expects a numerical value", flags);
+		return 0;
+
 	case OPTION_END:
 	case OPTION_ARGUMENT:
 	case OPTION_GROUP:
@@ -487,6 +504,7 @@ int usage_with_options_internal(const char * const *usagestr,
 		case OPTION_SET_INT:
 		case OPTION_SET_PTR:
 		case OPTION_LONG:
+		case OPTION_U64:
 			break;
 		}
 
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
index b2da725f102..e301e96b9d1 100644
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -17,6 +17,7 @@ enum parse_opt_type {
 	OPTION_INTEGER,
 	OPTION_LONG,
 	OPTION_CALLBACK,
+	OPTION_U64,
 };
 
 enum parse_opt_flags {
@@ -101,6 +102,7 @@ struct option {
 #define OPT_SET_PTR(s, l, v, h, p)  { .type = OPTION_SET_PTR, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (p) }
 #define OPT_INTEGER(s, l, v, h)     { .type = OPTION_INTEGER, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
 #define OPT_LONG(s, l, v, h)        { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
+#define OPT_U64(s, l, v, h)         { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
 #define OPT_STRING(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h) }
 #define OPT_DATE(s, l, v, h) \
 	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb }
-- 
cgit v1.2.3


From 3de29cab1f8d62db557a4afed0fb17eebfe64438 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 17 May 2010 12:20:43 -0300
Subject: perf record: Fix bug mismatch with -c option definition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The -c option defines the user requested sampling period. It was implemented
using an unsigned int variable but the type of the option was OPT_LONG. Thus,
the option parser was overwriting memory belonging to other variables, namely
the mmap_pages leading to a zero page sampling buffer. The bug was exposed only
when compiling at -O0, probably because the compiler was padding variables at
higher optimization levels.

This patch fixes this problem by declaring user_interval as u64. This also
avoids wrap-around issues for large period on 32-bit systems.

Commiter note:

Made it use OPT_U64(user_interval) after implementing OPT_U64 in the
previous patch.

Cc: David S. Miller <davem@davemloft.net>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <4bf11ae9.e88cd80a.06b0.ffffa8e3@mx.google.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-record.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 0f467cf7aa7..b93573c7ac0 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -33,8 +33,8 @@ enum write_mode_t {
 
 static int			*fd[MAX_NR_CPUS][MAX_COUNTERS];
 
-static unsigned int		user_interval 			= UINT_MAX;
-static long			default_interval		=      0;
+static u64			user_interval			= ULLONG_MAX;
+static u64			default_interval		=      0;
 
 static int			nr_cpus				=      0;
 static unsigned int		page_size;
@@ -268,7 +268,7 @@ static void create_counter(int counter, int cpu)
 	 * it a weak assumption overridable by the user.
 	 */
 	if (!attr->sample_period || (user_freq != UINT_MAX &&
-				     user_interval != UINT_MAX)) {
+				     user_interval != ULLONG_MAX)) {
 		if (freq) {
 			attr->sample_type	|= PERF_SAMPLE_PERIOD;
 			attr->freq		= 1;
@@ -817,8 +817,7 @@ static const struct option options[] = {
 			    "CPU to profile on"),
 	OPT_BOOLEAN('f', "force", &force,
 			"overwrite existing data file (deprecated)"),
-	OPT_LONG('c', "count", &user_interval,
-		    "event period to sample"),
+	OPT_U64('c', "count", &user_interval, "event period to sample"),
 	OPT_STRING('o', "output", &output_name, "file",
 		    "output file name"),
 	OPT_BOOLEAN('i', "no-inherit", &no_inherit,
@@ -901,7 +900,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 	if (!event_array)
 		return -ENOMEM;
 
-	if (user_interval != UINT_MAX)
+	if (user_interval != ULLONG_MAX)
 		default_interval = user_interval;
 	if (user_freq != UINT_MAX)
 		freq = user_freq;
-- 
cgit v1.2.3


From dc4ff19341126155c5714119396efbae62ab40bf Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 May 2010 12:25:09 -0300
Subject: perf tui: Add workaround for slang < 2.1.4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Older versions of the slang library didn't used the 'const' specifier,
causing problems with modern compilers of this kind:

util/newt.c:252: error: passing argument 1 of ‘SLsmg_printf’ discards
qualifiers from pointer target type

Fix it by using some wrappers that when needed const the affected
parameters back to plain (char *).

Reported-by: Lin Ming <ming.m.lin@intel.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <20100517145421.GD29052@ghostprotocols.net>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/newt.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/tools/perf/util/newt.c b/tools/perf/util/newt.c
index 2001d26a557..ccb7c5bb269 100644
--- a/tools/perf/util/newt.c
+++ b/tools/perf/util/newt.c
@@ -14,6 +14,17 @@
 #include "sort.h"
 #include "symbol.h"
 
+#if SLANG_VERSION < 20104
+#define slsmg_printf(msg, args...) SLsmg_printf((char *)msg, ##args)
+#define slsmg_write_nstring(msg, len) SLsmg_write_nstring((char *)msg, len)
+#define sltt_set_color(obj, name, fg, bg) SLtt_set_color(obj,(char *)name,\
+							 (char *)fg, (char *)bg)
+#else
+#define slsmg_printf SLsmg_printf
+#define slsmg_write_nstring SLsmg_write_nstring
+#define sltt_set_color SLtt_set_color
+#endif
+
 struct ui_progress {
 	newtComponent form, scale;
 };
@@ -292,21 +303,21 @@ static int objdump_line__show(struct objdump_line *self, struct list_head *head,
 
 		color = ui_browser__percent_color(percent, current_entry);
 		SLsmg_set_color(color);
-		SLsmg_printf(" %7.2f ", percent);
+		slsmg_printf(" %7.2f ", percent);
 		if (!current_entry)
 			SLsmg_set_color(HE_COLORSET_CODE);
 	} else {
 		int color = ui_browser__percent_color(0, current_entry);
 		SLsmg_set_color(color);
-		SLsmg_write_nstring(" ", 9);
+		slsmg_write_nstring(" ", 9);
 	}
 
 	SLsmg_write_char(':');
-	SLsmg_write_nstring(" ", 8);
+	slsmg_write_nstring(" ", 8);
 	if (!*self->line)
-		SLsmg_write_nstring(" ", width - 18);
+		slsmg_write_nstring(" ", width - 18);
 	else
-		SLsmg_write_nstring(self->line, width - 18);
+		slsmg_write_nstring(self->line, width - 18);
 
 	return 0;
 }
@@ -1054,11 +1065,11 @@ void setup_browser(void)
 	newtInit();
 	newtCls();
 	ui_helpline__puts(" ");
-	SLtt_set_color(HE_COLORSET_TOP, NULL, c->topColorFg, c->topColorBg);
-	SLtt_set_color(HE_COLORSET_MEDIUM, NULL, c->mediumColorFg, c->mediumColorBg);
-	SLtt_set_color(HE_COLORSET_NORMAL, NULL, c->normalColorFg, c->normalColorBg);
-	SLtt_set_color(HE_COLORSET_SELECTED, NULL, c->selColorFg, c->selColorBg);
-	SLtt_set_color(HE_COLORSET_CODE, NULL, c->codeColorFg, c->codeColorBg);
+	sltt_set_color(HE_COLORSET_TOP, NULL, c->topColorFg, c->topColorBg);
+	sltt_set_color(HE_COLORSET_MEDIUM, NULL, c->mediumColorFg, c->mediumColorBg);
+	sltt_set_color(HE_COLORSET_NORMAL, NULL, c->normalColorFg, c->normalColorBg);
+	sltt_set_color(HE_COLORSET_SELECTED, NULL, c->selColorFg, c->selColorBg);
+	sltt_set_color(HE_COLORSET_CODE, NULL, c->codeColorFg, c->codeColorBg);
 }
 
 void exit_browser(bool wait_for_ok)
-- 
cgit v1.2.3


From c100edbee8dbf033ec4095a976a74c1c75c9fc1d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 May 2010 15:30:00 -0300
Subject: perf options: Introduce OPT_UINTEGER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For unsigned int options to be parsed, next patches will make use of it.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-options.c | 22 ++++++++++++++++++++--
 tools/perf/util/parse-options.h |  2 ++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
index b05d51df2ce..36d955e4942 100644
--- a/tools/perf/util/parse-options.c
+++ b/tools/perf/util/parse-options.c
@@ -59,6 +59,7 @@ static int get_value(struct parse_opt_ctx_t *p,
 		case OPTION_GROUP:
 		case OPTION_STRING:
 		case OPTION_INTEGER:
+		case OPTION_UINTEGER:
 		case OPTION_LONG:
 		case OPTION_U64:
 		default:
@@ -126,6 +127,22 @@ static int get_value(struct parse_opt_ctx_t *p,
 			return opterror(opt, "expects a numerical value", flags);
 		return 0;
 
+	case OPTION_UINTEGER:
+		if (unset) {
+			*(unsigned int *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(unsigned int *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		*(unsigned int *)opt->value = strtol(arg, (char **)&s, 10);
+		if (*s)
+			return opterror(opt, "expects a numerical value", flags);
+		return 0;
+
 	case OPTION_LONG:
 		if (unset) {
 			*(long *)opt->value = 0;
@@ -463,7 +480,10 @@ int usage_with_options_internal(const char * const *usagestr,
 		switch (opts->type) {
 		case OPTION_ARGUMENT:
 			break;
+		case OPTION_LONG:
+		case OPTION_U64:
 		case OPTION_INTEGER:
+		case OPTION_UINTEGER:
 			if (opts->flags & PARSE_OPT_OPTARG)
 				if (opts->long_name)
 					pos += fprintf(stderr, "[=<n>]");
@@ -503,8 +523,6 @@ int usage_with_options_internal(const char * const *usagestr,
 		case OPTION_INCR:
 		case OPTION_SET_INT:
 		case OPTION_SET_PTR:
-		case OPTION_LONG:
-		case OPTION_U64:
 			break;
 		}
 
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
index e301e96b9d1..c6aa4eed5ed 100644
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -18,6 +18,7 @@ enum parse_opt_type {
 	OPTION_LONG,
 	OPTION_CALLBACK,
 	OPTION_U64,
+	OPTION_UINTEGER,
 };
 
 enum parse_opt_flags {
@@ -101,6 +102,7 @@ struct option {
 #define OPT_SET_INT(s, l, v, h, i)  { .type = OPTION_SET_INT, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (i) }
 #define OPT_SET_PTR(s, l, v, h, p)  { .type = OPTION_SET_PTR, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (p) }
 #define OPT_INTEGER(s, l, v, h)     { .type = OPTION_INTEGER, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
+#define OPT_UINTEGER(s, l, v, h)    { .type = OPTION_UINTEGER, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
 #define OPT_LONG(s, l, v, h)        { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
 #define OPT_U64(s, l, v, h)         { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
 #define OPT_STRING(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h) }
-- 
cgit v1.2.3


From 1967936d688c475b85d34d84e09858cf514c893c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 May 2010 15:39:16 -0300
Subject: perf options: Check v type in OPT_U?INTEGER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To avoid problems like the one fixed by Stephane Eranian in 3de29ca, now
we'll got this instead:

	bench/sched-messaging.c:259: error: negative width in bit-field ‘<anonymous>’
	bench/sched-messaging.c:261: error: negative width in bit-field ‘<anonymous>’

Which is rather cryptic, but is how BUILD_BUG_ON_ZERO works, so kernel
hackers should be already used to this.

With it in place found some problems, fixed by changing the affected
variables to sensible types or changed some OPT_INTEGER to OPT_UINTEGER.

Next csets will go thru converting each of the remaining OPT_ so that
review can be made easier by grouping changes per type per patch.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/bench/sched-messaging.c     | 6 ++----
 tools/perf/builtin-record.c            | 8 +++-----
 tools/perf/builtin-sched.c             | 6 +++---
 tools/perf/builtin-top.c               | 5 ++---
 tools/perf/util/include/linux/kernel.h | 2 ++
 tools/perf/util/parse-options.h        | 8 ++++++--
 6 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c
index da1b2e9f01f..d1d1b30f99c 100644
--- a/tools/perf/bench/sched-messaging.c
+++ b/tools/perf/bench/sched-messaging.c
@@ -256,10 +256,8 @@ static const struct option options[] = {
 		    "Use pipe() instead of socketpair()"),
 	OPT_BOOLEAN('t', "thread", &thread_mode,
 		    "Be multi thread instead of multi process"),
-	OPT_INTEGER('g', "group", &num_groups,
-		    "Specify number of groups"),
-	OPT_INTEGER('l', "loop", &loops,
-		    "Specify number of loops"),
+	OPT_UINTEGER('g', "group", &num_groups, "Specify number of groups"),
+	OPT_UINTEGER('l', "loop", &loops, "Specify number of loops"),
 	OPT_END()
 };
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index b93573c7ac0..cb46c7d0ea9 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -45,7 +45,7 @@ static int			output;
 static int			pipe_output			=      0;
 static const char		*output_name			= "perf.data";
 static int			group				=      0;
-static unsigned int		realtime_prio			=      0;
+static int			realtime_prio			=      0;
 static bool			raw_samples			=  false;
 static bool			system_wide			=  false;
 static int			profile_cpu			=     -1;
@@ -822,10 +822,8 @@ static const struct option options[] = {
 		    "output file name"),
 	OPT_BOOLEAN('i', "no-inherit", &no_inherit,
 		    "child tasks do not inherit counters"),
-	OPT_INTEGER('F', "freq", &user_freq,
-		    "profile at this frequency"),
-	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
-		    "number of mmap data pages"),
+	OPT_UINTEGER('F', "freq", &user_freq, "profile at this frequency"),
+	OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"),
 	OPT_BOOLEAN('g', "call-graph", &call_graph,
 		    "do call-graph (stack chain/backtrace) recording"),
 	OPT_INCR('v', "verbose", &verbose,
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index be7bc926471..c80acdf927a 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -105,7 +105,7 @@ static u64			sum_runtime;
 static u64			sum_fluct;
 static u64			run_avg;
 
-static unsigned long		replay_repeat = 10;
+static unsigned int		replay_repeat = 10;
 static unsigned long		nr_timestamps;
 static unsigned long		nr_unordered_timestamps;
 static unsigned long		nr_state_machine_bugs;
@@ -1816,8 +1816,8 @@ static const char * const replay_usage[] = {
 };
 
 static const struct option replay_options[] = {
-	OPT_INTEGER('r', "repeat", &replay_repeat,
-		    "repeat the workload replay N times (-1: infinite)"),
+	OPT_UINTEGER('r', "repeat", &replay_repeat,
+		     "repeat the workload replay N times (-1: infinite)"),
 	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index ed9b5b6905f..9f0cfa0108a 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -71,7 +71,7 @@ static int			thread_num			=      0;
 static bool			inherit				=  false;
 static int			profile_cpu			=     -1;
 static int			nr_cpus				=      0;
-static unsigned int		realtime_prio			=      0;
+static int			realtime_prio			=      0;
 static bool			group				=  false;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			=     16;
@@ -1357,8 +1357,7 @@ static const struct option options[] = {
 		   "file", "vmlinux pathname"),
 	OPT_BOOLEAN('K', "hide_kernel_symbols", &hide_kernel_symbols,
 		    "hide kernel symbols"),
-	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
-		    "number of mmap data pages"),
+	OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"),
 	OPT_INTEGER('r', "realtime", &realtime_prio,
 		    "collect data with this RT SCHED_FIFO priority"),
 	OPT_INTEGER('d', "delay", &delay_secs,
diff --git a/tools/perf/util/include/linux/kernel.h b/tools/perf/util/include/linux/kernel.h
index 388ab1bfd11..1eb804fd3fb 100644
--- a/tools/perf/util/include/linux/kernel.h
+++ b/tools/perf/util/include/linux/kernel.h
@@ -28,6 +28,8 @@
 	(type *)((char *)__mptr - offsetof(type, member)); })
 #endif
 
+#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+
 #ifndef max
 #define max(x, y) ({				\
 	typeof(x) _max1 = (x);			\
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
index c6aa4eed5ed..9ca348e1063 100644
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -1,6 +1,8 @@
 #ifndef __PERF_PARSE_OPTIONS_H
 #define __PERF_PARSE_OPTIONS_H
 
+#include <linux/kernel.h>
+
 enum parse_opt_type {
 	/* special types */
 	OPTION_END,
@@ -93,6 +95,8 @@ struct option {
 	intptr_t defval;
 };
 
+#define check_vtype(v, type) ( BUILD_BUG_ON_ZERO(!__builtin_types_compatible_p(typeof(v), type)) + v )
+
 #define OPT_END()                   { .type = OPTION_END }
 #define OPT_ARGUMENT(l, h)          { .type = OPTION_ARGUMENT, .long_name = (l), .help = (h) }
 #define OPT_GROUP(h)                { .type = OPTION_GROUP, .help = (h) }
@@ -101,8 +105,8 @@ struct option {
 #define OPT_INCR(s, l, v, h)        { .type = OPTION_INCR, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
 #define OPT_SET_INT(s, l, v, h, i)  { .type = OPTION_SET_INT, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (i) }
 #define OPT_SET_PTR(s, l, v, h, p)  { .type = OPTION_SET_PTR, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (p) }
-#define OPT_INTEGER(s, l, v, h)     { .type = OPTION_INTEGER, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
-#define OPT_UINTEGER(s, l, v, h)    { .type = OPTION_UINTEGER, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
+#define OPT_INTEGER(s, l, v, h)     { .type = OPTION_INTEGER, .short_name = (s), .long_name = (l), .value = check_vtype(v, int *), .help = (h) }
+#define OPT_UINTEGER(s, l, v, h)    { .type = OPTION_UINTEGER, .short_name = (s), .long_name = (l), .value = check_vtype(v, unsigned int *), .help = (h) }
 #define OPT_LONG(s, l, v, h)        { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
 #define OPT_U64(s, l, v, h)         { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
 #define OPT_STRING(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h) }
-- 
cgit v1.2.3


From b1f3bb494e8acddeb972331c2ac642b3c7bceeb9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Oct 2009 08:42:21 +0000
Subject: m68k: Remove BKL from rtc implementations

m68k does not support SMP. The access to the rtc is already serialized
with local_irq_save/restore which is sufficient on UP.

The open() protection in arch/m68k/mvme16x/rtc.c is not pretty but
sufficient on UP and safe w/o the BKL.

open() in arch/m68k/bvme6000/rtc.c can do with the same atomic logic
as arch/m68k/mvme16x/rtc.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/bvme6000/rtc.c | 29 +++++++++--------------------
 arch/m68k/mvme16x/rtc.c  | 19 +++++--------------
 2 files changed, 14 insertions(+), 34 deletions(-)

diff --git a/arch/m68k/bvme6000/rtc.c b/arch/m68k/bvme6000/rtc.c
index b46ea1714a8..cb8617bb194 100644
--- a/arch/m68k/bvme6000/rtc.c
+++ b/arch/m68k/bvme6000/rtc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/miscdevice.h>
-#include <linux/smp_lock.h>
 #include <linux/ioport.h>
 #include <linux/capability.h>
 #include <linux/fcntl.h>
@@ -35,10 +34,9 @@
 static unsigned char days_in_mo[] =
 {0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
 
-static char rtc_status;
+static atomic_t rtc_status = ATOMIC_INIT(1);
 
-static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-		     unsigned long arg)
+static long rtc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	volatile RtcPtr_t rtc = (RtcPtr_t)BVME_RTC_BASE;
 	unsigned char msr;
@@ -132,29 +130,20 @@ static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 }
 
 /*
- *	We enforce only one user at a time here with the open/close.
- *	Also clear the previous interrupt data on an open, and clean
- *	up things on a close.
+ * We enforce only one user at a time here with the open/close.
  */
-
 static int rtc_open(struct inode *inode, struct file *file)
 {
-	lock_kernel();
-	if(rtc_status) {
-		unlock_kernel();
+	if (!atomic_dec_and_test(&rtc_status)) {
+		atomic_inc(&rtc_status);
 		return -EBUSY;
 	}
-
-	rtc_status = 1;
-	unlock_kernel();
 	return 0;
 }
 
 static int rtc_release(struct inode *inode, struct file *file)
 {
-	lock_kernel();
-	rtc_status = 0;
-	unlock_kernel();
+	atomic_inc(&rtc_status);
 	return 0;
 }
 
@@ -163,9 +152,9 @@ static int rtc_release(struct inode *inode, struct file *file)
  */
 
 static const struct file_operations rtc_fops = {
-	.ioctl =	rtc_ioctl,
-	.open =		rtc_open,
-	.release =	rtc_release,
+	.unlocked_ioctl	= rtc_ioctl,
+	.open		= rtc_open,
+	.release	= rtc_release,
 };
 
 static struct miscdevice rtc_dev = {
diff --git a/arch/m68k/mvme16x/rtc.c b/arch/m68k/mvme16x/rtc.c
index 8da9c250d3e..11ac6f63967 100644
--- a/arch/m68k/mvme16x/rtc.c
+++ b/arch/m68k/mvme16x/rtc.c
@@ -9,7 +9,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/miscdevice.h>
-#include <linux/smp_lock.h>
 #include <linux/ioport.h>
 #include <linux/capability.h>
 #include <linux/fcntl.h>
@@ -36,8 +35,7 @@ static const unsigned char days_in_mo[] =
 
 static atomic_t rtc_ready = ATOMIC_INIT(1);
 
-static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
-		     unsigned long arg)
+static long rtc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	volatile MK48T08ptr_t rtc = (MK48T08ptr_t)MVME_RTC_BASE;
 	unsigned long flags;
@@ -120,22 +118,15 @@ static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 }
 
 /*
- *	We enforce only one user at a time here with the open/close.
- *	Also clear the previous interrupt data on an open, and clean
- *	up things on a close.
+ * We enforce only one user at a time here with the open/close.
  */
-
 static int rtc_open(struct inode *inode, struct file *file)
 {
-	lock_kernel();
 	if( !atomic_dec_and_test(&rtc_ready) )
 	{
 		atomic_inc( &rtc_ready );
-		unlock_kernel();
 		return -EBUSY;
 	}
-	unlock_kernel();
-
 	return 0;
 }
 
@@ -150,9 +141,9 @@ static int rtc_release(struct inode *inode, struct file *file)
  */
 
 static const struct file_operations rtc_fops = {
-	.ioctl =	rtc_ioctl,
-	.open =		rtc_open,
-	.release =	rtc_release,
+	.unlocked_ioctl	= rtc_ioctl,
+	.open		= rtc_open,
+	.release	= rtc_release,
 };
 
 static struct miscdevice rtc_dev=
-- 
cgit v1.2.3


From 291d7e9553fef2e18825b1ef1345fbd316dff98f Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Thu, 31 Dec 2009 15:35:44 -0500
Subject: m68k: Simplify param.h by using <asm-generic/param.h>

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/include/asm/param.h | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/arch/m68k/include/asm/param.h b/arch/m68k/include/asm/param.h
index 85c41b75aa7..36265ccf5c7 100644
--- a/arch/m68k/include/asm/param.h
+++ b/arch/m68k/include/asm/param.h
@@ -1,26 +1,12 @@
 #ifndef _M68K_PARAM_H
 #define _M68K_PARAM_H
 
-#ifdef __KERNEL__
-# define HZ		CONFIG_HZ	/* Internal kernel timer frequency */
-# define USER_HZ	100		/* .. some user interfaces are in "ticks" */
-# define CLOCKS_PER_SEC	(USER_HZ)	/* like times() */
-#endif
-
-#ifndef HZ
-#define HZ 100
-#endif
-
 #ifdef __uClinux__
 #define EXEC_PAGESIZE	4096
 #else
 #define EXEC_PAGESIZE	8192
 #endif
 
-#ifndef NOGROUP
-#define NOGROUP		(-1)
-#endif
-
-#define MAXHOSTNAMELEN	64	/* max length of hostname */
+#include <asm-generic/param.h>
 
 #endif /* _M68K_PARAM_H */
-- 
cgit v1.2.3


From b9b0d8b430ebd61d32e2a9544e75a3c4b10cddcd Mon Sep 17 00:00:00 2001
From: Frans Pop <elendil@planet.nl>
Date: Sat, 6 Feb 2010 18:47:11 +0100
Subject: m68k: Remove trailing spaces in messages

Signed-off-by: Frans Pop <elendil@planet.nl>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/kernel/traps.c |  2 +-
 arch/m68k/mac/config.c   | 10 +++++-----
 arch/m68k/q40/config.c   |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index aacd6d17b83..ada4f4cca81 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -455,7 +455,7 @@ static inline void access_error040(struct frame *fp)
 
 		if (do_page_fault(&fp->ptregs, addr, errorcode)) {
 #ifdef DEBUG
-		        printk("do_page_fault() !=0 \n");
+			printk("do_page_fault() !=0\n");
 #endif
 			if (user_mode(&fp->ptregs)){
 				/* delay writebacks after signal delivery */
diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c
index 0356da9bf76..1c16b1baf8d 100644
--- a/arch/m68k/mac/config.c
+++ b/arch/m68k/mac/config.c
@@ -148,7 +148,7 @@ static void mac_cache_card_flush(int writeback)
 void __init config_mac(void)
 {
 	if (!MACH_IS_MAC)
-		printk(KERN_ERR "ERROR: no Mac, but config_mac() called!! \n");
+		printk(KERN_ERR "ERROR: no Mac, but config_mac() called!!\n");
 
 	mach_sched_init = mac_sched_init;
 	mach_init_IRQ = mac_init_IRQ;
@@ -867,7 +867,7 @@ static void __init mac_identify(void)
 	 */
 	iop_preinit();
 
-	printk(KERN_INFO "Detected Macintosh model: %d \n", model);
+	printk(KERN_INFO "Detected Macintosh model: %d\n", model);
 
 	/*
 	 * Report booter data:
@@ -878,12 +878,12 @@ static void __init mac_identify(void)
 		mac_bi_data.videoaddr, mac_bi_data.videorow,
 		mac_bi_data.videodepth, mac_bi_data.dimensions & 0xFFFF,
 		mac_bi_data.dimensions >> 16);
-	printk(KERN_DEBUG " Videological 0x%lx phys. 0x%lx, SCC at 0x%lx \n",
+	printk(KERN_DEBUG " Videological 0x%lx phys. 0x%lx, SCC at 0x%lx\n",
 		mac_bi_data.videological, mac_orig_videoaddr,
 		mac_bi_data.sccbase);
-	printk(KERN_DEBUG " Boottime: 0x%lx GMTBias: 0x%lx \n",
+	printk(KERN_DEBUG " Boottime: 0x%lx GMTBias: 0x%lx\n",
 		mac_bi_data.boottime, mac_bi_data.gmtbias);
-	printk(KERN_DEBUG " Machine ID: %ld CPUid: 0x%lx memory size: 0x%lx \n",
+	printk(KERN_DEBUG " Machine ID: %ld CPUid: 0x%lx memory size: 0x%lx\n",
 		mac_bi_data.id, mac_bi_data.cpuid, mac_bi_data.memsize);
 
 	iop_init();
diff --git a/arch/m68k/q40/config.c b/arch/m68k/q40/config.c
index 31ab3f08bbd..ad10fecec2f 100644
--- a/arch/m68k/q40/config.c
+++ b/arch/m68k/q40/config.c
@@ -126,7 +126,7 @@ static void q40_reset(void)
 {
         halted = 1;
         printk("\n\n*******************************************\n"
-		"Called q40_reset : press the RESET button!! \n"
+		"Called q40_reset : press the RESET button!!\n"
 		"*******************************************\n");
 	Q40_LED_ON();
 	while (1)
-- 
cgit v1.2.3


From 9881bbb269e8f287d0e55ae3384e48b26f1872f7 Mon Sep 17 00:00:00 2001
From: Andrea Gelmini <andrea.gelmini@gelma.net>
Date: Sat, 27 Feb 2010 17:51:33 +0100
Subject: m68k: hp300 - Checkpatch cleanup

arch/m68k/hp300/time.h:2: WARNING: space prohibited between function name and open parenthesis '('

Signed-off-by: Andrea Gelmini <andrea.gelmini@gelma.net>
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/hp300/time.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/m68k/hp300/time.h b/arch/m68k/hp300/time.h
index f5b3d098b0f..7b98242960d 100644
--- a/arch/m68k/hp300/time.h
+++ b/arch/m68k/hp300/time.h
@@ -1,4 +1,2 @@
 extern void hp300_sched_init(irq_handler_t vector);
-extern unsigned long hp300_gettimeoffset (void);
-
-
+extern unsigned long hp300_gettimeoffset(void);
-- 
cgit v1.2.3


From b52dd0077cde89111c00efc73a8db07f50ebb3e8 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Sun, 28 Feb 2010 13:06:27 +0100
Subject: m68k: Implement generic_find_next_{zero_,}le_bit()

linux-next:
fs/udf/balloc.c: In function 'udf_bitmap_new_block':
fs/udf/balloc.c:274: error: implicit declaration of function 'generic_find_next_le_bit'

Convert ext2_find_next_{zero_,}bit() into generic_find_next_{zero_,}le_bit(),
and wrap the ext2_find_next_{zero_,}bit() around the latter.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/include/asm/bitops_mm.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/m68k/include/asm/bitops_mm.h b/arch/m68k/include/asm/bitops_mm.h
index 9bde784e7ba..b4ecdaada52 100644
--- a/arch/m68k/include/asm/bitops_mm.h
+++ b/arch/m68k/include/asm/bitops_mm.h
@@ -365,6 +365,10 @@ static inline int minix_test_bit(int nr, const void *vaddr)
 #define ext2_set_bit_atomic(lock, nr, addr)	test_and_set_bit((nr) ^ 24, (unsigned long *)(addr))
 #define ext2_clear_bit(nr, addr)		__test_and_clear_bit((nr) ^ 24, (unsigned long *)(addr))
 #define ext2_clear_bit_atomic(lock, nr, addr)	test_and_clear_bit((nr) ^ 24, (unsigned long *)(addr))
+#define ext2_find_next_zero_bit(addr, size, offset) \
+	generic_find_next_zero_le_bit((unsigned long *)addr, size, offset)
+#define ext2_find_next_bit(addr, size, offset) \
+	generic_find_next_le_bit((unsigned long *)addr, size, offset)
 
 static inline int ext2_test_bit(int nr, const void *vaddr)
 {
@@ -394,10 +398,9 @@ static inline int ext2_find_first_zero_bit(const void *vaddr, unsigned size)
 	return (p - addr) * 32 + res;
 }
 
-static inline int ext2_find_next_zero_bit(const void *vaddr, unsigned size,
-					  unsigned offset)
+static inline unsigned long generic_find_next_zero_le_bit(const unsigned long *addr,
+		unsigned long size, unsigned long offset)
 {
-	const unsigned long *addr = vaddr;
 	const unsigned long *p = addr + (offset >> 5);
 	int bit = offset & 31UL, res;
 
@@ -437,10 +440,9 @@ static inline int ext2_find_first_bit(const void *vaddr, unsigned size)
 	return (p - addr) * 32 + res;
 }
 
-static inline int ext2_find_next_bit(const void *vaddr, unsigned size,
-				     unsigned offset)
+static inline unsigned long generic_find_next_le_bit(const unsigned long *addr,
+		unsigned long size, unsigned long offset)
 {
-	const unsigned long *addr = vaddr;
 	const unsigned long *p = addr + (offset >> 5);
 	int bit = offset & 31UL, res;
 
-- 
cgit v1.2.3


From 8035458fbb567ae138c77a5f710050107c6a7066 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 May 2010 15:51:10 -0300
Subject: perf options: Type check OPT_BOOLEAN and fix the offenders
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-kvm.c        | 4 ++--
 tools/perf/builtin-lock.c       | 3 +--
 tools/perf/builtin-test.c       | 2 +-
 tools/perf/perf.h               | 3 ++-
 tools/perf/util/parse-options.h | 3 ++-
 5 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index a4c7cae4502..b1c6b38567f 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -22,8 +22,8 @@
 static char			*file_name;
 static char			name_buffer[256];
 
-int				perf_host = 1;
-int				perf_guest;
+bool				perf_host = 1;
+bool				perf_guest;
 
 static const char * const kvm_usage[] = {
 	"perf kvm [<options>] {top|record|report|diff|buildid-list}",
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index e18dfdc2948..821c1586a22 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -792,8 +792,7 @@ static void print_result(void)
 	print_bad_events(bad, total);
 }
 
-static int			info_threads;
-static int			info_map;
+static bool info_threads, info_map;
 
 static void dump_threads(void)
 {
diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c
index 0339612e738..035b9fa063a 100644
--- a/tools/perf/builtin-test.c
+++ b/tools/perf/builtin-test.c
@@ -257,7 +257,7 @@ static const char * const test_usage[] = {
 };
 
 static const struct option test_options[] = {
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INTEGER('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_END()
 };
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 02821febb70..ef7aa0a0c52 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -80,6 +80,7 @@ void get_term_dimensions(struct winsize *ws);
 
 #include "../../include/linux/perf_event.h"
 #include "util/types.h"
+#include <stdbool.h>
 
 /*
  * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all
@@ -131,6 +132,6 @@ struct ip_callchain {
 	u64 ips[0];
 };
 
-extern int perf_host, perf_guest;
+extern bool perf_host, perf_guest;
 
 #endif
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
index 9ca348e1063..5838e2d8bbc 100644
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -2,6 +2,7 @@
 #define __PERF_PARSE_OPTIONS_H
 
 #include <linux/kernel.h>
+#include <stdbool.h>
 
 enum parse_opt_type {
 	/* special types */
@@ -101,7 +102,7 @@ struct option {
 #define OPT_ARGUMENT(l, h)          { .type = OPTION_ARGUMENT, .long_name = (l), .help = (h) }
 #define OPT_GROUP(h)                { .type = OPTION_GROUP, .help = (h) }
 #define OPT_BIT(s, l, v, h, b)      { .type = OPTION_BIT, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (b) }
-#define OPT_BOOLEAN(s, l, v, h)     { .type = OPTION_BOOLEAN, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
+#define OPT_BOOLEAN(s, l, v, h)     { .type = OPTION_BOOLEAN, .short_name = (s), .long_name = (l), .value = check_vtype(v, bool *), .help = (h) }
 #define OPT_INCR(s, l, v, h)        { .type = OPTION_INCR, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
 #define OPT_SET_INT(s, l, v, h, i)  { .type = OPTION_SET_INT, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (i) }
 #define OPT_SET_PTR(s, l, v, h, p)  { .type = OPTION_SET_PTR, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (p) }
-- 
cgit v1.2.3


From edb7c60e27c1baff38d82440dc52eaffac9a45f4 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 May 2010 16:22:41 -0300
Subject: perf options: Type check all the remaining OPT_ variants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OPT_SET_INT was renamed to OPT_SET_UINT since the only use in these
tools is to set something that has an enum type, that is builtin
compatible with unsigned int.

Several string constifications were done to make OPT_STRING require a
const char * type.

Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/builtin-bench.c      |  4 ++--
 tools/perf/builtin-help.c       |  6 +++---
 tools/perf/builtin-kvm.c        |  2 +-
 tools/perf/builtin-report.c     |  4 ++--
 tools/perf/builtin-sched.c      |  2 +-
 tools/perf/builtin-top.c        |  2 +-
 tools/perf/util/hist.c          |  2 +-
 tools/perf/util/parse-options.c | 10 +++++-----
 tools/perf/util/parse-options.h | 16 ++++++++--------
 tools/perf/util/sort.c          |  8 ++++----
 tools/perf/util/sort.h          |  8 ++++----
 tools/perf/util/symbol.h        |  2 +-
 12 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index 46996774e55..fcb96269852 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -95,7 +95,7 @@ static void dump_suites(int subsys_index)
 	return;
 }
 
-static char *bench_format_str;
+static const char *bench_format_str;
 int bench_format = BENCH_FORMAT_DEFAULT;
 
 static const struct option bench_options[] = {
@@ -126,7 +126,7 @@ static void print_usage(void)
 	printf("\n");
 }
 
-static int bench_str2int(char *str)
+static int bench_str2int(const char *str)
 {
 	if (!str)
 		return BENCH_FORMAT_DEFAULT;
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 81e3ecc40fc..6d5a8a7faf4 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -33,10 +33,10 @@ static bool show_all = false;
 static enum help_format help_format = HELP_FORMAT_MAN;
 static struct option builtin_help_options[] = {
 	OPT_BOOLEAN('a', "all", &show_all, "print all available commands"),
-	OPT_SET_INT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN),
-	OPT_SET_INT('w', "web", &help_format, "show manual in web browser",
+	OPT_SET_UINT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN),
+	OPT_SET_UINT('w', "web", &help_format, "show manual in web browser",
 			HELP_FORMAT_WEB),
-	OPT_SET_INT('i', "info", &help_format, "show info page",
+	OPT_SET_UINT('i', "info", &help_format, "show info page",
 			HELP_FORMAT_INFO),
 	OPT_END(),
 };
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index b1c6b38567f..34d1e853829 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -19,7 +19,7 @@
 #include <pthread.h>
 #include <math.h>
 
-static char			*file_name;
+static const char		*file_name;
 static char			name_buffer[256];
 
 bool				perf_host = 1;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 68265120ee0..1d3c1003b43 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -39,8 +39,8 @@ static bool		dont_use_callchains;
 static bool		show_threads;
 static struct perf_read_values	show_threads_values;
 
-static char		default_pretty_printing_style[] = "normal";
-static char		*pretty_printing_style = default_pretty_printing_style;
+static const char	default_pretty_printing_style[] = "normal";
+static const char	*pretty_printing_style = default_pretty_printing_style;
 
 static char		callchain_default_opt[] = "fractal,0.5";
 
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index c80acdf927a..f67bce2a83b 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -22,7 +22,7 @@
 static char			const *input_name = "perf.data";
 
 static char			default_sort_order[] = "avg, max, switch, runtime";
-static char			*sort_order = default_sort_order;
+static const char		*sort_order = default_sort_order;
 
 static int			profile_cpu = -1;
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 9f0cfa0108a..397290a0a76 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -96,7 +96,7 @@ struct source_line {
 	struct source_line	*next;
 };
 
-static char			*sym_filter			=   NULL;
+static const char		*sym_filter			=   NULL;
 struct sym_entry		*sym_filter_entry		=   NULL;
 struct sym_entry		*sym_filter_entry_sched		=   NULL;
 static int			sym_pcnt_filter			=      5;
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index f75c5f62401..9a71c94f057 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -662,7 +662,7 @@ size_t hists__fprintf(struct hists *self, struct hists *pair,
 	long displacement = 0;
 	unsigned int width;
 	const char *sep = symbol_conf.field_sep;
-	char *col_width = symbol_conf.col_width_list_str;
+	const char *col_width = symbol_conf.col_width_list_str;
 
 	init_rem_hits();
 
diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c
index 36d955e4942..99d02aa57db 100644
--- a/tools/perf/util/parse-options.c
+++ b/tools/perf/util/parse-options.c
@@ -51,7 +51,7 @@ static int get_value(struct parse_opt_ctx_t *p,
 		case OPTION_BOOLEAN:
 		case OPTION_INCR:
 		case OPTION_BIT:
-		case OPTION_SET_INT:
+		case OPTION_SET_UINT:
 		case OPTION_SET_PTR:
 			return opterror(opt, "takes no value", flags);
 		case OPTION_END:
@@ -83,8 +83,8 @@ static int get_value(struct parse_opt_ctx_t *p,
 		*(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
 		return 0;
 
-	case OPTION_SET_INT:
-		*(int *)opt->value = unset ? 0 : opt->defval;
+	case OPTION_SET_UINT:
+		*(unsigned int *)opt->value = unset ? 0 : opt->defval;
 		return 0;
 
 	case OPTION_SET_PTR:
@@ -515,13 +515,13 @@ int usage_with_options_internal(const char * const *usagestr,
 					pos += fprintf(stderr, " ...");
 			}
 			break;
-		default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */
+		default: /* OPTION_{BIT,BOOLEAN,SET_UINT,SET_PTR} */
 		case OPTION_END:
 		case OPTION_GROUP:
 		case OPTION_BIT:
 		case OPTION_BOOLEAN:
 		case OPTION_INCR:
-		case OPTION_SET_INT:
+		case OPTION_SET_UINT:
 		case OPTION_SET_PTR:
 			break;
 		}
diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
index 5838e2d8bbc..c7d72dce54b 100644
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -13,7 +13,7 @@ enum parse_opt_type {
 	OPTION_BIT,
 	OPTION_BOOLEAN,
 	OPTION_INCR,
-	OPTION_SET_INT,
+	OPTION_SET_UINT,
 	OPTION_SET_PTR,
 	/* options with arguments (usually) */
 	OPTION_STRING,
@@ -79,7 +79,7 @@ typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
  *
  * `defval`::
  *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
- *   OPTION_{BIT,SET_INT,SET_PTR} store the {mask,integer,pointer} to put in
+ *   OPTION_{BIT,SET_UINT,SET_PTR} store the {mask,integer,pointer} to put in
  *   the value when met.
  *   CALLBACKS can use it like they want.
  */
@@ -101,16 +101,16 @@ struct option {
 #define OPT_END()                   { .type = OPTION_END }
 #define OPT_ARGUMENT(l, h)          { .type = OPTION_ARGUMENT, .long_name = (l), .help = (h) }
 #define OPT_GROUP(h)                { .type = OPTION_GROUP, .help = (h) }
-#define OPT_BIT(s, l, v, h, b)      { .type = OPTION_BIT, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (b) }
+#define OPT_BIT(s, l, v, h, b)      { .type = OPTION_BIT, .short_name = (s), .long_name = (l), .value = check_vtype(v, int *), .help = (h), .defval = (b) }
 #define OPT_BOOLEAN(s, l, v, h)     { .type = OPTION_BOOLEAN, .short_name = (s), .long_name = (l), .value = check_vtype(v, bool *), .help = (h) }
-#define OPT_INCR(s, l, v, h)        { .type = OPTION_INCR, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
-#define OPT_SET_INT(s, l, v, h, i)  { .type = OPTION_SET_INT, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (i) }
+#define OPT_INCR(s, l, v, h)        { .type = OPTION_INCR, .short_name = (s), .long_name = (l), .value = check_vtype(v, int *), .help = (h) }
+#define OPT_SET_UINT(s, l, v, h, i)  { .type = OPTION_SET_UINT, .short_name = (s), .long_name = (l), .value = check_vtype(v, unsigned int *), .help = (h), .defval = (i) }
 #define OPT_SET_PTR(s, l, v, h, p)  { .type = OPTION_SET_PTR, .short_name = (s), .long_name = (l), .value = (v), .help = (h), .defval = (p) }
 #define OPT_INTEGER(s, l, v, h)     { .type = OPTION_INTEGER, .short_name = (s), .long_name = (l), .value = check_vtype(v, int *), .help = (h) }
 #define OPT_UINTEGER(s, l, v, h)    { .type = OPTION_UINTEGER, .short_name = (s), .long_name = (l), .value = check_vtype(v, unsigned int *), .help = (h) }
-#define OPT_LONG(s, l, v, h)        { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
-#define OPT_U64(s, l, v, h)         { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = (v), .help = (h) }
-#define OPT_STRING(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h) }
+#define OPT_LONG(s, l, v, h)        { .type = OPTION_LONG, .short_name = (s), .long_name = (l), .value = check_vtype(v, long *), .help = (h) }
+#define OPT_U64(s, l, v, h)         { .type = OPTION_U64, .short_name = (s), .long_name = (l), .value = check_vtype(v, u64 *), .help = (h) }
+#define OPT_STRING(s, l, v, a, h)   { .type = OPTION_STRING,  .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h) }
 #define OPT_DATE(s, l, v, h) \
 	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb }
 #define OPT_CALLBACK(s, l, v, a, h, f) \
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index da30b305fba..2316cb5a411 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1,10 +1,10 @@
 #include "sort.h"
 
 regex_t		parent_regex;
-char		default_parent_pattern[] = "^sys_|^do_page_fault";
-char		*parent_pattern = default_parent_pattern;
-char		default_sort_order[] = "comm,dso,symbol";
-char		*sort_order = default_sort_order;
+const char	default_parent_pattern[] = "^sys_|^do_page_fault";
+const char	*parent_pattern = default_parent_pattern;
+const char	default_sort_order[] = "comm,dso,symbol";
+const char	*sort_order = default_sort_order;
 int		sort__need_collapse = 0;
 int		sort__has_parent = 0;
 
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index eab2e0b3b74..0d61c4082f4 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -25,10 +25,10 @@
 #include "sort.h"
 
 extern regex_t parent_regex;
-extern char *sort_order;
-extern char default_parent_pattern[];
-extern char *parent_pattern;
-extern char default_sort_order[];
+extern const char *sort_order;
+extern const char default_parent_pattern[];
+extern const char *parent_pattern;
+extern const char default_sort_order[];
 extern int sort__need_collapse;
 extern int sort__has_parent;
 extern char *field_sep;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 6389d1acaf8..032469e4187 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -78,7 +78,7 @@ struct symbol_conf {
 			*default_guest_kallsyms,
 			*default_guest_modules;
 	const char	*guestmount;
-	char		*dso_list_str,
+	const char	*dso_list_str,
 			*comm_list_str,
 			*sym_list_str,
 			*col_width_list_str;
-- 
cgit v1.2.3


From f3c7f317c91e45aac0ef9d0b6474cd4637de69f0 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Wed, 14 Apr 2010 18:48:50 +0200
Subject: serial167: Kill unused variables

commits 638157bc1461f6718eeca06bedd9a09cf1f35c36 ("serial167: prepare to push
BKL down into drivers") and 4165fe4ef7305609a96c7f248cefb9c414d0ede5 ("tty:
Fix up char drivers request_room usage") removed code without removing the
corresponding variables:

| drivers/char/serial167.c: In function 'cd2401_rx_interrupt':
| drivers/char/serial167.c:630: warning: unused variable 'len'
| drivers/char/serial167.c: In function 'cy_ioctl':
| drivers/char/serial167.c:1531: warning: unused variable 'val'

Remove the variables to kill the warnings.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 drivers/char/serial167.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/char/serial167.c b/drivers/char/serial167.c
index 8dfd24721a8..78a62ebe75c 100644
--- a/drivers/char/serial167.c
+++ b/drivers/char/serial167.c
@@ -627,7 +627,6 @@ static irqreturn_t cd2401_rx_interrupt(int irq, void *dev_id)
 	char data;
 	int char_count;
 	int save_cnt;
-	int len;
 
 	/* determine the channel and change to that context */
 	channel = (u_short) (base_addr[CyLICR] >> 2);
@@ -1528,7 +1527,6 @@ static int
 cy_ioctl(struct tty_struct *tty, struct file *file,
 	 unsigned int cmd, unsigned long arg)
 {
-	unsigned long val;
 	struct cyclades_port *info = tty->driver_data;
 	int ret_val = 0;
 	void __user *argp = (void __user *)arg;
-- 
cgit v1.2.3


From adbf6e6952e80ae42a403442dcae21438cae94b3 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 23 Apr 2010 02:06:20 +1000
Subject: m68k: invoke oom-killer from page fault

As explained in commit 1c0fe6e3bd, we want to call the architecture independent
oom killer when getting an unexplained OOM from handle_mm_fault, rather than
simply killing current.

Cc: linux-m68k@lists.linux-m68k.org
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: linux-arch@vger.kernel.org
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
[Geert] Kill 2 introduced compiler warnings
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/mm/fault.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index d0e35cf99fc..a96394a0333 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -154,7 +154,6 @@ good_area:
 	 * the fault.
 	 */
 
- survive:
 	fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
 #ifdef DEBUG
 	printk("handle_mm_fault returns %d\n",fault);
@@ -180,15 +179,10 @@ good_area:
  */
 out_of_memory:
 	up_read(&mm->mmap_sem);
-	if (is_global_init(current)) {
-		yield();
-		down_read(&mm->mmap_sem);
-		goto survive;
-	}
-
-	printk("VM: killing process %s\n", current->comm);
-	if (user_mode(regs))
-		do_group_exit(SIGKILL);
+	if (!user_mode(regs))
+		goto no_context;
+	pagefault_out_of_memory();
+	return 0;
 
 no_context:
 	current->thread.signo = SIGBUS;
-- 
cgit v1.2.3


From 0b7f1a7efb38b551f5948a13d0b36e876ba536db Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Wed, 28 Jan 2009 21:01:02 +0100
Subject: platform: Make platform resource input parameters const

Make the platform resource input parameters of platform_device_add_resources()
and platform_device_register_simple() const, as the resources are copied and
never modified.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/platform.c         | 4 ++--
 include/linux/platform_device.h | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 4b4b565c835..c5fbe198fbd 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -187,7 +187,7 @@ EXPORT_SYMBOL_GPL(platform_device_alloc);
  * released.
  */
 int platform_device_add_resources(struct platform_device *pdev,
-				  struct resource *res, unsigned int num)
+				  const struct resource *res, unsigned int num)
 {
 	struct resource *r;
 
@@ -367,7 +367,7 @@ EXPORT_SYMBOL_GPL(platform_device_unregister);
  */
 struct platform_device *platform_device_register_simple(const char *name,
 							int id,
-							struct resource *res,
+							const struct resource *res,
 							unsigned int num)
 {
 	struct platform_device *pdev;
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index 212da17d06a..5417944d368 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -44,12 +44,14 @@ extern int platform_get_irq_byname(struct platform_device *, const char *);
 extern int platform_add_devices(struct platform_device **, int);
 
 extern struct platform_device *platform_device_register_simple(const char *, int id,
-					struct resource *, unsigned int);
+					const struct resource *, unsigned int);
 extern struct platform_device *platform_device_register_data(struct device *,
 		const char *, int, const void *, size_t);
 
 extern struct platform_device *platform_device_alloc(const char *name, int id);
-extern int platform_device_add_resources(struct platform_device *pdev, struct resource *res, unsigned int num);
+extern int platform_device_add_resources(struct platform_device *pdev,
+					 const struct resource *res,
+					 unsigned int num);
 extern int platform_device_add_data(struct platform_device *pdev, const void *data, size_t size);
 extern int platform_device_add(struct platform_device *pdev);
 extern void platform_device_del(struct platform_device *pdev);
-- 
cgit v1.2.3


From bf54a2b3c0dbf76136f00ff785bf6d8f6291311d Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Tue, 18 Nov 2008 21:13:53 +0100
Subject: m68k: amiga - Zorro bus modalias support

Add Amiga Zorro bus modalias and uevent support

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 drivers/net/a2065.c             |  1 +
 drivers/net/ariadne.c           |  1 +
 drivers/net/hydra.c             |  1 +
 drivers/net/zorro8390.c         |  1 +
 drivers/scsi/zorro7xx.c         |  1 +
 drivers/video/cirrusfb.c        |  1 +
 drivers/video/fm2fb.c           |  1 +
 drivers/zorro/zorro-driver.c    | 24 ++++++++++++++++++++++++
 drivers/zorro/zorro-sysfs.c     | 11 +++++++++++
 include/linux/mod_devicetable.h |  9 +++++++++
 include/linux/zorro.h           | 13 +------------
 scripts/mod/file2alias.c        | 14 ++++++++++++++
 12 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/drivers/net/a2065.c b/drivers/net/a2065.c
index ed5e9742be2..a8f0512bad3 100644
--- a/drivers/net/a2065.c
+++ b/drivers/net/a2065.c
@@ -674,6 +674,7 @@ static struct zorro_device_id a2065_zorro_tbl[] __devinitdata = {
 	{ ZORRO_PROD_AMERISTAR_A2065 },
 	{ 0 }
 };
+MODULE_DEVICE_TABLE(zorro, a2065_zorro_tbl);
 
 static struct zorro_driver a2065_driver = {
 	.name		= "a2065",
diff --git a/drivers/net/ariadne.c b/drivers/net/ariadne.c
index fa1a2354f5f..4b30a46486e 100644
--- a/drivers/net/ariadne.c
+++ b/drivers/net/ariadne.c
@@ -145,6 +145,7 @@ static struct zorro_device_id ariadne_zorro_tbl[] __devinitdata = {
     { ZORRO_PROD_VILLAGE_TRONIC_ARIADNE },
     { 0 }
 };
+MODULE_DEVICE_TABLE(zorro, ariadne_zorro_tbl);
 
 static struct zorro_driver ariadne_driver = {
     .name	= "ariadne",
diff --git a/drivers/net/hydra.c b/drivers/net/hydra.c
index 24724b4ad70..07d8e5b634f 100644
--- a/drivers/net/hydra.c
+++ b/drivers/net/hydra.c
@@ -71,6 +71,7 @@ static struct zorro_device_id hydra_zorro_tbl[] __devinitdata = {
     { ZORRO_PROD_HYDRA_SYSTEMS_AMIGANET },
     { 0 }
 };
+MODULE_DEVICE_TABLE(zorro, hydra_zorro_tbl);
 
 static struct zorro_driver hydra_driver = {
     .name	= "hydra",
diff --git a/drivers/net/zorro8390.c b/drivers/net/zorro8390.c
index 81c753a617a..9548cbb5012 100644
--- a/drivers/net/zorro8390.c
+++ b/drivers/net/zorro8390.c
@@ -102,6 +102,7 @@ static struct zorro_device_id zorro8390_zorro_tbl[] __devinitdata = {
     { ZORRO_PROD_INDIVIDUAL_COMPUTERS_X_SURF, },
     { 0 }
 };
+MODULE_DEVICE_TABLE(zorro, zorro8390_zorro_tbl);
 
 static struct zorro_driver zorro8390_driver = {
     .name	= "zorro8390",
diff --git a/drivers/scsi/zorro7xx.c b/drivers/scsi/zorro7xx.c
index 105449c15fa..e17764d7147 100644
--- a/drivers/scsi/zorro7xx.c
+++ b/drivers/scsi/zorro7xx.c
@@ -69,6 +69,7 @@ static struct zorro_device_id zorro7xx_zorro_tbl[] __devinitdata = {
 	},
 	{ 0 }
 };
+MODULE_DEVICE_TABLE(zorro, zorro7xx_zorro_tbl);
 
 static int __devinit zorro7xx_init_one(struct zorro_dev *z,
 				       const struct zorro_device_id *ent)
diff --git a/drivers/video/cirrusfb.c b/drivers/video/cirrusfb.c
index 8d8dfda2f86..6df7c54db0a 100644
--- a/drivers/video/cirrusfb.c
+++ b/drivers/video/cirrusfb.c
@@ -299,6 +299,7 @@ static const struct zorro_device_id cirrusfb_zorro_table[] = {
 	},
 	{ 0 }
 };
+MODULE_DEVICE_TABLE(zorro, cirrusfb_zorro_table);
 
 static const struct {
 	zorro_id id2;
diff --git a/drivers/video/fm2fb.c b/drivers/video/fm2fb.c
index 6c91c61cdb6..1b0feb8e724 100644
--- a/drivers/video/fm2fb.c
+++ b/drivers/video/fm2fb.c
@@ -219,6 +219,7 @@ static struct zorro_device_id fm2fb_devices[] __devinitdata = {
 	{ ZORRO_PROD_HELFRICH_RAINBOW_II },
 	{ 0 }
 };
+MODULE_DEVICE_TABLE(zorro, fm2fb_devices);
 
 static struct zorro_driver fm2fb_driver = {
 	.name		= "fm2fb",
diff --git a/drivers/zorro/zorro-driver.c b/drivers/zorro/zorro-driver.c
index 53180a37cc9..7ee2b6e7178 100644
--- a/drivers/zorro/zorro-driver.c
+++ b/drivers/zorro/zorro-driver.c
@@ -137,10 +137,34 @@ static int zorro_bus_match(struct device *dev, struct device_driver *drv)
 	return 0;
 }
 
+static int zorro_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+#ifdef CONFIG_HOTPLUG
+	struct zorro_dev *z;
+
+	if (!dev)
+		return -ENODEV;
+
+	z = to_zorro_dev(dev);
+	if (!z)
+		return -ENODEV;
+
+	if (add_uevent_var(env, "ZORRO_ID=%08X", z->id) ||
+	    add_uevent_var(env, "ZORRO_SLOT_NAME=%s", dev_name(dev)) ||
+	    add_uevent_var(env, "ZORRO_SLOT_ADDR=%04X", z->slotaddr) ||
+	    add_uevent_var(env, "MODALIAS=" ZORRO_DEVICE_MODALIAS_FMT, z->id))
+		return -ENOMEM;
+
+	return 0;
+#else /* !CONFIG_HOTPLUG */
+	return -ENODEV;
+#endif /* !CONFIG_HOTPLUG */
+}
 
 struct bus_type zorro_bus_type = {
 	.name	= "zorro",
 	.match	= zorro_bus_match,
+	.uevent	= zorro_uevent,
 	.probe	= zorro_device_probe,
 	.remove	= zorro_device_remove,
 };
diff --git a/drivers/zorro/zorro-sysfs.c b/drivers/zorro/zorro-sysfs.c
index 1d2a772ea14..eb924e0a64c 100644
--- a/drivers/zorro/zorro-sysfs.c
+++ b/drivers/zorro/zorro-sysfs.c
@@ -77,6 +77,16 @@ static struct bin_attribute zorro_config_attr = {
 	.read = zorro_read_config,
 };
 
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct zorro_dev *z = to_zorro_dev(dev);
+
+	return sprintf(buf, ZORRO_DEVICE_MODALIAS_FMT "\n", z->id);
+}
+
+static DEVICE_ATTR(modalias, S_IRUGO, modalias_show, NULL);
+
 int zorro_create_sysfs_dev_files(struct zorro_dev *z)
 {
 	struct device *dev = &z->dev;
@@ -89,6 +99,7 @@ int zorro_create_sysfs_dev_files(struct zorro_dev *z)
 	    (error = device_create_file(dev, &dev_attr_slotaddr)) ||
 	    (error = device_create_file(dev, &dev_attr_slotsize)) ||
 	    (error = device_create_file(dev, &dev_attr_resource)) ||
+	    (error = device_create_file(dev, &dev_attr_modalias)) ||
 	    (error = sysfs_create_bin_file(&dev->kobj, &zorro_config_attr)))
 		return error;
 
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index f58e9d836f3..56fde4364e4 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -474,4 +474,13 @@ struct platform_device_id {
 			__attribute__((aligned(sizeof(kernel_ulong_t))));
 };
 
+struct zorro_device_id {
+	__u32 id;			/* Device ID or ZORRO_WILDCARD */
+	kernel_ulong_t driver_data;	/* Data private to the driver */
+};
+
+#define ZORRO_WILDCARD			(0xffffffff)	/* not official */
+
+#define ZORRO_DEVICE_MODALIAS_FMT	"zorro:i%08X"
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/include/linux/zorro.h b/include/linux/zorro.h
index 913bfc226dd..908db1b36d6 100644
--- a/include/linux/zorro.h
+++ b/include/linux/zorro.h
@@ -38,8 +38,6 @@
 typedef __u32 zorro_id;
 
 
-#define ZORRO_WILDCARD		(0xffffffff)	/* not official */
-
 /* Include the ID list */
 #include <linux/zorro_ids.h>
 
@@ -116,6 +114,7 @@ struct ConfigDev {
 
 #include <linux/init.h>
 #include <linux/ioport.h>
+#include <linux/mod_devicetable.h>
 
 #include <asm/zorro.h>
 
@@ -154,16 +153,6 @@ extern struct zorro_bus zorro_bus;	/* single Zorro bus */
 extern struct bus_type zorro_bus_type;
 
 
-    /*
-     *  Zorro device IDs
-     */
-
-struct zorro_device_id {
-	zorro_id id;			/* Device ID or ZORRO_WILDCARD */
-	unsigned long driver_data;	/* Data private to the driver */
-};
-
-
     /*
      *  Zorro device drivers
      */
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 220213e603d..df90f31d14b 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -796,6 +796,16 @@ static int do_platform_entry(const char *filename,
 	return 1;
 }
 
+/* Looks like: zorro:iN. */
+static int do_zorro_entry(const char *filename, struct zorro_device_id *id,
+			  char *alias)
+{
+	id->id = TO_NATIVE(id->id);
+	strcpy(alias, "zorro:");
+	ADD(alias, "i", id->id != ZORRO_WILDCARD, id->id);
+	return 1;
+}
+
 /* Ignore any prefix, eg. some architectures prepend _ */
 static inline int sym_is(const char *symbol, const char *name)
 {
@@ -943,6 +953,10 @@ void handle_moddevtable(struct module *mod, struct elf_info *info,
 		do_table(symval, sym->st_size,
 			 sizeof(struct platform_device_id), "platform",
 			 do_platform_entry, mod);
+	else if (sym_is(symname, "__mod_zorro_device_table"))
+		do_table(symval, sym->st_size,
+			 sizeof(struct zorro_device_id), "zorro",
+			 do_zorro_entry, mod);
 	free(zeros);
 }
 
-- 
cgit v1.2.3


From 0d305464aefff342c85b4db8b3d7a4345246e5a1 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Sun, 5 Apr 2009 12:40:41 +0200
Subject: m68k: amiga - Zorro host bridge platform device conversion

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/amiga/Makefile   |   2 +-
 arch/m68k/amiga/platform.c |  58 +++++++++++
 drivers/zorro/proc.c       |   6 +-
 drivers/zorro/zorro.c      | 243 ++++++++++++++++++++++++---------------------
 include/linux/zorro.h      |   9 --
 5 files changed, 190 insertions(+), 128 deletions(-)
 create mode 100644 arch/m68k/amiga/platform.c

diff --git a/arch/m68k/amiga/Makefile b/arch/m68k/amiga/Makefile
index 6a0d7650f98..11dd30b16b3 100644
--- a/arch/m68k/amiga/Makefile
+++ b/arch/m68k/amiga/Makefile
@@ -2,6 +2,6 @@
 # Makefile for Linux arch/m68k/amiga source directory
 #
 
-obj-y		:= config.o amiints.o cia.o chipram.o amisound.o
+obj-y		:= config.o amiints.o cia.o chipram.o amisound.o platform.o
 
 obj-$(CONFIG_AMIGA_PCMCIA)	+= pcmcia.o
diff --git a/arch/m68k/amiga/platform.c b/arch/m68k/amiga/platform.c
new file mode 100644
index 00000000000..33a7669b441
--- /dev/null
+++ b/arch/m68k/amiga/platform.c
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (C) 2007-2009 Geert Uytterhoeven
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/zorro.h>
+
+#include <asm/amigahw.h>
+
+
+#ifdef CONFIG_ZORRO
+
+static const struct resource zorro_resources[] __initconst = {
+	/* Zorro II regions (on Zorro II/III) */
+	{
+		.name	= "Zorro II exp",
+		.start	= 0x00e80000,
+		.end	= 0x00efffff,
+		.flags	= IORESOURCE_MEM,
+	}, {
+		.name	= "Zorro II mem",
+		.start	= 0x00200000,
+		.end	= 0x009fffff,
+		.flags	= IORESOURCE_MEM,
+	},
+	/* Zorro III regions (on Zorro III only) */
+	{
+		.name	= "Zorro III exp",
+		.start	= 0xff000000,
+		.end	= 0xffffffff,
+		.flags	= IORESOURCE_MEM,
+	}, {
+		.name	= "Zorro III cfg",
+		.start	= 0x40000000,
+		.end	= 0x7fffffff,
+		.flags	= IORESOURCE_MEM,
+	}
+};
+
+
+static int __init amiga_init_bus(void)
+{
+	if (!MACH_IS_AMIGA || !AMIGAHW_PRESENT(ZORRO))
+		return -ENODEV;
+
+	platform_device_register_simple("amiga-zorro", -1, zorro_resources,
+					AMIGAHW_PRESENT(ZORRO3) ? 4 : 2);
+	return 0;
+}
+
+subsys_initcall(amiga_init_bus);
+
+#endif /* CONFIG_ZORRO */
diff --git a/drivers/zorro/proc.c b/drivers/zorro/proc.c
index d47c47fc048..3c7046d7965 100644
--- a/drivers/zorro/proc.c
+++ b/drivers/zorro/proc.c
@@ -97,7 +97,7 @@ static void zorro_seq_stop(struct seq_file *m, void *v)
 
 static int zorro_seq_show(struct seq_file *m, void *v)
 {
-	u_int slot = *(loff_t *)v;
+	unsigned int slot = *(loff_t *)v;
 	struct zorro_dev *z = &zorro_autocon[slot];
 
 	seq_printf(m, "%02x\t%08x\t%08lx\t%08lx\t%02x\n", slot, z->id,
@@ -129,7 +129,7 @@ static const struct file_operations zorro_devices_proc_fops = {
 
 static struct proc_dir_entry *proc_bus_zorro_dir;
 
-static int __init zorro_proc_attach_device(u_int slot)
+static int __init zorro_proc_attach_device(unsigned int slot)
 {
 	struct proc_dir_entry *entry;
 	char name[4];
@@ -146,7 +146,7 @@ static int __init zorro_proc_attach_device(u_int slot)
 
 static int __init zorro_proc_init(void)
 {
-	u_int slot;
+	unsigned int slot;
 
 	if (MACH_IS_AMIGA && AMIGAHW_PRESENT(ZORRO)) {
 		proc_bus_zorro_dir = proc_mkdir("bus/zorro", NULL);
diff --git a/drivers/zorro/zorro.c b/drivers/zorro/zorro.c
index d45fb34e2d2..6455f3a244c 100644
--- a/drivers/zorro/zorro.c
+++ b/drivers/zorro/zorro.c
@@ -15,6 +15,8 @@
 #include <linux/zorro.h>
 #include <linux/bitops.h>
 #include <linux/string.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
 
 #include <asm/setup.h>
 #include <asm/amigahw.h>
@@ -26,24 +28,17 @@
      *  Zorro Expansion Devices
      */
 
-u_int zorro_num_autocon = 0;
+unsigned int zorro_num_autocon;
 struct zorro_dev zorro_autocon[ZORRO_NUM_AUTO];
 
 
     /*
-     *  Single Zorro bus
+     *  Zorro bus
      */
 
-struct zorro_bus zorro_bus = {\
-    .resources = {
-	/* Zorro II regions (on Zorro II/III) */
-	{ .name = "Zorro II exp", .start = 0x00e80000, .end = 0x00efffff },
-	{ .name = "Zorro II mem", .start = 0x00200000, .end = 0x009fffff },
-	/* Zorro III regions (on Zorro III only) */
-	{ .name = "Zorro III exp", .start = 0xff000000, .end = 0xffffffff },
-	{ .name = "Zorro III cfg", .start = 0x40000000, .end = 0x7fffffff }
-    },
-    .name = "Zorro bus"
+struct zorro_bus {
+	struct list_head devices;	/* list of devices on this bus */
+	struct device dev;
 };
 
 
@@ -53,18 +48,19 @@ struct zorro_bus zorro_bus = {\
 
 struct zorro_dev *zorro_find_device(zorro_id id, struct zorro_dev *from)
 {
-    struct zorro_dev *z;
+	struct zorro_dev *z;
 
-    if (!MACH_IS_AMIGA || !AMIGAHW_PRESENT(ZORRO))
-	return NULL;
+	if (!zorro_num_autocon)
+		return NULL;
 
-    for (z = from ? from+1 : &zorro_autocon[0];
-	 z < zorro_autocon+zorro_num_autocon;
-	 z++)
-	if (id == ZORRO_WILDCARD || id == z->id)
-	    return z;
-    return NULL;
+	for (z = from ? from+1 : &zorro_autocon[0];
+	     z < zorro_autocon+zorro_num_autocon;
+	     z++)
+		if (id == ZORRO_WILDCARD || id == z->id)
+			return z;
+	return NULL;
 }
+EXPORT_SYMBOL(zorro_find_device);
 
 
     /*
@@ -83,121 +79,138 @@ struct zorro_dev *zorro_find_device(zorro_id id, struct zorro_dev *from)
      */
 
 DECLARE_BITMAP(zorro_unused_z2ram, 128);
+EXPORT_SYMBOL(zorro_unused_z2ram);
 
 
 static void __init mark_region(unsigned long start, unsigned long end,
 			       int flag)
 {
-    if (flag)
-	start += Z2RAM_CHUNKMASK;
-    else
-	end += Z2RAM_CHUNKMASK;
-    start &= ~Z2RAM_CHUNKMASK;
-    end &= ~Z2RAM_CHUNKMASK;
-
-    if (end <= Z2RAM_START || start >= Z2RAM_END)
-	return;
-    start = start < Z2RAM_START ? 0x00000000 : start-Z2RAM_START;
-    end = end > Z2RAM_END ? Z2RAM_SIZE : end-Z2RAM_START;
-    while (start < end) {
-	u32 chunk = start>>Z2RAM_CHUNKSHIFT;
 	if (flag)
-	    set_bit(chunk, zorro_unused_z2ram);
+		start += Z2RAM_CHUNKMASK;
 	else
-	    clear_bit(chunk, zorro_unused_z2ram);
-	start += Z2RAM_CHUNKSIZE;
-    }
+		end += Z2RAM_CHUNKMASK;
+	start &= ~Z2RAM_CHUNKMASK;
+	end &= ~Z2RAM_CHUNKMASK;
+
+	if (end <= Z2RAM_START || start >= Z2RAM_END)
+		return;
+	start = start < Z2RAM_START ? 0x00000000 : start-Z2RAM_START;
+	end = end > Z2RAM_END ? Z2RAM_SIZE : end-Z2RAM_START;
+	while (start < end) {
+		u32 chunk = start>>Z2RAM_CHUNKSHIFT;
+		if (flag)
+			set_bit(chunk, zorro_unused_z2ram);
+		else
+			clear_bit(chunk, zorro_unused_z2ram);
+		start += Z2RAM_CHUNKSIZE;
+	}
 }
 
 
-static struct resource __init *zorro_find_parent_resource(struct zorro_dev *z)
+static struct resource __init *zorro_find_parent_resource(
+	struct platform_device *bridge, struct zorro_dev *z)
 {
-    int i;
+	int i;
 
-    for (i = 0; i < zorro_bus.num_resources; i++)
-	if (zorro_resource_start(z) >= zorro_bus.resources[i].start &&
-	    zorro_resource_end(z) <= zorro_bus.resources[i].end)
-		return &zorro_bus.resources[i];
-    return &iomem_resource;
+	for (i = 0; i < bridge->num_resources; i++) {
+		struct resource *r = &bridge->resource[i];
+		if (zorro_resource_start(z) >= r->start &&
+		    zorro_resource_end(z) <= r->end)
+			return r;
+	}
+	return &iomem_resource;
 }
 
 
-    /*
-     *  Initialization
-     */
 
-static int __init zorro_init(void)
+static int __init amiga_zorro_probe(struct platform_device *pdev)
 {
-    struct zorro_dev *z;
-    unsigned int i;
-    int error;
-
-    if (!MACH_IS_AMIGA || !AMIGAHW_PRESENT(ZORRO))
-	return 0;
-
-    pr_info("Zorro: Probing AutoConfig expansion devices: %d device%s\n",
-	   zorro_num_autocon, zorro_num_autocon == 1 ? "" : "s");
-
-    /* Initialize the Zorro bus */
-    INIT_LIST_HEAD(&zorro_bus.devices);
-    dev_set_name(&zorro_bus.dev, "zorro");
-    error = device_register(&zorro_bus.dev);
-    if (error) {
-	pr_err("Zorro: Error registering zorro_bus\n");
-	return error;
-    }
-
-    /* Request the resources */
-    zorro_bus.num_resources = AMIGAHW_PRESENT(ZORRO3) ? 4 : 2;
-    for (i = 0; i < zorro_bus.num_resources; i++)
-	request_resource(&iomem_resource, &zorro_bus.resources[i]);
-
-    /* Register all devices */
-    for (i = 0; i < zorro_num_autocon; i++) {
-	z = &zorro_autocon[i];
-	z->id = (z->rom.er_Manufacturer<<16) | (z->rom.er_Product<<8);
-	if (z->id == ZORRO_PROD_GVP_EPC_BASE) {
-	    /* GVP quirk */
-	    unsigned long magic = zorro_resource_start(z)+0x8000;
-	    z->id |= *(u16 *)ZTWO_VADDR(magic) & GVP_PRODMASK;
-	}
-	sprintf(z->name, "Zorro device %08x", z->id);
-	zorro_name_device(z);
-	z->resource.name = z->name;
-	if (request_resource(zorro_find_parent_resource(z), &z->resource))
-	    pr_err("Zorro: Address space collision on device %s %pR\n",
-		   z->name, &z->resource);
-	dev_set_name(&z->dev, "%02x", i);
-	z->dev.parent = &zorro_bus.dev;
-	z->dev.bus = &zorro_bus_type;
-	error = device_register(&z->dev);
+	struct zorro_bus *bus;
+	struct zorro_dev *z;
+	struct resource *r;
+	unsigned int i;
+	int error;
+
+	/* Initialize the Zorro bus */
+	bus = kzalloc(sizeof(*bus), GFP_KERNEL);
+	if (!bus)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&bus->devices);
+	bus->dev.parent = &pdev->dev;
+	dev_set_name(&bus->dev, "zorro");
+	error = device_register(&bus->dev);
 	if (error) {
-	    pr_err("Zorro: Error registering device %s\n", z->name);
-	    continue;
+		pr_err("Zorro: Error registering zorro_bus\n");
+		kfree(bus);
+		return error;
 	}
-	error = zorro_create_sysfs_dev_files(z);
-	if (error)
-	    dev_err(&z->dev, "Error creating sysfs files\n");
-    }
-
-    /* Mark all available Zorro II memory */
-    zorro_for_each_dev(z) {
-	if (z->rom.er_Type & ERTF_MEMLIST)
-	    mark_region(zorro_resource_start(z), zorro_resource_end(z)+1, 1);
-    }
-
-    /* Unmark all used Zorro II memory */
-    for (i = 0; i < m68k_num_memory; i++)
-	if (m68k_memory[i].addr < 16*1024*1024)
-	    mark_region(m68k_memory[i].addr,
-			m68k_memory[i].addr+m68k_memory[i].size, 0);
-
-    return 0;
+	platform_set_drvdata(pdev, bus);
+
+	/* Register all devices */
+	pr_info("Zorro: Probing AutoConfig expansion devices: %u device%s\n",
+		 zorro_num_autocon, zorro_num_autocon == 1 ? "" : "s");
+
+	for (i = 0; i < zorro_num_autocon; i++) {
+		z = &zorro_autocon[i];
+		z->id = (z->rom.er_Manufacturer<<16) | (z->rom.er_Product<<8);
+		if (z->id == ZORRO_PROD_GVP_EPC_BASE) {
+			/* GVP quirk */
+			unsigned long magic = zorro_resource_start(z)+0x8000;
+			z->id |= *(u16 *)ZTWO_VADDR(magic) & GVP_PRODMASK;
+		}
+		sprintf(z->name, "Zorro device %08x", z->id);
+		zorro_name_device(z);
+		z->resource.name = z->name;
+		r = zorro_find_parent_resource(pdev, z);
+		error = request_resource(r, &z->resource);
+		if (error)
+			dev_err(&bus->dev,
+				"Address space collision on device %s %pR\n",
+				z->name, &z->resource);
+		dev_set_name(&z->dev, "%02x", i);
+		z->dev.parent = &bus->dev;
+		z->dev.bus = &zorro_bus_type;
+		error = device_register(&z->dev);
+		if (error) {
+			dev_err(&bus->dev, "Error registering device %s\n",
+				z->name);
+			continue;
+		}
+		error = zorro_create_sysfs_dev_files(z);
+		if (error)
+			dev_err(&z->dev, "Error creating sysfs files\n");
+	}
+
+	/* Mark all available Zorro II memory */
+	zorro_for_each_dev(z) {
+		if (z->rom.er_Type & ERTF_MEMLIST)
+			mark_region(zorro_resource_start(z),
+				    zorro_resource_end(z)+1, 1);
+	}
+
+	/* Unmark all used Zorro II memory */
+	for (i = 0; i < m68k_num_memory; i++)
+		if (m68k_memory[i].addr < 16*1024*1024)
+			mark_region(m68k_memory[i].addr,
+				    m68k_memory[i].addr+m68k_memory[i].size,
+				    0);
+
+	return 0;
 }
 
-subsys_initcall(zorro_init);
+static struct platform_driver amiga_zorro_driver = {
+	.driver   = {
+		.name	= "amiga-zorro",
+		.owner	= THIS_MODULE,
+	},
+};
 
-EXPORT_SYMBOL(zorro_find_device);
-EXPORT_SYMBOL(zorro_unused_z2ram);
+static int __init amiga_zorro_init(void)
+{
+	return platform_driver_probe(&amiga_zorro_driver, amiga_zorro_probe);
+}
+
+module_init(amiga_zorro_init);
 
 MODULE_LICENSE("GPL");
diff --git a/include/linux/zorro.h b/include/linux/zorro.h
index 908db1b36d6..7bf9db525e9 100644
--- a/include/linux/zorro.h
+++ b/include/linux/zorro.h
@@ -141,15 +141,6 @@ struct zorro_dev {
      *  Zorro bus
      */
 
-struct zorro_bus {
-    struct list_head devices;		/* list of devices on this bus */
-    unsigned int num_resources;		/* number of resources */
-    struct resource resources[4];	/* address space routed to this bus */
-    struct device dev;
-    char name[10];
-};
-
-extern struct zorro_bus zorro_bus;	/* single Zorro bus */
 extern struct bus_type zorro_bus_type;
 
 
-- 
cgit v1.2.3


From fa6688e1c7e7341fb7d1ca5878a3641762e60dec Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Sun, 5 Apr 2009 12:45:56 +0200
Subject: m68k: amiga - Frame buffer platform device conversion

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/amiga/platform.c | 15 ++++++++++++++
 drivers/video/amifb.c      | 49 +++++++++++++++++++++++++++++-----------------
 2 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/arch/m68k/amiga/platform.c b/arch/m68k/amiga/platform.c
index 33a7669b441..7fa929cf9a3 100644
--- a/arch/m68k/amiga/platform.c
+++ b/arch/m68k/amiga/platform.c
@@ -56,3 +56,18 @@ static int __init amiga_init_bus(void)
 subsys_initcall(amiga_init_bus);
 
 #endif /* CONFIG_ZORRO */
+
+
+static int __init amiga_init_devices(void)
+{
+	if (!MACH_IS_AMIGA)
+		return -ENODEV;
+
+	/* video hardware */
+	if (AMIGAHW_PRESENT(AMI_VIDEO))
+		platform_device_register_simple("amiga-video", -1, NULL, 0);
+
+	return 0;
+}
+
+device_initcall(amiga_init_devices);
diff --git a/drivers/video/amifb.c b/drivers/video/amifb.c
index dca48df9844..e5d6b56d444 100644
--- a/drivers/video/amifb.c
+++ b/drivers/video/amifb.c
@@ -50,8 +50,9 @@
 #include <linux/fb.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
-
+#include <linux/platform_device.h>
 #include <linux/uaccess.h>
+
 #include <asm/system.h>
 #include <asm/irq.h>
 #include <asm/amigahw.h>
@@ -1135,7 +1136,7 @@ static int amifb_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg
 	 * Interface to the low level console driver
 	 */
 
-static void amifb_deinit(void);
+static void amifb_deinit(struct platform_device *pdev);
 
 	/*
 	 * Internal routines
@@ -2246,7 +2247,7 @@ static inline void chipfree(void)
 	 * Initialisation
 	 */
 
-static int __init amifb_init(void)
+static int __init amifb_probe(struct platform_device *pdev)
 {
 	int tag, i, err = 0;
 	u_long chipptr;
@@ -2261,16 +2262,6 @@ static int __init amifb_init(void)
 	}
 	amifb_setup(option);
 #endif
-	if (!MACH_IS_AMIGA || !AMIGAHW_PRESENT(AMI_VIDEO))
-		return -ENODEV;
-
-	/*
-	 * We request all registers starting from bplpt[0]
-	 */
-	if (!request_mem_region(CUSTOM_PHYSADDR+0xe0, 0x120,
-				"amifb [Denise/Lisa]"))
-		return -EBUSY;
-
 	custom.dmacon = DMAF_ALL | DMAF_MASTER;
 
 	switch (amiga_chipset) {
@@ -2377,6 +2368,7 @@ default_chipset:
 	fb_info.fbops = &amifb_ops;
 	fb_info.par = &currentpar;
 	fb_info.flags = FBINFO_DEFAULT;
+	fb_info.device = &pdev->dev;
 
 	if (!fb_find_mode(&fb_info.var, &fb_info, mode_option, ami_modedb,
 			  NUM_TOTAL_MODES, &ami_modedb[defmode], 4)) {
@@ -2451,18 +2443,18 @@ default_chipset:
 	return 0;
 
 amifb_error:
-	amifb_deinit();
+	amifb_deinit(pdev);
 	return err;
 }
 
-static void amifb_deinit(void)
+static void amifb_deinit(struct platform_device *pdev)
 {
 	if (fb_info.cmap.len)
 		fb_dealloc_cmap(&fb_info.cmap);
+	fb_dealloc_cmap(&fb_info.cmap);
 	chipfree();
 	if (videomemory)
 		iounmap((void*)videomemory);
-	release_mem_region(CUSTOM_PHYSADDR+0xe0, 0x120);
 	custom.dmacon = DMAF_ALL | DMAF_MASTER;
 }
 
@@ -3794,14 +3786,35 @@ static void ami_rebuild_copper(void)
 	}
 }
 
-static void __exit amifb_exit(void)
+static int __exit amifb_remove(struct platform_device *pdev)
 {
 	unregister_framebuffer(&fb_info);
-	amifb_deinit();
+	amifb_deinit(pdev);
 	amifb_video_off();
+	return 0;
+}
+
+static struct platform_driver amifb_driver = {
+	.remove = __exit_p(amifb_remove),
+	.driver   = {
+		.name	= "amiga-video",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init amifb_init(void)
+{
+	return platform_driver_probe(&amifb_driver, amifb_probe);
 }
 
 module_init(amifb_init);
+
+static void __exit amifb_exit(void)
+{
+	platform_driver_unregister(&amifb_driver);
+}
+
 module_exit(amifb_exit);
 
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:amiga-video");
-- 
cgit v1.2.3


From ff2db7c5ab78817eb3c5d15dd87f18e9be726f1a Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Sun, 5 Apr 2009 12:59:54 +0200
Subject: m68k: amiga - Sound platform device conversion

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/amiga/platform.c          |  5 ++++
 sound/oss/dmasound/dmasound_paula.c | 51 ++++++++++++++++++++++---------------
 2 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/arch/m68k/amiga/platform.c b/arch/m68k/amiga/platform.c
index 7fa929cf9a3..2e308e7a094 100644
--- a/arch/m68k/amiga/platform.c
+++ b/arch/m68k/amiga/platform.c
@@ -67,6 +67,11 @@ static int __init amiga_init_devices(void)
 	if (AMIGAHW_PRESENT(AMI_VIDEO))
 		platform_device_register_simple("amiga-video", -1, NULL, 0);
 
+
+	/* sound hardware */
+	if (AMIGAHW_PRESENT(AMI_AUDIO))
+		platform_device_register_simple("amiga-audio", -1, NULL, 0);
+
 	return 0;
 }
 
diff --git a/sound/oss/dmasound/dmasound_paula.c b/sound/oss/dmasound/dmasound_paula.c
index bb14e4c67e8..87910e99213 100644
--- a/sound/oss/dmasound/dmasound_paula.c
+++ b/sound/oss/dmasound/dmasound_paula.c
@@ -21,6 +21,7 @@
 #include <linux/ioport.h>
 #include <linux/soundcard.h>
 #include <linux/interrupt.h>
+#include <linux/platform_device.h>
 
 #include <asm/uaccess.h>
 #include <asm/setup.h>
@@ -710,31 +711,41 @@ static MACHINE machAmiga = {
 /*** Config & Setup **********************************************************/
 
 
-static int __init dmasound_paula_init(void)
+static int __init amiga_audio_probe(struct platform_device *pdev)
 {
-	int err;
-
-	if (MACH_IS_AMIGA && AMIGAHW_PRESENT(AMI_AUDIO)) {
-	    if (!request_mem_region(CUSTOM_PHYSADDR+0xa0, 0x40,
-				    "dmasound [Paula]"))
-		return -EBUSY;
-	    dmasound.mach = machAmiga;
-	    dmasound.mach.default_hard = def_hard ;
-	    dmasound.mach.default_soft = def_soft ;
-	    err = dmasound_init();
-	    if (err)
-		release_mem_region(CUSTOM_PHYSADDR+0xa0, 0x40);
-	    return err;
-	} else
-	    return -ENODEV;
+	dmasound.mach = machAmiga;
+	dmasound.mach.default_hard = def_hard ;
+	dmasound.mach.default_soft = def_soft ;
+	return dmasound_init();
 }
 
-static void __exit dmasound_paula_cleanup(void)
+static int __exit amiga_audio_remove(struct platform_device *pdev)
 {
 	dmasound_deinit();
-	release_mem_region(CUSTOM_PHYSADDR+0xa0, 0x40);
+	return 0;
+}
+
+static struct platform_driver amiga_audio_driver = {
+	.remove = __exit_p(amiga_audio_remove),
+	.driver   = {
+		.name	= "amiga-audio",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init amiga_audio_init(void)
+{
+	return platform_driver_probe(&amiga_audio_driver, amiga_audio_probe);
 }
 
-module_init(dmasound_paula_init);
-module_exit(dmasound_paula_cleanup);
+module_init(amiga_audio_init);
+
+static void __exit amiga_audio_exit(void)
+{
+	platform_driver_unregister(&amiga_audio_driver);
+}
+
+module_exit(amiga_audio_exit);
+
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:amiga-audio");
-- 
cgit v1.2.3


From 92183b346f02773dae09182c65f16b013f295d80 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Sun, 5 Apr 2009 13:02:13 +0200
Subject: m68k: amiga - Floppy platform device conversion

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/amiga/platform.c |  5 +++++
 drivers/block/amiflop.c    | 47 +++++++++++++++++++++-------------------------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/arch/m68k/amiga/platform.c b/arch/m68k/amiga/platform.c
index 2e308e7a094..38f18bf1473 100644
--- a/arch/m68k/amiga/platform.c
+++ b/arch/m68k/amiga/platform.c
@@ -72,6 +72,11 @@ static int __init amiga_init_devices(void)
 	if (AMIGAHW_PRESENT(AMI_AUDIO))
 		platform_device_register_simple("amiga-audio", -1, NULL, 0);
 
+
+	/* storage interfaces */
+	if (AMIGAHW_PRESENT(AMI_FLOPPY))
+		platform_device_register_simple("amiga-floppy", -1, NULL, 0);
+
 	return 0;
 }
 
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 0182a22c423..832798aa14f 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -66,6 +66,7 @@
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/interrupt.h>
+#include <linux/platform_device.h>
 
 #include <asm/setup.h>
 #include <asm/uaccess.h>
@@ -1696,34 +1697,18 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data)
 	return get_disk(unit[drive].gendisk);
 }
 
-static int __init amiga_floppy_init(void)
+static int __init amiga_floppy_probe(struct platform_device *pdev)
 {
 	int i, ret;
 
-	if (!MACH_IS_AMIGA)
-		return -ENODEV;
-
-	if (!AMIGAHW_PRESENT(AMI_FLOPPY))
-		return -ENODEV;
-
 	if (register_blkdev(FLOPPY_MAJOR,"fd"))
 		return -EBUSY;
 
-	/*
-	 *  We request DSKPTR, DSKLEN and DSKDATA only, because the other
-	 *  floppy registers are too spreaded over the custom register space
-	 */
-	ret = -EBUSY;
-	if (!request_mem_region(CUSTOM_PHYSADDR+0x20, 8, "amiflop [Paula]")) {
-		printk("fd: cannot get floppy registers\n");
-		goto out_blkdev;
-	}
-
 	ret = -ENOMEM;
 	if ((raw_buf = (char *)amiga_chip_alloc (RAW_BUF_SIZE, "Floppy")) ==
 	    NULL) {
 		printk("fd: cannot get chip mem buffer\n");
-		goto out_memregion;
+		goto out_blkdev;
 	}
 
 	ret = -EBUSY;
@@ -1792,18 +1777,13 @@ out_irq2:
 	free_irq(IRQ_AMIGA_DSKBLK, NULL);
 out_irq:
 	amiga_chip_free(raw_buf);
-out_memregion:
-	release_mem_region(CUSTOM_PHYSADDR+0x20, 8);
 out_blkdev:
 	unregister_blkdev(FLOPPY_MAJOR,"fd");
 	return ret;
 }
 
-module_init(amiga_floppy_init);
-#ifdef MODULE
-
 #if 0 /* not safe to unload */
-void cleanup_module(void)
+static int __exit amiga_floppy_remove(struct platform_device *pdev)
 {
 	int i;
 
@@ -1820,12 +1800,25 @@ void cleanup_module(void)
 	custom.dmacon = DMAF_DISK; /* disable DMA */
 	amiga_chip_free(raw_buf);
 	blk_cleanup_queue(floppy_queue);
-	release_mem_region(CUSTOM_PHYSADDR+0x20, 8);
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 }
 #endif
 
-#else
+static struct platform_driver amiga_floppy_driver = {
+	.driver   = {
+		.name	= "amiga-floppy",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init amiga_floppy_init(void)
+{
+	return platform_driver_probe(&amiga_floppy_driver, amiga_floppy_probe);
+}
+
+module_init(amiga_floppy_init);
+
+#ifndef MODULE
 static int __init amiga_floppy_setup (char *str)
 {
 	int n;
@@ -1840,3 +1833,5 @@ static int __init amiga_floppy_setup (char *str)
 
 __setup("floppy=", amiga_floppy_setup);
 #endif
+
+MODULE_ALIAS("platform:amiga-floppy");
-- 
cgit v1.2.3


From 63aa9e7e3ab28ad5362502b1a69fae945367ad65 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 May 2010 16:42:37 -0300
Subject: perf tui: Add explicit -lslang option

At least on rawhide using -lnewt is not enough if we use SLang routines
directly, so add an explicit -lslang since we use SLang routines.

Reported-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index a9281cca411..e2729fcfaf3 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -564,7 +564,7 @@ ifneq ($(shell sh -c "(echo '\#include <newt.h>'; echo 'int main(void) { newtIni
 else
 	# Fedora has /usr/include/slang/slang.h, but ubuntu /usr/include/slang.h
 	BASIC_CFLAGS += -I/usr/include/slang
-	EXTLIBS += -lnewt
+	EXTLIBS += -lnewt -lslang
 	LIB_OBJS += $(OUTPUT)util/newt.o
 endif
 
-- 
cgit v1.2.3


From db19272edc93661835bf6ec9736cfd0754aa3c62 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 17 May 2010 14:51:49 -0400
Subject: cifs: always revalidate hardlinked inodes when using noserverino

The old cifs_revalidate logic always revalidated hardlinked inodes.
This hack allowed CIFS to pass some connectathon tests when server inode
numbers aren't used (basic test7, in particular).

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5b042fc4645..8e05e8a0ff8 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1528,6 +1528,11 @@ cifs_inode_needs_reval(struct inode *inode)
 	if (time_after_eq(jiffies, cifs_i->time + HZ))
 		return true;
 
+	/* hardlinked files w/ noserverino get "special" treatment */
+	if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+	    S_ISREG(inode->i_mode) && inode->i_nlink != 1)
+		return true;
+
 	return false;
 }
 
-- 
cgit v1.2.3


From 84f30c66c3689745abbd3b9ce39816caeb9bec3b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 17 May 2010 07:18:57 -0400
Subject: cifs: don't update uniqueid in cifs_fattr_to_inode

We use this value to find an inode within the hash bucket, so we can't
change this without re-hashing the inode. For now, treat this value
as immutable.

Eventually, we should probably use an inode number change on a path
based operation to indicate that the lookup cache is invalid, but that's
a bit more code to deal with.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8e05e8a0ff8..b0ff2529cb9 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -137,7 +137,6 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 		inode->i_mode = fattr->cf_mode;
 
 	cifs_i->cifsAttrs = fattr->cf_cifsattrs;
-	cifs_i->uniqueid = fattr->cf_uniqueid;
 
 	if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
 		cifs_i->time = 0;
-- 
cgit v1.2.3


From 2f51903bc3139e25ec908f8944a0001c7b868e90 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 May 2010 17:57:59 -0300
Subject: perf symbols: symbol inconsistency message should be done only at
 verbose=1

That happened for an old perf.data file that had no fake MMAP events for
the kernel modules, but even then it should warn once for each module,
not one time for every symbol in every module not found.

Reported-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/symbol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index ecccc8df128..a06131f6259 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -525,7 +525,7 @@ static int dso__split_kallsyms(struct dso *self, struct map *map,
 				curr_map = map_groups__find_by_name(kmaps,
 							map->type, module);
 				if (curr_map == NULL) {
-					pr_err("%s/proc/{kallsyms,modules} "
+					pr_debug("%s/proc/{kallsyms,modules} "
 					         "inconsistency while looking "
 						 "for \"%s\" module!\n",
 						 machine->root_dir, module);
-- 
cgit v1.2.3


From 4065c802da7484fa36f8cdf10f18d087233ecb88 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 17 May 2010 07:18:58 -0400
Subject: cifs: fix noserverino handling when unix extensions are enabled

The uniqueid field sent by the server when unix extensions are enabled
is currently used sometimes when it shouldn't be. The readdir codepath
is correct, but most others are not. Fix it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  1 +
 fs/cifs/dir.c       |  1 +
 fs/cifs/inode.c     | 13 +++++++++++++
 3 files changed, 15 insertions(+)

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 2e07da9a46f..fb1657e0fdb 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -110,6 +110,7 @@ extern int cifs_posix_open(char *full_path, struct inode **pinode,
 				struct super_block *sb,
 				int mode, int oflags,
 				__u32 *poplock, __u16 *pnetfid, int xid);
+void cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr);
 extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
 				     FILE_UNIX_BASIC_INFO *info,
 				     struct cifs_sb_info *cifs_sb);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 86d3c0c82f2..391816b461c 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -248,6 +248,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
 
 	/* get new inode and set it up */
 	if (*pinode == NULL) {
+		cifs_fill_uniqueid(sb, &fattr);
 		*pinode = cifs_iget(sb, &fattr);
 		if (!*pinode) {
 			rc = -ENOMEM;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b0ff2529cb9..62b324f26a5 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -169,6 +169,17 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	cifs_set_ops(inode, fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL);
 }
 
+void
+cifs_fill_uniqueid(struct super_block *sb, struct cifs_fattr *fattr)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+		return;
+
+	fattr->cf_uniqueid = iunique(sb, ROOT_I);
+}
+
 /* Fill a cifs_fattr struct with info from FILE_UNIX_BASIC_INFO. */
 void
 cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
@@ -322,6 +333,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 
 	if (*pinode == NULL) {
 		/* get new inode */
+		cifs_fill_uniqueid(sb, &fattr);
 		*pinode = cifs_iget(sb, &fattr);
 		if (!*pinode)
 			rc = -ENOMEM;
@@ -1197,6 +1209,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 				direntry->d_op = &cifs_dentry_ops;
 
 			cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
+			cifs_fill_uniqueid(inode->i_sb, &fattr);
 			newinode = cifs_iget(inode->i_sb, &fattr);
 			if (!newinode) {
 				kfree(pInfo);
-- 
cgit v1.2.3


From 94f3ca95787ada3d64339a4ecb2754236ab563f6 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 17 May 2010 18:18:11 -0300
Subject: perf tools: Add mode to build without newt support

make NO_NEWT=1

Will avoid building the newt (tui) support.

Suggested-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <new-submission>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index e2729fcfaf3..3d8f31ed771 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -558,6 +558,9 @@ else
 endif # PERF_HAVE_DWARF_REGS
 endif # NO_DWARF
 
+ifdef NO_NEWT
+	BASIC_CFLAGS += -DNO_NEWT_SUPPORT
+else
 ifneq ($(shell sh -c "(echo '\#include <newt.h>'; echo 'int main(void) { newtInit(); newtCls(); return newtFinished(); }') | $(CC) -x c - $(ALL_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -lnewt -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
 	msg := $(warning newt not found, disables TUI support. Please install newt-devel or libnewt-dev);
 	BASIC_CFLAGS += -DNO_NEWT_SUPPORT
@@ -567,6 +570,7 @@ else
 	EXTLIBS += -lnewt -lslang
 	LIB_OBJS += $(OUTPUT)util/newt.o
 endif
+endif # NO_NEWT
 
 ifndef NO_LIBPERL
 PERL_EMBED_LDOPTS = `perl -MExtUtils::Embed -e ldopts 2>/dev/null`
-- 
cgit v1.2.3


From c59bd5688299cddb71183e156e7a3c1409b90df2 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 17 May 2010 15:13:23 -0700
Subject: x86, hweight: Use a 32-bit popcnt for __arch_hweight32()

Use a 32-bit popcnt instruction for __arch_hweight32(), even on
x86-64.  Even though the input register will *usually* be
zero-extended due to the standard operation of the hardware, it isn't
necessarily so if the input value was the result of truncating a
64-bit operation.

Note: the POPCNT32 variant used on x86-64 has a technically
unnecessary REX prefix to make it five bytes long, the same as a CALL
instruction, therefore avoiding an unnecessary NOP.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <alpine.LFD.2.00.1005171443060.4195@i5.linux-foundation.org>
---
 arch/x86/include/asm/arch_hweight.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index d1fc3c219ae..9686c3d9ff7 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -2,13 +2,15 @@
 #define _ASM_X86_HWEIGHT_H
 
 #ifdef CONFIG_64BIT
+/* popcnt %edi, %eax -- redundant REX prefix for alignment */
+#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
 /* popcnt %rdi, %rax */
-#define POPCNT ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
+#define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
 #define REG_IN "D"
 #define REG_OUT "a"
 #else
 /* popcnt %eax, %eax */
-#define POPCNT ".byte 0xf3,0x0f,0xb8,0xc0"
+#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc0"
 #define REG_IN "a"
 #define REG_OUT "a"
 #endif
@@ -23,7 +25,7 @@ static inline unsigned int __arch_hweight32(unsigned int w)
 {
 	unsigned int res = 0;
 
-	asm (ALTERNATIVE("call __sw_hweight32", POPCNT, X86_FEATURE_POPCNT)
+	asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
 		     : "="REG_OUT (res)
 		     : REG_IN (w));
 
@@ -48,7 +50,7 @@ static inline unsigned long __arch_hweight64(__u64 w)
 	return  __arch_hweight32((u32)w) +
 		__arch_hweight32((u32)(w >> 32));
 #else
-	asm (ALTERNATIVE("call __sw_hweight64", POPCNT, X86_FEATURE_POPCNT)
+	asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
 		     : "="REG_OUT (res)
 		     : REG_IN (w));
 #endif /* CONFIG_X86_32 */
-- 
cgit v1.2.3


From 9c6f7e43b4e02c161b53e97ba913855246876c61 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 18 May 2010 00:17:44 +0200
Subject: stop_machine: Move local variable closer to the usage site in
 cpu_stop_cpu_callback()

This addresses the following compiler warning:

 kernel/stop_machine.c: In function 'cpu_stop_cpu_callback':
 kernel/stop_machine.c:297: warning: unused variable 'work'

Cc: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <tip-3fc1f1e27a5b807791d72e5d992aa33b668a6626@git.kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/stop_machine.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ef51d1fcf5e..b4e7431e7c7 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -294,7 +294,6 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
 	unsigned int cpu = (unsigned long)hcpu;
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-	struct cpu_stop_work *work;
 	struct task_struct *p;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
@@ -323,6 +322,9 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 	case CPU_DEAD:
+	{
+		struct cpu_stop_work *work;
+
 		/* kill the stopper */
 		kthread_stop(stopper->thread);
 		/* drain remaining works */
@@ -335,6 +337,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
 		put_task_struct(stopper->thread);
 		stopper->thread = NULL;
 		break;
+	}
 #endif
 	}
 
-- 
cgit v1.2.3