Merge branch 'perf' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux-2.6 into perf/core
diff --git a/tools/perf/Documentation/perf-kvm.txt b/tools/perf/Documentation/perf-kvm.txt
index 93400a0..d004e19 100644
--- a/tools/perf/Documentation/perf-kvm.txt
+++ b/tools/perf/Documentation/perf-kvm.txt
@@ -1,5 +1,5 @@
 perf-kvm(1)
-==============
+===========
 
 NAME
 ----
@@ -41,9 +41,9 @@
 OPTIONS
 -------
 --host=::
-        Collect host side perforamnce profile.
+        Collect host side performance profile.
 --guest=::
-        Collect guest side perforamnce profile.
+        Collect guest side performance profile.
 --guestmount=<path>::
 	Guest os root file system mount directory. Users mounts guest os
         root directories under <path> by a specific filesystem access method,
@@ -64,4 +64,5 @@
 
 SEE ALSO
 --------
-linkperf:perf-top[1] perf-record[1] perf-report[1] perf-diff[1] perf-buildid-list[1]
+linkperf:perf-top[1], linkperf:perf-record[1], linkperf:perf-report[1],
+linkperf:perf-diff[1], linkperf:perf-buildid-list[1]
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index db474bb..ab906cb 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -335,8 +335,9 @@
 }
 
 static struct perf_event_ops event_ops = {
-	.sample	= process_sample_event,
-	.comm	= event__process_comm,
+	.sample			= process_sample_event,
+	.comm			= event__process_comm,
+	.ordered_samples	= true,
 };
 
 static double fragmentation(unsigned long n_req, unsigned long n_alloc)
@@ -730,7 +731,6 @@
 	"record",
 	"-a",
 	"-R",
-	"-M",
 	"-f",
 	"-c", "1",
 	"-e", "kmem:kmalloc",
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 6c38e4f..ce27675 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -23,6 +23,8 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 
+static struct perf_session *session;
+
 /* based on kernel/lockdep.c */
 #define LOCKHASH_BITS		12
 #define LOCKHASH_SIZE		(1UL << LOCKHASH_BITS)
@@ -32,9 +34,6 @@
 #define __lockhashfn(key)	hash_long((unsigned long)key, LOCKHASH_BITS)
 #define lockhashentry(key)	(lockhash_table + __lockhashfn((key)))
 
-#define LOCK_STATE_UNLOCKED	0	       /* initial state */
-#define LOCK_STATE_LOCKED	1
-
 struct lock_stat {
 	struct list_head	hash_entry;
 	struct rb_node		rb;		/* used for sorting */
@@ -47,20 +46,151 @@
 	void			*addr;		/* address of lockdep_map, used as ID */
 	char			*name;		/* for strcpy(), we cannot use const */
 
-	int			state;
-	u64			prev_event_time; /* timestamp of previous event */
-
-	unsigned int		nr_acquired;
 	unsigned int		nr_acquire;
+	unsigned int		nr_acquired;
 	unsigned int		nr_contended;
 	unsigned int		nr_release;
 
+	unsigned int		nr_readlock;
+	unsigned int		nr_trylock;
 	/* these times are in nano sec. */
 	u64			wait_time_total;
 	u64			wait_time_min;
 	u64			wait_time_max;
+
+	int			discard; /* flag of blacklist */
 };
 
+/*
+ * States of lock_seq_stat
+ *
+ * UNINITIALIZED is required for detecting first event of acquire.
+ * As the nature of lock events, there is no guarantee
+ * that the first event for the locks are acquire,
+ * it can be acquired, contended or release.
+ */
+#define SEQ_STATE_UNINITIALIZED      0	       /* initial state */
+#define SEQ_STATE_RELEASED	1
+#define SEQ_STATE_ACQUIRING	2
+#define SEQ_STATE_ACQUIRED	3
+#define SEQ_STATE_READ_ACQUIRED	4
+#define SEQ_STATE_CONTENDED	5
+
+/*
+ * MAX_LOCK_DEPTH
+ * Imported from include/linux/sched.h.
+ * Should this be synchronized?
+ */
+#define MAX_LOCK_DEPTH 48
+
+/*
+ * struct lock_seq_stat:
+ * Place to put on state of one lock sequence
+ * 1) acquire -> acquired -> release
+ * 2) acquire -> contended -> acquired -> release
+ * 3) acquire (with read or try) -> release
+ * 4) Are there other patterns?
+ */
+struct lock_seq_stat {
+	struct list_head        list;
+	int			state;
+	u64			prev_event_time;
+	void                    *addr;
+
+	int                     read_count;
+};
+
+struct thread_stat {
+	struct rb_node		rb;
+
+	u32                     tid;
+	struct list_head        seq_list;
+};
+
+static struct rb_root		thread_stats;
+
+static struct thread_stat *thread_stat_find(u32 tid)
+{
+	struct rb_node *node;
+	struct thread_stat *st;
+
+	node = thread_stats.rb_node;
+	while (node) {
+		st = container_of(node, struct thread_stat, rb);
+		if (st->tid == tid)
+			return st;
+		else if (tid < st->tid)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+
+	return NULL;
+}
+
+static void thread_stat_insert(struct thread_stat *new)
+{
+	struct rb_node **rb = &thread_stats.rb_node;
+	struct rb_node *parent = NULL;
+	struct thread_stat *p;
+
+	while (*rb) {
+		p = container_of(*rb, struct thread_stat, rb);
+		parent = *rb;
+
+		if (new->tid < p->tid)
+			rb = &(*rb)->rb_left;
+		else if (new->tid > p->tid)
+			rb = &(*rb)->rb_right;
+		else
+			BUG_ON("inserting invalid thread_stat\n");
+	}
+
+	rb_link_node(&new->rb, parent, rb);
+	rb_insert_color(&new->rb, &thread_stats);
+}
+
+static struct thread_stat *thread_stat_findnew_after_first(u32 tid)
+{
+	struct thread_stat *st;
+
+	st = thread_stat_find(tid);
+	if (st)
+		return st;
+
+	st = zalloc(sizeof(struct thread_stat));
+	if (!st)
+		die("memory allocation failed\n");
+
+	st->tid = tid;
+	INIT_LIST_HEAD(&st->seq_list);
+
+	thread_stat_insert(st);
+
+	return st;
+}
+
+static struct thread_stat *thread_stat_findnew_first(u32 tid);
+static struct thread_stat *(*thread_stat_findnew)(u32 tid) =
+	thread_stat_findnew_first;
+
+static struct thread_stat *thread_stat_findnew_first(u32 tid)
+{
+	struct thread_stat *st;
+
+	st = zalloc(sizeof(struct thread_stat));
+	if (!st)
+		die("memory allocation failed\n");
+	st->tid = tid;
+	INIT_LIST_HEAD(&st->seq_list);
+
+	rb_link_node(&st->rb, NULL, &thread_stats.rb_node);
+	rb_insert_color(&st->rb, &thread_stats);
+
+	thread_stat_findnew = thread_stat_findnew_after_first;
+	return st;
+}
+
 /* build simple key function one is bigger than two */
 #define SINGLE_KEY(member)						\
 	static int lock_stat_key_ ## member(struct lock_stat *one,	\
@@ -175,8 +305,6 @@
 		goto alloc_failed;
 	strcpy(new->name, name);
 
-	/* LOCK_STATE_UNLOCKED == 0 isn't guaranteed forever */
-	new->state = LOCK_STATE_UNLOCKED;
 	new->wait_time_min = ULLONG_MAX;
 
 	list_add(&new->hash_entry, entry);
@@ -188,8 +316,6 @@
 
 static char			const *input_name = "perf.data";
 
-static int			profile_cpu = -1;
-
 struct raw_event_sample {
 	u32			size;
 	char			data[0];
@@ -198,6 +324,7 @@
 struct trace_acquire_event {
 	void			*addr;
 	const char		*name;
+	int			flag;
 };
 
 struct trace_acquired_event {
@@ -241,120 +368,246 @@
 			      struct thread *thread);
 };
 
+static struct lock_seq_stat *get_seq(struct thread_stat *ts, void *addr)
+{
+	struct lock_seq_stat *seq;
+
+	list_for_each_entry(seq, &ts->seq_list, list) {
+		if (seq->addr == addr)
+			return seq;
+	}
+
+	seq = zalloc(sizeof(struct lock_seq_stat));
+	if (!seq)
+		die("Not enough memory\n");
+	seq->state = SEQ_STATE_UNINITIALIZED;
+	seq->addr = addr;
+
+	list_add(&seq->list, &ts->seq_list);
+	return seq;
+}
+
+static int bad_hist[4];
+
 static void
 report_lock_acquire_event(struct trace_acquire_event *acquire_event,
 			struct event *__event __used,
 			int cpu __used,
-			u64 timestamp,
+			u64 timestamp __used,
 			struct thread *thread __used)
 {
-	struct lock_stat *st;
+	struct lock_stat *ls;
+	struct thread_stat *ts;
+	struct lock_seq_stat *seq;
 
-	st = lock_stat_findnew(acquire_event->addr, acquire_event->name);
+	ls = lock_stat_findnew(acquire_event->addr, acquire_event->name);
+	if (ls->discard)
+		return;
 
-	switch (st->state) {
-	case LOCK_STATE_UNLOCKED:
+	ts = thread_stat_findnew(thread->pid);
+	seq = get_seq(ts, acquire_event->addr);
+
+	switch (seq->state) {
+	case SEQ_STATE_UNINITIALIZED:
+	case SEQ_STATE_RELEASED:
+		if (!acquire_event->flag) {
+			seq->state = SEQ_STATE_ACQUIRING;
+		} else {
+			if (acquire_event->flag & 1)
+				ls->nr_trylock++;
+			if (acquire_event->flag & 2)
+				ls->nr_readlock++;
+			seq->state = SEQ_STATE_READ_ACQUIRED;
+			seq->read_count = 1;
+			ls->nr_acquired++;
+		}
 		break;
-	case LOCK_STATE_LOCKED:
+	case SEQ_STATE_READ_ACQUIRED:
+		if (acquire_event->flag & 2) {
+			seq->read_count++;
+			ls->nr_acquired++;
+			goto end;
+		} else {
+			goto broken;
+		}
+		break;
+	case SEQ_STATE_ACQUIRED:
+	case SEQ_STATE_ACQUIRING:
+	case SEQ_STATE_CONTENDED:
+broken:
+		/* broken lock sequence, discard it */
+		ls->discard = 1;
+		bad_hist[0]++;
+		list_del(&seq->list);
+		free(seq);
+		goto end;
 		break;
 	default:
-		BUG_ON(1);
+		BUG_ON("Unknown state of lock sequence found!\n");
 		break;
 	}
 
-	st->prev_event_time = timestamp;
+	ls->nr_acquire++;
+	seq->prev_event_time = timestamp;
+end:
+	return;
 }
 
 static void
 report_lock_acquired_event(struct trace_acquired_event *acquired_event,
 			 struct event *__event __used,
 			 int cpu __used,
-			 u64 timestamp,
+			 u64 timestamp __used,
 			 struct thread *thread __used)
 {
-	struct lock_stat *st;
+	struct lock_stat *ls;
+	struct thread_stat *ts;
+	struct lock_seq_stat *seq;
+	u64 contended_term;
 
-	st = lock_stat_findnew(acquired_event->addr, acquired_event->name);
+	ls = lock_stat_findnew(acquired_event->addr, acquired_event->name);
+	if (ls->discard)
+		return;
 
-	switch (st->state) {
-	case LOCK_STATE_UNLOCKED:
-		st->state = LOCK_STATE_LOCKED;
-		st->nr_acquired++;
+	ts = thread_stat_findnew(thread->pid);
+	seq = get_seq(ts, acquired_event->addr);
+
+	switch (seq->state) {
+	case SEQ_STATE_UNINITIALIZED:
+		/* orphan event, do nothing */
+		return;
+	case SEQ_STATE_ACQUIRING:
 		break;
-	case LOCK_STATE_LOCKED:
+	case SEQ_STATE_CONTENDED:
+		contended_term = timestamp - seq->prev_event_time;
+		ls->wait_time_total += contended_term;
+
+		if (contended_term < ls->wait_time_min)
+			ls->wait_time_min = contended_term;
+		else if (ls->wait_time_max < contended_term)
+			ls->wait_time_max = contended_term;
 		break;
+	case SEQ_STATE_RELEASED:
+	case SEQ_STATE_ACQUIRED:
+	case SEQ_STATE_READ_ACQUIRED:
+		/* broken lock sequence, discard it */
+		ls->discard = 1;
+		bad_hist[1]++;
+		list_del(&seq->list);
+		free(seq);
+		goto end;
+		break;
+
 	default:
-		BUG_ON(1);
+		BUG_ON("Unknown state of lock sequence found!\n");
 		break;
 	}
 
-	st->prev_event_time = timestamp;
+	seq->state = SEQ_STATE_ACQUIRED;
+	ls->nr_acquired++;
+	seq->prev_event_time = timestamp;
+end:
+	return;
 }
 
 static void
 report_lock_contended_event(struct trace_contended_event *contended_event,
 			  struct event *__event __used,
 			  int cpu __used,
-			  u64 timestamp,
+			  u64 timestamp __used,
 			  struct thread *thread __used)
 {
-	struct lock_stat *st;
+	struct lock_stat *ls;
+	struct thread_stat *ts;
+	struct lock_seq_stat *seq;
 
-	st = lock_stat_findnew(contended_event->addr, contended_event->name);
+	ls = lock_stat_findnew(contended_event->addr, contended_event->name);
+	if (ls->discard)
+		return;
 
-	switch (st->state) {
-	case LOCK_STATE_UNLOCKED:
+	ts = thread_stat_findnew(thread->pid);
+	seq = get_seq(ts, contended_event->addr);
+
+	switch (seq->state) {
+	case SEQ_STATE_UNINITIALIZED:
+		/* orphan event, do nothing */
+		return;
+	case SEQ_STATE_ACQUIRING:
 		break;
-	case LOCK_STATE_LOCKED:
-		st->nr_contended++;
+	case SEQ_STATE_RELEASED:
+	case SEQ_STATE_ACQUIRED:
+	case SEQ_STATE_READ_ACQUIRED:
+	case SEQ_STATE_CONTENDED:
+		/* broken lock sequence, discard it */
+		ls->discard = 1;
+		bad_hist[2]++;
+		list_del(&seq->list);
+		free(seq);
+		goto end;
 		break;
 	default:
-		BUG_ON(1);
+		BUG_ON("Unknown state of lock sequence found!\n");
 		break;
 	}
 
-	st->prev_event_time = timestamp;
+	seq->state = SEQ_STATE_CONTENDED;
+	ls->nr_contended++;
+	seq->prev_event_time = timestamp;
+end:
+	return;
 }
 
 static void
 report_lock_release_event(struct trace_release_event *release_event,
 			struct event *__event __used,
 			int cpu __used,
-			u64 timestamp,
+			u64 timestamp __used,
 			struct thread *thread __used)
 {
-	struct lock_stat *st;
-	u64 hold_time;
+	struct lock_stat *ls;
+	struct thread_stat *ts;
+	struct lock_seq_stat *seq;
 
-	st = lock_stat_findnew(release_event->addr, release_event->name);
+	ls = lock_stat_findnew(release_event->addr, release_event->name);
+	if (ls->discard)
+		return;
 
-	switch (st->state) {
-	case LOCK_STATE_UNLOCKED:
+	ts = thread_stat_findnew(thread->pid);
+	seq = get_seq(ts, release_event->addr);
+
+	switch (seq->state) {
+	case SEQ_STATE_UNINITIALIZED:
+		goto end;
 		break;
-	case LOCK_STATE_LOCKED:
-		st->state = LOCK_STATE_UNLOCKED;
-		hold_time = timestamp - st->prev_event_time;
-
-		if (timestamp < st->prev_event_time) {
-			/* terribly, this can happen... */
+	case SEQ_STATE_ACQUIRED:
+		break;
+	case SEQ_STATE_READ_ACQUIRED:
+		seq->read_count--;
+		BUG_ON(seq->read_count < 0);
+		if (!seq->read_count) {
+			ls->nr_release++;
 			goto end;
 		}
-
-		if (st->wait_time_min > hold_time)
-			st->wait_time_min = hold_time;
-		if (st->wait_time_max < hold_time)
-			st->wait_time_max = hold_time;
-		st->wait_time_total += hold_time;
-
-		st->nr_release++;
+		break;
+	case SEQ_STATE_ACQUIRING:
+	case SEQ_STATE_CONTENDED:
+	case SEQ_STATE_RELEASED:
+		/* broken lock sequence, discard it */
+		ls->discard = 1;
+		bad_hist[3]++;
+		goto free_seq;
 		break;
 	default:
-		BUG_ON(1);
+		BUG_ON("Unknown state of lock sequence found!\n");
 		break;
 	}
 
+	ls->nr_release++;
+free_seq:
+	list_del(&seq->list);
+	free(seq);
 end:
-	st->prev_event_time = timestamp;
+	return;
 }
 
 /* lock oriented handlers */
@@ -381,6 +634,7 @@
 	tmp = raw_field_value(event, "lockdep_addr", data);
 	memcpy(&acquire_event.addr, &tmp, sizeof(void *));
 	acquire_event.name = (char *)raw_field_ptr(event, "name", data);
+	acquire_event.flag = (int)raw_field_value(event, "flag", data);
 
 	if (trace_handler->acquire_event)
 		trace_handler->acquire_event(&acquire_event, event, cpu, timestamp, thread);
@@ -441,8 +695,7 @@
 }
 
 static void
-process_raw_event(void *data, int cpu,
-		  u64 timestamp, struct thread *thread)
+process_raw_event(void *data, int cpu, u64 timestamp, struct thread *thread)
 {
 	struct event *event;
 	int type;
@@ -460,182 +713,13 @@
 		process_lock_release_event(data, event, cpu, timestamp, thread);
 }
 
-struct raw_event_queue {
-	u64			timestamp;
-	int			cpu;
-	void			*data;
-	struct thread		*thread;
-	struct list_head	list;
-};
-
-static LIST_HEAD(raw_event_head);
-
-#define FLUSH_PERIOD	(5 * NSEC_PER_SEC)
-
-static u64 flush_limit = ULLONG_MAX;
-static u64 last_flush = 0;
-struct raw_event_queue *last_inserted;
-
-static void flush_raw_event_queue(u64 limit)
-{
-	struct raw_event_queue *tmp, *iter;
-
-	list_for_each_entry_safe(iter, tmp, &raw_event_head, list) {
-		if (iter->timestamp > limit)
-			return;
-
-		if (iter == last_inserted)
-			last_inserted = NULL;
-
-		process_raw_event(iter->data, iter->cpu, iter->timestamp,
-				  iter->thread);
-
-		last_flush = iter->timestamp;
-		list_del(&iter->list);
-		free(iter->data);
-		free(iter);
-	}
-}
-
-static void __queue_raw_event_end(struct raw_event_queue *new)
-{
-	struct raw_event_queue *iter;
-
-	list_for_each_entry_reverse(iter, &raw_event_head, list) {
-		if (iter->timestamp < new->timestamp) {
-			list_add(&new->list, &iter->list);
-			return;
-		}
-	}
-
-	list_add(&new->list, &raw_event_head);
-}
-
-static void __queue_raw_event_before(struct raw_event_queue *new,
-				     struct raw_event_queue *iter)
-{
-	list_for_each_entry_continue_reverse(iter, &raw_event_head, list) {
-		if (iter->timestamp < new->timestamp) {
-			list_add(&new->list, &iter->list);
-			return;
-		}
-	}
-
-	list_add(&new->list, &raw_event_head);
-}
-
-static void __queue_raw_event_after(struct raw_event_queue *new,
-				     struct raw_event_queue *iter)
-{
-	list_for_each_entry_continue(iter, &raw_event_head, list) {
-		if (iter->timestamp > new->timestamp) {
-			list_add_tail(&new->list, &iter->list);
-			return;
-		}
-	}
-	list_add_tail(&new->list, &raw_event_head);
-}
-
-/* The queue is ordered by time */
-static void __queue_raw_event(struct raw_event_queue *new)
-{
-	if (!last_inserted) {
-		__queue_raw_event_end(new);
-		return;
-	}
-
-	/*
-	 * Most of the time the current event has a timestamp
-	 * very close to the last event inserted, unless we just switched
-	 * to another event buffer. Having a sorting based on a list and
-	 * on the last inserted event that is close to the current one is
-	 * probably more efficient than an rbtree based sorting.
-	 */
-	if (last_inserted->timestamp >= new->timestamp)
-		__queue_raw_event_before(new, last_inserted);
-	else
-		__queue_raw_event_after(new, last_inserted);
-}
-
-static void queue_raw_event(void *data, int raw_size, int cpu,
-			    u64 timestamp, struct thread *thread)
-{
-	struct raw_event_queue *new;
-
-	if (flush_limit == ULLONG_MAX)
-		flush_limit = timestamp + FLUSH_PERIOD;
-
-	if (timestamp < last_flush) {
-		printf("Warning: Timestamp below last timeslice flush\n");
-		return;
-	}
-
-	new = malloc(sizeof(*new));
-	if (!new)
-		die("Not enough memory\n");
-
-	new->timestamp = timestamp;
-	new->cpu = cpu;
-	new->thread = thread;
-
-	new->data = malloc(raw_size);
-	if (!new->data)
-		die("Not enough memory\n");
-
-	memcpy(new->data, data, raw_size);
-
-	__queue_raw_event(new);
-	last_inserted = new;
-
-	/*
-	 * We want to have a slice of events covering 2 * FLUSH_PERIOD
-	 * If FLUSH_PERIOD is big enough, it ensures every events that occured
-	 * in the first half of the timeslice have all been buffered and there
-	 * are none remaining (we need that because of the weakly ordered
-	 * event recording we have). Then once we reach the 2 * FLUSH_PERIOD
-	 * timeslice, we flush the first half to be gentle with the memory
-	 * (the second half can still get new events in the middle, so wait
-	 * another period to flush it)
-	 */
-	if (new->timestamp > flush_limit &&
-		new->timestamp - flush_limit > FLUSH_PERIOD) {
-		flush_limit += FLUSH_PERIOD;
-		flush_raw_event_queue(flush_limit);
-	}
-}
-
-static int process_sample_event(event_t *event, struct perf_session *session)
-{
-	struct thread *thread;
-	struct sample_data data;
-
-	bzero(&data, sizeof(struct sample_data));
-	event__parse_sample(event, session->sample_type, &data);
-	thread = perf_session__findnew(session, data.pid);
-
-	if (thread == NULL) {
-		pr_debug("problem processing %d event, skipping it.\n",
-			 event->header.type);
-		return -1;
-	}
-
-	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
-
-	if (profile_cpu != -1 && profile_cpu != (int) data.cpu)
-		return 0;
-
-	queue_raw_event(data.raw_data, data.raw_size, data.cpu, data.time, thread);
-
-	return 0;
-}
-
 /* TODO: various way to print, coloring, nano or milli sec */
 static void print_result(void)
 {
 	struct lock_stat *st;
 	char cut_name[20];
+	int bad, total;
 
-	printf("%18s ", "ID");
 	printf("%20s ", "Name");
 	printf("%10s ", "acquired");
 	printf("%10s ", "contended");
@@ -646,11 +730,15 @@
 
 	printf("\n\n");
 
+	bad = total = 0;
 	while ((st = pop_from_result())) {
+		total++;
+		if (st->discard) {
+			bad++;
+			continue;
+		}
 		bzero(cut_name, 20);
 
-		printf("%p ", st->addr);
-
 		if (strlen(st->name) < 16) {
 			/* output raw name */
 			printf("%20s ", st->name);
@@ -673,6 +761,21 @@
 		       0 : st->wait_time_min);
 		printf("\n");
 	}
+
+	{
+		/* Output for debug, this have to be removed */
+		int i;
+		const char *name[4] =
+			{ "acquire", "acquired", "contended", "release" };
+
+		printf("\n=== output for debug===\n\n");
+		printf("bad:%d, total:%d\n", bad, total);
+		printf("bad rate:%f\n", (double)(bad / total));
+
+		printf("histogram of events caused bad sequence\n");
+		for (i = 0; i < 4; i++)
+			printf(" %10s: %d\n", name[i], bad_hist[i]);
+	}
 }
 
 static void dump_map(void)
@@ -687,13 +790,32 @@
 	}
 }
 
+static int process_sample_event(event_t *self, struct perf_session *s)
+{
+	struct sample_data data;
+	struct thread *thread;
+
+	bzero(&data, sizeof(data));
+	event__parse_sample(self, s->sample_type, &data);
+
+	thread = perf_session__findnew(s, data.tid);
+	if (thread == NULL) {
+		pr_debug("problem processing %d event, skipping it.\n",
+			self->header.type);
+		return -1;
+	}
+
+	process_raw_event(data.raw_data, data.cpu, data.time, thread);
+
+	return 0;
+}
+
 static struct perf_event_ops eops = {
 	.sample			= process_sample_event,
 	.comm			= event__process_comm,
+	.ordered_samples	= true,
 };
 
-static struct perf_session *session;
-
 static int read_events(void)
 {
 	session = perf_session__new(input_name, O_RDONLY, 0);
@@ -720,7 +842,6 @@
 	setup_pager();
 	select_key();
 	read_events();
-	flush_raw_event_queue(ULLONG_MAX);
 	sort_result();
 	print_result();
 }
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 09ddc8e..94453f1 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1651,9 +1651,10 @@
 }
 
 static struct perf_event_ops event_ops = {
-	.sample	= process_sample_event,
-	.comm	= event__process_comm,
-	.lost	= process_lost_event,
+	.sample			= process_sample_event,
+	.comm			= event__process_comm,
+	.lost			= process_lost_event,
+	.ordered_samples	= true,
 };
 
 static int read_events(void)
@@ -1850,7 +1851,6 @@
 	"record",
 	"-a",
 	"-R",
-	"-M",
 	"-f",
 	"-m", "1024",
 	"-c", "1",
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 96f4a09..c35aa44 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -143,9 +143,6 @@
 static struct power_event    *power_events;
 static struct wake_event     *wake_events;
 
-struct sample_wrapper *all_samples;
-
-
 struct process_filter;
 struct process_filter {
 	char			*name;
@@ -566,88 +563,6 @@
 	}
 }
 
-static u64 sample_time(event_t *event, const struct perf_session *session)
-{
-	int cursor;
-
-	cursor = 0;
-	if (session->sample_type & PERF_SAMPLE_IP)
-		cursor++;
-	if (session->sample_type & PERF_SAMPLE_TID)
-		cursor++;
-	if (session->sample_type & PERF_SAMPLE_TIME)
-		return event->sample.array[cursor];
-	return 0;
-}
-
-
-/*
- * We first queue all events, sorted backwards by insertion.
- * The order will get flipped later.
- */
-static int queue_sample_event(event_t *event, struct perf_session *session)
-{
-	struct sample_wrapper *copy, *prev;
-	int size;
-
-	size = event->sample.header.size + sizeof(struct sample_wrapper) + 8;
-
-	copy = malloc(size);
-	if (!copy)
-		return 1;
-
-	memset(copy, 0, size);
-
-	copy->next = NULL;
-	copy->timestamp = sample_time(event, session);
-
-	memcpy(&copy->data, event, event->sample.header.size);
-
-	/* insert in the right place in the list */
-
-	if (!all_samples) {
-		/* first sample ever */
-		all_samples = copy;
-		return 0;
-	}
-
-	if (all_samples->timestamp < copy->timestamp) {
-		/* insert at the head of the list */
-		copy->next = all_samples;
-		all_samples = copy;
-		return 0;
-	}
-
-	prev = all_samples;
-	while (prev->next) {
-		if (prev->next->timestamp < copy->timestamp) {
-			copy->next = prev->next;
-			prev->next = copy;
-			return 0;
-		}
-		prev = prev->next;
-	}
-	/* insert at the end of the list */
-	prev->next = copy;
-
-	return 0;
-}
-
-static void sort_queued_samples(void)
-{
-	struct sample_wrapper *cursor, *next;
-
-	cursor = all_samples;
-	all_samples = NULL;
-
-	while (cursor) {
-		next = cursor->next;
-		cursor->next = all_samples;
-		all_samples = cursor;
-		cursor = next;
-	}
-}
-
 /*
  * Sort the pid datastructure
  */
@@ -1011,26 +926,12 @@
 	svg_close();
 }
 
-static void process_samples(struct perf_session *session)
-{
-	struct sample_wrapper *cursor;
-	event_t *event;
-
-	sort_queued_samples();
-
-	cursor = all_samples;
-	while (cursor) {
-		event = (void *)&cursor->data;
-		cursor = cursor->next;
-		process_sample_event(event, session);
-	}
-}
-
 static struct perf_event_ops event_ops = {
-	.comm	= process_comm_event,
-	.fork	= process_fork_event,
-	.exit	= process_exit_event,
-	.sample	= queue_sample_event,
+	.comm			= process_comm_event,
+	.fork			= process_fork_event,
+	.exit			= process_exit_event,
+	.sample			= process_sample_event,
+	.ordered_samples	= true,
 };
 
 static int __cmd_timechart(void)
@@ -1048,8 +949,6 @@
 	if (ret)
 		goto out_delete;
 
-	process_samples(session);
-
 	end_sample_processing();
 
 	sort_pids();
@@ -1072,7 +971,6 @@
 	"record",
 	"-a",
 	"-R",
-	"-M",
 	"-f",
 	"-c", "1",
 	"-e", "power:power_start",
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 2eefb33..77f556f 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -11,6 +11,8 @@
 
 static char const		*script_name;
 static char const		*generate_script_lang;
+static bool			debug_ordering;
+static u64			last_timestamp;
 
 static int default_start_script(const char *script __unused,
 				int argc __unused,
@@ -87,6 +89,14 @@
 	}
 
 	if (session->sample_type & PERF_SAMPLE_RAW) {
+		if (debug_ordering) {
+			if (data.time < last_timestamp) {
+				pr_err("Samples misordered, previous: %llu "
+					"this: %llu\n", last_timestamp,
+					data.time);
+			}
+			last_timestamp = data.time;
+		}
 		/*
 		 * FIXME: better resolve from pid from the struct trace_entry
 		 * field, although it should be the same than this perf
@@ -108,6 +118,7 @@
 	.event_type = event__process_event_type,
 	.tracing_data = event__process_tracing_data,
 	.build_id = event__process_build_id,
+	.ordered_samples = true,
 };
 
 extern volatile int session_done;
@@ -531,6 +542,8 @@
 		   "generate perf-trace.xx script in specified language"),
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
+	OPT_BOOLEAN('d', "debug-ordering", &debug_ordering,
+		   "check that samples time ordering is monotonic"),
 
 	OPT_END()
 };
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 3b4ec67..600d327 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -418,12 +418,6 @@
 	u64 id;
 	int fd;
 
-	attr->sample_type |= PERF_SAMPLE_RAW;
-	attr->sample_type |= PERF_SAMPLE_TIME;
-	attr->sample_type |= PERF_SAMPLE_CPU;
-
-	attr->sample_period = 1;
-
 	snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path,
 		 sys_name, evt_name);
 
@@ -442,6 +436,13 @@
 	attr->type = PERF_TYPE_TRACEPOINT;
 	*strp = evt_name + evt_length;
 
+	attr->sample_type |= PERF_SAMPLE_RAW;
+	attr->sample_type |= PERF_SAMPLE_TIME;
+	attr->sample_type |= PERF_SAMPLE_CPU;
+
+	attr->sample_period = 1;
+
+
 	return EVT_HANDLED;
 }
 
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 7d88ae5..b7aade2 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -98,6 +98,8 @@
 	self->cwdlen = 0;
 	self->unknown_events = 0;
 	self->kerninfo_root = RB_ROOT;
+	self->ordered_samples.flush_limit = ULLONG_MAX;
+	INIT_LIST_HEAD(&self->ordered_samples.samples_head);
 
 	if (mode == O_RDONLY) {
 		if (perf_session__open(self, force) < 0)
@@ -351,6 +353,178 @@
 	[PERF_RECORD_HEADER_MAX]    = NULL,
 };
 
+struct sample_queue {
+	u64			timestamp;
+	struct sample_event	*event;
+	struct list_head	list;
+};
+
+#define FLUSH_PERIOD	(2 * NSEC_PER_SEC)
+
+static void flush_sample_queue(struct perf_session *s,
+			       struct perf_event_ops *ops)
+{
+	struct list_head *head = &s->ordered_samples.samples_head;
+	u64 limit = s->ordered_samples.flush_limit;
+	struct sample_queue *tmp, *iter;
+
+	if (!ops->ordered_samples)
+		return;
+
+	list_for_each_entry_safe(iter, tmp, head, list) {
+		if (iter->timestamp > limit)
+			return;
+
+		if (iter == s->ordered_samples.last_inserted)
+			s->ordered_samples.last_inserted = NULL;
+
+		ops->sample((event_t *)iter->event, s);
+
+		s->ordered_samples.last_flush = iter->timestamp;
+		list_del(&iter->list);
+		free(iter->event);
+		free(iter);
+	}
+}
+
+static void __queue_sample_end(struct sample_queue *new, struct list_head *head)
+{
+	struct sample_queue *iter;
+
+	list_for_each_entry_reverse(iter, head, list) {
+		if (iter->timestamp < new->timestamp) {
+			list_add(&new->list, &iter->list);
+			return;
+		}
+	}
+
+	list_add(&new->list, head);
+}
+
+static void __queue_sample_before(struct sample_queue *new,
+				  struct sample_queue *iter,
+				  struct list_head *head)
+{
+	list_for_each_entry_continue_reverse(iter, head, list) {
+		if (iter->timestamp < new->timestamp) {
+			list_add(&new->list, &iter->list);
+			return;
+		}
+	}
+
+	list_add(&new->list, head);
+}
+
+static void __queue_sample_after(struct sample_queue *new,
+				 struct sample_queue *iter,
+				 struct list_head *head)
+{
+	list_for_each_entry_continue(iter, head, list) {
+		if (iter->timestamp > new->timestamp) {
+			list_add_tail(&new->list, &iter->list);
+			return;
+		}
+	}
+	list_add_tail(&new->list, head);
+}
+
+/* The queue is ordered by time */
+static void __queue_sample_event(struct sample_queue *new,
+				 struct perf_session *s)
+{
+	struct sample_queue *last_inserted = s->ordered_samples.last_inserted;
+	struct list_head *head = &s->ordered_samples.samples_head;
+
+
+	if (!last_inserted) {
+		__queue_sample_end(new, head);
+		return;
+	}
+
+	/*
+	 * Most of the time the current event has a timestamp
+	 * very close to the last event inserted, unless we just switched
+	 * to another event buffer. Having a sorting based on a list and
+	 * on the last inserted event that is close to the current one is
+	 * probably more efficient than an rbtree based sorting.
+	 */
+	if (last_inserted->timestamp >= new->timestamp)
+		__queue_sample_before(new, last_inserted, head);
+	else
+		__queue_sample_after(new, last_inserted, head);
+}
+
+static int queue_sample_event(event_t *event, struct sample_data *data,
+			      struct perf_session *s,
+			      struct perf_event_ops *ops)
+{
+	u64 timestamp = data->time;
+	struct sample_queue *new;
+	u64 flush_limit;
+
+
+	if (s->ordered_samples.flush_limit == ULLONG_MAX)
+		s->ordered_samples.flush_limit = timestamp + FLUSH_PERIOD;
+
+	if (timestamp < s->ordered_samples.last_flush) {
+		printf("Warning: Timestamp below last timeslice flush\n");
+		return -EINVAL;
+	}
+
+	new = malloc(sizeof(*new));
+	if (!new)
+		return -ENOMEM;
+
+	new->timestamp = timestamp;
+
+	new->event = malloc(event->header.size);
+	if (!new->event) {
+		free(new);
+		return -ENOMEM;
+	}
+
+	memcpy(new->event, event, event->header.size);
+
+	__queue_sample_event(new, s);
+	s->ordered_samples.last_inserted = new;
+
+	/*
+	 * We want to have a slice of events covering 2 * FLUSH_PERIOD
+	 * If FLUSH_PERIOD is big enough, it ensures every events that occured
+	 * in the first half of the timeslice have all been buffered and there
+	 * are none remaining (we need that because of the weakly ordered
+	 * event recording we have). Then once we reach the 2 * FLUSH_PERIOD
+	 * timeslice, we flush the first half to be gentle with the memory
+	 * (the second half can still get new events in the middle, so wait
+	 * another period to flush it)
+	 */
+	flush_limit = s->ordered_samples.flush_limit;
+
+	if (new->timestamp > flush_limit &&
+		new->timestamp - flush_limit > FLUSH_PERIOD) {
+		s->ordered_samples.flush_limit += FLUSH_PERIOD;
+		flush_sample_queue(s, ops);
+	}
+
+	return 0;
+}
+
+static int perf_session__process_sample(event_t *event, struct perf_session *s,
+					struct perf_event_ops *ops)
+{
+	struct sample_data data;
+
+	if (!ops->ordered_samples)
+		return ops->sample(event, s);
+
+	bzero(&data, sizeof(struct sample_data));
+	event__parse_sample(event, s->sample_type, &data);
+
+	queue_sample_event(event, &data, s, ops);
+
+	return 0;
+}
+
 static int perf_session__process_event(struct perf_session *self,
 				       event_t *event,
 				       struct perf_event_ops *ops,
@@ -371,7 +545,7 @@
 
 	switch (event->header.type) {
 	case PERF_RECORD_SAMPLE:
-		return ops->sample(event, self);
+		return perf_session__process_sample(event, self, ops);
 	case PERF_RECORD_MMAP:
 		return ops->mmap(event, self);
 	case PERF_RECORD_COMM:
@@ -611,6 +785,9 @@
 		goto more;
 done:
 	err = 0;
+	/* do the final flush for ordered samples */
+	self->ordered_samples.flush_limit = ULLONG_MAX;
+	flush_sample_queue(self, ops);
 out_err:
 	ui_progress__delete(progress);
 	return err;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 5e47c87..796e229 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -8,9 +8,17 @@
 #include <linux/rbtree.h>
 #include "../../../include/linux/perf_event.h"
 
+struct sample_queue;
 struct ip_callchain;
 struct thread;
 
+struct ordered_samples {
+	u64			last_flush;
+	u64			flush_limit;
+	struct list_head	samples_head;
+	struct sample_queue	*last_inserted;
+};
+
 struct perf_session {
 	struct perf_header	header;
 	unsigned long		size;
@@ -28,6 +36,7 @@
 	bool			fd_pipe;
 	int			cwdlen;
 	char			*cwd;
+	struct ordered_samples	ordered_samples;
 	char filename[0];
 };
 
@@ -47,6 +56,7 @@
 		 event_type,
 		 tracing_data,
 		 build_id;
+	bool	ordered_samples;
 };
 
 struct perf_session *perf_session__new(const char *filename, int mode, bool force);