--- linux/net/core/pktgen.c.orig
+++ linux/net/core/pktgen.c
@@ -268,7 +268,7 @@ static struct net_device *setup_inject(s
 	if (strlen(info->src_min) == 0) {
 		struct in_device *in_dev;
 
-		rcu_read_lock();
+		rcu_read_lock_down_read(&rtnl_sem);
 		in_dev = __in_dev_get(odev);
 		if (in_dev) {
 			if (in_dev->ifa_list) {
@@ -276,7 +276,7 @@ static struct net_device *setup_inject(s
 				info->saddr_max = info->saddr_min;
 			}
 		}
-		rcu_read_unlock();
+		rcu_read_unlock_up_read(&rtnl_sem);
 	}
 	else {
 		info->saddr_min = in_aton(info->src_min);
--- linux/net/core/netfilter.c.orig
+++ linux/net/core/netfilter.c
@@ -47,7 +47,7 @@ static DECLARE_MUTEX(nf_sockopt_mutex);
 
 struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
 static LIST_HEAD(nf_sockopts);
-static spinlock_t nf_hook_lock = SPIN_LOCK_UNLOCKED;
+static rwlock_t nf_hook_lock = RW_LOCK_UNLOCKED;
 
 /* 
  * A queue handler may be registered for each protocol.  Each is protected by
@@ -64,13 +64,13 @@ int nf_register_hook(struct nf_hook_ops 
 {
 	struct list_head *i;
 
-	spin_lock_bh(&nf_hook_lock);
+	write_lock_bh(&nf_hook_lock);
 	list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
 		if (reg->priority < ((struct nf_hook_ops *)i)->priority)
 			break;
 	}
 	list_add_rcu(&reg->list, i->prev);
-	spin_unlock_bh(&nf_hook_lock);
+	write_unlock_bh(&nf_hook_lock);
 
 	synchronize_net();
 	return 0;
@@ -78,9 +78,9 @@ int nf_register_hook(struct nf_hook_ops 
 
 void nf_unregister_hook(struct nf_hook_ops *reg)
 {
-	spin_lock_bh(&nf_hook_lock);
+	write_lock_bh(&nf_hook_lock);
 	list_del_rcu(&reg->list);
-	spin_unlock_bh(&nf_hook_lock);
+	write_unlock_bh(&nf_hook_lock);
 
 	synchronize_net();
 }
@@ -504,8 +504,15 @@ int nf_hook_slow(int pf, unsigned int ho
 	unsigned int verdict;
 	int ret = 0;
 
+	/*
+	 * PREEMPT_RT semantics: different-type read-locks
+	 * dont nest that easily:
+	 */
+//	rcu_read_lock_read(&ptype_lock);
+
 	/* We may already have this, but read-locks nest anyway */
-	rcu_read_lock();
+	// FIXME, HACK: complex locking dependencies here ...
+//	rcu_read_lock_read(&nf_hook_lock);
 
 #ifdef CONFIG_NETFILTER_DEBUG
 	if (skb->nf_debug & (1 << hook)) {
@@ -536,7 +543,9 @@ int nf_hook_slow(int pf, unsigned int ho
 		break;
 	}
 
-	rcu_read_unlock();
+//	rcu_read_unlock_read(&nf_hook_lock);
+//	rcu_read_unlock_read(&ptype_lock);
+
 	return ret;
 }
 
@@ -546,7 +555,8 @@ void nf_reinject(struct sk_buff *skb, st
 	struct list_head *elem = &info->elem->list;
 	struct list_head *i;
 
-	rcu_read_lock();
+//	rcu_read_lock_read(&ptype_lock);
+//	rcu_read_lock_read(&nf_hook_lock);
 
 	/* Release those devices we held, or Alexey will kill me. */
 	if (info->indev) dev_put(info->indev);
@@ -600,7 +610,8 @@ void nf_reinject(struct sk_buff *skb, st
 			goto next_hook;
 		break;
 	}
-	rcu_read_unlock();
+//	rcu_read_unlock_read(&nf_hook_lock);
+//	rcu_read_unlock_read(&ptype_lock);
 
 	if (verdict == NF_DROP)
 		kfree_skb(skb);
@@ -744,7 +755,7 @@ EXPORT_SYMBOL(skb_ip_make_writable);
 
 static nf_logfn *nf_logging[NPROTO]; /* = NULL */
 static int reported = 0;
-static spinlock_t nf_log_lock = SPIN_LOCK_UNLOCKED;
+static rwlock_t nf_log_lock = RW_LOCK_UNLOCKED;
 
 int nf_log_register(int pf, nf_logfn *logfn)
 {
@@ -752,21 +763,21 @@ int nf_log_register(int pf, nf_logfn *lo
 
 	/* Any setup of logging members must be done before
 	 * substituting pointer. */
-	spin_lock(&nf_log_lock);
+	write_lock(&nf_log_lock);
 	if (!nf_logging[pf]) {
 		rcu_assign_pointer(nf_logging[pf], logfn);
 		ret = 0;
 	}
-	spin_unlock(&nf_log_lock);
+	write_unlock(&nf_log_lock);
 	return ret;
 }		
 
 void nf_log_unregister(int pf, nf_logfn *logfn)
 {
-	spin_lock(&nf_log_lock);
+	write_lock(&nf_log_lock);
 	if (nf_logging[pf] == logfn)
 		nf_logging[pf] = NULL;
-	spin_unlock(&nf_log_lock);
+	write_unlock(&nf_log_lock);
 
 	/* Give time to concurrent readers. */
 	synchronize_net();
@@ -783,7 +794,7 @@ void nf_log_packet(int pf,
 	char prefix[NF_LOG_PREFIXLEN];
 	nf_logfn *logfn;
 	
-	rcu_read_lock();
+	rcu_read_lock_read(&nf_log_lock);
 	logfn = rcu_dereference(nf_logging[pf]);
 	if (logfn) {
 		va_start(args, fmt);
@@ -796,7 +807,7 @@ void nf_log_packet(int pf,
 		       "no backend logging module loaded in!\n");
 		reported++;
 	}
-	rcu_read_unlock();
+	rcu_read_unlock_read(&nf_log_lock);
 }
 EXPORT_SYMBOL(nf_log_register);
 EXPORT_SYMBOL(nf_log_unregister);
--- linux/net/core/netpoll.c.orig
+++ linux/net/core/netpoll.c
@@ -80,7 +80,9 @@ void netpoll_poll(struct netpoll *np)
 		return;
 
 	/* Process pending work on NIC */
+	WARN_ON_RT(irqs_disabled());
 	np->dev->poll_controller(np->dev);
+	WARN_ON_RT(irqs_disabled());
 
 	/* If scheduling is stopped, tickle NAPI bits */
 	spin_lock_irqsave(&netpoll_poll_lock, flags);
@@ -119,25 +121,28 @@ static void refill_skbs(void)
 
 static void zap_completion_queue(void)
 {
-	unsigned long flags;
 	struct softnet_data *sd = &get_cpu_var(softnet_data);
+	struct sk_buff *clist = NULL;
+	unsigned long flags;
 
 	if (sd->completion_queue) {
-		struct sk_buff *clist;
-
 		local_irq_save(flags);
 		clist = sd->completion_queue;
 		sd->completion_queue = NULL;
 		local_irq_restore(flags);
-
-		while (clist != NULL) {
-			struct sk_buff *skb = clist;
-			clist = clist->next;
-			__kfree_skb(skb);
-		}
 	}
 
+	/*
+	 * Took the list private, can drop our softnet
+	 * reference:
+	 */
 	put_cpu_var(softnet_data);
+
+	while (clist != NULL) {
+		struct sk_buff *skb = clist;
+		clist = clist->next;
+		__kfree_skb(skb);
+	}
 }
 
 static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve)
@@ -189,7 +194,7 @@ repeat:
 	}
 
 	spin_lock(&np->dev->xmit_lock);
-	np->dev->xmit_lock_owner = smp_processor_id();
+	np->dev->xmit_lock_owner = _smp_processor_id();
 
 	/*
 	 * network drivers do not expect to be called if the queue is
@@ -608,18 +613,18 @@ int netpoll_setup(struct netpoll *np)
 		memcpy(np->local_mac, ndev->dev_addr, 6);
 
 	if (!np->local_ip) {
-		rcu_read_lock();
+		rcu_read_lock_down_read(&rtnl_sem);
 		in_dev = __in_dev_get(ndev);
 
 		if (!in_dev || !in_dev->ifa_list) {
-			rcu_read_unlock();
+			rcu_read_unlock_up_read(&rtnl_sem);
 			printk(KERN_ERR "%s: no IP address for %s, aborting\n",
 			       np->name, np->dev_name);
 			goto release;
 		}
 
 		np->local_ip = ntohl(in_dev->ifa_list->ifa_local);
-		rcu_read_unlock();
+		rcu_read_unlock_up_read(&rtnl_sem);
 		printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n",
 		       np->name, HIPQUAD(np->local_ip));
 	}
--- linux/net/core/dev.c.orig
+++ linux/net/core/dev.c
@@ -154,7 +154,7 @@
  *		86DD	IPv6
  */
 
-static spinlock_t ptype_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_RWLOCK(ptype_lock);
 static struct list_head ptype_base[16];	/* 16 way hashed list */
 static struct list_head ptype_all;		/* Taps */
 
@@ -272,7 +272,7 @@ void dev_add_pack(struct packet_type *pt
 {
 	int hash;
 
-	spin_lock_bh(&ptype_lock);
+	write_lock_bh(&ptype_lock);
 	if (pt->type == htons(ETH_P_ALL)) {
 		netdev_nit++;
 		list_add_rcu(&pt->list, &ptype_all);
@@ -280,7 +280,7 @@ void dev_add_pack(struct packet_type *pt
 		hash = ntohs(pt->type) & 15;
 		list_add_rcu(&pt->list, &ptype_base[hash]);
 	}
-	spin_unlock_bh(&ptype_lock);
+	write_unlock_bh(&ptype_lock);
 }
 
 extern void linkwatch_run_queue(void);
@@ -305,7 +305,7 @@ void __dev_remove_pack(struct packet_typ
 	struct list_head *head;
 	struct packet_type *pt1;
 
-	spin_lock_bh(&ptype_lock);
+	write_lock_bh(&ptype_lock);
 
 	if (pt->type == htons(ETH_P_ALL)) {
 		netdev_nit--;
@@ -322,7 +322,7 @@ void __dev_remove_pack(struct packet_typ
 
 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 out:
-	spin_unlock_bh(&ptype_lock);
+	write_unlock_bh(&ptype_lock);
 }
 /**
  *	dev_remove_pack	 - remove packet handler
@@ -1034,7 +1034,7 @@ void dev_queue_xmit_nit(struct sk_buff *
 	struct packet_type *ptype;
 	net_timestamp(&skb->stamp);
 
-	rcu_read_lock();
+//	rcu_read_lock_read(&ptype_lock);
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
 		/* Never send packets back to the socket
 		 * they originated from - MvS (miquels@drinkel.ow.org)
@@ -1066,7 +1066,7 @@ void dev_queue_xmit_nit(struct sk_buff *
 			ptype->func(skb2, skb->dev, ptype);
 		}
 	}
-	rcu_read_unlock();
+//	rcu_read_unlock_read(&ptype_lock);
 }
 
 /*
@@ -1228,6 +1228,8 @@ int dev_queue_xmit(struct sk_buff *skb)
 	struct Qdisc *q;
 	int rc = -ENOMEM;
 
+//	rcu_read_lock_read(&ptype_lock); // for PREEMPT_RT
+
 	if (skb_shinfo(skb)->frag_list &&
 	    !(dev->features & NETIF_F_FRAGLIST) &&
 	    __skb_linearize(skb, GFP_ATOMIC))
@@ -1299,10 +1301,16 @@ int dev_queue_xmit(struct sk_buff *skb)
 	   Either shot noqueue qdisc, it is even simpler 8)
 	 */
 	if (dev->flags & IFF_UP) {
-		int cpu = smp_processor_id(); /* ok because BHs are off */
+		int cpu = _smp_processor_id(); /* ok because BHs are off */
 
+		/*
+		 * No need to check for recursion with threaded interrupts:
+		 */
+#ifdef CONFIG_PREEMPT_RT
+		if (1) {
+#else
 		if (dev->xmit_lock_owner != cpu) {
-
+#endif
 			HARD_TX_LOCK(dev, cpu);
 
 			if (!netif_queue_stopped(dev)) {
@@ -1333,9 +1341,11 @@ int dev_queue_xmit(struct sk_buff *skb)
 
 out_kfree_skb:
 	kfree_skb(skb);
+//	rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT
 	return rc;
 out:
 	local_bh_enable();
+//	rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT
 	return rc;
 }
 
@@ -1515,6 +1525,7 @@ static void net_tx_action(struct softirq
 {
 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
 
+//	rcu_read_lock_read(&ptype_lock); // for PREEMPT_RT
 	if (sd->completion_queue) {
 		struct sk_buff *clist;
 
@@ -1529,6 +1540,13 @@ static void net_tx_action(struct softirq
 
 			BUG_TRAP(!atomic_read(&skb->users));
 			__kfree_skb(skb);
+			/*
+			 * Safe to reschedule - the list is private
+			 * at this point.
+			 */
+//			rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT
+			cond_resched_all();
+//			rcu_read_lock_read(&ptype_lock); // for PREEMPT_RT
 		}
 	}
 
@@ -1551,10 +1569,17 @@ static void net_tx_action(struct softirq
 				qdisc_run(dev);
 				spin_unlock(&dev->queue_lock);
 			} else {
-				netif_schedule(dev);
+				/*
+				 * Dont re-kick the queue here, it will cause
+				 * excessive scheduling of ksoftirqd due
+				 * to retry. When the queue is released
+				 * it will be completed anyway.
+				 */
+//				netif_schedule(dev);
 			}
 		}
 	}
+//	rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT
 }
 
 static __inline__ int deliver_skb(struct sk_buff *skb,
@@ -1653,7 +1678,7 @@ int netif_receive_skb(struct sk_buff *sk
 
 	pt_prev = NULL;
 
-	rcu_read_lock();
+//	rcu_read_lock_read(&ptype_lock);
 
 #ifdef CONFIG_NET_CLS_ACT
 	if (skb->tc_verd & TC_NCLS) {
@@ -1715,7 +1740,7 @@ ncls:
 	}
 
 out:
-	rcu_read_unlock();
+//	rcu_read_unlock_read(&ptype_lock);
 	return ret;
 }
 
@@ -1769,11 +1794,12 @@ job_done:
 
 static void net_rx_action(struct softirq_action *h)
 {
-	struct softnet_data *queue = &__get_cpu_var(softnet_data);
+	struct softnet_data *queue;
 	unsigned long start_time = jiffies;
 	int budget = netdev_max_backlog;
 
 	local_irq_disable();
+	queue = &__get_cpu_var(softnet_data);
 
 	while (!list_empty(&queue->poll_list)) {
 		struct net_device *dev;
@@ -1783,10 +1809,16 @@ static void net_rx_action(struct softirq
 
 		local_irq_enable();
 
+		if (unlikely(cond_resched_all())) {
+			local_irq_disable();
+			continue;
+		}
 		dev = list_entry(queue->poll_list.next,
 				 struct net_device, poll_list);
 
+//		rcu_read_lock_read(&ptype_lock);
 		if (dev->quota <= 0 || dev->poll(dev, &budget)) {
+//			rcu_read_unlock_read(&ptype_lock);
 			local_irq_disable();
 			list_del(&dev->poll_list);
 			list_add_tail(&dev->poll_list, &queue->poll_list);
@@ -1796,6 +1828,7 @@ static void net_rx_action(struct softirq
 				dev->quota = dev->weight;
 		} else {
 			dev_put(dev);
+//			rcu_read_unlock_read(&ptype_lock);
 			local_irq_disable();
 		}
 
@@ -1808,8 +1841,10 @@ out:
 	return;
 
 softnet_break:
+	preempt_disable();
 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	preempt_enable();
 	goto out;
 }
 
--- linux/net/core/rtnetlink.c.orig
+++ linux/net/core/rtnetlink.c
@@ -51,7 +51,7 @@
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
-DECLARE_MUTEX(rtnl_sem);
+DECLARE_RWSEM(rtnl_sem);
 
 void rtnl_lock(void)
 {
@@ -608,7 +608,7 @@ static void rtnetlink_rcv(struct sock *s
 			kfree_skb(skb);
 		}
 
-		up(&rtnl_sem);
+		up_write(&rtnl_sem);
 
 		netdev_run_todo();
 	} while (rtnl && rtnl->sk_receive_queue.qlen);
--- linux/net/bridge/br_device.c.orig
+++ linux/net/bridge/br_device.c
@@ -40,7 +40,7 @@ int br_dev_xmit(struct sk_buff *skb, str
 	skb->mac.raw = skb->data;
 	skb_pull(skb, ETH_HLEN);
 
-	rcu_read_lock();
+	rcu_read_lock_spin(&br->hash_lock);
 	if (dest[0] & 1) 
 		br_flood_deliver(br, skb, 0);
 	else if ((dst = __br_fdb_get(br, dest)) != NULL)
@@ -48,7 +48,7 @@ int br_dev_xmit(struct sk_buff *skb, str
 	else
 		br_flood_deliver(br, skb, 0);
 
-	rcu_read_unlock();
+	rcu_read_unlock_spin(&br->hash_lock);
 	return 0;
 }
 
--- linux/net/bridge/br_ioctl.c.orig
+++ linux/net/bridge/br_ioctl.c
@@ -122,7 +122,7 @@ static int old_dev_ioctl(struct net_devi
 		struct __bridge_info b;
 
 		memset(&b, 0, sizeof(struct __bridge_info));
-		rcu_read_lock();
+		rcu_read_lock_spin(&br->hash_lock);
 		memcpy(&b.designated_root, &br->designated_root, 8);
 		memcpy(&b.bridge_id, &br->bridge_id, 8);
 		b.root_path_cost = br->root_path_cost;
@@ -141,7 +141,7 @@ static int old_dev_ioctl(struct net_devi
 		b.tcn_timer_value = br_timer_value(&br->tcn_timer);
 		b.topology_change_timer_value = br_timer_value(&br->topology_change_timer);
 		b.gc_timer_value = br_timer_value(&br->gc_timer);
-	        rcu_read_unlock();
+		rcu_read_unlock_spin(&br->hash_lock);
 
 		if (copy_to_user((void __user *)args[1], &b, sizeof(b)))
 			return -EFAULT;
@@ -219,9 +219,9 @@ static int old_dev_ioctl(struct net_devi
 		struct __port_info p;
 		struct net_bridge_port *pt;
 
-		rcu_read_lock();
+		rcu_read_lock_spin(&br->lock);
 		if ((pt = br_get_port(br, args[2])) == NULL) {
-			rcu_read_unlock();
+			rcu_read_unlock_spin(&br->lock);
 			return -EINVAL;
 		}
 
@@ -239,7 +239,7 @@ static int old_dev_ioctl(struct net_devi
 		p.forward_delay_timer_value = br_timer_value(&pt->forward_delay_timer);
 		p.hold_timer_value = br_timer_value(&pt->hold_timer);
 
-		rcu_read_unlock();
+		rcu_read_unlock_spin(&br->lock);
 
 		if (copy_to_user((void __user *)args[1], &p, sizeof(p)))
 			return -EFAULT;
--- linux/net/bridge/br_fdb.c.orig
+++ linux/net/bridge/br_fdb.c
@@ -211,11 +211,11 @@ struct net_bridge_fdb_entry *br_fdb_get(
 {
 	struct net_bridge_fdb_entry *fdb;
 
-	rcu_read_lock();
+	rcu_read_lock_spin(&br->hash_lock);
 	fdb = __br_fdb_get(br, addr);
 	if (fdb) 
 		atomic_inc(&fdb->use_count);
-	rcu_read_unlock();
+	rcu_read_unlock_spin(&br->hash_lock);
 	return fdb;
 }
 
@@ -247,7 +247,7 @@ int br_fdb_fillbuf(struct net_bridge *br
 
 	memset(buf, 0, maxnum*sizeof(struct __fdb_entry));
 
-	rcu_read_lock();
+	rcu_read_lock_spin(&br->hash_lock);
 	for (i = 0; i < BR_HASH_SIZE; i++) {
 		hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) {
 			if (num >= maxnum)
@@ -273,7 +273,7 @@ int br_fdb_fillbuf(struct net_bridge *br
 	}
 
  out:
-	rcu_read_unlock();
+	rcu_read_unlock_spin(&br->hash_lock);
 
 	return num;
 }
--- linux/net/sched/sch_generic.c.orig
+++ linux/net/sched/sch_generic.c
@@ -14,6 +14,7 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/bitops.h>
+#include <linux/kallsyms.h>
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -95,6 +96,7 @@ int qdisc_restart(struct net_device *dev
 	struct Qdisc *q = dev->qdisc;
 	struct sk_buff *skb;
 
+//	rcu_read_lock_read(&ptype_lock); // for PREEMPT_RT
 	/* Dequeue packet */
 	if ((skb = q->dequeue(q)) != NULL) {
 		unsigned nolock = (dev->features & NETIF_F_LLTX);
@@ -108,6 +110,10 @@ int qdisc_restart(struct net_device *dev
 		 * will be requeued.
 		 */
 		if (!nolock) {
+#ifdef CONFIG_PREEMPT_RT
+			spin_lock(&dev->xmit_lock);
+			dev->xmit_lock_owner = _smp_processor_id();
+#else
 			if (!spin_trylock(&dev->xmit_lock)) {
 			collision:
 				/* So, someone grabbed the driver. */
@@ -117,17 +123,19 @@ int qdisc_restart(struct net_device *dev
 				   it by checking xmit owner and drop the
 				   packet when deadloop is detected.
 				*/
-				if (dev->xmit_lock_owner == smp_processor_id()) {
+				if (dev->xmit_lock_owner == _smp_processor_id()) {
 					kfree_skb(skb);
 					if (net_ratelimit())
 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
+//					rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT
 					return -1;
 				}
 				__get_cpu_var(netdev_rx_stat).cpu_collision++;
 				goto requeue;
 			}
 			/* Remember that the driver is grabbed by us. */
-			dev->xmit_lock_owner = smp_processor_id();
+			dev->xmit_lock_owner = _smp_processor_id();
+#endif
 		}
 		
 		{
@@ -139,18 +147,34 @@ int qdisc_restart(struct net_device *dev
 				if (netdev_nit)
 					dev_queue_xmit_nit(skb, dev);
 
+				WARN_ON_RT(irqs_disabled());
 				ret = dev->hard_start_xmit(skb, dev);
+#ifdef CONFIG_PREEMPT_RT
+				if (irqs_disabled()) {
+					if (printk_ratelimit())
+						print_symbol("network driver disabled interrupts: %s\n", (unsigned long)dev->hard_start_xmit);
+					local_irq_enable();
+				}
+#endif
 				if (ret == NETDEV_TX_OK) { 
 					if (!nolock) {
 						dev->xmit_lock_owner = -1;
 						spin_unlock(&dev->xmit_lock);
 					}
 					spin_lock(&dev->queue_lock);
+//					rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT
 					return -1;
 				}
 				if (ret == NETDEV_TX_LOCKED && nolock) {
 					spin_lock(&dev->queue_lock);
+#ifdef CONFIG_PREEMPT_RT
+					preempt_disable();
+					__get_cpu_var(netdev_rx_stat).cpu_collision++;
+					preempt_enable();
+					goto requeue;
+#else
 					goto collision; 
+#endif
 				}
 			}
 
@@ -177,8 +201,10 @@ int qdisc_restart(struct net_device *dev
 requeue:
 		q->ops->requeue(skb, q);
 		netif_schedule(dev);
+//		rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT
 		return 1;
 	}
+//	rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT
 	return q->q.qlen;
 }
 
--- linux/net/ipv6/protocol.c.orig
+++ linux/net/ipv6/protocol.c
@@ -40,14 +40,14 @@
 #include <net/protocol.h>
 
 struct inet6_protocol *inet6_protos[MAX_INET_PROTOS];
-static spinlock_t inet6_proto_lock = SPIN_LOCK_UNLOCKED;
+rwlock_t inet6_proto_lock = RW_LOCK_UNLOCKED;
 
 
 int inet6_add_protocol(struct inet6_protocol *prot, unsigned char protocol)
 {
 	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
 
-	spin_lock_bh(&inet6_proto_lock);
+	write_lock_bh(&inet6_proto_lock);
 
 	if (inet6_protos[hash]) {
 		ret = -1;
@@ -56,7 +56,7 @@ int inet6_add_protocol(struct inet6_prot
 		ret = 0;
 	}
 
-	spin_unlock_bh(&inet6_proto_lock);
+	write_unlock_bh(&inet6_proto_lock);
 
 	return ret;
 }
@@ -69,7 +69,7 @@ int inet6_del_protocol(struct inet6_prot
 {
 	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
 
-	spin_lock_bh(&inet6_proto_lock);
+	write_lock_bh(&inet6_proto_lock);
 
 	if (inet6_protos[hash] != prot) {
 		ret = -1;
@@ -78,7 +78,7 @@ int inet6_del_protocol(struct inet6_prot
 		ret = 0;
 	}
 
-	spin_unlock_bh(&inet6_proto_lock);
+	write_unlock_bh(&inet6_proto_lock);
 
 	synchronize_net();
 
--- linux/net/ipv6/af_inet6.c.orig
+++ linux/net/ipv6/af_inet6.c
@@ -94,7 +94,7 @@ atomic_t inet6_sock_nr;
  * build a new socket.
  */
 static struct list_head inetsw6[SOCK_MAX];
-static spinlock_t inetsw6_lock = SPIN_LOCK_UNLOCKED;
+static rwlock_t inetsw6_lock = RW_LOCK_UNLOCKED;
 
 static void inet6_sock_destruct(struct sock *sk)
 {
@@ -127,7 +127,7 @@ static int inet6_create(struct socket *s
 
 	/* Look for the requested type/protocol pair. */
 	answer = NULL;
-	rcu_read_lock();
+	rcu_read_lock_read(&inetsw6_lock);
 	list_for_each_rcu(p, &inetsw6[sock->type]) {
 		answer = list_entry(p, struct inet_protosw, list);
 
@@ -162,7 +162,7 @@ static int inet6_create(struct socket *s
 	answer_prot = answer->prot;
 	answer_no_check = answer->no_check;
 	answer_flags = answer->flags;
-	rcu_read_unlock();
+	rcu_read_unlock_read(&inetsw6_lock);
 
 	BUG_TRAP(answer_prot->slab != NULL);
 
@@ -242,7 +242,7 @@ static int inet6_create(struct socket *s
 out:
 	return rc;
 out_rcu_unlock:
-	rcu_read_unlock();
+	rcu_read_unlock_read(&inetsw6_lock);
 	goto out;
 }
 
@@ -564,7 +564,7 @@ inet6_register_protosw(struct inet_proto
 	int protocol = p->protocol;
 	struct list_head *last_perm;
 
-	spin_lock_bh(&inetsw6_lock);
+	write_lock_bh(&inetsw6_lock);
 
 	if (p->type >= SOCK_MAX)
 		goto out_illegal;
@@ -595,7 +595,7 @@ inet6_register_protosw(struct inet_proto
 	 */
 	list_add_rcu(&p->list, last_perm);
 out:
-	spin_unlock_bh(&inetsw6_lock);
+	write_unlock_bh(&inetsw6_lock);
 	return;
 
 out_permanent:
@@ -618,9 +618,9 @@ inet6_unregister_protosw(struct inet_pro
 		       "Attempt to unregister permanent protocol %d.\n",
 		       p->protocol);
 	} else {
-		spin_lock_bh(&inetsw6_lock);
+		write_lock_bh(&inetsw6_lock);
 		list_del_rcu(&p->list);
-		spin_unlock_bh(&inetsw6_lock);
+		write_unlock_bh(&inetsw6_lock);
 
 		synchronize_net();
 	}
--- linux/net/ipv6/ndisc.c.orig
+++ linux/net/ipv6/ndisc.c
@@ -289,17 +289,17 @@ static int ndisc_constructor(struct neig
 	struct neigh_parms *parms;
 	int is_multicast = ipv6_addr_is_multicast(addr);
 
-	rcu_read_lock();
+	rcu_read_lock_read(&addrconf_lock);
 	in6_dev = in6_dev_get(dev);
 	if (in6_dev == NULL) {
-		rcu_read_unlock();
+		rcu_read_unlock_read(&addrconf_lock);
 		return -EINVAL;
 	}
 
 	parms = in6_dev->nd_parms;
 	__neigh_parms_put(neigh->parms);
 	neigh->parms = neigh_parms_clone(parms);
-	rcu_read_unlock();
+	rcu_read_unlock_read(&addrconf_lock);
 
 	neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST;
 	if (dev->hard_header == NULL) {
--- linux/net/ipv6/icmp.c.orig
+++ linux/net/ipv6/icmp.c
@@ -537,11 +537,11 @@ static void icmpv6_notify(struct sk_buff
 
 	hash = nexthdr & (MAX_INET_PROTOS - 1);
 
-	rcu_read_lock();
+	rcu_read_lock_read(&inet6_proto_lock);
 	ipprot = rcu_dereference(inet6_protos[hash]);
 	if (ipprot && ipprot->err_handler)
 		ipprot->err_handler(skb, NULL, type, code, inner_offset, info);
-	rcu_read_unlock();
+	rcu_read_unlock_read(&inet6_proto_lock);
 
 	read_lock(&raw_v6_lock);
 	if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) {
--- linux/net/ipv6/ip6_input.c.orig
+++ linux/net/ipv6/ip6_input.c
@@ -156,7 +156,7 @@ static inline int ip6_input_finish(struc
 		skb->h.raw += (skb->h.raw[1]+1)<<3;
 	}
 
-	rcu_read_lock();
+	rcu_read_lock_read(&raw_v6_lock);
 resubmit:
 	if (!pskb_pull(skb, skb->h.raw - skb->data))
 		goto discard;
@@ -205,12 +205,12 @@ resubmit:
 			kfree_skb(skb);
 		}
 	}
-	rcu_read_unlock();
+	rcu_read_unlock_read(&raw_v6_lock);
 	return 0;
 
 discard:
 	IP6_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
-	rcu_read_unlock();
+	rcu_read_unlock_read(&raw_v6_lock);
 	kfree_skb(skb);
 	return 0;
 }
--- linux/net/packet/af_packet.c.orig
+++ linux/net/packet/af_packet.c
@@ -393,7 +393,6 @@ static int packet_sendmsg_spkt(struct ki
 	/*
 	 *	Now send it
 	 */
-
 	dev_queue_xmit(skb);
 	dev_put(dev);
 	return(len);
--- linux/net/sunrpc/sched.c.orig
+++ linux/net/sunrpc/sched.c
@@ -959,8 +959,6 @@ void rpc_killall_tasks(struct rpc_clnt *
 	spin_unlock(&rpc_sched_lock);
 }
 
-static DECLARE_MUTEX_LOCKED(rpciod_running);
-
 static void rpciod_killall(void)
 {
 	unsigned long flags;
--- linux/net/sunrpc/clnt.c.orig
+++ linux/net/sunrpc/clnt.c
@@ -231,7 +231,8 @@ rpc_shutdown_client(struct rpc_clnt *cln
 		clnt->cl_oneshot = 0;
 		clnt->cl_dead = 0;
 		rpc_killall_tasks(clnt);
-		sleep_on_timeout(&destroy_wait, 1*HZ);
+		wait_event_timeout(destroy_wait,
+			atomic_read(&clnt->cl_users) > 0, 1*HZ);
 	}
 
 	if (atomic_read(&clnt->cl_users) < 0) {
--- linux/net/ipv4/devinet.c.orig
+++ linux/net/ipv4/devinet.c
@@ -214,16 +214,16 @@ static void inetdev_destroy(struct in_de
 
 int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
 {
-	rcu_read_lock();
+	rcu_read_lock_down_read(&rtnl_sem);
 	for_primary_ifa(in_dev) {
 		if (inet_ifa_match(a, ifa)) {
 			if (!b || inet_ifa_match(b, ifa)) {
-				rcu_read_unlock();
+				rcu_read_unlock_up_read(&rtnl_sem);
 				return 1;
 			}
 		}
 	} endfor_ifa(in_dev);
-	rcu_read_unlock();
+	rcu_read_unlock_up_read(&rtnl_sem);
 	return 0;
 }
 
@@ -772,7 +772,7 @@ u32 inet_select_addr(const struct net_de
 	u32 addr = 0;
 	struct in_device *in_dev;
 
-	rcu_read_lock();
+	rcu_read_lock_down_read(&rtnl_sem);
 	in_dev = __in_dev_get(dev);
 	if (!in_dev)
 		goto no_in_dev;
@@ -788,7 +788,7 @@ u32 inet_select_addr(const struct net_de
 			addr = ifa->ifa_local;
 	} endfor_ifa(in_dev);
 no_in_dev:
-	rcu_read_unlock();
+	rcu_read_unlock_up_read(&rtnl_sem);
 
 	if (addr)
 		goto out;
@@ -798,7 +798,7 @@ no_in_dev:
 	   in dev_base list.
 	 */
 	read_lock(&dev_base_lock);
-	rcu_read_lock();
+	rcu_read_lock_down_read(&rtnl_sem);
 	for (dev = dev_base; dev; dev = dev->next) {
 		if ((in_dev = __in_dev_get(dev)) == NULL)
 			continue;
@@ -812,8 +812,8 @@ no_in_dev:
 		} endfor_ifa(in_dev);
 	}
 out_unlock_both:
+	rcu_read_unlock_up_read(&rtnl_sem);
 	read_unlock(&dev_base_lock);
-	rcu_read_unlock();
 out:
 	return addr;
 }
@@ -868,16 +868,16 @@ u32 inet_confirm_addr(const struct net_d
 	struct in_device *in_dev;
 
 	if (dev) {
-		rcu_read_lock();
+		rcu_read_lock_down_read(&rtnl_sem);
 		if ((in_dev = __in_dev_get(dev)))
 			addr = confirm_addr_indev(in_dev, dst, local, scope);
-		rcu_read_unlock();
+		rcu_read_unlock_up_read(&rtnl_sem);
 
 		return addr;
 	}
 
 	read_lock(&dev_base_lock);
-	rcu_read_lock();
+	rcu_read_lock_down_read(&rtnl_sem);
 	for (dev = dev_base; dev; dev = dev->next) {
 		if ((in_dev = __in_dev_get(dev))) {
 			addr = confirm_addr_indev(in_dev, dst, local, scope);
@@ -885,7 +885,7 @@ u32 inet_confirm_addr(const struct net_d
 				break;
 		}
 	}
-	rcu_read_unlock();
+	rcu_read_unlock_up_read(&rtnl_sem);
 	read_unlock(&dev_base_lock);
 
 	return addr;
@@ -1054,9 +1054,9 @@ static int inet_dump_ifaddr(struct sk_bu
 			continue;
 		if (idx > s_idx)
 			s_ip_idx = 0;
-		rcu_read_lock();
+		rcu_read_lock_down_read(&rtnl_sem);
 		if ((in_dev = __in_dev_get(dev)) == NULL) {
-			rcu_read_unlock();
+			rcu_read_unlock_up_read(&rtnl_sem);
 			continue;
 		}
 
@@ -1067,11 +1067,11 @@ static int inet_dump_ifaddr(struct sk_bu
 			if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
 					     cb->nlh->nlmsg_seq,
 					     RTM_NEWADDR) <= 0) {
-				rcu_read_unlock();
+				rcu_read_unlock_up_read(&rtnl_sem);
 				goto done;
 			}
 		}
-		rcu_read_unlock();
+		rcu_read_unlock_up_read(&rtnl_sem);
 	}
 
 done:
@@ -1125,11 +1125,11 @@ void inet_forward_change(void)
 	read_lock(&dev_base_lock);
 	for (dev = dev_base; dev; dev = dev->next) {
 		struct in_device *in_dev;
-		rcu_read_lock();
+		rcu_read_lock_down_read(&rtnl_sem);
 		in_dev = __in_dev_get(dev);
 		if (in_dev)
 			in_dev->cnf.forwarding = on;
-		rcu_read_unlock();
+		rcu_read_unlock_up_read(&rtnl_sem);
 	}
 	read_unlock(&dev_base_lock);
 
--- linux/net/ipv4/arp.c.orig
+++ linux/net/ipv4/arp.c
@@ -237,17 +237,17 @@ static int arp_constructor(struct neighb
 
 	neigh->type = inet_addr_type(addr);
 
-	rcu_read_lock();
+	rcu_read_lock_down_read(&rtnl_sem);
 	in_dev = rcu_dereference(__in_dev_get(dev));
 	if (in_dev == NULL) {
-		rcu_read_unlock();
+		rcu_read_unlock_up_read(&rtnl_sem);
 		return -EINVAL;
 	}
 
 	parms = in_dev->arp_parms;
 	__neigh_parms_put(neigh->parms);
 	neigh->parms = neigh_parms_clone(parms);
-	rcu_read_unlock();
+	rcu_read_unlock_up_read(&rtnl_sem);
 
 	if (dev->hard_header == NULL) {
 		neigh->nud_state = NUD_NOARP;
--- linux/net/ipv4/ip_input.c.orig
+++ linux/net/ipv4/ip_input.c
@@ -213,7 +213,7 @@ static inline int ip_local_deliver_finis
         /* Point into the IP datagram, just past the header. */
         skb->h.raw = skb->data;
 
-	rcu_read_lock();
+	rcu_read_lock_read(&inet_proto_lock);
 	{
 		/* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
 		int protocol = skb->nh.iph->protocol;
@@ -258,7 +258,7 @@ static inline int ip_local_deliver_finis
 		}
 	}
  out:
-	rcu_read_unlock();
+	rcu_read_unlock_read(&inet_proto_lock);
 
 	return 0;
 }
--- linux/net/ipv4/tcp_timer.c.orig
+++ linux/net/ipv4/tcp_timer.c
@@ -210,6 +210,7 @@ static void tcp_delack_timer(unsigned lo
 	struct sock *sk = (struct sock*)data;
 	struct tcp_opt *tp = tcp_sk(sk);
 
+//	rcu_read_lock_read(&ptype_lock);
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
 		/* Try again later. */
@@ -263,6 +264,7 @@ out:
 		sk_stream_mem_reclaim(sk);
 out_unlock:
 	bh_unlock_sock(sk);
+//	rcu_read_unlock_read(&ptype_lock);
 	sock_put(sk);
 }
 
@@ -421,6 +423,7 @@ static void tcp_write_timer(unsigned lon
 	struct tcp_opt *tp = tcp_sk(sk);
 	int event;
 
+//	rcu_read_lock_read(&ptype_lock);
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
 		/* Try again later */
@@ -452,6 +455,7 @@ static void tcp_write_timer(unsigned lon
 out:
 	sk_stream_mem_reclaim(sk);
 out_unlock:
+//	rcu_read_unlock_read(&ptype_lock);
 	bh_unlock_sock(sk);
 	sock_put(sk);
 }
@@ -577,6 +581,7 @@ static void tcp_keepalive_timer (unsigne
 	__u32 elapsed;
 
 	/* Only process if socket is not in use. */
+//	rcu_read_lock_read(&ptype_lock);
 	bh_lock_sock(sk);
 	if (sock_owned_by_user(sk)) {
 		/* Try again later. */ 
@@ -646,6 +651,7 @@ death:	
 
 out:
 	bh_unlock_sock(sk);
+//	rcu_read_unlock_read(&ptype_lock);
 	sock_put(sk);
 }
 
--- linux/net/ipv4/fib_frontend.c.orig
+++ linux/net/ipv4/fib_frontend.c
@@ -172,13 +172,13 @@ int fib_validate_source(u32 src, u32 dst
 	int ret;
 
 	no_addr = rpf = 0;
-	rcu_read_lock();
+	rcu_read_lock_down_read(&rtnl_sem);
 	in_dev = __in_dev_get(dev);
 	if (in_dev) {
 		no_addr = in_dev->ifa_list == NULL;
 		rpf = IN_DEV_RPFILTER(in_dev);
 	}
-	rcu_read_unlock();
+	rcu_read_unlock_up_read(&rtnl_sem);
 
 	if (in_dev == NULL)
 		goto e_inval;
--- linux/net/ipv4/netfilter/ip_tables.c.orig
+++ linux/net/ipv4/netfilter/ip_tables.c
@@ -110,7 +110,11 @@ struct ipt_table_info
 static LIST_HEAD(ipt_target);
 static LIST_HEAD(ipt_match);
 static LIST_HEAD(ipt_tables);
-#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
+/*
+ * Use atomic add because on PREEMPT_RT the same table might
+ * be used on two CPUs at once:
+ */
+#define ADD_COUNTER(c,b,p) do { atomic_add((b), (atomic_t *)(&(c).bcnt)); atomic_add((p), (atomic_t *)(&(c).pcnt)); } while(0)
 
 #ifdef CONFIG_SMP
 #define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
@@ -289,8 +293,17 @@ ipt_do_table(struct sk_buff **pskb,
 
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+	/*
+	 * on a PREEMPT_RT kernel the task could schedule
+	 * off and smp_processor_id() is not safe. So we take
+	 * the current value of the CPU and use that table. We
+	 * only update the counters while read-locking the table
+	 * and dont change the rules so the possibility of the
+	 * same table being used by two tasks at once is not a
+	 * problem.
+	 */
 	table_base = (void *)table->private->entries
-		+ TABLE_OFFSET(table->private, smp_processor_id());
+		+ TABLE_OFFSET(table->private, _smp_processor_id());
 	e = get_entry(table_base, table->private->hook_entry[hook]);
 
 #ifdef CONFIG_NETFILTER_DEBUG
@@ -298,7 +311,7 @@ ipt_do_table(struct sk_buff **pskb,
 	if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac
 	    && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) {
 		printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n",
-		       smp_processor_id(),
+		       _smp_processor_id(),
 		       table->name,
 		       &((struct ipt_entry *)table_base)->comefrom,
 		       ((struct ipt_entry *)table_base)->comefrom);
--- linux/net/ipv4/protocol.c.orig
+++ linux/net/ipv4/protocol.c
@@ -49,7 +49,7 @@
 #include <linux/igmp.h>
 
 struct net_protocol *inet_protos[MAX_INET_PROTOS];
-static spinlock_t inet_proto_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_RWLOCK(inet_proto_lock);
 
 /*
  *	Add a protocol handler to the hash tables
@@ -61,14 +61,14 @@ int inet_add_protocol(struct net_protoco
 
 	hash = protocol & (MAX_INET_PROTOS - 1);
 
-	spin_lock_bh(&inet_proto_lock);
+	write_lock_bh(&inet_proto_lock);
 	if (inet_protos[hash]) {
 		ret = -1;
 	} else {
 		inet_protos[hash] = prot;
 		ret = 0;
 	}
-	spin_unlock_bh(&inet_proto_lock);
+	write_unlock_bh(&inet_proto_lock);
 
 	return ret;
 }
@@ -83,14 +83,14 @@ int inet_del_protocol(struct net_protoco
 
 	hash = protocol & (MAX_INET_PROTOS - 1);
 
-	spin_lock_bh(&inet_proto_lock);
+	write_lock_bh(&inet_proto_lock);
 	if (inet_protos[hash] == prot) {
 		inet_protos[hash] = NULL;
 		ret = 0;
 	} else {
 		ret = -1;
 	}
-	spin_unlock_bh(&inet_proto_lock);
+	write_unlock_bh(&inet_proto_lock);
 
 	synchronize_net();
 
--- linux/net/ipv4/route.c.orig
+++ linux/net/ipv4/route.c
@@ -196,7 +196,7 @@ __u8 ip_tos2prio[16] = {
 
 struct rt_hash_bucket {
 	struct rtable	*chain;
-	spinlock_t	lock;
+	rwlock_t	lock;
 } __attribute__((__aligned__(8)));
 
 static struct rt_hash_bucket 	*rt_hash_table;
@@ -226,11 +226,11 @@ static struct rtable *rt_cache_get_first
 	struct rt_cache_iter_state *st = seq->private;
 
 	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
-		rcu_read_lock_bh();
+		rcu_read_lock_bh_read(&rt_hash_table[st->bucket].lock);
 		r = rt_hash_table[st->bucket].chain;
 		if (r)
 			break;
-		rcu_read_unlock_bh();
+		rcu_read_unlock_bh_read(&rt_hash_table[st->bucket].lock);
 	}
 	return r;
 }
@@ -241,10 +241,10 @@ static struct rtable *rt_cache_get_next(
 
 	r = r->u.rt_next;
 	while (!r) {
-		rcu_read_unlock_bh();
+		rcu_read_unlock_bh_read(&rt_hash_table[st->bucket].lock);
 		if (--st->bucket < 0)
 			break;
-		rcu_read_lock_bh();
+		rcu_read_lock_bh_read(&rt_hash_table[st->bucket].lock);
 		r = rt_hash_table[st->bucket].chain;
 	}
 	return r;
@@ -279,8 +279,10 @@ static void *rt_cache_seq_next(struct se
 
 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 {
+	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
+
 	if (v && v != SEQ_START_TOKEN)
-		rcu_read_unlock_bh();
+		rcu_read_unlock_bh_read(&rt_hash_table[st->bucket].lock);
 }
 
 static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -527,7 +529,7 @@ static void rt_check_expire(unsigned lon
 		i = (i + 1) & rt_hash_mask;
 		rthp = &rt_hash_table[i].chain;
 
-		spin_lock(&rt_hash_table[i].lock);
+		write_lock(&rt_hash_table[i].lock);
 		while ((rth = *rthp) != NULL) {
 			if (rth->u.dst.expires) {
 				/* Entry is expired even if it is in use */
@@ -546,7 +548,7 @@ static void rt_check_expire(unsigned lon
 			*rthp = rth->u.rt_next;
 			rt_free(rth);
 		}
-		spin_unlock(&rt_hash_table[i].lock);
+		write_unlock(&rt_hash_table[i].lock);
 
 		/* Fallback loop breaker. */
 		if (time_after(jiffies, now))
@@ -569,11 +571,12 @@ static void rt_run_flush(unsigned long d
 	get_random_bytes(&rt_hash_rnd, 4);
 
 	for (i = rt_hash_mask; i >= 0; i--) {
-		spin_lock_bh(&rt_hash_table[i].lock);
+		write_lock_bh(&rt_hash_table[i].lock);
 		rth = rt_hash_table[i].chain;
 		if (rth)
 			rt_hash_table[i].chain = NULL;
-		spin_unlock_bh(&rt_hash_table[i].lock);
+		write_unlock_bh(&rt_hash_table[i].lock);
+		cond_resched_all();
 
 		for (; rth; rth = next) {
 			next = rth->u.rt_next;
@@ -582,7 +585,7 @@ static void rt_run_flush(unsigned long d
 	}
 }
 
-static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(rt_flush_lock);
 
 void rt_cache_flush(int delay)
 {
@@ -703,7 +706,7 @@ static int rt_garbage_collect(void)
 
 			k = (k + 1) & rt_hash_mask;
 			rthp = &rt_hash_table[k].chain;
-			spin_lock_bh(&rt_hash_table[k].lock);
+			write_lock_bh(&rt_hash_table[k].lock);
 			while ((rth = *rthp) != NULL) {
 				if (!rt_may_expire(rth, tmo, expire)) {
 					tmo >>= 1;
@@ -714,7 +717,7 @@ static int rt_garbage_collect(void)
 				rt_free(rth);
 				goal--;
 			}
-			spin_unlock_bh(&rt_hash_table[k].lock);
+			write_unlock_bh(&rt_hash_table[k].lock);
 			if (goal <= 0)
 				break;
 		}
@@ -791,7 +794,7 @@ restart:
 
 	rthp = &rt_hash_table[hash].chain;
 
-	spin_lock_bh(&rt_hash_table[hash].lock);
+	write_lock_bh(&rt_hash_table[hash].lock);
 	while ((rth = *rthp) != NULL) {
 		if (compare_keys(&rth->fl, &rt->fl)) {
 			/* Put it first */
@@ -812,7 +815,7 @@ restart:
 			rth->u.dst.__use++;
 			dst_hold(&rth->u.dst);
 			rth->u.dst.lastuse = now;
-			spin_unlock_bh(&rt_hash_table[hash].lock);
+			write_unlock_bh(&rt_hash_table[hash].lock);
 
 			rt_drop(rt);
 			*rp = rth;
@@ -853,7 +856,7 @@ restart:
 	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 		int err = arp_bind_neighbour(&rt->u.dst);
 		if (err) {
-			spin_unlock_bh(&rt_hash_table[hash].lock);
+			write_unlock_bh(&rt_hash_table[hash].lock);
 
 			if (err != -ENOBUFS) {
 				rt_drop(rt);
@@ -894,14 +897,14 @@ restart:
 	}
 #endif
 	rt_hash_table[hash].chain = rt;
-	spin_unlock_bh(&rt_hash_table[hash].lock);
+	write_unlock_bh(&rt_hash_table[hash].lock);
 	*rp = rt;
 	return 0;
 }
 
 void rt_bind_peer(struct rtable *rt, int create)
 {
-	static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(rt_peer_lock);
 	struct inet_peer *peer;
 
 	peer = inet_getpeer(rt->rt_dst, create);
@@ -925,7 +928,7 @@ void rt_bind_peer(struct rtable *rt, int
  */
 static void ip_select_fb_ident(struct iphdr *iph)
 {
-	static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(ip_fb_id_lock);
 	static u32 ip_fallback_id;
 	u32 salt;
 
@@ -961,7 +964,7 @@ static void rt_del(unsigned hash, struct
 {
 	struct rtable **rthp;
 
-	spin_lock_bh(&rt_hash_table[hash].lock);
+	write_lock_bh(&rt_hash_table[hash].lock);
 	ip_rt_put(rt);
 	for (rthp = &rt_hash_table[hash].chain; *rthp;
 	     rthp = &(*rthp)->u.rt_next)
@@ -970,7 +973,7 @@ static void rt_del(unsigned hash, struct
 			rt_free(rt);
 			break;
 		}
-	spin_unlock_bh(&rt_hash_table[hash].lock);
+	write_unlock_bh(&rt_hash_table[hash].lock);
 }
 
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -1009,7 +1012,7 @@ void ip_rt_redirect(u32 old_gw, u32 dadd
 
 			rthp=&rt_hash_table[hash].chain;
 
-			rcu_read_lock();
+			rcu_read_lock_read(&rt_hash_table[hash].lock);
 			while ((rth = rcu_dereference(*rthp)) != NULL) {
 				struct rtable *rt;
 
@@ -1030,7 +1033,7 @@ void ip_rt_redirect(u32 old_gw, u32 dadd
 					break;
 
 				dst_hold(&rth->u.dst);
-				rcu_read_unlock();
+				rcu_read_unlock_read(&rt_hash_table[hash].lock);
 
 				rt = dst_alloc(&ipv4_dst_ops);
 				if (rt == NULL) {
@@ -1082,7 +1085,7 @@ void ip_rt_redirect(u32 old_gw, u32 dadd
 					ip_rt_put(rt);
 				goto do_next;
 			}
-			rcu_read_unlock();
+			rcu_read_unlock_read(&rt_hash_table[hash].lock);
 		do_next:
 			;
 		}
@@ -1263,7 +1266,7 @@ unsigned short ip_rt_frag_needed(struct 
 	for (i = 0; i < 2; i++) {
 		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
 
-		rcu_read_lock();
+		rcu_read_lock_read(&rt_hash_table[hash].lock);
 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
 		     rth = rcu_dereference(rth->u.rt_next)) {
 			if (rth->fl.fl4_dst == daddr &&
@@ -1301,7 +1304,7 @@ unsigned short ip_rt_frag_needed(struct 
 				}
 			}
 		}
-		rcu_read_unlock();
+		rcu_read_unlock_read(&rt_hash_table[hash].lock);
 	}
 	return est_mtu ? : new_mtu;
 }
@@ -1823,7 +1826,7 @@ int ip_route_input(struct sk_buff *skb, 
 	tos &= IPTOS_RT_MASK;
 	hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
 
-	rcu_read_lock();
+	rcu_read_lock_read(&rt_hash_table[hash].lock);
 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
 	     rth = rcu_dereference(rth->u.rt_next)) {
 		if (rth->fl.fl4_dst == daddr &&
@@ -1838,13 +1841,13 @@ int ip_route_input(struct sk_buff *skb, 
 			dst_hold(&rth->u.dst);
 			rth->u.dst.__use++;
 			RT_CACHE_STAT_INC(in_hit);
-			rcu_read_unlock();
+			rcu_read_unlock_read(&rt_hash_table[hash].lock);
 			skb->dst = (struct dst_entry*)rth;
 			return 0;
 		}
 		RT_CACHE_STAT_INC(in_hlist_search);
 	}
-	rcu_read_unlock();
+	rcu_read_unlock_read(&rt_hash_table[hash].lock);
 
 	/* Multicast recognition logic is moved from route cache to here.
 	   The problem was that too many Ethernet cards have broken/missing
@@ -1860,7 +1863,7 @@ int ip_route_input(struct sk_buff *skb, 
 	if (MULTICAST(daddr)) {
 		struct in_device *in_dev;
 
-		rcu_read_lock();
+		rcu_read_lock_down_read(&rtnl_sem);
 		if ((in_dev = __in_dev_get(dev)) != NULL) {
 			int our = ip_check_mc(in_dev, daddr, saddr,
 				skb->nh.iph->protocol);
@@ -1869,12 +1872,12 @@ int ip_route_input(struct sk_buff *skb, 
 			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
 #endif
 			    ) {
-				rcu_read_unlock();
+				rcu_read_unlock_up_read(&rtnl_sem);
 				return ip_route_input_mc(skb, daddr, saddr,
 							 tos, dev, our);
 			}
 		}
-		rcu_read_unlock();
+		rcu_read_unlock_up_read(&rtnl_sem);
 		return -EINVAL;
 	}
 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
@@ -2184,7 +2187,7 @@ int __ip_route_output_key(struct rtable 
 
 	hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
 
-	rcu_read_lock_bh();
+	rcu_read_lock_read(&rt_hash_table[hash].lock);
 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
 		rth = rcu_dereference(rth->u.rt_next)) {
 		if (rth->fl.fl4_dst == flp->fl4_dst &&
@@ -2200,13 +2203,13 @@ int __ip_route_output_key(struct rtable 
 			dst_hold(&rth->u.dst);
 			rth->u.dst.__use++;
 			RT_CACHE_STAT_INC(out_hit);
-			rcu_read_unlock_bh();
+			rcu_read_unlock_read(&rt_hash_table[hash].lock);
 			*rp = rth;
 			return 0;
 		}
 		RT_CACHE_STAT_INC(out_hlist_search);
 	}
-	rcu_read_unlock_bh();
+	rcu_read_unlock_read(&rt_hash_table[hash].lock);
 
 	return ip_route_output_slow(rp, flp);
 }
@@ -2421,7 +2424,7 @@ int ip_rt_dump(struct sk_buff *skb,  str
 		if (h < s_h) continue;
 		if (h > s_h)
 			s_idx = 0;
-		rcu_read_lock_bh();
+		rcu_read_lock_read(&rt_hash_table[h].lock);
 		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
 		     rt = rcu_dereference(rt->u.rt_next), idx++) {
 			if (idx < s_idx)
@@ -2431,12 +2434,12 @@ int ip_rt_dump(struct sk_buff *skb,  str
 					 cb->nlh->nlmsg_seq,
 					 RTM_NEWROUTE, 1) <= 0) {
 				dst_release(xchg(&skb->dst, NULL));
-				rcu_read_unlock_bh();
+				rcu_read_unlock_read(&rt_hash_table[h].lock);
 				goto done;
 			}
 			dst_release(xchg(&skb->dst, NULL));
 		}
-		rcu_read_unlock_bh();
+		rcu_read_unlock_read(&rt_hash_table[h].lock);
 	}
 
 done:
@@ -2755,7 +2758,7 @@ int __init ip_rt_init(void)
 
 	rt_hash_mask--;
 	for (i = 0; i <= rt_hash_mask; i++) {
-		spin_lock_init(&rt_hash_table[i].lock);
+		rwlock_init(&rt_hash_table[i].lock);
 		rt_hash_table[i].chain = NULL;
 	}
 
--- linux/net/ipv4/inetpeer.c.orig
+++ linux/net/ipv4/inetpeer.c
@@ -70,7 +70,7 @@
  */
 
 /* Exported for inet_getid inline function.  */
-spinlock_t inet_peer_idlock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(inet_peer_idlock);
 
 static kmem_cache_t *peer_cachep;
 
@@ -95,7 +95,7 @@ int inet_peer_maxttl = 10 * 60 * HZ;	/* 
 /* Exported for inet_putpeer inline function.  */
 struct inet_peer *inet_peer_unused_head,
 		**inet_peer_unused_tailp = &inet_peer_unused_head;
-spinlock_t inet_peer_unused_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(inet_peer_unused_lock);
 #define PEER_MAX_CLEANUP_WORK 30
 
 static void peer_check_expire(unsigned long dummy);
--- linux/net/ipv4/af_inet.c.orig
+++ linux/net/ipv4/af_inet.c
@@ -125,7 +125,7 @@ extern void ip_mc_drop_socket(struct soc
  * build a new socket.
  */
 static struct list_head inetsw[SOCK_MAX];
-static spinlock_t inetsw_lock = SPIN_LOCK_UNLOCKED;
+static rwlock_t inetsw_lock = RW_LOCK_UNLOCKED;
 
 /* New destruction routine */
 
@@ -242,7 +242,7 @@ static int inet_create(struct socket *so
 
 	/* Look for the requested type/protocol pair. */
 	answer = NULL;
-	rcu_read_lock();
+	rcu_read_lock_read(&inetsw_lock);
 	list_for_each_rcu(p, &inetsw[sock->type]) {
 		answer = list_entry(p, struct inet_protosw, list);
 
@@ -276,7 +276,7 @@ static int inet_create(struct socket *so
 	answer_prot = answer->prot;
 	answer_no_check = answer->no_check;
 	answer_flags = answer->flags;
-	rcu_read_unlock();
+	rcu_read_unlock_read(&inetsw_lock);
 
 	BUG_TRAP(answer_prot->slab != NULL);
 
@@ -345,7 +345,7 @@ static int inet_create(struct socket *so
 out:
 	return err;
 out_rcu_unlock:
-	rcu_read_unlock();
+	rcu_read_unlock_read(&inetsw_lock);
 	goto out;
 }
 
@@ -902,7 +902,7 @@ void inet_register_protosw(struct inet_p
 	int protocol = p->protocol;
 	struct list_head *last_perm;
 
-	spin_lock_bh(&inetsw_lock);
+	write_lock_bh(&inetsw_lock);
 
 	if (p->type >= SOCK_MAX)
 		goto out_illegal;
@@ -933,7 +933,7 @@ void inet_register_protosw(struct inet_p
 	 */
 	list_add_rcu(&p->list, last_perm);
 out:
-	spin_unlock_bh(&inetsw_lock);
+	write_unlock_bh(&inetsw_lock);
 
 	synchronize_net();
 
@@ -958,9 +958,9 @@ void inet_unregister_protosw(struct inet
 		       "Attempt to unregister permanent protocol %d.\n",
 		       p->protocol);
 	} else {
-		spin_lock_bh(&inetsw_lock);
+		write_lock_bh(&inetsw_lock);
 		list_del_rcu(&p->list);
-		spin_unlock_bh(&inetsw_lock);
+		write_unlock_bh(&inetsw_lock);
 
 		synchronize_net();
 	}
--- linux/net/ipv4/tcp_minisocks.c.orig
+++ linux/net/ipv4/tcp_minisocks.c
@@ -417,7 +417,7 @@ static void tcp_twkill(unsigned long);
 #define TCP_TWKILL_QUOTA	100
 
 static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
-static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(tw_death_lock);
 static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
 static void twkill_work(void *);
 static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
@@ -512,7 +512,7 @@ static void twkill_work(void *dummy)
 				continue;
 
 			while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
-				if (need_resched()) {
+				if (softirq_need_resched()) {
 					spin_unlock_bh(&tw_death_lock);
 					schedule();
 					spin_lock_bh(&tw_death_lock);
--- linux/net/ipv4/tcp_ipv4.c.orig
+++ linux/net/ipv4/tcp_ipv4.c
@@ -1015,6 +1015,7 @@ void tcp_v4_err(struct sk_buff *skb, u32
 		return;
 	}
 
+//	rcu_read_lock_read(&ptype_lock);
 	bh_lock_sock(sk);
 	/* If too many ICMPs get dropped on busy
 	 * servers this needs to be solved differently.
@@ -1132,6 +1133,7 @@ void tcp_v4_err(struct sk_buff *skb, u32
 
 out:
 	bh_unlock_sock(sk);
+//	rcu_read_unlock_read(&ptype_lock);
 	sock_put(sk);
 }
 
@@ -1789,6 +1791,7 @@ process:
 
 	skb->dev = NULL;
 
+//	rcu_read_lock_read(&ptype_lock);
 	bh_lock_sock(sk);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
@@ -1797,6 +1800,7 @@ process:
 	} else
 		sk_add_backlog(sk, skb);
 	bh_unlock_sock(sk);
+//	rcu_read_unlock_read(&ptype_lock);
 
 	sock_put(sk);
 
--- linux/net/ipv4/icmp.c.orig
+++ linux/net/ipv4/icmp.c
@@ -701,11 +701,11 @@ static void icmp_unreach(struct sk_buff 
 	}
 	read_unlock(&raw_v4_lock);
 
-	rcu_read_lock();
+	rcu_read_lock_read(&inet_proto_lock);
 	ipprot = rcu_dereference(inet_protos[hash]);
 	if (ipprot && ipprot->err_handler)
 		ipprot->err_handler(skb, info);
-	rcu_read_unlock();
+	rcu_read_unlock_read(&inet_proto_lock);
 
 out:
 	return;
@@ -883,7 +883,7 @@ static void icmp_address_reply(struct sk
 	in_dev = in_dev_get(dev);
 	if (!in_dev)
 		goto out;
-	rcu_read_lock();
+	rcu_read_lock_down_read(&rtnl_sem);
 	if (in_dev->ifa_list &&
 	    IN_DEV_LOG_MARTIANS(in_dev) &&
 	    IN_DEV_FORWARD(in_dev)) {
@@ -903,7 +903,7 @@ static void icmp_address_reply(struct sk
 			       NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src));
 		}
 	}
-	rcu_read_unlock();
+	rcu_read_unlock_up_read(&rtnl_sem);
 	in_dev_put(in_dev);
 out:;
 }
--- linux/net/ipv4/ip_output.c.orig
+++ linux/net/ipv4/ip_output.c
@@ -1302,6 +1302,7 @@ void ip_send_reply(struct sock *sk, stru
 	   Note that it uses the fact, that this function is called
 	   with locally disabled BH and that sk cannot be already spinlocked.
 	 */
+//	rcu_read_lock_read(&ptype_lock);
 	bh_lock_sock(sk);
 	inet->tos = skb->nh.iph->tos;
 	sk->sk_priority = skb->priority;
@@ -1316,6 +1317,7 @@ void ip_send_reply(struct sock *sk, stru
 	}
 
 	bh_unlock_sock(sk);
+//	rcu_read_unlock_read(&ptype_lock);
 
 	ip_rt_put(rt);
 }
--- linux/net/802/psnap.c.orig
+++ linux/net/802/psnap.c
@@ -55,7 +55,7 @@ static int snap_rcv(struct sk_buff *skb,
 		.type = __constant_htons(ETH_P_SNAP),
 	};
 
-	rcu_read_lock();
+	rcu_read_lock_spin(&snap_lock);
 	proto = find_snap_client(skb->h.raw);
 	if (proto) {
 		/* Pass the frame on. */
@@ -68,7 +68,7 @@ static int snap_rcv(struct sk_buff *skb,
 		rc = 1;
 	}
 
-	rcu_read_unlock();
+	rcu_read_unlock_spin(&snap_lock);
 	return rc;
 }
 
--- linux/sound/core/oss/pcm_oss.c.orig
+++ linux/sound/core/oss/pcm_oss.c
@@ -1918,7 +1918,7 @@ static int snd_pcm_oss_release(struct in
 	return 0;
 }
 
-static inline int _snd_pcm_oss_ioctl(struct inode *inode, struct file *file,
+static inline int snd_pcm_oss_ioctl(struct inode *inode, struct file *file,
 				     unsigned int cmd, unsigned long arg)
 {
 	snd_pcm_oss_file_t *pcm_oss_file;
@@ -2078,17 +2078,6 @@ static inline int _snd_pcm_oss_ioctl(str
 	return -EINVAL;
 }
 
-/* FIXME: need to unlock BKL to allow preemption */
-static int snd_pcm_oss_ioctl(struct inode *inode, struct file *file,
-			     unsigned int cmd, unsigned long arg)
-{
-	int err;
-	unlock_kernel();
-	err = _snd_pcm_oss_ioctl(inode, file, cmd, arg);
-	lock_kernel();
-	return err;
-}
-
 static ssize_t snd_pcm_oss_read(struct file *file, char __user *buf, size_t count, loff_t *offset)
 {
 	snd_pcm_oss_file_t *pcm_oss_file;
@@ -2119,9 +2108,7 @@ static ssize_t snd_pcm_oss_write(struct 
 	substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK];
 	if (substream == NULL)
 		return -ENXIO;
-	up(&file->f_dentry->d_inode->i_sem);
 	result = snd_pcm_oss_write1(substream, buf, count);
-	down(&file->f_dentry->d_inode->i_sem);
 #ifdef OSS_DEBUG
 	printk("pcm_oss: write %li bytes (wrote %li bytes)\n", (long)count, (long)result);
 #endif
@@ -2415,7 +2402,7 @@ static struct file_operations snd_pcm_os
 	.open =		snd_pcm_oss_open,
 	.release =	snd_pcm_oss_release,
 	.poll =		snd_pcm_oss_poll,
-	.ioctl =	snd_pcm_oss_ioctl,
+	.unlocked_ioctl = snd_pcm_oss_ioctl,
 	.mmap =		snd_pcm_oss_mmap,
 };
 
--- linux/sound/core/oss/mixer_oss.c.orig
+++ linux/sound/core/oss/mixer_oss.c
@@ -359,16 +359,10 @@ static int snd_mixer_oss_ioctl1(snd_mixe
 	return -ENXIO;
 }
 
-/* FIXME: need to unlock BKL to allow preemption */
 int snd_mixer_oss_ioctl(struct inode *inode, struct file *file,
 			unsigned int cmd, unsigned long arg)
 {
-	int err;
-	/* FIXME: need to unlock BKL to allow preemption */
-	unlock_kernel();
-	err = snd_mixer_oss_ioctl1((snd_mixer_oss_file_t *) file->private_data, cmd, arg);
-	lock_kernel();
-	return err;
+	return snd_mixer_oss_ioctl1((snd_mixer_oss_file_t *) file->private_data, cmd, arg);
 }
 
 int snd_mixer_oss_ioctl_card(snd_card_t *card, unsigned int cmd, unsigned long arg)
@@ -393,7 +387,7 @@ static struct file_operations snd_mixer_
 	.owner =	THIS_MODULE,
 	.open =		snd_mixer_oss_open,
 	.release =	snd_mixer_oss_release,
-	.ioctl =	snd_mixer_oss_ioctl,
+	.unlocked_ioctl = snd_mixer_oss_ioctl,
 };
 
 static snd_minor_t snd_mixer_oss_reg =
--- linux/sound/core/pcm_lib.c.orig
+++ linux/sound/core/pcm_lib.c
@@ -133,6 +133,7 @@ static void xrun(snd_pcm_substream_t *su
 	snd_pcm_stop(substream, SNDRV_PCM_STATE_XRUN);
 #ifdef CONFIG_SND_DEBUG
 	if (substream->pstr->xrun_debug) {
+		user_trace_stop();
 		snd_printd(KERN_DEBUG "XRUN: pcmC%dD%d%c\n",
 			   substream->pcm->card->number,
 			   substream->pcm->device,
--- linux/sound/core/control.c.orig
+++ linux/sound/core/control.c
@@ -1021,7 +1021,7 @@ static int snd_ctl_set_power_state(snd_c
 }
 #endif
 
-static inline int _snd_ctl_ioctl(struct inode *inode, struct file *file,
+static inline int snd_ctl_ioctl(struct inode *inode, struct file *file,
 				 unsigned int cmd, unsigned long arg)
 {
 	snd_ctl_file_t *ctl;
@@ -1095,17 +1095,6 @@ static inline int _snd_ctl_ioctl(struct 
 	return -ENOTTY;
 }
 
-/* FIXME: need to unlock BKL to allow preemption */
-static int snd_ctl_ioctl(struct inode *inode, struct file *file,
-			 unsigned int cmd, unsigned long arg)
-{
-	int err;
-	unlock_kernel();
-	err = _snd_ctl_ioctl(inode, file, cmd, arg);
-	lock_kernel();
-	return err;
-}
-
 static ssize_t snd_ctl_read(struct file *file, char __user *buffer, size_t count, loff_t * offset)
 {
 	snd_ctl_file_t *ctl;
@@ -1241,7 +1230,7 @@ static struct file_operations snd_ctl_f_
 	.open =		snd_ctl_open,
 	.release =	snd_ctl_release,
 	.poll =		snd_ctl_poll,
-	.ioctl =	snd_ctl_ioctl,
+	.unlocked_ioctl = snd_ctl_ioctl,
 	.fasync =	snd_ctl_fasync,
 };
 
--- linux/sound/core/seq/oss/seq_oss.c.orig
+++ linux/sound/core/seq/oss/seq_oss.c
@@ -181,14 +181,10 @@ static int
 odev_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
 {
 	seq_oss_devinfo_t *dp;
-	int err;
+
 	dp = file->private_data;
 	snd_assert(dp != NULL, return -EIO);
-	/* FIXME: need to unlock BKL to allow preemption */
-	unlock_kernel();
-	err = snd_seq_oss_ioctl(dp, cmd, arg);
-	lock_kernel();
-	return err;
+	return snd_seq_oss_ioctl(dp, cmd, arg);
 }
 
 
@@ -213,7 +209,7 @@ static struct file_operations seq_oss_f_
 	.open =		odev_open,
 	.release =	odev_release,
 	.poll =		odev_poll,
-	.ioctl =	odev_ioctl,
+	.unlocked_ioctl = odev_ioctl,
 };
 
 static snd_minor_t seq_oss_reg = {
--- linux/sound/core/seq/seq_clientmgr.c.orig
+++ linux/sound/core/seq/seq_clientmgr.c
@@ -2135,15 +2135,10 @@ static int snd_seq_ioctl(struct inode *i
 			 unsigned int cmd, unsigned long arg)
 {
 	client_t *client = (client_t *) file->private_data;
-	int err;
 
 	snd_assert(client != NULL, return -ENXIO);
 		
-	/* FIXME: need to unlock BKL to allow preemption */
-	unlock_kernel();
-	err = snd_seq_do_ioctl(client, cmd, (void __user *) arg);
-	lock_kernel();
-	return err;
+	return snd_seq_do_ioctl(client, cmd, (void __user *) arg);
 }
 
 
@@ -2462,7 +2457,7 @@ static struct file_operations snd_seq_f_
 	.open =		snd_seq_open,
 	.release =	snd_seq_release,
 	.poll =		snd_seq_poll,
-	.ioctl =	snd_seq_ioctl,
+	.unlocked_ioctl = snd_seq_ioctl,
 };
 
 static snd_minor_t snd_seq_reg =
--- linux/sound/core/hwdep.c.orig
+++ linux/sound/core/hwdep.c
@@ -232,7 +232,7 @@ static int snd_hwdep_dsp_load(snd_hwdep_
 	return 0;
 }
 
-static inline int _snd_hwdep_ioctl(struct inode *inode, struct file * file,
+static inline int snd_hwdep_ioctl(struct inode *inode, struct file * file,
 				   unsigned int cmd, unsigned long arg)
 {
 	snd_hwdep_t *hw = file->private_data;
@@ -252,17 +252,6 @@ static inline int _snd_hwdep_ioctl(struc
 	return -ENOTTY;
 }
 
-/* FIXME: need to unlock BKL to allow preemption */
-static int snd_hwdep_ioctl(struct inode *inode, struct file * file,
-			   unsigned int cmd, unsigned long arg)
-{
-	int err;
-	unlock_kernel();
-	err = _snd_hwdep_ioctl(inode, file, cmd, arg);
-	lock_kernel();
-	return err;
-}
-
 static int snd_hwdep_mmap(struct file * file, struct vm_area_struct * vma)
 {
 	snd_hwdep_t *hw = file->private_data;
@@ -328,7 +317,7 @@ static struct file_operations snd_hwdep_
 	.open =		snd_hwdep_open,
 	.release =	snd_hwdep_release,
 	.poll =		snd_hwdep_poll,
-	.ioctl =	snd_hwdep_ioctl,
+	.unlocked_ioctl = snd_hwdep_ioctl,
 	.mmap =		snd_hwdep_mmap,
 };
 
--- linux/sound/core/pcm_native.c.orig
+++ linux/sound/core/pcm_native.c
@@ -2644,36 +2644,26 @@ static int snd_pcm_playback_ioctl(struct
 				  unsigned int cmd, unsigned long arg)
 {
 	snd_pcm_file_t *pcm_file;
-	int err;
 
 	pcm_file = file->private_data;
 
 	if (((cmd >> 8) & 0xff) != 'A')
 		return -ENOTTY;
 
-	/* FIXME: need to unlock BKL to allow preemption */
-	unlock_kernel();
-	err = snd_pcm_playback_ioctl1(pcm_file->substream, cmd, (void __user *)arg);
-	lock_kernel();
-	return err;
+	return snd_pcm_playback_ioctl1(pcm_file->substream, cmd, (void __user *)arg);
 }
 
 static int snd_pcm_capture_ioctl(struct inode *inode, struct file *file,
 				 unsigned int cmd, unsigned long arg)
 {
 	snd_pcm_file_t *pcm_file;
-	int err;
 
 	pcm_file = file->private_data;
 
 	if (((cmd >> 8) & 0xff) != 'A')
 		return -ENOTTY;
 
-	/* FIXME: need to unlock BKL to allow preemption */
-	unlock_kernel();
-	err = snd_pcm_capture_ioctl1(pcm_file->substream, cmd, (void __user *)arg);
-	lock_kernel();
-	return err;
+	return snd_pcm_capture_ioctl1(pcm_file->substream, cmd, (void __user *)arg);
 }
 
 int snd_pcm_kernel_playback_ioctl(snd_pcm_substream_t *substream,
@@ -3318,7 +3308,7 @@ static struct file_operations snd_pcm_f_
 	.open =		snd_pcm_open,
 	.release =	snd_pcm_release,
 	.poll =		snd_pcm_playback_poll,
-	.ioctl =	snd_pcm_playback_ioctl,
+	.unlocked_ioctl = snd_pcm_playback_ioctl,
 	.mmap =		snd_pcm_mmap,
 	.fasync =	snd_pcm_fasync,
 };
@@ -3330,7 +3320,7 @@ static struct file_operations snd_pcm_f_
 	.open =		snd_pcm_open,
 	.release =	snd_pcm_release,
 	.poll =		snd_pcm_capture_poll,
-	.ioctl =	snd_pcm_capture_ioctl,
+	.unlocked_ioctl = snd_pcm_capture_ioctl,
 	.mmap =		snd_pcm_mmap,
 	.fasync =	snd_pcm_fasync,
 };
--- linux/sound/core/timer.c.orig
+++ linux/sound/core/timer.c
@@ -1657,7 +1657,7 @@ static int snd_timer_user_continue(struc
 	return (err = snd_timer_continue(tu->timeri)) < 0 ? err : 0;
 }
 
-static inline int _snd_timer_user_ioctl(struct inode *inode, struct file *file,
+static inline int snd_timer_user_ioctl(struct inode *inode, struct file *file,
 					unsigned int cmd, unsigned long arg)
 {
 	snd_timer_user_t *tu;
@@ -1705,17 +1705,6 @@ static inline int _snd_timer_user_ioctl(
 	return -ENOTTY;
 }
 
-/* FIXME: need to unlock BKL to allow preemption */
-static int snd_timer_user_ioctl(struct inode *inode, struct file * file,
-				unsigned int cmd, unsigned long arg)
-{
-	int err;
-	unlock_kernel();
-	err = _snd_timer_user_ioctl(inode, file, cmd, arg);
-	lock_kernel();
-	return err;
-}
-
 static int snd_timer_user_fasync(int fd, struct file * file, int on)
 {
 	snd_timer_user_t *tu;
@@ -1814,7 +1803,7 @@ static struct file_operations snd_timer_
 	.open =		snd_timer_user_open,
 	.release =	snd_timer_user_release,
 	.poll =		snd_timer_user_poll,
-	.ioctl =	snd_timer_user_ioctl,
+	.unlocked_ioctl = snd_timer_user_ioctl,
 	.fasync = 	snd_timer_user_fasync,
 };
 
--- linux/sound/core/info.c.orig
+++ linux/sound/core/info.c
@@ -448,7 +448,7 @@ static unsigned int snd_info_entry_poll(
 	return mask;
 }
 
-static inline int _snd_info_entry_ioctl(struct inode *inode, struct file *file,
+static inline int snd_info_entry_ioctl(struct inode *inode, struct file *file,
 					unsigned int cmd, unsigned long arg)
 {
 	snd_info_private_data_t *data;
@@ -469,17 +469,6 @@ static inline int _snd_info_entry_ioctl(
 	return -ENOTTY;
 }
 
-/* FIXME: need to unlock BKL to allow preemption */
-static int snd_info_entry_ioctl(struct inode *inode, struct file *file,
-				unsigned int cmd, unsigned long arg)
-{
-	int err;
-	unlock_kernel();
-	err = _snd_info_entry_ioctl(inode, file, cmd, arg);
-	lock_kernel();
-	return err;
-}
-
 static int snd_info_entry_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_dentry->d_inode;
@@ -508,7 +497,7 @@ static struct file_operations snd_info_e
 	.read =		snd_info_entry_read,
 	.write =	snd_info_entry_write,
 	.poll =		snd_info_entry_poll,
-	.ioctl =	snd_info_entry_ioctl,
+	.unlocked_ioctl = snd_info_entry_ioctl,
 	.mmap =		snd_info_entry_mmap,
 	.open =		snd_info_entry_open,
 	.release =	snd_info_entry_release,
--- linux/sound/core/rawmidi.c.orig
+++ linux/sound/core/rawmidi.c
@@ -133,7 +133,8 @@ int snd_rawmidi_drain_output(snd_rawmidi
 	err = 0;
 	runtime->drain = 1;
 	while (runtime->avail < runtime->buffer_size) {
-		timeout = interruptible_sleep_on_timeout(&runtime->sleep, 10 * HZ);
+		timeout = wait_event_interruptible_timeout(runtime->sleep, 
+				runtime->avail < runtime->buffer_size, 10 * HZ);
 		if (signal_pending(current)) {
 			err = -ERESTARTSYS;
 			break;
@@ -673,7 +674,7 @@ static int snd_rawmidi_input_status(snd_
 	return 0;
 }
 
-static inline int _snd_rawmidi_ioctl(struct inode *inode, struct file *file,
+static inline int snd_rawmidi_ioctl(struct inode *inode, struct file *file,
 				     unsigned int cmd, unsigned long arg)
 {
 	snd_rawmidi_file_t *rfile;
@@ -784,17 +785,6 @@ static inline int _snd_rawmidi_ioctl(str
 	return -ENOTTY;
 }
 
-/* FIXME: need to unlock BKL to allow preemption */
-static int snd_rawmidi_ioctl(struct inode *inode, struct file *file,
-			     unsigned int cmd, unsigned long arg)
-{
-	int err;
-	unlock_kernel();
-	err = _snd_rawmidi_ioctl(inode, file, cmd, arg);
-	lock_kernel();
-	return err;
-}
-
 int snd_rawmidi_control_ioctl(snd_card_t * card, snd_ctl_file_t * control,
 			      unsigned int cmd, unsigned long arg)
 {
@@ -1345,7 +1335,7 @@ static struct file_operations snd_rawmid
 	.open =		snd_rawmidi_open,
 	.release =	snd_rawmidi_release,
 	.poll =		snd_rawmidi_poll,
-	.ioctl =	snd_rawmidi_ioctl,
+	.unlocked_ioctl = snd_rawmidi_ioctl,
 };
 
 static snd_minor_t snd_rawmidi_reg =
--- linux/fs/xfs/linux-2.6/mutex.h.orig
+++ linux/fs/xfs/linux-2.6/mutex.h
@@ -44,8 +44,8 @@
 #define MUTEX_DEFAULT		0x0
 typedef struct semaphore	mutex_t;
 
-#define mutex_init(lock, type, name)		sema_init(lock, 1)
-#define mutex_destroy(lock)			sema_init(lock, -99)
+#define mutex_init(lock, type, name)		sema_init_nocheck(lock, 1)
+#define mutex_destroy(lock)			sema_init_nocheck(lock, -99)
 #define mutex_lock(lock, num)			down(lock)
 #define mutex_trylock(lock)			(down_trylock(lock) ? 0 : 1)
 #define mutex_unlock(lock)			up(lock)
--- linux/fs/xfs/linux-2.6/sema.h.orig
+++ linux/fs/xfs/linux-2.6/sema.h
@@ -43,9 +43,9 @@
 
 typedef struct semaphore sema_t;
 
-#define init_sema(sp, val, c, d)	sema_init(sp, val)
-#define initsema(sp, val)		sema_init(sp, val)
-#define initnsema(sp, val, name)	sema_init(sp, val)
+#define init_sema(sp, val, c, d)	sema_init_nocheck(sp, val)
+#define initsema(sp, val)		sema_init_nocheck(sp, val)
+#define initnsema(sp, val, name)	sema_init_nocheck(sp, val)
 #define psema(sp, b)			down(sp)
 #define vsema(sp)			up(sp)
 #define valusema(sp)			(atomic_read(&(sp)->count))
--- linux/fs/proc/array.c.orig
+++ linux/fs/proc/array.c
@@ -129,17 +129,19 @@ static inline char * task_name(struct ta
  */
 static const char *task_state_array[] = {
 	"R (running)",		/*  0 */
-	"S (sleeping)",		/*  1 */
-	"D (disk sleep)",	/*  2 */
-	"T (stopped)",		/*  4 */
-	"T (tracing stop)",	/*  8 */
-	"Z (zombie)",		/* 16 */
-	"X (dead)"		/* 32 */
+	"M (running-mutex)",	/*  1 */
+	"S (sleeping)",		/*  2 */
+	"D (disk sleep)",	/*  4 */
+	"T (stopped)",		/*  8 */
+	"T (tracing stop)",	/* 16 */
+	"Z (zombie)",		/* 32 */
+	"X (dead)"		/* 64 */
 };
 
 static inline const char * get_task_state(struct task_struct *tsk)
 {
 	unsigned int state = (tsk->state & (TASK_RUNNING |
+					    TASK_RUNNING_MUTEX |
 					    TASK_INTERRUPTIBLE |
 					    TASK_UNINTERRUPTIBLE |
 					    TASK_STOPPED |
--- linux/fs/proc/proc_misc.c.orig
+++ linux/fs/proc/proc_misc.c
@@ -397,6 +397,41 @@ static int show_stat(struct seq_file *p,
 		nr_running(),
 		nr_iowait());
 
+#ifdef CONFIG_PREEMPT_RT
+	{
+		unsigned long nr_uninterruptible_cpu(int cpu);
+		extern int pi_walk, pi_null, pi_prio;
+		extern int rt_overload_schedule,
+			   rt_overload_wakeup, rt_overload_pulled;
+		unsigned long rt_nr_running_cpu(int cpu);
+		extern atomic_t rt_overload;
+
+		int i;
+
+		seq_printf(p, "rt_overload_schedule: %d\n",
+					rt_overload_schedule);
+		seq_printf(p, "rt_overload_wakeup:   %d\n",
+					rt_overload_wakeup);
+		seq_printf(p, "rt_overload_pulled:   %d\n",
+					rt_overload_pulled);
+		seq_printf(p, "pi_null: %d\n", pi_null);
+		seq_printf(p, "pi_prio: %d\n", pi_prio);
+		seq_printf(p, "pi_walk: %d\n", pi_walk);
+		seq_printf(p, "nr_running(): %ld\n",
+			nr_running());
+		seq_printf(p, "nr_uninterruptible(): %ld\n",
+			nr_uninterruptible());
+		for_each_cpu(i)
+			seq_printf(p, "nr_uninterruptible(%d): %ld\n",
+				i, nr_uninterruptible_cpu(i));
+		for_each_cpu(i)
+			seq_printf(p, "rt_nr_running(%d): %ld\n",
+				i, rt_nr_running_cpu(i));
+		seq_printf(p, "rt_overload: %d\n", atomic_read(&rt_overload));
+		
+	}
+#endif
+
 	return 0;
 }
 
@@ -513,6 +548,20 @@ static int execdomains_read_proc(char *p
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+#ifdef CONFIG_LATENCY_TRACE
+extern struct seq_operations latency_trace_op;
+static int latency_trace_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &latency_trace_op);
+}
+static struct file_operations proc_latency_trace_operations = {
+	.open		= latency_trace_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+#endif
+
 #ifdef CONFIG_MAGIC_SYSRQ
 /*
  * writing 'C' to /proc/sysrq-trigger is like sysrq-C
@@ -592,6 +641,9 @@ void __init proc_misc_init(void)
 #ifdef CONFIG_SCHEDSTATS
 	create_seq_entry("schedstat", 0, &proc_schedstat_operations);
 #endif
+#ifdef CONFIG_LATENCY_TRACE
+	create_seq_entry("latency_trace", 0, &proc_latency_trace_operations);
+#endif
 #ifdef CONFIG_PROC_KCORE
 	proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
 	if (proc_root_kcore) {
--- linux/fs/proc/task_mmu.c.orig
+++ linux/fs/proc/task_mmu.c
@@ -125,8 +125,10 @@ static void *m_start(struct seq_file *m,
 
 	down_read(&mm->mmap_sem);
 	map = mm->mmap;
-	while (l-- && map)
+	while (l-- && map) {
 		map = map->vm_next;
+		cond_resched();
+	}
 	if (!map) {
 		up_read(&mm->mmap_sem);
 		mmput(mm);
--- linux/fs/nfsd/nfssvc.c.orig
+++ linux/fs/nfsd/nfssvc.c
@@ -281,6 +281,7 @@ out:
 	/* Release the thread */
 	svc_exit_thread(rqstp);
 
+	unlock_kernel();
 	/* Release module */
 	module_put_and_exit(0);
 }
--- linux/fs/reiser4/log.c.orig
+++ linux/fs/reiser4/log.c
@@ -231,7 +231,7 @@ lock_log(reiser4_log_file * log)
 	while (log->long_term) {
 		/* sleep on a semaphore */
 		struct __wlink link;
-		sema_init(&link.sema, 0);
+		sema_init_nocheck(&link.sema, 0);
 		list_add(&link.link, &log->wait);
 		spin_unlock(&log->lock);
 
--- linux/fs/reiser4/plugin/space/bitmap.c.orig
+++ linux/fs/reiser4/plugin/space/bitmap.c
@@ -636,7 +636,7 @@ init_bnode(struct bitmap_node *bnode,
 {
 	xmemset(bnode, 0, sizeof (struct bitmap_node));
 
-	sema_init(&bnode->sema, 1);
+	sema_init_nocheck(&bnode->sema, 1);
 	atomic_set(&bnode->loaded, 0);
 }
 
--- linux/fs/reiser4/lock.c.orig
+++ linux/fs/reiser4/lock.c
@@ -1184,7 +1184,7 @@ init_lock_stack(lock_stack * owner	/* po
 	requestors_list_clean(owner);
 	spin_stack_init(owner);
 	owner->curpri = 1;
-	sema_init(&owner->sema, 0);
+	sema_init_nocheck(&owner->sema, 0);
 }
 
 /* Initializes lock object. */
@@ -1308,7 +1308,7 @@ prepare_to_sleep(lock_stack * owner)
 
 	if (0) {
 
-	           NOTE-NIKITA: I commented call to sema_init() out hoping
+	           NOTE-NIKITA: I commented call to sema_init_nocheck() out hoping
 		   that it is the reason or thread sleeping in
 		   down(&owner->sema) without any other thread running.
 
@@ -1317,7 +1317,7 @@ prepare_to_sleep(lock_stack * owner)
 		   longterm_lock_znode() would have to iterate its loop once
 		   more.
 		spin_lock_stack(owner);
-		sema_init(&owner->sema, 0);
+		sema_init_nocheck(&owner->sema, 0);
 		spin_unlock_stack(owner);
 	}
 	*/
--- linux/fs/reiser4/init_super.c.orig
+++ linux/fs/reiser4/init_super.c
@@ -63,8 +63,8 @@ _INIT_(sinfo)
 	ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes));
 	ON_DEBUG(spin_lock_init(&sbinfo->all_guard));
 
-	sema_init(&sbinfo->delete_sema, 1);
-	sema_init(&sbinfo->flush_sema, 1);
+	sema_init_nocheck(&sbinfo->delete_sema, 1);
+	sema_init_nocheck(&sbinfo->flush_sema, 1);
 	spin_super_init(sbinfo);
 	spin_super_eflush_init(sbinfo);
 
--- linux/fs/reiser4/flush_queue.c.orig
+++ linux/fs/reiser4/flush_queue.c
@@ -108,7 +108,7 @@ init_fq(flush_queue_t * fq)
 
 	capture_list_init(ATOM_FQ_LIST(fq));
 
-	sema_init(&fq->io_sem, 0);
+	sema_init_nocheck(&fq->io_sem, 0);
 	spin_fq_init(fq);
 }
 
--- linux/fs/reiser4/search.c.orig
+++ linux/fs/reiser4/search.c
@@ -1174,7 +1174,6 @@ cbk_node_lookup(cbk_handle * h /* search
 	assert("vs-361", h->level > h->stop_level);
 
 	if (handle_eottl(h, &result)) {
-		/**/
 		assert("vs-1674", result == LOOKUP_DONE || result == LOOKUP_REST);
 		return result;
 	}
@@ -1241,7 +1240,7 @@ cbk_cache_scan_slots(cbk_handle * h /* c
 	 *
 	 */
 
-	rcu_read_lock();
+	rcu_read_lock_nort();
 	read_lock_cbk_cache(cache);
 	slot = cbk_cache_list_prev(cbk_cache_list_front(&cache->lru));
 	while (1) {
@@ -1278,7 +1277,7 @@ cbk_cache_scan_slots(cbk_handle * h /* c
 	if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP)))
 		result = -ENOENT;
 
-	rcu_read_unlock();
+	rcu_read_unlock_nort();
 
 	if (result != 0) {
 		h->result = CBK_COORD_NOTFOUND;
--- linux/fs/reiser4/vfs_ops.c.orig
+++ linux/fs/reiser4/vfs_ops.c
@@ -433,7 +433,7 @@ init_once(void *obj /* pointer to new in
 		inode_init_once(&info->vfs_inode);
 		readdir_list_init(get_readdir_list(&info->vfs_inode));
 		init_rwsem(&info->p.coc_sem);
-		sema_init(&info->p.loading, 1);
+		sema_init_nocheck(&info->p.loading, 1);
 		ON_DEBUG(info->p.nr_jnodes = 0);
 		INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p), GFP_ATOMIC);
 		ON_DEBUG(info->p.captured_eflushed = 0);
--- linux/fs/reiser4/txnmgr.c.orig
+++ linux/fs/reiser4/txnmgr.c
@@ -366,7 +366,7 @@ txnmgr_init(txn_mgr * mgr)
 	atom_list_init(&mgr->atoms_list);
 	spin_txnmgr_init(mgr);
 
-	sema_init(&mgr->commit_semaphore, 1);
+	sema_init_nocheck(&mgr->commit_semaphore, 1);
 }
 
 /* Free transaction manager. */
--- linux/fs/reiser4/entd.c.orig
+++ linux/fs/reiser4/entd.c
@@ -312,7 +312,7 @@ void write_page_by_ent (struct page * pa
 		spin_unlock(&ent->guard);
 		return;
 	}
-	sema_init(&rq.sem, 0);
+	sema_init_nocheck(&rq.sem, 0);
 	wbq_list_push_back(&ent->wbq_list, &rq);
 	ent->nr_synchronous_requests ++;
 	spin_unlock(&ent->guard);
--- linux/fs/jbd/commit.c.orig
+++ linux/fs/jbd/commit.c
@@ -333,7 +333,7 @@ write_out_data:
 			jbd_unlock_bh_state(bh);
 		}
 		put_bh(bh);
-		cond_resched_lock(&journal->j_list_lock);
+//		cond_resched_lock(&journal->j_list_lock);
 	}
 	spin_unlock(&journal->j_list_lock);
 
--- linux/fs/pipe.c.orig
+++ linux/fs/pipe.c
@@ -160,8 +160,14 @@ pipe_readv(struct file *filp, const stru
 		wake_up_interruptible(PIPE_WAIT(*inode));
 		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
 	}
+	/*
+	 * Hack: we turn off atime updates for -RT kernels.
+	 * Who uses them on pipes anyway?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (ret > 0)
 		file_accessed(filp);
+#endif
 	return ret;
 }
 
@@ -254,8 +260,14 @@ pipe_writev(struct file *filp, const str
 		wake_up_interruptible(PIPE_WAIT(*inode));
 		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
 	}
+	/*
+	 * Hack: we turn off atime updates for -RT kernels.
+	 * Who uses them on pipes anyway?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (ret > 0)
 		inode_update_time(inode, 1);	/* mtime and ctime */
+#endif
 	return ret;
 }
 
--- linux/fs/lockd/svc.c.orig
+++ linux/fs/lockd/svc.c
@@ -49,7 +49,7 @@ static pid_t			nlmsvc_pid;
 int				nlmsvc_grace_period;
 unsigned long			nlmsvc_timeout;
 
-static DECLARE_MUTEX_LOCKED(lockd_start);
+static DECLARE_WAIT_QUEUE_HEAD(lockd_start);
 static DECLARE_WAIT_QUEUE_HEAD(lockd_exit);
 
 /*
@@ -112,7 +112,7 @@ lockd(struct svc_rqst *rqstp)
 	 * Let our maker know we're running.
 	 */
 	nlmsvc_pid = current->pid;
-	up(&lockd_start);
+	wake_up(&lockd_start);
 
 	daemonize("lockd");
 
@@ -233,6 +233,7 @@ lockd_up(void)
 		printk(KERN_WARNING
 			"lockd_up: no pid, %d users??\n", nlmsvc_users);
 
+
 	error = -ENOMEM;
 	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE);
 	if (!serv) {
@@ -261,8 +262,15 @@ lockd_up(void)
 			"lockd_up: create thread failed, error=%d\n", error);
 		goto destroy_and_out;
 	}
-	down(&lockd_start);
-
+	/*
+	 * Wait for the lockd process to start, but since we're holding
+	 * the lockd semaphore, we can't wait around forever ...
+	 */
+	if (wait_event_interruptible_timeout(lockd_start, 
+					     nlmsvc_pid != 0, HZ) <= 0) {
+		printk(KERN_WARNING 
+			"lockd_down: lockd failed to start\n");
+	}
 	/*
 	 * Note: svc_serv structures have an initial use count of 1,
 	 * so we exit through here on both success and failure.
@@ -302,16 +310,12 @@ lockd_down(void)
 	 * Wait for the lockd process to exit, but since we're holding
 	 * the lockd semaphore, we can't wait around forever ...
 	 */
-	clear_thread_flag(TIF_SIGPENDING);
-	interruptible_sleep_on_timeout(&lockd_exit, HZ);
-	if (nlmsvc_pid) {
+	if (wait_event_interruptible_timeout(lockd_exit, 
+					     nlmsvc_pid == 0, HZ) <= 0) {
 		printk(KERN_WARNING 
 			"lockd_down: lockd failed to exit, clearing pid\n");
 		nlmsvc_pid = 0;
 	}
-	spin_lock_irq(&current->sighand->siglock);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
 out:
 	up(&nlmsvc_sema);
 }
--- linux/fs/fcntl.c.orig
+++ linux/fs/fcntl.c
@@ -473,7 +473,8 @@ static void send_sigio_to_task(struct ta
 				break;
 		/* fall-through: fall back on the old plain SIGIO signal */
 		case 0:
-			send_group_sig_info(SIGIO, SEND_SIG_PRIV, p);
+			// we hold the tasklist lock already:
+			group_send_sig_info(SIGIO, SEND_SIG_PRIV, p);
 	}
 }
 
@@ -507,7 +508,7 @@ static void send_sigurg_to_task(struct t
                                 struct fown_struct *fown)
 {
 	if (sigio_perm(p, fown, SIGURG))
-		send_group_sig_info(SIGURG, SEND_SIG_PRIV, p);
+		group_send_sig_info(SIGURG, SEND_SIG_PRIV, p);
 }
 
 int send_sigurg(struct fown_struct *fown)
--- linux/fs/exec.c.orig
+++ linux/fs/exec.c
@@ -550,11 +550,16 @@ static int exec_mmap(struct mm_struct *m
 	mm_release(tsk, old_mm);
 
 	task_lock(tsk);
+
+	local_irq_disable(); // FIXME
 	active_mm = tsk->active_mm;
+	activate_mm(active_mm, mm);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
-	activate_mm(active_mm, mm);
+	local_irq_enable();
+
 	task_unlock(tsk);
+
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
 		if (active_mm != old_mm) BUG();
--- linux/fs/aio.c.orig
+++ linux/fs/aio.c
@@ -573,9 +573,11 @@ void use_mm(struct mm_struct *mm)
 	tsk->flags |= PF_BORROWED_MM;
 	active_mm = tsk->active_mm;
 	atomic_inc(&mm->mm_count);
+	local_irq_disable(); // FIXME
+	activate_mm(active_mm, mm);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
-	activate_mm(active_mm, mm);
+	local_irq_enable();
 	task_unlock(tsk);
 
 	mmdrop(active_mm);
--- linux/fs/dcache.c.orig
+++ linux/fs/dcache.c
@@ -37,8 +37,8 @@
 
 int sysctl_vfs_cache_pressure = 100;
 
-spinlock_t dcache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
+DEFINE_SPINLOCK(dcache_lock);
+DECLARE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(dcache_lock);
 
@@ -997,7 +997,7 @@ struct dentry * __d_lookup(struct dentry
 	struct dentry *found = NULL;
 	struct hlist_node *node;
 
-	rcu_read_lock();
+	rcu_read_lock_spin(&dcache_lock);
 	
 	hlist_for_each_rcu(node, head) {
 		struct dentry *dentry; 
@@ -1044,7 +1044,7 @@ struct dentry * __d_lookup(struct dentry
 next:
 		spin_unlock(&dentry->d_lock);
  	}
- 	rcu_read_unlock();
+ 	rcu_read_unlock_spin(&dcache_lock);
 
  	return found;
 }
@@ -1479,17 +1479,25 @@ int is_subdir(struct dentry * new_dentry
 {
 	int result;
 	struct dentry * saved = new_dentry;
+#ifndef CONFIG_PREEMPT_RT
 	unsigned long seq;
+#endif
 
 	result = 0;
 	/* need rcu_readlock to protect against the d_parent trashing due to
 	 * d_move
 	 */
+#ifdef CONFIG_PREEMPT_RT
+	write_seqlock(&rename_lock);
+#else
 	rcu_read_lock();
+#endif
         do {
 		/* for restarting inner loop in case of seq retry */
 		new_dentry = saved;
+#ifndef CONFIG_PREEMPT_RT
 		seq = read_seqbegin(&rename_lock);
+#endif
 		for (;;) {
 			if (new_dentry != old_dentry) {
 				struct dentry * parent = new_dentry->d_parent;
@@ -1501,8 +1509,13 @@ int is_subdir(struct dentry * new_dentry
 			result = 1;
 			break;
 		}
+#ifdef CONFIG_PREEMPT_RT
+	} while (0);
+	write_sequnlock(&rename_lock);
+#else
 	} while (read_seqretry(&rename_lock, seq));
 	rcu_read_unlock();
+#endif
 
 	return result;
 }
--- linux/fs/ioctl.c.orig
+++ linux/fs/ioctl.c
@@ -93,10 +93,8 @@ asmlinkage long sys_ioctl(unsigned int f
 			int block;
 			int res;
 
-			if (!S_ISREG(inode->i_mode)) {
-				error = -ENOTTY;
-				goto done;
-			}
+			if (!S_ISREG(inode->i_mode))
+				break;
 			/* do we support this mess? */
 			if (!mapping->a_ops->bmap) {
 				error = -EINVAL;
@@ -116,19 +114,15 @@ asmlinkage long sys_ioctl(unsigned int f
 			goto done;
 		}
 	case FIGETBSZ:
-		if (!S_ISREG(inode->i_mode)) {
-			error = -ENOTTY;
-			goto done;
-		}
+		if (!S_ISREG(inode->i_mode))
+			break;
 		error = -EBADF;
 		if (inode->i_sb)
 			error = put_user(inode->i_sb->s_blocksize, p);
 		goto done;
 	case FIONREAD:
-		if (!S_ISREG(inode->i_mode)) {
-			error = -ENOTTY;
-			goto done;
-		}
+		if (!S_ISREG(inode->i_mode))
+			break;
 		error = put_user(i_size_read(inode) - filp->f_pos, p);
 		goto done;
 	}
--- linux/mm/slab.c.orig
+++ linux/mm/slab.c
@@ -560,9 +560,9 @@ static inline void ** ac_entry(struct ar
 	return (void**)(ac+1);
 }
 
-static inline struct array_cache *ac_data(kmem_cache_t *cachep)
+static inline struct array_cache *ac_data(kmem_cache_t *cachep, int cpu)
 {
-	return cachep->array[smp_processor_id()];
+	return cachep->array[cpu];
 }
 
 static kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags)
@@ -802,21 +802,22 @@ void __init kmem_cache_init(void)
 	/* 4) Replace the bootstrap head arrays */
 	{
 		void * ptr;
+		int cpu = smp_processor_id();
 		
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
-		local_irq_disable();
-		BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
-		memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
-		cache_cache.array[smp_processor_id()] = ptr;
-		local_irq_enable();
+		local_irq_disable_nort();
+		BUG_ON(ac_data(&cache_cache, cpu) != &initarray_cache.cache);
+		memcpy(ptr, ac_data(&cache_cache, cpu), sizeof(struct arraycache_init));
+		cache_cache.array[cpu] = ptr;
+		local_irq_enable_nort();
 	
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
-		local_irq_disable();
-		BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
-		memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
+		local_irq_disable_nort();
+		BUG_ON(ac_data(malloc_sizes[0].cs_cachep, cpu) != &initarray_generic.cache);
+		memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep, cpu),
 				sizeof(struct arraycache_init));
-		malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
-		local_irq_enable();
+		malloc_sizes[0].cs_cachep->array[cpu] = ptr;
+		local_irq_enable_nort();
 	}
 
 	/* 5) resize the head arrays to their final sizes */
@@ -1174,6 +1175,7 @@ kmem_cache_create (const char *name, siz
 {
 	size_t left_over, slab_size;
 	kmem_cache_t *cachep = NULL;
+	int cpu = _smp_processor_id();
 
 	/*
 	 * Sanity checks... these are all serious usage bugs.
@@ -1400,16 +1402,16 @@ next:
 			 * the cache that's used by kmalloc(24), otherwise
 			 * the creation of further caches will BUG().
 			 */
-			cachep->array[smp_processor_id()] = &initarray_generic.cache;
+			cachep->array[cpu] = &initarray_generic.cache;
 			g_cpucache_up = PARTIAL;
 		} else {
-			cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL);
+			cachep->array[cpu] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL);
 		}
-		BUG_ON(!ac_data(cachep));
-		ac_data(cachep)->avail = 0;
-		ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-		ac_data(cachep)->batchcount = 1;
-		ac_data(cachep)->touched = 0;
+		BUG_ON(!ac_data(cachep, cpu));
+		ac_data(cachep, cpu)->avail = 0;
+		ac_data(cachep, cpu)->limit = BOOT_CPUCACHE_ENTRIES;
+		ac_data(cachep, cpu)->batchcount = 1;
+		ac_data(cachep, cpu)->touched = 0;
 		cachep->batchcount = 1;
 		cachep->limit = BOOT_CPUCACHE_ENTRIES;
 		cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
@@ -1463,7 +1465,9 @@ EXPORT_SYMBOL(kmem_cache_create);
 #if DEBUG
 static void check_irq_off(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(!irqs_disabled());
+#endif
 }
 
 static void check_irq_on(void)
@@ -1505,22 +1509,39 @@ static void smp_call_function_all_cpus(v
 static void drain_array_locked(kmem_cache_t* cachep,
 				struct array_cache *ac, int force);
 
-static void do_drain(void *arg)
+static void do_drain_cpu(kmem_cache_t *cachep, int cpu)
 {
-	kmem_cache_t *cachep = (kmem_cache_t*)arg;
 	struct array_cache *ac;
 
 	check_irq_off();
-	ac = ac_data(cachep);
+
 	spin_lock(&cachep->spinlock);
+	ac = ac_data(cachep, cpu);
 	free_block(cachep, &ac_entry(ac)[0], ac->avail);
-	spin_unlock(&cachep->spinlock);
 	ac->avail = 0;
+	spin_unlock(&cachep->spinlock);
+}
+
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * Executes in an IRQ context:
+ */
+static void do_drain(void *arg)
+{
+	do_drain_cpu((kmem_cache_t*)arg, smp_processor_id());
 }
+#endif
 
 static void drain_cpu_caches(kmem_cache_t *cachep)
 {
+#ifndef CONFIG_PREEMPT_RT
 	smp_call_function_all_cpus(do_drain, cachep);
+#else
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		do_drain_cpu(cachep, cpu);
+#endif
 	check_irq_on();
 	spin_lock_irq(&cachep->spinlock);
 	if (cachep->lists.shared)
@@ -1789,7 +1810,7 @@ static int cache_grow (kmem_cache_t * ca
 	spin_unlock(&cachep->spinlock);
 
 	if (local_flags & __GFP_WAIT)
-		local_irq_enable();
+		local_irq_enable_nort();
 
 	/*
 	 * The test for missing atomic flag is performed here, rather than
@@ -1813,7 +1834,7 @@ static int cache_grow (kmem_cache_t * ca
 	cache_init_objs(cachep, slabp, ctor_flags);
 
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		local_irq_disable_nort();
 	check_irq_off();
 	spin_lock(&cachep->spinlock);
 
@@ -1827,7 +1848,7 @@ opps1:
 	kmem_freepages(cachep, objp);
 failed:
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		local_irq_disable_nort();
 	return 0;
 }
 
@@ -1953,14 +1974,14 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
 
-static void* cache_alloc_refill(kmem_cache_t* cachep, int flags)
+static void* cache_alloc_refill(kmem_cache_t* cachep, int flags, int cpu)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
 	struct array_cache *ac;
 
 	check_irq_off();
-	ac = ac_data(cachep);
+	ac = ac_data(cachep, cpu);
 retry:
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -1973,7 +1994,7 @@ retry:
 	l3 = list3_data(cachep);
 
 	BUG_ON(ac->avail > 0);
-	spin_lock(&cachep->spinlock);
+	spin_lock_nort(&cachep->spinlock);
 	if (l3->shared) {
 		struct array_cache *shared_array = l3->shared;
 		if (shared_array->avail) {
@@ -2031,14 +2052,17 @@ retry:
 must_grow:
 	l3->free_objects -= ac->avail;
 alloc_done:
-	spin_unlock(&cachep->spinlock);
+	spin_unlock_nort(&cachep->spinlock);
 
 	if (unlikely(!ac->avail)) {
 		int x;
+		spin_unlock_rt(&cachep->spinlock);
 		x = cache_grow(cachep, flags, -1);
-		
+
+		spin_lock_rt(&cachep->spinlock);
 		// cache_grow can reenable interrupts, then ac could change.
-		ac = ac_data(cachep);
+		cpu = smp_processor_id_rt(cpu);
+		ac = ac_data(cachep, cpu);
 		if (!x && ac->avail == 0)	// no objects in sight? abort
 			return NULL;
 
@@ -2107,23 +2131,26 @@ cache_alloc_debugcheck_after(kmem_cache_
 
 static inline void * __cache_alloc (kmem_cache_t *cachep, int flags)
 {
+	int cpu = _smp_processor_id();
 	unsigned long save_flags;
 	void* objp;
 	struct array_cache *ac;
 
 	cache_alloc_debugcheck_before(cachep, flags);
 
-	local_irq_save(save_flags);
-	ac = ac_data(cachep);
+	local_irq_save_nort(save_flags);
+	spin_lock_rt(&cachep->spinlock);
+	ac = ac_data(cachep, cpu);
 	if (likely(ac->avail)) {
 		STATS_INC_ALLOCHIT(cachep);
 		ac->touched = 1;
 		objp = ac_entry(ac)[--ac->avail];
 	} else {
 		STATS_INC_ALLOCMISS(cachep);
-		objp = cache_alloc_refill(cachep, flags);
+		objp = cache_alloc_refill(cachep, flags, cpu);
 	}
-	local_irq_restore(save_flags);
+	spin_unlock_rt(&cachep->spinlock);
+	local_irq_restore_nort(save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0));
 	return objp;
 }
@@ -2193,7 +2220,7 @@ static void cache_flusharray (kmem_cache
 	BUG_ON(!batchcount || batchcount > ac->avail);
 #endif
 	check_irq_off();
-	spin_lock(&cachep->spinlock);
+	spin_lock_nort(&cachep->spinlock);
 	if (cachep->lists.shared) {
 		struct array_cache *shared_array = cachep->lists.shared;
 		int max = shared_array->limit-shared_array->avail;
@@ -2228,7 +2255,7 @@ free_done:
 		STATS_SET_FREEABLE(cachep, i);
 	}
 #endif
-	spin_unlock(&cachep->spinlock);
+	spin_unlock_nort(&cachep->spinlock);
 	ac->avail -= batchcount;
 	memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount],
 			sizeof(void*)*ac->avail);
@@ -2243,20 +2270,22 @@ free_done:
  */
 static inline void __cache_free (kmem_cache_t *cachep, void* objp)
 {
-	struct array_cache *ac = ac_data(cachep);
+	int cpu = _smp_processor_id();
+	struct array_cache *ac = ac_data(cachep, cpu);
 
 	check_irq_off();
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
 
+	spin_lock_rt(&cachep->spinlock);
 	if (likely(ac->avail < ac->limit)) {
 		STATS_INC_FREEHIT(cachep);
 		ac_entry(ac)[ac->avail++] = objp;
-		return;
 	} else {
 		STATS_INC_FREEMISS(cachep);
 		cache_flusharray(cachep, ac);
 		ac_entry(ac)[ac->avail++] = objp;
 	}
+	spin_unlock_rt(&cachep->spinlock);
 }
 
 /**
@@ -2358,12 +2387,12 @@ void *kmem_cache_alloc_node(kmem_cache_t
 		}
 		spin_unlock_irq(&cachep->spinlock);
 
-		local_irq_disable();
+		local_irq_disable_nort();
 		if (!cache_grow(cachep, GFP_KERNEL, nodeid)) {
-			local_irq_enable();
+			local_irq_enable_nort();
 			return NULL;
 		}
-		local_irq_enable();
+		local_irq_enable_nort();
 	}
 got_slabp:
 	/* found one: allocate object */
@@ -2503,9 +2532,9 @@ void kmem_cache_free (kmem_cache_t *cach
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	__cache_free(cachep, objp);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 EXPORT_SYMBOL(kmem_cache_free);
@@ -2545,11 +2574,11 @@ void kfree (const void *objp)
 
 	if (!objp)
 		return;
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	kfree_debugcheck(objp);
 	c = GET_PAGE_CACHE(virt_to_page(objp));
 	__cache_free(c, (void*)objp);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 EXPORT_SYMBOL(kfree);
@@ -2590,13 +2619,17 @@ struct ccupdate_struct {
 	struct array_cache *new[NR_CPUS];
 };
 
+/*
+ * Executes in IRQ context:
+ */
 static void do_ccupdate_local(void *info)
 {
 	struct ccupdate_struct *new = (struct ccupdate_struct *)info;
 	struct array_cache *old;
 
+//	WARN_ON(!in_interrupt());
 	check_irq_off();
-	old = ac_data(new->cachep);
+	old = ac_data(new->cachep, smp_processor_id());
 	
 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
 	new->new[smp_processor_id()] = old;
@@ -2704,6 +2737,10 @@ static void enable_cpucache (kmem_cache_
 	if (limit > 32)
 		limit = 32;
 #endif
+#ifdef CONFIG_PREEMPT
+	if (limit > 16)
+		limit = 16;
+#endif
 	err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
 	if (err)
 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
@@ -2743,11 +2780,12 @@ static void drain_array_locked(kmem_cach
  */
 static void cache_reap(void *unused)
 {
+	int cpu = _smp_processor_id();
 	struct list_head *walk;
 
 	if (down_trylock(&cache_chain_sem)) {
 		/* Give up. Setup the next iteration. */
-		schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
+		schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + cpu);
 		return;
 	}
 
@@ -2766,7 +2804,7 @@ static void cache_reap(void *unused)
 
 		spin_lock_irq(&searchp->spinlock);
 
-		drain_array_locked(searchp, ac_data(searchp), 0);
+		drain_array_locked(searchp, ac_data(searchp, cpu), 0);
 
 		if(time_after(searchp->lists.next_reap, jiffies))
 			goto next_unlock;
@@ -2810,7 +2848,7 @@ next:
 	check_irq_on();
 	up(&cache_chain_sem);
 	/* Setup the next iteration */
-	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
+	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC+cpu);
 }
 
 #ifdef CONFIG_PROC_FS
@@ -3031,10 +3069,10 @@ unsigned int ksize(const void *objp)
 	unsigned int size = 0;
 
 	if (likely(objp != NULL)) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		c = GET_PAGE_CACHE(virt_to_page(objp));
 		size = kmem_cache_size(c);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 
 	return size;
--- linux/mm/highmem.c.orig
+++ linux/mm/highmem.c
@@ -240,11 +240,11 @@ static void bounce_copy_vec(struct bio_v
 	unsigned long flags;
 	unsigned char *vto;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
 	kunmap_atomic(vto, KM_BOUNCE_READ);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 #else /* CONFIG_HIGHMEM */
--- linux/mm/page_alloc.c.orig
+++ linux/mm/page_alloc.c
@@ -442,6 +442,7 @@ static struct page *__rmqueue(struct zon
 	return NULL;
 }
 
+#if !defined(CONFIG_PREEMPT_RT)
 /* 
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
@@ -466,6 +467,7 @@ static int rmqueue_bulk(struct zone *zon
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return allocated;
 }
+#endif
 
 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
@@ -550,6 +552,7 @@ static void zone_statistics(struct zonel
 #endif
 }
 
+#if !defined(CONFIG_PREEMPT_RT)
 /*
  * Free a 0-order page
  */
@@ -577,15 +580,32 @@ static void fastcall free_hot_cold_page(
 	local_irq_restore(flags);
 	put_cpu();
 }
+#endif
 
+/*
+ * On PREEMPT_RT we use a simple solution for the time being,
+ * per-CPU allocation is disabled.
+ */
 void fastcall free_hot_page(struct page *page)
 {
+#if defined(CONFIG_PREEMPT_RT)
+	if (PageAnon(page))
+		page->mapping = NULL;
+	__free_pages_ok(page, 0);
+#else
 	free_hot_cold_page(page, 0);
+#endif
 }
 	
 void fastcall free_cold_page(struct page *page)
 {
+#ifdef CONFIG_PREEMPT_RT
+	if (PageAnon(page))
+		page->mapping = NULL;
+	__free_pages_ok(page, 0);
+#else
 	free_hot_cold_page(page, 1);
+#endif
 }
 
 static inline struct list_head *get_per_thread_pages(void)
@@ -690,6 +710,7 @@ buffered_rmqueue(struct zone *zone, int 
 {
 	unsigned long flags;
 	struct page *page = NULL;
+#if !defined(CONFIG_PREEMPT_RT)
 	int cold = !!(gfp_flags & __GFP_COLD);
 
 	if (order == 0) {
@@ -708,6 +729,7 @@ buffered_rmqueue(struct zone *zone, int 
 		local_irq_restore(flags);
 		put_cpu();
 	}
+#endif
 
 	if (page == NULL) {
 		spin_lock_irqsave(&zone->lock, flags);
@@ -963,8 +985,15 @@ void __pagevec_free(struct pagevec *pvec
 {
 	int i = pagevec_count(pvec);
 
-	while (--i >= 0)
+	while (--i >= 0) {
+#if defined(CONFIG_PREEMPT_RT)
+		if (PageAnon(pvec->pages[i]))
+			pvec->pages[i]->mapping = NULL;
+		__free_pages_ok(pvec->pages[i], 0);
+#else
 		free_hot_cold_page(pvec->pages[i], pvec->cold);
+#endif
+	}
 }
 
 fastcall void __free_pages(struct page *page, unsigned int order)
--- linux/mm/swap.c.orig
+++ linux/mm/swap.c
@@ -136,39 +136,45 @@ EXPORT_SYMBOL(mark_page_accessed);
  * lru_cache_add: add a page to the page lists
  * @page: the page to add
  */
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_pvecs) = { 0, };
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_active_pvecs) = { 0, };
 
 void fastcall lru_cache_add(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+	int cpu = _smp_processor_id();
+	struct pagevec *pvec = &get_cpu_var_locked(lru_add_pvecs, cpu);
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
 		__pagevec_lru_add(pvec);
-	put_cpu_var(lru_add_pvecs);
+	put_cpu_var_locked(lru_add_pvecs, cpu);
 }
 
 void fastcall lru_cache_add_active(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
+	int cpu = _smp_processor_id();
+	struct pagevec *pvec = &get_cpu_var_locked(lru_add_active_pvecs, cpu);
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
 		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_active_pvecs);
+	put_cpu_var_locked(lru_add_active_pvecs, cpu);
 }
 
 void lru_add_drain(void)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+	int cpu = _smp_processor_id();
+	struct pagevec *pvec;
 
+	pvec = &get_cpu_var_locked(lru_add_pvecs, cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
-	pvec = &__get_cpu_var(lru_add_active_pvecs);
+	put_cpu_var_locked(lru_add_pvecs, cpu);
+
+	pvec = &get_cpu_var_locked(lru_add_active_pvecs, cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_pvecs);
+	put_cpu_var_locked(lru_add_active_pvecs, cpu);
 }
 
 /*
--- linux/mm/rmap.c.orig
+++ linux/mm/rmap.c
@@ -190,8 +190,8 @@ void __init anon_vma_init(void)
  */
 static struct anon_vma *page_lock_anon_vma(struct page *page)
 {
-	struct anon_vma *anon_vma = NULL;
 	unsigned long anon_mapping;
+	struct anon_vma *anon_vma;
 
 	rcu_read_lock();
 	anon_mapping = (unsigned long) page->mapping;
@@ -201,10 +201,13 @@ static struct anon_vma *page_lock_anon_v
 		goto out;
 
 	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+	rcu_read_unlock(); // FIXME: hack
 	spin_lock(&anon_vma->lock);
+
+	return anon_vma;
 out:
 	rcu_read_unlock();
-	return anon_vma;
+	return NULL;
 }
 
 /*
--- linux/mm/mmap.c.orig
+++ linux/mm/mmap.c
@@ -1512,7 +1512,7 @@ static void free_pgtables(struct mmu_gat
 	unsigned long first = start & PGDIR_MASK;
 	unsigned long last = end + PGDIR_SIZE - 1;
 	unsigned long start_pml4_index, start_pgd_index;
-	struct mm_struct *mm = tlb->mm;
+	struct mm_struct *mm = tlb_mm(tlb);
 
 	if (!prev) {
 		prev = mm->mmap;
--- linux/mm/memory.c.orig
+++ linux/mm/memory.c
@@ -115,7 +115,7 @@ static inline void free_one_pmd(struct m
 	page = pmd_page(*dir);
 	pmd_clear(dir);
 	dec_page_state(nr_page_table_pages);
-	tlb->mm->nr_ptes--;
+	tlb_mm(tlb)->nr_ptes--;
 	pte_free_tlb(tlb, page);
 }
 
@@ -183,7 +183,7 @@ void clear_page_range(struct mmu_gather 
 			unsigned long end)
 {
 	int i;
-	pml4_t *pml4 = tlb->mm->pml4;
+	pml4_t *pml4 = tlb_mm(tlb)->pml4;
 	unsigned long next;
 
 	for (i = pml4_index(addr); i <= pml4_index(end-1); i++) {
@@ -523,10 +523,10 @@ static void zap_pte_range(struct mmu_gat
 			if (pte_dirty(pte))
 				set_page_dirty(page);
 			if (PageAnon(page))
-				tlb->mm->anon_rss--;
+				tlb_mm(tlb)->anon_rss--;
 			else if (pte_young(pte))
 				mark_page_accessed(page);
-			tlb->freed++;
+			tlb_free(tlb);
 			page_remove_rmap(page);
 			tlb_remove_page(tlb, page);
 			continue;
--- linux/kernel/rt.c.orig
+++ linux/kernel/rt.c
@@ -0,0 +1,1689 @@
+/*
+ * kernel/rt.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * lock debugging, locking tree, deadlock detection:
+ *
+ *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Released under the General Public License (GPL).
+ *
+ * Includes portions of the generic R/W semaphore implementation from:
+ *
+ *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ *  - Derived also from comments by Linus
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+
+/*
+ * This flag is good for debugging the PI code - it makes all tasks
+ * in the system fall under PI handling. Normally only SCHED_FIFO/RR
+ * tasks are PI-handled:
+ */
+//#define ALL_TASKS_PI
+
+/*
+ * We need a global lock for priority inheritance handling.
+ * This is only for the slow path, but still, we might want
+ * to optimize it later to be more scalable.
+ */
+static __cacheline_aligned_in_smp raw_spinlock_t pi_lock =
+						RAW_SPIN_LOCK_UNLOCKED;
+
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+/*
+ * We need a global lock when we walk through the multi-process
+ * lock tree...
+ */
+static raw_spinlock_t trace_lock = RAW_SPIN_LOCK_UNLOCKED;
+
+static LIST_HEAD(held_locks);
+
+/*
+ * deadlock detection flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+static int trace_on = 1;
+
+void deadlock_trace_off(void)
+{
+	trace_on = 0;
+}
+
+#define trace_lock_irq(lock)			\
+	do {					\
+		local_irq_disable();		\
+		if (trace_on)			\
+			spin_lock(lock);	\
+	} while (0)
+
+#define trace_unlock(lock)			\
+	do {					\
+		if (trace_on)			\
+			spin_unlock(lock);	\
+	} while (0)
+
+#define trace_unlock_irq(lock)			\
+	do {					\
+		if (trace_on)			\
+			spin_unlock(lock);	\
+		local_irq_enable();		\
+		preempt_check_resched();	\
+	} while (0)
+
+#define trace_lock_irqsave(lock, flags)		\
+	do {					\
+		local_irq_save(flags);		\
+		if (trace_on)			\
+			spin_lock(lock);	\
+	} while (0)
+
+#define trace_unlock_irqrestore(lock, flags)	\
+	do {					\
+		if (trace_on)			\
+			spin_unlock(lock);	\
+		local_irq_restore(flags);	\
+		preempt_check_resched();	\
+	} while (0)
+
+#define TRACE_OFF()				\
+do {						\
+	if (trace_on) {				\
+		trace_on = 0;			\
+		console_verbose();		\
+		spin_unlock(&trace_lock);	\
+	}					\
+} while (0)
+
+#define TRACE_BUG()				\
+do {						\
+	TRACE_OFF();				\
+	BUG();					\
+} while (0)
+
+#define TRACE_WARN_ON(c)			\
+do {						\
+	if (c) {				\
+		TRACE_OFF();			\
+		WARN_ON(1);			\
+	}					\
+} while (0)
+
+#else
+# define trace_lock_irq(lock)			local_irq_disable()
+# define trace_lock_irqsave(lock, flags)	local_irq_save(flags)
+# define trace_unlock(lock)			do { } while (0)
+
+# define trace_unlock_irq(lock) \
+	do { local_irq_enable(); preempt_check_resched(); } while (0)
+
+# define trace_unlock_irqrestore(lock, flags) \
+	do { local_irq_restore(flags); preempt_check_resched(); } while (0)
+
+# define TRACE_BUG()				do { } while (0)
+# define TRACE_WARN_ON(c)			do { } while (0)
+# define TRACE_OFF()				do { } while (0)
+#endif /* CONFIG_RT_DEADLOCK_DETECT */
+
+#define TRACE_BUG_ON(c) do { if (c) TRACE_BUG(); } while (0)
+
+/*
+ * Unlock these on crash:
+ */
+void zap_rt_locks(void)
+{
+	spin_lock_init(&pi_lock);
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+	spin_lock_init(&trace_lock);
+#endif
+}
+
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+
+static void printk_task(struct task_struct *p)
+{
+	if (p)
+		printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_task_short(struct task_struct *p)
+{
+	if (p)
+		printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_lock(struct rt_mutex *lock, int print_owner)
+{
+	if (lock->name)
+		printk(" [%p] {%s}\n",
+			lock, lock->name);
+	else
+		printk(" [%p] {%s:%d}\n",
+			lock, lock->file, lock->line);
+
+	if (print_owner && lock->owner) {
+		printk(".. held by:  ");
+		printk_task(lock->owner);
+		printk("\n");
+	}
+	if (lock->owner) {
+		printk("... acquired at:  ");
+		print_symbol("%s\n", lock->acquire_eip);
+	}
+}
+
+static void printk_waiter(struct rt_mutex_waiter *w)
+{
+	printk("-------------------------\n");
+	printk("| waiter struct %p:\n", w);
+	printk("| w->task:\n");
+	printk_task(w->task);
+	printk("\n| lock:\n");
+	printk_lock(w->lock, 1);
+	printk("| blocked at:  ");
+	print_symbol("%s\n", w->eip);
+	printk("-------------------------\n");
+}
+
+static void show_task_locks(struct task_struct *p)
+{
+	switch (p->state) {
+	case TASK_RUNNING:		printk("R"); break;
+	case TASK_INTERRUPTIBLE:	printk("s"); break;
+	case TASK_UNINTERRUPTIBLE:	printk("D"); break;
+	case TASK_STOPPED:		printk("T"); break;
+	case EXIT_ZOMBIE:		printk("Z"); break;
+	case EXIT_DEAD:			printk("X"); break;
+	default:			printk("?"); break;
+	}
+	printk_task(p);
+	if (p->blocked_on) {
+		struct rt_mutex *lock = p->blocked_on->lock;
+
+		printk(" blocked on:");
+		printk_lock(lock, 1);
+	} else
+		printk(" (not blocked)\n");
+}
+
+static void show_held_locks(struct task_struct *filter)
+{
+	struct list_head *curr, *cursor = NULL;
+	struct rt_mutex *lock;
+	struct task_struct *p;
+	unsigned long flags;
+	int count = 0;
+
+	printk("\n");
+	if (filter) {
+		printk("------------------------------\n");
+		printk("| showing all locks held by: |  (");
+		printk_task_short(filter);
+		printk("):\n");
+		printk("------------------------------\n");
+	} else {
+		printk("---------------------------\n");
+		printk("| showing all locks held: |\n");
+		printk("---------------------------\n");
+	}
+
+	/*
+	 * Play safe and acquire the global trace lock. We
+	 * cannot printk with that lock held so we iterate
+	 * very carefully:
+	 */
+next:
+	trace_lock_irqsave(&trace_lock, flags);
+	list_for_each(curr, &held_locks) {
+		if (cursor && curr != cursor)
+			continue;
+		lock = list_entry(curr, struct rt_mutex, held_list);
+		p = lock->owner;
+		if (filter && (p != filter))
+			continue;
+		count++;
+		cursor = curr->next;
+		trace_unlock_irqrestore(&trace_lock, flags);
+
+		printk("\n#%03d:            ", count);
+		printk_lock(lock, filter ? 0 : 1);
+		goto next;
+	}
+	trace_unlock_irqrestore(&trace_lock, flags);
+}
+
+void show_all_locks(void)
+{
+	struct task_struct *g, *p;
+	int count = 10;
+	int unlock = 1;
+
+	printk("\nshowing all tasks:\n");
+
+	/*
+	 * Here we try to get the tasklist_lock as hard as possible,
+	 * if not successful after 2 seconds we ignore it (but keep
+	 * trying). This is to enable a debug printout even if a
+	 * tasklist_lock-holding task deadlocks or crashes.
+	 */
+retry:
+	if (!read_trylock(&tasklist_lock)) {
+		if (count == 10)
+			printk("hm, tasklist_lock locked, retrying... ");
+		if (count) {
+			count--;
+			printk(" #%d", 10-count);
+			mdelay(200);
+			goto retry;
+		}
+		printk(" ignoring it.\n");
+		unlock = 0;
+	}
+	if (count != 10)
+		printk(" locked it.\n");
+
+	do_each_thread(g, p) {
+		show_task_locks(p);
+		if (!unlock)
+			if (read_trylock(&tasklist_lock))
+				unlock = 1;
+	} while_each_thread(g, p);
+
+	show_held_locks(NULL);
+	printk("=============================================\n\n");
+
+	if (unlock)
+		read_unlock(&tasklist_lock);
+}
+
+static int check_deadlock(struct rt_mutex *lock, int recursive,
+			  unsigned long eip)
+{
+	struct rt_mutex *lockblk;
+	struct task_struct *task;
+
+	if (!trace_on)
+		return 0;
+	/*
+	 * Special-case: the BKL self-releases at schedule()
+	 * time so it can never deadlock:
+	 */
+	if (lock == &kernel_sem.lock)
+		return 0;
+	task = lock->owner;
+	if (!task)
+		return 0;
+	lockblk = NULL;
+	if (task->blocked_on)
+		lockblk = task->blocked_on->lock;
+	if (current == task) {
+		TRACE_OFF();
+		if (recursive)
+			return 1;
+		printk("\n==========================================\n");
+		printk(  "[ BUG: lock recursion deadlock detected! |\n");
+		printk(  "------------------------------------------\n");
+		printk("already locked: ");
+		printk_lock(lock, 1);
+		show_held_locks(task);
+		printk("\n-{current task's backtrace}----------------->\n");
+		dump_stack();
+		show_all_locks();
+		printk("[ turning off deadlock detection. Please report this trace. ]\n\n");
+		local_irq_disable();
+		return 0;
+	}
+	/*
+	 * Skip the BKL:
+	 */
+	if (lockblk == &kernel_sem.lock)
+		return 0;
+	if (lockblk && check_deadlock(lockblk, 1, eip)) {
+		printk("\n============================================\n");
+		printk(  "[ BUG: circular locking deadlock detected! ]\n");
+		printk(  "--------------------------------------------\n");
+		printk("%s/%d is deadlocking current task %s/%d\n\n",
+			task->comm, task->pid, current->comm, current->pid);
+		printk("\n1) %s/%d is trying to acquire this lock:\n",
+			current->comm, current->pid);
+		printk_lock(lock, 1);
+
+		printk("... trying at:   ");
+		print_symbol("%s\n", eip);
+
+		printk("\n2) %s/%d is blocked on this lock:\n",
+			task->comm, task->pid);
+		printk_lock(lockblk, 1);
+
+		show_held_locks(current);
+		show_held_locks(task);
+
+		printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
+		show_stack(task, NULL);
+		printk("\n%s/%d's [current] stackdump:\n\n",
+			current->comm, current->pid);
+		dump_stack();
+		show_all_locks();
+		printk("[ turning off deadlock detection. Please report this trace. ]\n\n");
+		local_irq_disable();
+		return 0;
+	}
+	return 0;
+}
+
+void check_no_held_locks(struct task_struct *task)
+{
+	struct list_head *curr, *next, *cursor = NULL;
+	struct rt_mutex *lock;
+	struct rt_mutex_waiter *w;
+	struct task_struct *p;
+	unsigned long flags;
+
+	if (!trace_on)
+		return;
+restart:
+	trace_lock_irqsave(&trace_lock, flags);
+	list_for_each_safe(curr, next, &held_locks) {
+		if (cursor && curr != cursor)
+			continue;
+		lock = list_entry(curr, struct rt_mutex, held_list);
+		p = lock->owner;
+		if (p != task)
+			continue;
+		cursor = next;
+		list_del_init(curr);
+		trace_unlock_irqrestore(&trace_lock, flags);
+
+		if (lock == &kernel_sem.lock) {
+			printk("BUG: %s/%d, BKL held at task exit time!\n",
+				current->comm, current->pid);
+			printk("BKL acquired at: ");
+			print_symbol("%s\n",
+				(unsigned long) current->last_kernel_lock);
+		} else
+			printk("BUG: %s/%d, lock held at task exit time!\n",
+				current->comm, current->pid);
+		printk_lock(lock, 1);
+		if (lock->owner != task)
+			printk("exiting task is not even the owner??\n");
+		goto restart;
+	}
+	spin_lock(&pi_lock);
+	list_for_each(curr, &task->pi_waiters) {
+		w = list_entry(curr, struct rt_mutex_waiter, pi_list);
+		TRACE_OFF();
+		spin_unlock(&pi_lock);
+		trace_unlock_irqrestore(&trace_lock, flags);
+
+		printk("hm, PI interest held at exit time? Task:\n");
+		printk_task(task);
+		printk_waiter(w);
+		return;
+	}
+	spin_unlock(&pi_lock);
+	trace_unlock_irqrestore(&trace_lock, flags);
+}
+
+#endif
+
+#if defined(ALL_TASKS_PI) && defined(CONFIG_RT_DEADLOCK_DETECT)
+
+static void
+check_pi_list_present(struct rt_mutex *lock, struct rt_mutex_waiter *waiter,
+		      struct task_struct *old_owner)
+{
+	struct rt_mutex_waiter *w;
+	struct list_head *curr;
+
+	TRACE_WARN_ON(list_empty(&waiter->pi_list));
+	TRACE_WARN_ON(lock->owner);
+
+	list_for_each(curr, &old_owner->pi_waiters) {
+		w = list_entry(curr, struct rt_mutex_waiter, pi_list);
+		if (w == waiter)
+			goto ok;
+	}
+	TRACE_WARN_ON(1);
+ok:
+}
+
+static void
+check_pi_list_empty(struct rt_mutex *lock, struct task_struct *old_owner)
+{
+	struct rt_mutex_waiter *w;
+	struct list_head *curr;
+
+	list_for_each(curr, &old_owner->pi_waiters) {
+		w = list_entry(curr, struct rt_mutex_waiter, pi_list);
+		if (w->lock == lock) {
+			TRACE_OFF();
+			printk("hm, PI interest but no waiter? Old owner:\n");
+			printk_waiter(w);
+			printk("\n");
+			TRACE_WARN_ON(1);
+			return;
+		}
+	}
+}
+
+#else
+
+static inline void
+check_pi_list_present(struct rt_mutex *lock, struct rt_mutex_waiter *waiter,
+		      struct task_struct *old_owner)
+{
+}
+
+static inline void
+check_pi_list_empty(struct rt_mutex *lock, struct task_struct *old_owner)
+{
+}
+
+#endif
+
+/*
+ * Move PI waiters of this lock to the new owner:
+ */
+static void
+change_owner(struct rt_mutex *lock, struct task_struct *old_owner,
+		   struct task_struct *new_owner)
+{
+	struct list_head *curr, *next;
+	struct rt_mutex_waiter *w;
+	int requeued = 0, sum = 0;
+
+	if (old_owner == new_owner)
+		return;
+	list_for_each_safe(curr, next, &old_owner->pi_waiters) {
+		w = list_entry(curr, struct rt_mutex_waiter, pi_list);
+		if (w->lock == lock) {
+			list_del_init(curr);
+			list_add_tail(curr, &new_owner->pi_waiters);
+			requeued++;
+		}
+		sum++;
+	}
+	trace_special(sum, requeued, 0);
+}
+
+int pi_walk, pi_null, pi_prio;
+
+static void pi_setprio(struct rt_mutex *lock, struct task_struct *p, int prio)
+{
+	if (unlikely(!p->pid)) {
+		pi_null++;
+		return;
+	}
+
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+	pi_prio++;
+	if (p->policy != SCHED_NORMAL && prio > mutex_getprio(p)) {
+		TRACE_OFF();
+
+		printk("huh? (%d->%d??)\n", p->prio, prio);
+		printk("owner:\n");
+		printk_task(p);
+		printk("\ncurrent:\n");
+		printk_task(current);
+		printk("\nlock:\n");
+		printk_lock(lock, 1);
+		dump_stack();
+		local_irq_disable();
+	}
+#endif
+	/*
+	 * If the task is blocked on some other task then boost that
+	 * other task (or tasks) too:
+	 */
+	for (;;) {
+		struct rt_mutex_waiter *w = p->blocked_on;
+		int was_rt = rt_task(p);
+
+		mutex_setprio(p, prio);
+		if (!w)
+			break;
+		/*
+		 * If the task is blocked on a lock, and we just made
+		 * it RT, then register the task in the PI list and
+		 * requeue it to the head of the wait list:
+		 */
+		lock = w->lock;
+		TRACE_BUG_ON(!lock);
+		TRACE_BUG_ON(!lock->owner);
+		if (rt_task(p) && list_empty(&w->pi_list)) {
+			TRACE_BUG_ON(was_rt);
+			list_add_tail(&w->pi_list, &lock->owner->pi_waiters);
+			list_del(&w->list);
+			list_add(&w->list, &lock->wait_list);
+		}
+		/*
+		 * If the task is blocked on a lock, and we just restored
+		 * it from RT to non-RT then unregister the task from
+		 * the PI list and requeue it to the tail of the wait
+		 * list:
+		 *
+		 * (TODO: this can be unfair to SCHED_NORMAL tasks if they
+		 *        get PI handled.)
+		 */
+		if (!rt_task(p) && !list_empty(&w->pi_list)) {
+			TRACE_BUG_ON(!was_rt);
+			list_del(&w->pi_list);
+			list_del(&w->list);
+			list_add_tail(&w->list, &lock->wait_list);
+		}
+
+		pi_walk++;
+
+		p = lock->owner;
+		TRACE_BUG_ON(!p);
+		/*
+		 * If the dependee is already higher-prio then
+		 * no need to boost it, and all further tasks down
+		 * the dependency chain are already boosted:
+		 */
+		if (p->prio <= prio)
+			break;
+	}
+}
+
+static void
+task_blocks_on_lock(struct rt_mutex_waiter *waiter, struct task_struct *task,
+		   struct rt_mutex *lock, unsigned long eip)
+{
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+	if (lock->debug)
+		check_deadlock(lock, 0, eip);
+	/* mark the current thread as blocked on the lock */
+	waiter->eip = eip;
+#endif
+	task->blocked_on = waiter;
+	waiter->lock = lock;
+	waiter->task = task;
+	INIT_LIST_HEAD(&waiter->pi_list);
+	/*
+	 * Add SCHED_NORMAL tasks to the end of the waitqueue (FIFO):
+	 */
+#ifndef ALL_TASKS_PI
+	if (!rt_task(task)) {
+		list_add_tail(&waiter->list, &lock->wait_list);
+		return;
+	}
+#endif
+	spin_lock(&pi_lock);
+	list_add_tail(&waiter->pi_list, &lock->owner->pi_waiters);
+	/*
+	 * Add RT tasks to the head:
+	 */
+	list_add(&waiter->list, &lock->wait_list);
+	/*
+	 * If the waiter has higher priority than the owner
+	 * then temporarily boost the owner:
+	 */
+	if (task->prio < lock->owner->prio)
+		pi_setprio(lock, lock->owner, task->prio);
+	spin_unlock(&pi_lock);
+}
+
+/*
+ * initialise the lock:
+ */
+static void __init_rt_mutex(struct rt_mutex *lock, int save_state, int debug,
+				char *name, char *file, int line)
+{
+	lock->owner = NULL;
+	spin_lock_init(&lock->wait_lock);
+	INIT_LIST_HEAD(&lock->wait_list);
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+	lock->save_state = save_state;
+	lock->debug = debug;
+	INIT_LIST_HEAD(&lock->held_list);
+	lock->name = name;
+	lock->file = file;
+	lock->line = line;
+#endif
+}
+
+void fastcall __init_rwsem(struct rw_semaphore *rwsem, int save_state,
+			int debug, char *name, char *file, int line)
+{
+	__init_rt_mutex(&rwsem->lock, save_state, debug, name, file, line);
+	rwsem->read_depth = 0;
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+static void set_new_owner(struct rt_mutex *lock, struct task_struct *old_owner,
+			struct task_struct *new_owner, unsigned long eip)
+{
+	if (new_owner)
+		trace_special_pid(new_owner->pid, new_owner->prio, 0);
+	if (old_owner)
+		change_owner(lock, old_owner, new_owner);
+	lock->owner = new_owner;
+	lock->owner_prio = new_owner->prio;
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+	if (lock->debug) {
+		TRACE_WARN_ON(!list_empty(&lock->held_list));
+		list_add_tail(&lock->held_list, &held_locks);
+	}
+	lock->acquire_eip = eip;
+#endif
+}
+
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - the spinlock must be held by the caller
+ */
+static inline struct task_struct * pick_new_owner(struct rt_mutex *lock,
+		struct task_struct *old_owner, int save_state,
+		unsigned long eip)
+{
+	struct rt_mutex_waiter *w, *waiter = NULL;
+	struct task_struct *new_owner;
+	struct list_head *curr;
+
+	/*
+	 * Get the highest prio one:
+	 *
+	 * (same-prio RT tasks go FIFO)
+	 */
+	list_for_each(curr, &lock->wait_list) {
+		w = list_entry(curr, struct rt_mutex_waiter, list);
+		trace_special_pid(w->task->pid, w->task->prio, 0);
+		/*
+		 * Break out upon meeting the first non-RT-prio
+		 * task - we inserted them to the tail, so if we
+	 	 * see the first one the rest is SCHED_NORMAL too:
+	 	 */
+		if (!rt_task(w->task))
+			break;
+		if (!waiter || w->task->prio <= waiter->task->prio)
+			waiter = w;
+	}
+
+	/*
+	 * If no RT waiter then pick the first one:
+	 */
+	if (!waiter)
+		waiter = list_entry(lock->wait_list.next,
+					struct rt_mutex_waiter, list);
+	trace_special_pid(waiter->task->pid, waiter->task->prio, 0);
+
+#ifdef ALL_TASKS_PI
+	check_pi_list_present(lock, waiter, old_owner);
+#endif
+	new_owner = waiter->task;
+	list_del_init(&waiter->list);
+
+	list_del_init(&waiter->pi_list);
+
+	set_new_owner(lock, old_owner, new_owner, eip);
+	/* Don't touch waiter after ->task has been NULLed */
+	mb();
+	waiter->task = NULL;
+	new_owner->blocked_on = NULL;
+	TRACE_WARN_ON(save_state != lock->save_state);
+
+	return new_owner;
+}
+
+static inline void init_lists(struct rt_mutex *lock)
+{
+	// we have to do this until the static initializers get fixed:
+	if (!lock->wait_list.prev && !lock->wait_list.next)
+		INIT_LIST_HEAD(&lock->wait_list);
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+	if (!lock->held_list.prev && !lock->held_list.next)
+		INIT_LIST_HEAD(&lock->held_list);
+#endif
+}
+
+/*
+ * lock it semaphore-style: no worries about missed wakeups.
+ */
+static void __sched __down(struct rt_mutex *lock, unsigned long eip)
+{
+	struct task_struct *task = current;
+	unsigned long flags, nosched_flag;
+	struct rt_mutex_waiter waiter;
+
+	trace_lock_irqsave(&trace_lock, flags);
+	TRACE_BUG_ON(!irqs_disabled());
+	spin_lock(&lock->wait_lock);
+
+	init_lists(lock);
+
+	if (!lock->owner) {
+		/* granted */
+		TRACE_WARN_ON(!list_empty(&lock->wait_list));
+		spin_lock(&pi_lock);
+		set_new_owner(lock, NULL, task, eip);
+		spin_unlock(&pi_lock);
+		spin_unlock(&lock->wait_lock);
+		trace_unlock_irqrestore(&trace_lock, flags);
+
+		return;
+	}
+
+	set_task_state(task, TASK_UNINTERRUPTIBLE);
+
+	task_blocks_on_lock(&waiter, task, lock, eip);
+
+	TRACE_BUG_ON(!irqs_disabled());
+	/* we don't need to touch the lock struct anymore */
+	spin_unlock(&lock->wait_lock);
+	trace_unlock_irqrestore(&trace_lock, flags);
+
+	might_sleep();
+
+	nosched_flag = current->flags & PF_NOSCHED;
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+	if (!lock->debug)
+#endif
+		current->flags &= ~PF_NOSCHED;
+
+	/* wait to be given the lock */
+	for (;;) {
+		if (!waiter.task)
+			break;
+		schedule();
+		set_task_state(task, TASK_UNINTERRUPTIBLE);
+	}
+	current->flags |= nosched_flag;
+	task->state = TASK_RUNNING;
+}
+
+/*
+ * get a write lock on the rw-semaphore
+ */
+void fastcall __sched down_write(struct rw_semaphore *rwsem)
+{
+	__down(&rwsem->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(down_write);
+
+/*
+ * get a read lock on the rw-semaphore
+ */
+void fastcall __sched down_read(struct rw_semaphore *rwsem)
+{
+	/*
+	 * Read locks within the write lock succeed.
+	 */
+	if (rwsem->lock.owner == current) {
+		rwsem->read_depth++;
+		return;
+	}
+	return __down(&rwsem->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(down_read);
+
+/*
+ * lock it mutex-style: this variant is very careful not to
+ * miss any non-mutex wakeups.
+ *
+ * The wakeup side uses wake_up_process_mutex, which, combined with
+ * the xchg code of this function is a transparent sleep/wakeup
+ * mechanism nested within any existing sleep/wakeup mechanism. This
+ * enables the seemless use of arbitrary (blocking) spinlocks within
+ * sleep/wakeup event loops.
+ */
+static void __sched __down_mutex(struct rt_mutex *lock, unsigned long eip)
+{
+	unsigned long state, saved_state, nosched_flag;
+	struct task_struct *task = current;
+	struct rt_mutex_waiter waiter;
+	int got_wakeup = 0;
+
+	might_sleep();
+
+	trace_lock_irq(&trace_lock);
+	TRACE_BUG_ON(!irqs_disabled());
+	spin_lock(&lock->wait_lock);
+
+	init_lists(lock);
+
+	if (!lock->owner) {
+		/* granted */
+		TRACE_WARN_ON(!list_empty(&lock->wait_list));
+		spin_lock(&pi_lock);
+		set_new_owner(lock, NULL, task, eip);
+		spin_unlock(&pi_lock);
+		spin_unlock(&lock->wait_lock);
+		trace_unlock_irq(&trace_lock);
+
+		return;
+	}
+
+	task_blocks_on_lock(&waiter, task, lock, eip);
+
+	TRACE_BUG_ON(!irqs_disabled());
+	/*
+	 * Here we save whatever state the task was in originally,
+	 * we'll restore it at the end of the function and we'll
+	 * take any intermediate wakeup into account as well,
+	 * independently of the mutex sleep/wakeup mechanism:
+	 */
+	saved_state = xchg(&task->state, TASK_UNINTERRUPTIBLE);
+
+	/* we don't need to touch the lock struct anymore */
+	spin_unlock(&lock->wait_lock);
+	trace_unlock(&trace_lock);
+
+	nosched_flag = current->flags & PF_NOSCHED;
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+	if (!lock->debug)
+#endif
+		current->flags &= ~PF_NOSCHED;
+
+	/* wait to be given the lock */
+	for (;;) {
+		unsigned long saved_flags = current->flags & PF_NOSCHED;
+
+		if (!waiter.task)
+			break;
+		local_irq_enable();
+		current->flags &= ~PF_NOSCHED;
+		schedule();
+		current->flags |= saved_flags;
+		local_irq_disable();
+		state = xchg(&task->state, TASK_UNINTERRUPTIBLE);
+		if (state == TASK_RUNNING)
+			got_wakeup = 1;
+	}
+	/*
+	 * Only set the task's state to TASK_RUNNING if it got
+	 * a non-mutex wakeup. We keep the original state otherwise.
+	 * A mutex wakeup changes the task's state to TASK_RUNNING_MUTEX,
+	 * not TASK_RUNNING - hence we can differenciate between the two
+	 * cases:
+	 */
+	state = xchg(&task->state, saved_state);
+	if (state == TASK_RUNNING)
+		got_wakeup = 1;
+	if (got_wakeup)
+		task->state = TASK_RUNNING;
+	local_irq_enable();
+	preempt_check_resched();
+
+	current->flags |= nosched_flag;
+}
+
+/*
+ * TODO: push this into __down_mutex()
+ *
+ * BKL users expect the BKL to be held across spinlock/rwlock-acquire.
+ * Save and clear it, this will cause the scheduler to not drop the
+ * BKL semaphore if we end up scheduling:
+ */
+#define SAVE_BKL(ACTION)					\
+{								\
+	struct task_struct *task = current;			\
+	unsigned int saved_lock_depth;				\
+								\
+	saved_lock_depth = task->lock_depth;			\
+	task->lock_depth = -1;					\
+								\
+	might_sleep();						\
+	ACTION;							\
+								\
+	task->lock_depth = saved_lock_depth;			\
+}
+
+
+static void __sched down_write_mutex(struct rw_semaphore *rwsem,
+					unsigned long eip)
+{
+	SAVE_BKL(__down_mutex(&rwsem->lock, eip));
+}
+
+static void __sched down_read_mutex(struct rw_semaphore *rwsem,
+					unsigned long eip)
+{
+	/*
+	 * Read locks within the write lock succeed.
+	 */
+	if (rwsem->lock.owner == current) {
+		rwsem->read_depth++;
+		return;
+	}
+	SAVE_BKL(__down_mutex(&rwsem->lock, eip));
+}
+
+/*
+ * get a lock - interruptible
+ */
+static int __sched __down_interruptible(struct rt_mutex *lock,
+					unsigned long eip)
+{
+	struct task_struct *task = current;
+	unsigned long flags, nosched_flag;
+	struct rt_mutex_waiter waiter;
+	int ret;
+
+	trace_lock_irqsave(&trace_lock, flags);
+	TRACE_BUG_ON(!irqs_disabled());
+	spin_lock(&lock->wait_lock);
+
+	init_lists(lock);
+
+	if (!lock->owner) {
+		/* granted */
+		TRACE_WARN_ON(!list_empty(&lock->wait_list));
+		spin_lock(&pi_lock);
+		set_new_owner(lock, NULL, task, eip);
+		spin_unlock(&pi_lock);
+		spin_unlock(&lock->wait_lock);
+		trace_unlock_irqrestore(&trace_lock, flags);
+
+		return 0;
+	}
+
+	set_task_state(task, TASK_INTERRUPTIBLE);
+
+	task_blocks_on_lock(&waiter, task, lock, eip);
+
+	TRACE_BUG_ON(!irqs_disabled());
+	/* we don't need to touch the lock struct anymore */
+	spin_unlock(&lock->wait_lock);
+	trace_unlock_irqrestore(&trace_lock, flags);
+
+	might_sleep();
+
+	nosched_flag = current->flags & PF_NOSCHED;
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+	if (!lock->debug)
+#endif
+		current->flags &= ~PF_NOSCHED;
+
+	ret = 0;
+	/* wait to be given the lock */
+	for (;;) {
+		if (signal_pending(current)) {
+			/*
+			 * Remove ourselves from the wait list if we
+			 * didnt get the lock - else return success:
+			 */
+			trace_lock_irq(&trace_lock);
+			spin_lock(&lock->wait_lock);
+			if (waiter.task) {
+				list_del_init(&waiter.list);
+				/*
+				 * Just remove ourselves from the PI list.
+				 * (No big problem if our PI effect lingers
+				 *  a bit - owner will restore prio.)
+				 */
+				spin_lock(&pi_lock);
+				list_del_init(&waiter.pi_list);
+				spin_unlock(&pi_lock);
+				ret = -EINTR;
+			}
+			spin_unlock(&lock->wait_lock);
+			trace_unlock_irq(&trace_lock);
+			break;
+		}
+		if (!waiter.task)
+			break;
+		schedule();
+		set_task_state(task, TASK_INTERRUPTIBLE);
+	}
+
+	task->state = TASK_RUNNING;
+	current->flags |= nosched_flag;
+
+	return ret;
+}
+
+int fastcall __sched down_write_interruptible(struct rw_semaphore *rwsem)
+{
+	return __down_interruptible(&rwsem->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(down_write_interruptible);
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+static int __down_trylock(struct rt_mutex *lock, unsigned long eip)
+{
+	struct task_struct *task = current;
+	unsigned long flags;
+	int ret = 0;
+
+	trace_lock_irqsave(&trace_lock, flags);
+	TRACE_BUG_ON(!irqs_disabled());
+	spin_lock(&lock->wait_lock);
+
+	init_lists(lock);
+
+	if (!lock->owner) {
+		/* granted */
+		TRACE_WARN_ON(!list_empty(&lock->wait_list));
+		spin_lock(&pi_lock);
+		set_new_owner(lock, NULL, task, eip);
+		spin_unlock(&pi_lock);
+		ret = 1;
+	}
+
+	spin_unlock(&lock->wait_lock);
+	trace_unlock_irqrestore(&trace_lock, flags);
+
+	return ret;
+}
+
+int fastcall down_write_trylock(struct rw_semaphore *rwsem)
+{
+	return __down_trylock(&rwsem->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(down_write_trylock);
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int fastcall down_read_trylock(struct rw_semaphore *rwsem)
+{
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	if (rwsem->lock.owner == current) {
+		rwsem->read_depth++;
+		return 1;
+	}
+	return __down_trylock(&rwsem->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(down_read_trylock);
+
+static int down_write_trylock_mutex(struct rw_semaphore *rwsem)
+{
+	return __down_trylock(&rwsem->lock, CALLER_ADDR0);
+}
+
+static int down_read_trylock_mutex(struct rw_semaphore *rwsem)
+{
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	if (rwsem->lock.owner == current) {
+		rwsem->read_depth++;
+		return 1;
+	}
+	return __down_trylock(&rwsem->lock, CALLER_ADDR0);
+}
+
+/*
+ * release the lock:
+ */
+static void __up_mutex(struct rt_mutex *lock, int save_state, unsigned long eip)
+{
+	struct task_struct *old_owner, *new_owner;
+	struct rt_mutex_waiter *w;
+	struct list_head *curr;
+	unsigned long flags;
+	int prio;
+
+	TRACE_WARN_ON(save_state != lock->save_state);
+
+	trace_lock_irqsave(&trace_lock, flags);
+	TRACE_BUG_ON(!irqs_disabled());
+	spin_lock(&lock->wait_lock);
+	TRACE_BUG_ON(!lock->wait_list.prev && !lock->wait_list.next);
+
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+	if (lock->debug) {
+		TRACE_WARN_ON(list_empty(&lock->held_list));
+		list_del_init(&lock->held_list);
+	}
+#endif
+	spin_lock(&pi_lock);
+
+	old_owner = lock->owner;
+#ifdef ALL_TASKS_PI
+	if (list_empty(&lock->wait_list))
+		check_pi_list_empty(lock, old_owner);
+#endif
+	lock->owner = NULL;
+	new_owner = NULL;
+	if (!list_empty(&lock->wait_list))
+		new_owner = pick_new_owner(lock, old_owner, save_state, eip);
+
+	/*
+	 * If the owner got priority-boosted then restore it
+	 * to the previous priority (or to the next highest prio
+	 * waiter's priority):
+	 */
+	prio = mutex_getprio(old_owner);
+	list_for_each(curr, &old_owner->pi_waiters) {
+		w = list_entry(curr, struct rt_mutex_waiter, pi_list);
+		if (w->task->prio < prio)
+			prio = w->task->prio;
+		trace_special_pid(w->task->pid, w->task->prio, 0);
+	}
+	if (prio != old_owner->prio)
+		pi_setprio(lock, old_owner, prio);
+	if (new_owner) {
+		if (save_state)
+			wake_up_process_mutex(new_owner);
+		else
+			wake_up_process(new_owner);
+	}
+	spin_unlock(&pi_lock);
+	spin_unlock(&lock->wait_lock);
+
+#ifdef PREEMPT_DIRECT
+	trace_unlock(&trace_lock);
+	/*
+	 * Common place where preemption is requested - if we can
+	 * reschedule then do it here without enabling interrupts
+	 * again (and lengthening latency):
+	 */
+	if (need_resched() && !irqs_disabled_flags(flags) && !preempt_count())
+		preempt_schedule_irq();
+	local_irq_restore(flags);
+#else
+	trace_unlock_irqrestore(&trace_lock, flags);
+#endif
+	/* no need to check for preempt here - we just handled it */
+}
+
+/*
+ * Do owner check too:
+ */
+void fastcall up_write(struct rw_semaphore *rwsem)
+{
+	WARN_ON(rwsem->lock.owner != current);
+	BUG_ON(rwsem->read_depth);
+	__up_mutex(&rwsem->lock, 0, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(up_write);
+
+static void _up_write(struct rw_semaphore *rwsem, unsigned long eip)
+{
+	WARN_ON(rwsem->lock.owner != current);
+	BUG_ON(rwsem->read_depth);
+	__up_mutex(&rwsem->lock, 0, eip);
+}
+
+void fastcall up_write_mutex(struct rw_semaphore *rwsem, unsigned long eip)
+{
+	TRACE_WARN_ON(rwsem->lock.save_state != 1);
+	WARN_ON(rwsem->lock.owner != current);
+	BUG_ON(rwsem->read_depth);
+	__up_mutex(&rwsem->lock, 1, eip);
+}
+
+/*
+ * release a read lock on the semaphore
+ */
+void fastcall up_read(struct rw_semaphore *rwsem)
+{
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	if (rwsem->lock.owner == current && rwsem->read_depth) {
+		rwsem->read_depth--;
+		return;
+	}
+	return _up_write(rwsem, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(up_read);
+
+void fastcall up_read_mutex(struct rw_semaphore *rwsem, unsigned long eip)
+{
+	TRACE_WARN_ON(rwsem->lock.save_state != 1);
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	if (rwsem->lock.owner == current && rwsem->read_depth) {
+		rwsem->read_depth--;
+		return;
+	}
+	return up_write_mutex(rwsem, eip);
+}
+
+/*
+ * downgrade a write lock into a read lock
+ * - just wake up any readers at the front of the queue
+ */
+void fastcall downgrade_write(struct rw_semaphore *rwsem)
+{
+	BUG();
+}
+EXPORT_SYMBOL(downgrade_write);
+
+static int rt_mutex_is_locked(struct rt_mutex *lock)
+{
+	int ret;
+
+	mb();
+	ret = lock->owner != NULL;
+
+	return ret;
+}
+
+int fastcall rwsem_is_locked(struct rw_semaphore *rwsem)
+{
+	return rt_mutex_is_locked(&rwsem->lock);
+}
+EXPORT_SYMBOL(rwsem_is_locked);
+
+static void _down_mutex(struct rt_mutex *lock, unsigned long eip)
+{
+	TRACE_WARN_ON(lock->save_state != 1);
+	__down_mutex(lock, eip);
+}
+
+void fastcall __sema_init(struct semaphore *sem, int val, int debug,
+			  char *name, char *file, int line)
+{
+	atomic_set(&sem->count, val);
+	switch (val) {
+	case 0:
+		__init_rt_mutex(&sem->lock, 0, debug, name, file, line);
+		__down(&sem->lock, CALLER_ADDR0);
+		break;
+	default:
+		__init_rt_mutex(&sem->lock, 0, debug, name, file, line);
+		break;
+	}
+}
+EXPORT_SYMBOL(__sema_init);
+
+void fastcall __init_MUTEX(struct semaphore *sem, char *name, char *file,
+			   int line)
+{
+	__sema_init(sem, 1, 1, name, file, line);
+}
+EXPORT_SYMBOL(__init_MUTEX);
+
+/*
+ * We initialize them to nodebug because mutexes that are initialized
+ * locked are almost always used for completion purposes, not genuine
+ * locking:
+ */
+void fastcall __init_MUTEX_LOCKED(struct semaphore *sem, char *name,
+				  char *file, int line)
+{
+	__sema_init(sem, 0, 0, name, file, line);
+}
+EXPORT_SYMBOL(__init_MUTEX_LOCKED);
+
+static int down_trylock_mutex(struct rt_mutex *lock, unsigned long eip)
+{
+	TRACE_WARN_ON(lock->save_state != 1);
+	return __down_trylock(lock, eip);
+}
+
+void fastcall up_mutex(struct rt_mutex *lock, unsigned long eip)
+{
+	TRACE_WARN_ON(lock->save_state != 1);
+	WARN_ON(lock->owner != current);
+	__up_mutex(lock, 1, eip);
+}
+
+/*
+ * Linux Semaphores implemented via RT-mutexes.
+ *
+ * In the down() variants we use the mutex as the semaphore blocking
+ * object: we always acquire it, decrease the counter and keep the lock
+ * locked if we did the 1->0 transition. The next down() will then block.
+ *
+ * In the up() path we atomically increase the counter and do the
+ * unlock if we were the one doing the 0->1 transition.
+ */
+
+static inline void __down_complete(struct semaphore *sem, unsigned long eip)
+{
+	int count = atomic_dec_return(&sem->count);
+
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	WARN_ON(count < 0);
+
+	if (count > 0)
+		__up_mutex(&sem->lock, 0, eip);
+}
+
+void fastcall down(struct semaphore *sem)
+{
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	__down(&sem->lock, CALLER_ADDR0);
+	__down_complete(sem, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(down);
+
+int fastcall down_interruptible(struct semaphore *sem)
+{
+	int ret;
+
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	ret = __down_interruptible(&sem->lock, CALLER_ADDR0);
+	if (ret)
+		return ret;
+	__down_complete(sem, CALLER_ADDR0);
+	return 0;
+}
+EXPORT_SYMBOL(down_interruptible);
+
+/*
+ * try to down the semaphore, 0 on success and 1 on failure. (inverted)
+ */
+int fastcall down_trylock(struct semaphore *sem)
+{
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	/*
+	 * Here we are a tiny bit different from ordinary Linux semaphores,
+	 * because we can get 'transient' locking-failures when say a
+	 * process decreases the count from 9 to 8 and locks/releases the
+	 * embedded mutex internally. It would be quite complex to remove
+	 * these transient failures so lets try it the simple way first:
+	 */
+	if (__down_trylock(&sem->lock, CALLER_ADDR0)) {
+		__down_complete(sem, CALLER_ADDR0);
+		return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(down_trylock);
+
+void fastcall up(struct semaphore *sem)
+{
+	int count;
+
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	/*
+	 * Disable preemption to make sure a highprio trylock-er cannot
+	 * preempt us here and get into an infinite loop:
+	 */
+	preempt_disable();
+	count = atomic_inc_return(&sem->count);
+	/*
+	 * If we did the 0 -> 1 transition then we are the ones to unlock it:
+	 */
+	if (count == 1)
+		__up_mutex(&sem->lock, 0, CALLER_ADDR0);
+	preempt_enable();
+}
+EXPORT_SYMBOL(up);
+
+int fastcall sem_is_locked(struct semaphore *sem)
+{
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	return rt_mutex_is_locked(&sem->lock);
+}
+EXPORT_SYMBOL(sem_is_locked);
+
+int fastcall sema_count(struct semaphore *sem)
+{
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	return atomic_read(&sem->count);
+}
+EXPORT_SYMBOL(sema_count);
+
+/*
+ * Spinlock wrappers:
+ */
+
+static void __spin_lock(spinlock_t *lock, unsigned long eip)
+{
+	SAVE_BKL(_down_mutex(&lock->lock, eip));
+}
+
+void _spin_lock(spinlock_t *spin)
+{
+	__spin_lock(spin, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_spin_lock);
+
+void _spin_lock_bh(spinlock_t *spin)
+{
+	__spin_lock(spin, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_spin_lock_bh);
+
+void _spin_lock_irq(spinlock_t *spin)
+{
+	__spin_lock(spin, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_spin_lock_irq);
+
+unsigned long _spin_lock_irqsave(spinlock_t *spin)
+{
+	unsigned long flags;
+
+	__spin_lock(spin, CALLER_ADDR0);
+	local_save_flags(flags);
+
+	return flags;
+}
+EXPORT_SYMBOL(_spin_lock_irqsave);
+
+void _spin_unlock(spinlock_t *lock)
+{
+	up_mutex(&lock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_spin_unlock);
+
+void _spin_unlock_wait(spinlock_t *lock)
+{
+	do {
+		barrier();
+	} while (_spin_is_locked(lock));
+}
+EXPORT_SYMBOL(_spin_unlock_wait);
+
+void _spin_unlock_bh(spinlock_t *lock)
+{
+	up_mutex(&lock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_spin_unlock_bh);
+
+void _spin_unlock_irq(spinlock_t *lock)
+{
+	up_mutex(&lock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_spin_unlock_irq);
+
+void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+{
+	up_mutex(&lock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_spin_unlock_irqrestore);
+
+int _spin_trylock(spinlock_t *lock)
+{
+	return down_trylock_mutex(&lock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_spin_trylock);
+
+int _spin_trylock_bh(spinlock_t *lock)
+{
+	return down_trylock_mutex(&lock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_spin_trylock_bh);
+
+int _spin_trylock_irq(spinlock_t *lock)
+{
+	return down_trylock_mutex(&lock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_spin_trylock_irq);
+
+int _spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+{
+	local_save_flags(*flags);
+	return down_trylock_mutex(&lock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_spin_trylock_irqsave);
+
+int _spin_is_locked(spinlock_t *lock)
+{
+	return rt_mutex_is_locked(&lock->lock);
+}
+EXPORT_SYMBOL(_spin_is_locked);
+
+int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
+{
+	__spin_lock(lock, CALLER_ADDR0);
+	if (atomic_dec_and_test(atomic))
+		return 1;
+	_spin_unlock(lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(atomic_dec_and_spin_lock);
+
+void _spin_lock_init(spinlock_t *lock, char *name, char *file, int line)
+{
+	__init_rt_mutex(&lock->lock, 1, 1, name, file, line);
+}
+EXPORT_SYMBOL(_spin_lock_init);
+
+
+/*
+ * RW-lock wrappers:
+ */
+int _read_trylock(rwlock_t *rwlock)
+{
+	return down_read_trylock_mutex(&rwlock->lock);
+}
+EXPORT_SYMBOL(_read_trylock);
+
+int _write_trylock(rwlock_t *rwlock)
+{
+	return down_write_trylock_mutex(&rwlock->lock);
+}
+EXPORT_SYMBOL(_write_trylock);
+
+void _write_lock(rwlock_t *rwlock)
+{
+	down_write_mutex(&rwlock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_write_lock);
+
+void _read_lock(rwlock_t *rwlock)
+{
+	down_read_mutex(&rwlock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_read_lock);
+
+void _write_unlock(rwlock_t *rwlock)
+{
+	up_write_mutex(&rwlock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_write_unlock);
+
+void _read_unlock(rwlock_t *rwlock)
+{
+	up_read_mutex(&rwlock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_read_unlock);
+
+unsigned long _write_lock_irqsave(rwlock_t *rwlock)
+{
+	unsigned long flags;
+
+	down_write_mutex(&rwlock->lock, CALLER_ADDR0);
+
+	local_save_flags(flags);
+	return flags;
+}
+EXPORT_SYMBOL(_write_lock_irqsave);
+
+unsigned long _read_lock_irqsave(rwlock_t *rwlock)
+{
+	unsigned long flags;
+
+	down_read_mutex(&rwlock->lock, CALLER_ADDR0);
+
+	local_save_flags(flags);
+	return flags;
+}
+EXPORT_SYMBOL(_read_lock_irqsave);
+
+void _write_lock_irq(rwlock_t *rwlock)
+{
+	down_write_mutex(&rwlock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_write_lock_irq);
+
+void _read_lock_irq(rwlock_t *rwlock)
+{
+	down_read_mutex(&rwlock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_read_lock_irq);
+
+void _write_lock_bh(rwlock_t *rwlock)
+{
+	down_write_mutex(&rwlock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_write_lock_bh);
+
+void _read_lock_bh(rwlock_t *rwlock)
+{
+	down_read_mutex(&rwlock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_read_lock_bh);
+
+void _write_unlock_irq(rwlock_t *rwlock)
+{
+	up_write_mutex(&rwlock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_write_unlock_irq);
+
+void _read_unlock_irq(rwlock_t *rwlock)
+{
+	up_read_mutex(&rwlock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_read_unlock_irq);
+
+void _write_unlock_bh(rwlock_t *rwlock)
+{
+	up_write_mutex(&rwlock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_write_unlock_bh);
+
+void _read_unlock_bh(rwlock_t *rwlock)
+{
+	up_read_mutex(&rwlock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_read_unlock_bh);
+
+void _write_unlock_irqrestore(rwlock_t *rwlock,
+				       unsigned long flags)
+{
+	up_write_mutex(&rwlock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_write_unlock_irqrestore);
+
+void _read_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags)
+{
+	up_read_mutex(&rwlock->lock, CALLER_ADDR0);
+}
+EXPORT_SYMBOL(_read_unlock_irqrestore);
+
+void _rwlock_init(rwlock_t *rwlock, char *name, char *file, int line)
+{
+	__init_rwsem(&rwlock->lock, 1, 1, name, file, line);
+}
+EXPORT_SYMBOL(_rwlock_init);
+
+int _rwlock_is_locked(rwlock_t *rwlock)
+{
+	return rwsem_is_locked(&rwlock->lock);
+}
+EXPORT_SYMBOL(_rwlock_is_locked);
+
--- linux/kernel/time.c.orig
+++ linux/kernel/time.c
@@ -98,6 +98,20 @@ asmlinkage long sys_stime(time_t __user 
 
 asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz)
 {
+#ifdef CONFIG_LATENCY_TRACE
+	if (!tv && ((long)tz == 1))
+		return user_trace_start();
+	if (!tv && !tz)
+		return user_trace_stop();
+#endif
+	if (((long)tv == 1) && ((long)tz == 1)) {
+		current->flags |= PF_NOSCHED;
+		return 0;
+	}
+	if (((long)tv == 1) && ((long)tz == 0)) {
+		current->flags &= ~PF_NOSCHED;
+		return 0;
+	}
 	if (likely(tv != NULL)) {
 		struct timeval ktv;
 		do_gettimeofday(&ktv);
--- linux/kernel/exit.c.orig
+++ linux/kernel/exit.c
@@ -47,8 +47,11 @@ static void __unhash_process(struct task
 	if (thread_group_leader(p)) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
-		if (p->pid)
+		if (p->pid) {
+			preempt_disable();
 			__get_cpu_var(process_counts)--;
+			preempt_enable();
+		}
 	}
 
 	REMOVE_LINKS(p);
@@ -372,8 +375,10 @@ static inline void close_files(struct fi
 		while (set) {
 			if (set & 1) {
 				struct file * file = xchg(&files->fd[i], NULL);
-				if (file)
+				if (file) {
 					filp_close(file, files);
+					cond_resched();
+				}
 			}
 			i++;
 			set >>= 1;
@@ -503,9 +508,11 @@ static inline void __exit_mm(struct task
 	if (mm != tsk->active_mm) BUG();
 	/* more a memory barrier than a real lock */
 	task_lock(tsk);
+	preempt_disable(); // FIXME
 	tsk->mm = NULL;
 	up_read(&mm->mmap_sem);
 	enter_lazy_tlb(mm, current);
+	preempt_enable();
 	task_unlock(tsk);
 	mmput(mm);
 }
@@ -776,10 +783,6 @@ static void exit_notify(struct task_stru
 	/* If the process is dead, release it - nobody will wait for it */
 	if (state == EXIT_DEAD)
 		release_task(tsk);
-
-	/* PF_DEAD causes final put_task_struct after we schedule. */
-	preempt_disable();
-	tsk->flags |= PF_DEAD;
 }
 
 fastcall NORET_TYPE void do_exit(long code)
@@ -838,12 +841,18 @@ fastcall NORET_TYPE void do_exit(long co
 	mpol_free(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
-
-	BUG_ON(!(current->flags & PF_DEAD));
-	schedule();
-	BUG();
-	/* Avoid "noreturn function does return".  */
-	for (;;) ;
+	check_no_held_locks(tsk);
+	/* PF_DEAD causes final put_task_struct after we schedule. */
+again:
+	local_irq_disable();
+	tsk->flags |= PF_DEAD;
+	__schedule();
+	printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n",
+		current->comm, current->pid);
+	printk(KERN_ERR ".... flags: %08lx, count: %d, state: %08lx\n",
+		current->flags, atomic_read(&current->usage), current->state);
+	printk(KERN_ERR ".... trying again ...\n");
+	goto again;
 }
 
 NORET_TYPE void complete_and_exit(struct completion *comp, long code)
@@ -867,8 +876,21 @@ task_t fastcall *next_thread(const task_
 	if (!p->sighand)
 		BUG();
 	if (!spin_is_locked(&p->sighand->siglock) &&
-				!rwlock_is_locked(&tasklist_lock))
+				!rwlock_is_locked(&tasklist_lock)) {
+#ifdef CONFIG_PREEMPT_RT
+#if 0
+		printk("hm #1, siglock: %d. tasklist_lock: %d.\n",
+			atomic_read(&p->sighand->siglock.lock.count),
+			tasklist_lock.lock.activity);
+		spin_lock(&tasklist_lock.lock.wait_lock);
+		spin_unlock(&tasklist_lock.lock.wait_lock);
+		printk("hm #2, siglock: %d. tasklist_lock: %d.\n",
+			atomic_read(&p->sighand->siglock.lock.count),
+			tasklist_lock.lock.activity);
+#endif
+#endif
 		BUG();
+	}
 #endif
 	return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
 }
@@ -1348,6 +1370,7 @@ repeat:
 		list_for_each(_p,&tsk->children) {
 			p = list_entry(_p,struct task_struct,sibling);
 
+			BUG_ON(!atomic_read(&p->usage));
 			ret = eligible_child(pid, options, p);
 			if (!ret)
 				continue;
--- linux/kernel/printk.c.orig
+++ linux/kernel/printk.c
@@ -78,7 +78,7 @@ static int console_locked;
  * It is also used in interesting ways to provide interlocking in
  * release_console_sem().
  */
-static DEFINE_SPINLOCK(logbuf_lock);
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 static char __log_buf[__LOG_BUF_LEN];
 static char *log_buf = __log_buf;
@@ -390,10 +390,12 @@ static void __call_console_drivers(unsig
 {
 	struct console *con;
 
+	touch_critical_timing();
 	for (con = console_drivers; con; con = con->next) {
 		if ((con->flags & CON_ENABLED) && con->write)
 			con->write(con, &LOG_BUF(start), end - start);
 	}
+	touch_critical_timing();
 }
 
 /*
@@ -497,6 +499,7 @@ static void zap_locks(void)
 	spin_lock_init(&logbuf_lock);
 	/* And make sure that we print immediately */
 	init_MUTEX(&console_sem);
+	zap_rt_locks();
 }
 
 /*
@@ -651,8 +654,17 @@ void release_console_sem(void)
 	}
 	console_locked = 0;
 	console_may_schedule = 0;
-	up(&console_sem);
 	spin_unlock_irqrestore(&logbuf_lock, flags);
+	up(&console_sem);
+	/*
+	 * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd
+	 * up only if we are in a preemptible section. We normally dont
+	 * printk from non-preemptible sections so this is for the emergency
+	 * case only.
+	 */
+#ifdef CONFIG_PREEMPT_RT
+	if (!in_atomic() && !irqs_disabled())
+#endif
 	if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
 		wake_up_interruptible(&log_wait);
 }
@@ -875,7 +887,7 @@ void tty_write_message(struct tty_struct
  */
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
-	static DEFINE_SPINLOCK(ratelimit_lock);
+	static DEFINE_RAW_SPINLOCK(ratelimit_lock);
 	static unsigned long toks = 10*5*HZ;
 	static unsigned long last_msg;
 	static int missed;
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -4,6 +4,7 @@
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
+ *  Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
@@ -16,6 +17,7 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2004-10-13  Real-Time Preemption support by Ingo Molnar
  */
 
 #include <linux/mm.h>
@@ -48,6 +50,7 @@
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
+#include <linux/kallsyms.h>
 #include <asm/tlb.h>
 
 #include <asm/unistd.h>
@@ -186,6 +189,7 @@ static unsigned int task_timeslice(task_
 typedef struct runqueue runqueue_t;
 
 struct prio_array {
+	runqueue_t *rq;
 	unsigned int nr_active;
 	unsigned long bitmap[BITMAP_SIZE];
 	struct list_head queue[MAX_PRIO];
@@ -199,7 +203,7 @@ struct prio_array {
  * acquire operations must be ordered by ascending &runqueue.
  */
 struct runqueue {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
@@ -207,6 +211,9 @@ struct runqueue {
 	 */
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
+#ifdef CONFIG_PREEMPT_RT
+	unsigned long rt_nr_running;
+#endif
 	unsigned long cpu_load;
 #endif
 	unsigned long long nr_switches;
@@ -291,12 +298,18 @@ static DEFINE_PER_CPU(struct runqueue, r
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+#ifdef CONFIG_PREEMPT_RT
+# ifdef prepare_arch_switch
+#   error FIXME
+# endif
+#endif
+
 /*
  * Default context-switch locking:
  */
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(rq, next)	do { } while (0)
-# define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
+# define _finish_arch_switch(rq, next)	spin_unlock(&(rq)->lock)
 # define task_running(rq, p)		((rq)->curr == (p))
 #endif
 
@@ -563,6 +576,33 @@ static inline void sched_info_switch(tas
 #define sched_info_switch(t, next)	do { } while (0)
 #endif /* CONFIG_SCHEDSTATS */
 
+int rt_overload_schedule, rt_overload_wakeup, rt_overload_pulled;
+
+__cacheline_aligned_in_smp atomic_t rt_overload;
+
+static inline void inc_rt_tasks(task_t *p, runqueue_t *rq)
+{
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	if (rt_task(p)) {
+		rq->rt_nr_running++;
+		if (rq->rt_nr_running == 2)
+			atomic_inc(&rt_overload);
+	}
+#endif
+}
+
+static inline void dec_rt_tasks(task_t *p, runqueue_t *rq)
+{
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	if (rt_task(p)) {
+		WARN_ON(!rq->rt_nr_running);
+		rq->rt_nr_running--;
+		if (rq->rt_nr_running == 1)
+			atomic_dec(&rt_overload);
+	}
+#endif
+}
+
 /*
  * Adding/removing a task to/from a priority array:
  */
@@ -572,15 +612,21 @@ static void dequeue_task(struct task_str
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
+	dec_rt_tasks(p, array->rq);
 }
 
 static void enqueue_task(struct task_struct *p, prio_array_t *array)
 {
+	if (p->flags & PF_DEAD) {
+		printk("BUG: %s/%d: dead task enqueued!\n", p->comm, p->pid);
+		dump_stack();
+	}
 	sched_info_queued(p);
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
+	inc_rt_tasks(p, array->rq);
 }
 
 /*
@@ -619,13 +665,11 @@ static inline void enqueue_task_head(str
  *
  * Both properties are important to certain workloads.
  */
-static int effective_prio(task_t *p)
+
+static inline int __effective_prio(task_t *p)
 {
 	int bonus, prio;
 
-	if (rt_task(p))
-		return p->prio;
-
 	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 
 	prio = p->static_prio - bonus;
@@ -636,22 +680,52 @@ static int effective_prio(task_t *p)
 	return prio;
 }
 
+static int effective_prio(task_t *p)
+{
+	if (rt_task(p))
+		return p->prio;
+	return __effective_prio(p);
+}
+
+static inline void trace_start_sched_wakeup(task_t *p, runqueue_t *rq)
+{
+	if (TASK_PREEMPTS_CURR(p, rq) && (p != rq->curr))
+		__trace_start_sched_wakeup(p);
+}
+
 /*
  * __activate_task - move a task to the runqueue.
  */
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
+	trace_special_pid(p->pid, p->prio, rq->nr_running);
 	enqueue_task(p, rq->active);
 	rq->nr_running++;
 }
 
 /*
+ * __activate_task_after - move a task to the runqueue,
+ *                         to execute after a specific task.
+ */
+static inline
+void __activate_task_after(task_t *p, task_t *parent, runqueue_t *rq)
+{
+	// FIXME: to head rather?
+	list_add_tail(&p->run_list, &parent->run_list);
+	p->array = parent->array;
+	p->array->nr_active++;
+	rq->nr_running++;
+	inc_rt_tasks(p, rq);
+}
+
+/*
  * __activate_idle_task - move idle task to the _front_ of runqueue.
  */
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task_head(p, rq->active);
 	rq->nr_running++;
+	WARN_ON(rt_task(p));
 }
 
 static void recalc_task_prio(task_t *p, unsigned long long now)
@@ -984,7 +1058,7 @@ static inline int wake_idle(int cpu, tas
  *
  * returns failure only if the task is already active.
  */
-static int try_to_wake_up(task_t * p, unsigned int state, int sync)
+static int try_to_wake_up(task_t * p, unsigned int state, int sync, int mutex)
 {
 	int cpu, this_cpu, success = 0;
 	unsigned long flags;
@@ -996,12 +1070,25 @@ static int try_to_wake_up(task_t * p, un
 	int new_cpu;
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * sync wakeups can increase wakeup latencies:
+	 */
+	sync = 0;
+#endif
+
 	rq = task_rq_lock(p, &flags);
 	schedstat_inc(rq, ttwu_cnt);
 	old_state = p->state;
 	if (!(old_state & state))
 		goto out;
 
+	if (p->flags & PF_DEAD) {
+		printk("BUG: %s/%d: dead task woken up!\n", p->comm, p->pid);
+		dump_stack();
+		goto out;
+	}
+
 	if (p->array)
 		goto out_running;
 
@@ -1086,6 +1173,16 @@ out_set_cpu:
 
 		this_cpu = smp_processor_id();
 		cpu = task_cpu(p);
+	} else {
+		/*
+		 * If a newly woken up RT task cannot preempt the
+		 * current (RT) task then try to find another
+		 * CPU it can preempt:
+		 */
+		if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) {
+			smp_send_reschedule_allbutself();
+			rt_overload_wakeup++;
+		}
 	}
 
 out_activate:
@@ -1112,27 +1209,62 @@ out_activate:
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
+	trace_start_sched_wakeup(p, rq);
+	if (rq->curr && p && rq && _need_resched())
+		trace_special_pid(p->pid, p->prio, rq->curr->prio);
 	success = 1;
 
 out_running:
-	p->state = TASK_RUNNING;
+	if (mutex)
+		p->state = TASK_RUNNING_MUTEX;
+	else
+		p->state = TASK_RUNNING;
 out:
-	task_rq_unlock(rq, &flags);
+#ifdef PREEMPT_DIRECT
+	spin_unlock(&rq->lock);
+	/*
+	 * Common place where preemption is requested - if we can
+	 * reschedule then do it here without enabling interrupts
+	 * again (and lengthening latency):
+	 */
+	if (_need_resched() && !irqs_disabled_flags(flags) && !preempt_count())
+		preempt_schedule_irq();
+	local_irq_restore(flags);
+#else
+	spin_unlock_irqrestore(&rq->lock, flags);
+#endif
+	/* no need to check for preempt here - we just handled it */
 
 	return success;
 }
 
 int fastcall wake_up_process(task_t * p)
 {
-	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
-				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
+	int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 0, 0);
+	mcount();
+	return ret;
 }
 
 EXPORT_SYMBOL(wake_up_process);
 
+int fastcall wake_up_process_mutex(task_t * p)
+{
+	int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 0, 1);
+	mcount();
+	return ret;
+}
+
+EXPORT_SYMBOL(wake_up_process_mutex);
+
 int fastcall wake_up_state(task_t *p, unsigned int state)
 {
-	return try_to_wake_up(p, state, 0);
+	int ret = try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0);
+	mcount();
+	return ret;
 }
 
 #ifdef CONFIG_SMP
@@ -1239,15 +1371,16 @@ void fastcall wake_up_new_task(task_t * 
 				__activate_task(p, rq);
 			else {
 				p->prio = current->prio;
-				list_add_tail(&p->run_list, &current->run_list);
-				p->array = current->array;
-				p->array->nr_active++;
-				rq->nr_running++;
+				__activate_task_after(p, current, rq);
 			}
 			set_need_resched();
-		} else
+			trace_start_sched_wakeup(p, rq);
+		} else {
 			/* Run child last */
 			__activate_task(p, rq);
+			if (rt_task(p) && TASK_PREEMPTS_CURR(p, rq))
+				set_need_resched();
+		}
 		/*
 		 * We skip the following code due to cpu == this_cpu
 	 	 *
@@ -1326,13 +1459,14 @@ void fastcall sched_exit(task_t * p)
  * details.)
  */
 static void finish_task_switch(task_t *prev)
-	__releases(rq->lock)
+	__releases(this_rq->lock)
 {
-	runqueue_t *rq = this_rq();
-	struct mm_struct *mm = rq->prev_mm;
+	int this_cpu = smp_processor_id();
+	runqueue_t *this_rq = cpu_rq(this_cpu);
+	struct mm_struct *mm = this_rq->prev_mm;
 	unsigned long prev_task_flags;
 
-	rq->prev_mm = NULL;
+	this_rq->prev_mm = NULL;
 
 	/*
 	 * A task struct has one reference for the use as "current".
@@ -1346,11 +1480,28 @@ static void finish_task_switch(task_t *p
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_task_flags = prev->flags;
-	finish_arch_switch(rq, prev);
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	/*
+	 * If we pushed an RT task off the runqueue,
+	 * then kick other CPUs, they might run it:
+	 */
+	if (unlikely(rt_task(current) && prev->array && rt_task(prev))) {
+		rt_overload_schedule++;
+		smp_send_reschedule_allbutself();
+	}
+#endif
+	_finish_arch_switch(this_rq, prev);
+
+	trace_stop_sched_switched(current);
+	/*
+	 * Delay the final freeing of the mm or task, so that we dont have
+	 * to do complex work from within the scheduler:
+	 */
 	if (mm)
-		mmdrop(mm);
+		mmdrop_delayed(mm);
 	if (unlikely(prev_task_flags & PF_DEAD))
-		put_task_struct(prev);
+		put_task_struct_delayed(prev);
 }
 
 /**
@@ -1360,7 +1511,11 @@ static void finish_task_switch(task_t *p
 asmlinkage void schedule_tail(task_t *prev)
 	__releases(rq->lock)
 {
+	preempt_disable(); // TODO: move this to fork setup
 	finish_task_switch(prev);
+	preempt_enable_no_resched();
+	local_irq_enable();
+	preempt_check_resched();
 
 	if (current->set_child_tid)
 		put_user(current->pid, current->set_child_tid);
@@ -1389,6 +1544,8 @@ task_t * context_switch(runqueue_t *rq, 
 		rq->prev_mm = oldmm;
 	}
 
+	trace_cmdline();
+
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 
@@ -1429,6 +1586,21 @@ unsigned long nr_uninterruptible(void)
 	return sum;
 }
 
+unsigned long nr_uninterruptible_cpu(int cpu)
+{
+	return cpu_rq(cpu)->nr_uninterruptible;
+}
+
+unsigned long rt_nr_running_cpu(int cpu)
+{
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	return cpu_rq(cpu)->rt_nr_running;
+#else
+	return 0;
+#endif
+}
+
+
 unsigned long long nr_context_switches(void)
 {
 	unsigned long long i, sum = 0;
@@ -1510,6 +1682,100 @@ static void double_lock_balance(runqueue
 	}
 }
 
+#ifdef CONFIG_PREEMPT_RT
+
+static task_t * pick_rt_task(runqueue_t *src_rq, int this_cpu)
+{
+	struct list_head *head, *curr;
+	prio_array_t *array;
+	task_t *tmp;
+	int idx;
+
+	WARN_ON(!spin_is_locked(&src_rq->lock));
+	/*
+	 * Only consider the active array - we are looking for
+	 * RT tasks. Must have 2 tasks at least:
+	 */
+	array = src_rq->active;
+	if (unlikely(array->nr_active < 2))
+		return NULL;
+
+	idx = sched_find_first_bit(array->bitmap);
+next_in_bitmap:
+	/*
+	 * Only non-RT tasks available - abort the search:
+	 */
+	if (idx >= MAX_RT_PRIO)
+		return NULL;
+
+	head = array->queue + idx;
+	curr = head->next;
+next_in_queue:
+	tmp = list_entry(curr, task_t, run_list);
+	/*
+	 * Return the highest-prio non-running RT task (if task
+	 * may run on this CPU):
+	 */
+	if (!task_running(src_rq, tmp) &&
+				cpu_isset(this_cpu, tmp->cpus_allowed))
+		return tmp;
+
+	curr = curr->next;
+	if (curr != head)
+		goto next_in_queue;
+
+	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
+	goto next_in_bitmap;
+}
+
+/*
+ * Pull RT tasks from other CPUs in the RT-overload
+ * case. Interrupts are disabled, local rq is locked.
+ */
+static void pull_rt_tasks(runqueue_t *this_rq, int this_cpu)
+{
+	runqueue_t *src_rq;
+	task_t *p;
+	int cpu;
+
+	WARN_ON(!irqs_disabled());
+
+	for_each_online_cpu(cpu) {
+		if (cpu == this_cpu)
+			continue;
+		src_rq = cpu_rq(cpu);
+		if (src_rq->rt_nr_running <= 1)
+			continue;
+
+		double_lock_balance(this_rq, src_rq);
+
+		p = pick_rt_task(src_rq, this_cpu);
+	
+		if (p /* && TASK_PREEMPTS_CURR(p, this_rq) */ ) {
+			WARN_ON(p == src_rq->curr);
+			WARN_ON(!p->array);
+			rt_overload_pulled++;
+
+			set_task_cpu(p, this_cpu);
+
+			p->timestamp = p->timestamp -
+				src_rq->timestamp_last_tick
+				+ this_rq->timestamp_last_tick;
+			deactivate_task(p, src_rq);
+			activate_task(p, this_rq, 0);
+			/*
+			 * We continue with the search, just in
+			 * case there's an even higher prio task
+			 * in another runqueue.
+			 */
+		}
+		spin_unlock(&src_rq->lock);
+	}
+}
+
+#endif
+
+
 /*
  * find_idlest_cpu - find the least busy runqueue.
  */
@@ -2271,7 +2537,9 @@ static inline void account_it_virt(struc
 	    cputime_gt(cputime, cputime_zero)) {
 		if (cputime_ge(cputime, it_virt)) {
 			it_virt = cputime_add(it_virt, p->it_virt_incr);
+#if 0
 			send_sig(SIGVTALRM, p, 1);
+#endif
 		}
 		it_virt = cputime_sub(it_virt, cputime);
 		p->it_virt_value = it_virt;
@@ -2291,7 +2559,9 @@ static void account_it_prof(struct task_
 	    cputime_gt(cputime, cputime_zero)) {
 		if (cputime_ge(cputime, it_prof)) {
 			it_prof = cputime_add(it_prof, p->it_prof_incr);
+#if 0
 			send_sig(SIGPROF, p, 1);
+#endif
 		}
 		it_prof = cputime_sub(it_prof, cputime);
 		p->it_prof_value = it_prof;
@@ -2313,12 +2583,18 @@ static void check_rlimit(struct task_str
 	if (unlikely(cputime_gt(total, tmp))) {
 		/* Send SIGXCPU every second. */
 		tmp = cputime_sub(total, cputime);
-		if (cputime_to_secs(tmp) < cputime_to_secs(total))
+		if (cputime_to_secs(tmp) < cputime_to_secs(total)) {
+#if 0
 			send_sig(SIGXCPU, p, 1);
+#endif
+		}
 		/* and SIGKILL when we go over max.. */
 		tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_max);
-		if (cputime_gt(total, tmp))
+		if (cputime_gt(total, tmp)) {
+#if 0
 			send_sig(SIGKILL, p, 1);
+#endif
+		}
 	}
 }
 
@@ -2413,6 +2689,8 @@ void scheduler_tick(void)
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
 
+	BUG_ON(!irqs_disabled());
+
 	rq->timestamp_last_tick = sched_clock();
 
 	if (p == rq->idle) {
@@ -2434,6 +2712,8 @@ void scheduler_tick(void)
 	 * priority until it either goes to sleep or uses up its
 	 * timeslice. This makes it possible for interactive tasks
 	 * to use up their timeslices at their highest priority levels.
+	 *
+	 * Priority-boosted SCHED_NORMAL tasks may go here too.
 	 */
 	if (rt_task(p)) {
 		/*
@@ -2622,42 +2902,51 @@ static inline int dependent_sleeper(int 
 }
 #endif
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_LATENCY_TRACE) && defined(CONFIG_RT_DEADLOCK_DETECT)
 
-void fastcall add_preempt_count(int val)
+static void trace_array(prio_array_t *array)
 {
-	/*
-	 * Underflow?
-	 */
-	BUG_ON(((int)preempt_count() < 0));
-	preempt_count() += val;
-	/*
-	 * Spinlock count overflowing soon?
-	 */
-	BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+	int i;
+	task_t *p;
+	struct list_head *head, *tmp;
+
+	for (i = 0; i < MAX_PRIO; i++) {
+		head = array->queue + i;
+		if (list_empty(head)) {
+			WARN_ON(test_bit(i, array->bitmap));
+			continue;
+		}
+		WARN_ON(!test_bit(i, array->bitmap));
+		list_for_each(tmp, head) {
+			p = list_entry(tmp, task_t, run_list);
+			trace_special_pid(p->pid, p->prio,
+				p->policy == SCHED_NORMAL ?
+					p->static_prio :
+					MAX_USER_RT_PRIO - p->rt_priority);
+		}
+	}
 }
-EXPORT_SYMBOL(add_preempt_count);
 
-void fastcall sub_preempt_count(int val)
+static inline void trace_all_runnable_tasks(runqueue_t *rq)
+{
+	if (trace_enabled) {
+		trace_array(rq->active);
+		trace_array(rq->expired);
+	}
+}
+
+#else
+
+static inline void trace_all_runnable_tasks(runqueue_t *rq)
 {
-	/*
-	 * Underflow?
-	 */
-	BUG_ON(val > preempt_count());
-	/*
-	 * Is the spinlock portion underflowing?
-	 */
-	BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
-	preempt_count() -= val;
 }
-EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
 /*
- * schedule() is the main scheduler function.
+ * __schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+void __sched __schedule(void)
 {
 	long *switch_count;
 	task_t *prev, *next;
@@ -2668,26 +2957,24 @@ asmlinkage void __sched schedule(void)
 	unsigned long run_time;
 	int cpu, idx;
 
+	WARN_ON(system_state == SYSTEM_BOOTING);
 	/*
-	 * Test if we are atomic.  Since do_exit() needs to call into
-	 * schedule() atomically, we ignore that path for now.
-	 * Otherwise, whine if we are scheduling when we should not be.
-	 */
-	if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) {
-		if (unlikely(in_atomic())) {
-			printk(KERN_ERR "scheduling while atomic: "
-				"%s/0x%08x/%d\n",
-				current->comm, preempt_count(), current->pid);
-			dump_stack();
-		}
+	 * Test if we are atomic.
+	 */
+	if (unlikely(in_atomic())) {
+		stop_trace();
+		printk(KERN_ERR "BUG: scheduling while atomic: "
+			"%s/0x%08x/%d\n",
+			current->comm, preempt_count(), current->pid);
+		print_symbol("caller is %s\n",
+			(long)__builtin_return_address(0));
+		dump_stack();
 	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
-need_resched:
-	preempt_disable();
+	preempt_disable(); // FIXME: disable irqs here
 	prev = current;
 	release_kernel_lock(prev);
-need_resched_nonpreemptible:
 	rq = this_rq();
 
 	/*
@@ -2695,7 +2982,7 @@ need_resched_nonpreemptible:
 	 * Remove this check after it has been exercised a bit.
 	 */
 	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
-		printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+		printk(KERN_ERR "BUG: scheduling from the idle thread!\n");
 		dump_stack();
 	}
 
@@ -2712,16 +2999,17 @@ need_resched_nonpreemptible:
 	 */
 	run_time /= (CURRENT_BONUS(prev) ? : 1);
 
+	cpu = smp_processor_id();
 	spin_lock_irq(&rq->lock);
 
-	if (unlikely(prev->flags & PF_DEAD))
-		prev->state = EXIT_DEAD;
 	/*
 	 * if entering off of a kernel preemption go straight
 	 * to picking the next task.
 	 */
-	switch_count = &prev->nivcsw;
-	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+	switch_count = &prev->nvcsw; // TODO: temporary - to see it in vmstat
+
+	if ((prev->state & ~TASK_RUNNING_MUTEX) &&
+				!(preempt_count() & PREEMPT_ACTIVE)) {
 		switch_count = &prev->nvcsw;
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				unlikely(signal_pending(prev))))
@@ -2732,8 +3020,23 @@ need_resched_nonpreemptible:
 			deactivate_task(prev, rq);
 		}
 	}
+	if (preempt_count() & PREEMPT_ACTIVE)
+		sub_preempt_count(PREEMPT_ACTIVE);
+	if (unlikely(prev->flags & PF_DEAD)) {
+		if (prev->state != TASK_RUNNING) {
+			printk("prev->state: %ld != TASK_RUNNING??\n",
+				prev->state);
+			WARN_ON(1);
+		} else
+			deactivate_task(prev, rq);
+		prev->state = EXIT_DEAD;
+	}
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	if (unlikely(atomic_read(&rt_overload)))
+		pull_rt_tasks(rq, cpu);
+#endif
 
-	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
 go_idle:
 		idle_balance(cpu, rq);
@@ -2805,6 +3108,8 @@ switch_tasks:
 		prev->sleep_avg = 0;
 	prev->timestamp = prev->last_ran = now;
 
+	trace_all_runnable_tasks(rq);
+
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = now;
@@ -2815,22 +3120,79 @@ switch_tasks:
 		prepare_arch_switch(rq, next);
 		prev = context_switch(rq, prev, next);
 		barrier();
-
+		if (prev && current)
+			trace_special_pid(prev->pid, prev->prio, current->prio);
 		finish_task_switch(prev);
-	} else
-		spin_unlock_irq(&rq->lock);
+		preempt_enable_no_resched();
+	} else {
+		trace_stop_sched_switched(next);
+		preempt_enable_no_resched();
+		spin_unlock(&rq->lock);
+	}
 
-	prev = current;
-	if (unlikely(reacquire_kernel_lock(prev) < 0))
-		goto need_resched_nonpreemptible;
-	preempt_enable_no_resched();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
-		goto need_resched;
+	reacquire_kernel_lock(current);
+}
+
+/*
+ * schedule() is the main scheduler function.
+ */
+asmlinkage void __sched schedule(void)
+{
+	WARN_ON(system_state == SYSTEM_BOOTING);
+	/*
+	 * Test if we have interrupts disabled.
+	 */
+	if (unlikely(irqs_disabled())) {
+		stop_trace();
+		printk(KERN_ERR "BUG: scheduling with irqs disabled: "
+			"%s/0x%08x/%d\n",
+				current->comm, preempt_count(), current->pid);
+		print_symbol("caller is %s\n",
+			(long)__builtin_return_address(0));
+		dump_stack();
+	}
+	if (unlikely(current->flags & PF_NOSCHED)) {
+		current->flags &= ~PF_NOSCHED;
+		printk(KERN_ERR "%s:%d userspace BUG: scheduling in user-atomic context!\n", current->comm, current->pid);
+		dump_stack();
+		send_sig(SIGUSR2, current, 1);
+	}
+	do {
+		__schedule();
+	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+	local_irq_enable(); // TODO: do sti; ret
 }
 
 EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_PREEMPT
+
+int kernel_preemption = 1;
+
+static int __init preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3)) {
+		if (kernel_preemption) {
+			printk("turning off kernel preemption!\n");
+			kernel_preemption = 0;
+		}
+		return 1;
+	}
+	if (!strncmp(str, "on", 2)) {
+		if (!kernel_preemption) {
+			printk("turning on kernel preemption!\n");
+			kernel_preemption = 1;
+		}
+		return 1;
+	}
+	get_option(&str, &kernel_preemption);
+
+	return 1;
+}
+
+__setup("preempt=", preempt_setup);
+
+
 /*
  * this is is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable.  Kernel preemptions off return from interrupt
@@ -2843,6 +3205,9 @@ asmlinkage void __sched preempt_schedule
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
+
+	if (!kernel_preemption)
+		return;
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task.  Just return..
@@ -2851,6 +3216,7 @@ asmlinkage void __sched preempt_schedule
 		return;
 
 need_resched:
+	local_irq_disable();
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
@@ -2861,25 +3227,72 @@ need_resched:
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
-	schedule();
+	__schedule();
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
-	sub_preempt_count(PREEMPT_ACTIVE);
 
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
+	local_irq_enable();
 }
 
 EXPORT_SYMBOL(preempt_schedule);
+
+/*
+ * this is is the entry point for the IRQ return path. Called with
+ * interrupts disabled.  To avoid infinite irq-entry recursion problems
+ * with fast-paced IRQ sources we do all of this carefully to never
+ * enable interrupts again.
+ */
+asmlinkage void __sched preempt_schedule_irq(void)
+{
+	struct thread_info *ti = current_thread_info();
+#ifdef CONFIG_PREEMPT_BKL
+	struct task_struct *task = current;
+	int saved_lock_depth;
+#endif
+
+	if (!kernel_preemption)
+		return;
+	/*
+	 * If there is a non-zero preempt_count then just return.
+	 * (interrupts are disabled)
+	 */
+	if (unlikely(ti->preempt_count))
+		return;
+
+need_resched:
+	add_preempt_count(PREEMPT_ACTIVE);
+	/*
+	 * We keep the big kernel semaphore locked, but we
+	 * clear ->lock_depth so that schedule() doesnt
+	 * auto-release the semaphore:
+	 */
+#ifdef CONFIG_PREEMPT_BKL
+	saved_lock_depth = task->lock_depth;
+	task->lock_depth = -1;
+#endif
+	__schedule();
+	local_irq_disable();
+#ifdef CONFIG_PREEMPT_BKL
+	task->lock_depth = saved_lock_depth;
+#endif
+
+	/* we could miss a preemption opportunity between schedule and now */
+	barrier();
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+		goto need_resched;
+}
+
 #endif /* CONFIG_PREEMPT */
 
 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
 {
 	task_t *p = curr->task;
-	return try_to_wake_up(p, mode, sync);
+	return try_to_wake_up(p, mode | TASK_RUNNING_MUTEX, sync, 0);
 }
 
 EXPORT_SYMBOL(default_wake_function);
@@ -2990,6 +3403,13 @@ void fastcall complete_all(struct comple
 }
 EXPORT_SYMBOL(complete_all);
 
+unsigned int fastcall completion_done(struct completion *x)
+{
+	return x->done;
+}
+EXPORT_SYMBOL(completion_done);
+
+
 void fastcall __sched wait_for_completion(struct completion *x)
 {
 	might_sleep();
@@ -3012,6 +3432,101 @@ void fastcall __sched wait_for_completio
 }
 EXPORT_SYMBOL(wait_for_completion);
 
+unsigned long fastcall __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			timeout = schedule_timeout(timeout);
+			if (!timeout)
+				goto out;
+			spin_lock_irq(&x->wait.lock);
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+	spin_unlock_irq(&x->wait.lock);
+out:
+	return timeout;
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+
+int fastcall __sched wait_for_completion_interruptible(struct completion *x)
+{
+	int ret = 0;
+
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			if (signal_pending(current)) {
+				ret = -ERESTARTSYS;
+				goto out;
+			}
+			__set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			schedule();
+			spin_lock_irq(&x->wait.lock);
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+out:
+	spin_unlock_irq(&x->wait.lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+
+unsigned long fastcall __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+					  unsigned long timeout)
+{
+	might_sleep();
+
+	spin_lock_irq(&x->wait.lock);
+	if (!x->done) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		wait.flags |= WQ_FLAG_EXCLUSIVE;
+		__add_wait_queue_tail(&x->wait, &wait);
+		do {
+			if (signal_pending(current)) {
+				timeout = -ERESTARTSYS;
+				goto out_unlock;
+			}
+			__set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&x->wait.lock);
+			timeout = schedule_timeout(timeout);
+			if (!timeout)
+				goto out;
+			spin_lock_irq(&x->wait.lock);
+		} while (!x->done);
+		__remove_wait_queue(&x->wait, &wait);
+	}
+	x->done--;
+out_unlock:
+	spin_unlock_irq(&x->wait.lock);
+out:
+	return timeout;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+
+
 #define	SLEEP_ON_VAR					\
 	unsigned long flags;				\
 	wait_queue_t wait;				\
@@ -3237,6 +3752,65 @@ static void __setscheduler(struct task_s
 		p->prio = p->static_prio;
 }
 
+int mutex_getprio(task_t *p)
+{
+	int prio;
+
+	if (p->policy != SCHED_NORMAL)
+		prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
+	else
+		prio = __effective_prio(p);
+	trace_special_pid(p->pid, p->prio, prio);
+	return prio;
+}
+
+/*
+ * Used by the PREEMPT_RT code to implement
+ * priority inheritance logic:
+ */
+void mutex_setprio(task_t *p, int prio)
+{
+	unsigned long flags;
+	prio_array_t *array;
+	runqueue_t *rq;
+	int oldprio, prev_resched;
+
+	BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+	rq = task_rq_lock(p, &flags);
+
+	oldprio = p->prio;
+	array = p->array;
+	if (array)
+		dequeue_task(p, array);
+	p->prio = prio;
+
+	trace_special_pid(p->pid, oldprio, prio);
+	prev_resched = _need_resched();
+	if (array) {
+		/*
+		 * If changing to an RT priority then queue it
+		 * in the active array!
+		 */
+		if (rt_task(p))
+			array = rq->active;
+		enqueue_task(p, array);
+		/*
+		 * Reschedule if we are currently running on this runqueue and
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
+		 */
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+	trace_special(prev_resched, _need_resched(), 0);
+
+	task_rq_unlock(rq, &flags);
+}
+
 /*
  * setscheduler - change the scheduling policy and/or RT priority of a thread.
  */
@@ -3602,21 +4176,28 @@ asmlinkage long sys_sched_yield(void)
 	 * no need to preempt or enable interrupts:
 	 */
 	__release(rq->lock);
-	_raw_spin_unlock(&rq->lock);
+	__raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();
 
-	schedule();
+	__schedule();
+	local_irq_enable();
+	preempt_check_resched();
 
 	return 0;
 }
 
-static inline void __cond_resched(void)
+static void __cond_resched(void)
 {
+	if (system_state == SYSTEM_BOOTING || !current->pid)
+		return;
+	if (preempt_count() & PREEMPT_ACTIVE)
+		return;
 	do {
+		local_irq_disable();
 		add_preempt_count(PREEMPT_ACTIVE);
-		schedule();
-		sub_preempt_count(PREEMPT_ACTIVE);
+		__schedule();
 	} while (need_resched());
+	local_irq_enable();
 }
 
 int __sched cond_resched(void)
@@ -3638,7 +4219,7 @@ EXPORT_SYMBOL(cond_resched);
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
-int cond_resched_lock(spinlock_t * lock)
+int __cond_resched_raw_spinlock(raw_spinlock_t *lock)
 {
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
 	if (lock->break_lock) {
@@ -3649,7 +4230,7 @@ int cond_resched_lock(spinlock_t * lock)
 	}
 #endif
 	if (need_resched()) {
-		_raw_spin_unlock(lock);
+		__raw_spin_unlock(lock);
 		preempt_enable_no_resched();
 		__cond_resched();
 		spin_lock(lock);
@@ -3658,23 +4239,104 @@ int cond_resched_lock(spinlock_t * lock)
 	return 0;
 }
 
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_raw_spinlock);
+
+#ifdef CONFIG_PREEMPT_RT
 
+int __cond_resched_spinlock(spinlock_t *lock)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
+	if (lock->break_lock) {
+		lock->break_lock = 0;
+		_spin_unlock(lock);
+		__cond_resched();
+		_spin_lock(lock);
+	}
+#endif
+	return 0;
+}
+
+EXPORT_SYMBOL(__cond_resched_spinlock);
+
+#endif
+
+
+/*
+ * Preempt a softirq context if necessary:
+ */
 int __sched cond_resched_softirq(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(!in_softirq());
 
-	if (need_resched()) {
+	if (softirq_need_resched()) {
 		__local_bh_enable();
 		__cond_resched();
 		local_bh_disable();
 		return 1;
 	}
+#endif
 	return 0;
 }
 
 EXPORT_SYMBOL(cond_resched_softirq);
 
+/*
+ * Preempt a hardirq context if necessary:
+ */
+int cond_resched_hardirq(void)
+{
+	unsigned long flags;
+
+	BUG_ON(!in_irq());
+	if (hardirq_need_resched()) {
+		local_save_flags(flags);
+		irq_exit();
+		__cond_resched();
+		local_irq_restore(flags);
+		irq_enter();
+		return 1;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(cond_resched_hardirq);
+
+/*
+ * Preempt any context:
+ */
+int cond_resched_all(void)
+{
+	if (hardirq_count())
+		return cond_resched_hardirq();
+	if (softirq_count())
+		return cond_resched_softirq();
+	return cond_resched();
+}
+
+EXPORT_SYMBOL(cond_resched_all);
+
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+
+int voluntary_preemption = 1;
+
+EXPORT_SYMBOL(voluntary_preemption);
+
+static int __init voluntary_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		voluntary_preemption = 0;
+	else
+		get_option(&str, &voluntary_preemption);
+	if (!voluntary_preemption)
+		printk("turning off voluntary preemption!\n");
+
+	return 1;
+}
+
+__setup("voluntary-preempt=", voluntary_preempt_setup);
+
+#endif
 
 /**
  * yield - yield the current processor to other threads.
@@ -3828,23 +4490,27 @@ static void show_task(task_t * p)
 	unsigned long free = 0;
 	static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
 
-	printk("%-13.13s ", p->comm);
+	printk("%-13.13s [%p]", p->comm, p);
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	if (state < ARRAY_SIZE(stat_nam))
 		printk(stat_nam[state]);
 	else
 		printk("?");
 #if (BITS_PER_LONG == 32)
-	if (state == TASK_RUNNING)
+	if (0 && (state == TASK_RUNNING))
 		printk(" running ");
 	else
 		printk(" %08lX ", thread_saved_pc(p));
 #else
-	if (state == TASK_RUNNING)
+	if (0 && (state == TASK_RUNNING))
 		printk("  running task   ");
 	else
 		printk(" %016lx ", thread_saved_pc(p));
 #endif
+	if (task_curr(p))
+		printk("[curr] ");
+	else if (p->array)
+		printk("[on rq] ");
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	{
 		unsigned long * n = (unsigned long *) (p->thread_info+1);
@@ -3878,6 +4544,7 @@ static void show_task(task_t * p)
 void show_state(void)
 {
 	task_t *g, *p;
+	int do_unlock = 1;
 
 #if (BITS_PER_LONG == 32)
 	printk("\n"
@@ -3888,7 +4555,16 @@ void show_state(void)
 	       "                                                       sibling\n");
 	printk("  task                 PC          pid father child younger older\n");
 #endif
+#ifdef CONFIG_PREEMPT_RT
+	if (!read_trylock(&tasklist_lock)) {
+		printk("hm, tasklist_lock write-locked.\n");
+		printk("ignoring ...\n");
+		do_unlock = 0;
+	}
+#else
 	read_lock(&tasklist_lock);
+#endif
+
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
@@ -3898,7 +4574,9 @@ void show_state(void)
 		show_task(p);
 	} while_each_thread(g, p);
 
-	read_unlock(&tasklist_lock);
+	if (do_unlock)
+		read_unlock(&tasklist_lock);
+	show_all_locks();
 }
 
 void __devinit init_idle(task_t *idle, int cpu)
@@ -3918,7 +4596,9 @@ void __devinit init_idle(task_t *idle, i
 	spin_unlock_irqrestore(&rq->lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
+#if defined(CONFIG_PREEMPT) && \
+	!defined(CONFIG_PREEMPT_BKL) && \
+		!defined(CONFIG_PREEMPT_RT)
 	idle->thread_info->preempt_count = (idle->lock_depth >= 0);
 #else
 	idle->thread_info->preempt_count = 0;
@@ -4004,12 +4684,13 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  */
-static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	runqueue_t *rq_dest, *rq_src;
+	int ret = 0;
 
 	if (unlikely(cpu_is_offline(dest_cpu)))
-		return;
+		return 0;
 
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
@@ -4022,7 +4703,9 @@ static void __migrate_task(struct task_s
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
 
+	WARN_ON(p == rq_src->curr);
 	set_task_cpu(p, dest_cpu);
+
 	if (p->array) {
 		/*
 		 * Sync timestamp with rq_dest's before activating.
@@ -4036,10 +4719,13 @@ static void __migrate_task(struct task_s
 		activate_task(p, rq_dest, 0);
 		if (TASK_PREEMPTS_CURR(p, rq_dest))
 			resched_task(rq_dest->curr);
+		ret = 1;
 	}
 
 out:
 	double_rq_unlock(rq_src, rq_dest);
+
+	return ret;
 }
 
 /*
@@ -4820,6 +5506,7 @@ void __init sched_init(void)
 
 		for (j = 0; j < 2; j++) {
 			array = rq->arrays + j;
+			array->rq = rq;
 			for (k = 0; k < MAX_PRIO; k++) {
 				INIT_LIST_HEAD(array->queue + k);
 				__clear_bit(k, array->bitmap);
@@ -4835,6 +5522,9 @@ void __init sched_init(void)
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 
+#ifdef CONFIG_PREEMPT_RT
+	printk("Real-Time Preemption Support (c) Ingo Molnar\n");
+#endif
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
@@ -4844,7 +5534,7 @@ void __init sched_init(void)
 	init_idle(current, smp_processor_id());
 }
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
 void __might_sleep(char *file, int line)
 {
 #if defined(in_atomic)
@@ -4852,13 +5542,17 @@ void __might_sleep(char *file, int line)
 
 	if ((in_atomic() || irqs_disabled()) &&
 	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
+		if (debug_direct_keyboard && hardirq_count())
+			return;
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
-		printk(KERN_ERR "Debug: sleeping function called from invalid"
-				" context at %s:%d\n", file, line);
-		printk("in_atomic():%d, irqs_disabled():%d\n",
-			in_atomic(), irqs_disabled());
+		stop_trace();
+		printk(KERN_ERR "BUG: sleeping function called from invalid"
+				" context %s(%d) at %s:%d\n",
+				current->comm, current->pid, file, line);
+		printk("in_atomic():%d [%08x], irqs_disabled():%d\n",
+			in_atomic(), preempt_count(), irqs_disabled());
 		dump_stack();
 	}
 #endif
--- linux/kernel/fork.c.orig
+++ linux/kernel/fork.c
@@ -41,6 +41,8 @@
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -63,6 +65,16 @@ DEFINE_PER_CPU(unsigned long, process_co
 
 EXPORT_SYMBOL(tasklist_lock);
 
+/*
+ * Delayed mmdrop/put_task_struct. In the PREEMPT_RT case we
+ * dont want to do this from the scheduling context.
+ */
+static DEFINE_PER_CPU(struct task_struct *, desched_task);
+
+static DEFINE_PER_CPU(struct list_head, delayed_put_list);
+static DEFINE_PER_CPU(struct list_head, delayed_drop_list);
+
+
 int nr_processes(void)
 {
 	int cpu;
@@ -89,6 +101,8 @@ EXPORT_SYMBOL(free_task);
 
 void __put_task_struct(struct task_struct *tsk)
 {
+	BUG_ON(atomic_read(&tsk->usage));
+	WARN_ON(!(tsk->flags & PF_DEAD));
 	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
@@ -103,8 +117,29 @@ void __put_task_struct(struct task_struc
 		free_task(tsk);
 }
 
+void put_task_struct(struct task_struct *tsk)
+{
+	BUG_ON(!atomic_read(&tsk->usage));
+
+	if (!atomic_dec_and_test(&tsk->usage))
+		return;
+	__put_task_struct(tsk);
+}
+
+EXPORT_SYMBOL(put_task_struct);
+
+void get_task_struct(struct task_struct *tsk)
+{
+	BUG_ON(!atomic_read(&tsk->usage));
+	atomic_inc(&tsk->usage);
+}
+
+EXPORT_SYMBOL(get_task_struct);
+
 void __init fork_init(unsigned long mempages)
 {
+	int i;
+
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
@@ -130,6 +165,11 @@ void __init fork_init(unsigned long memp
 
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i));
+		INIT_LIST_HEAD(&per_cpu(delayed_put_list, i));
+	}
 }
 
 static struct task_struct *dup_task_struct(struct task_struct *orig)
@@ -305,6 +345,7 @@ static struct mm_struct * mm_init(struct
 	rwlock_init(&mm->ioctx_list_lock);
 	mm->ioctx_list = NULL;
 	mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
+	INIT_LIST_HEAD(&mm->delayed_drop);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 
 	if (likely(!mm_alloc_pgd(mm))) {
@@ -905,6 +946,9 @@ static task_t *copy_process(unsigned lon
  		goto bad_fork_cleanup;
  	}
 #endif
+	INIT_LIST_HEAD(&p->delayed_put);
+	INIT_LIST_HEAD(&p->pi_waiters);
+	p->blocked_on = NULL; /* not blocked yet */
 
 	p->tgid = p->pid;
 	if (clone_flags & CLONE_THREAD)
@@ -978,8 +1022,10 @@ static task_t *copy_process(unsigned lon
 	 * another CPU - so we re-copy it here and set the child's CPU to
 	 * the parent's CPU. This avoids alot of nasty races.
 	 */
+	preempt_disable();
 	p->cpus_allowed = current->cpus_allowed;
 	set_task_cpu(p, smp_processor_id());
+	preempt_enable();
 
 	/*
 	 * Check for pending SIGKILL! The new thread should not be allowed
@@ -1037,8 +1083,11 @@ static task_t *copy_process(unsigned lon
 	if (thread_group_leader(p)) {
 		attach_pid(p, PIDTYPE_PGID, process_group(p));
 		attach_pid(p, PIDTYPE_SID, p->signal->session);
-		if (p->pid)
+		if (p->pid) {
+			preempt_disable();
 			__get_cpu_var(process_counts)++;
+			preempt_enable();
+		}
 	}
 
 	nr_threads++;
@@ -1232,3 +1281,173 @@ void __init proc_caches_init(void)
 			sizeof(struct mm_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 }
+
+static int put_task_complete(void)
+{
+	struct list_head *head;
+	int ret = 0;
+
+	head = &get_cpu_var(delayed_put_list);
+	while (!list_empty(head)) {
+		struct task_struct *task = list_entry(head->next,
+					struct task_struct, delayed_put);
+		list_del(&task->delayed_put);
+		put_cpu_var(delayed_put_list);
+
+		__put_task_struct(task);
+		ret = 1;
+
+		head = &get_cpu_var(delayed_put_list);
+	}
+	put_cpu_var(delayed_put_list);
+
+	return ret;
+}
+
+/*
+ * We dont want to do complex work from the scheduler, thus
+ * we delay the work to a per-CPU worker thread:
+ */
+void fastcall __put_task_struct_delayed(struct task_struct *task)
+{
+	struct task_struct *desched_task;
+	struct list_head *head;
+
+	head = &get_cpu_var(delayed_put_list);
+	list_add_tail(&task->delayed_put, head);
+	desched_task = __get_cpu_var(desched_task);
+	if (desched_task)
+		wake_up_process(desched_task);
+	put_cpu_var(delayed_put_list);
+}
+
+void put_task_struct_delayed(struct task_struct *tsk)
+{
+	BUG_ON(!atomic_read(&tsk->usage));
+
+	if (!atomic_dec_and_test(&tsk->usage))
+		return;
+	__put_task_struct_delayed(tsk);
+}
+
+static int mmdrop_complete(void)
+{
+	struct list_head *head;
+	int ret = 0;
+
+	head = &get_cpu_var(delayed_drop_list);
+	while (!list_empty(head)) {
+		struct mm_struct *mm = list_entry(head->next,
+					struct mm_struct, delayed_drop);
+		list_del(&mm->delayed_drop);
+		put_cpu_var(delayed_drop_list);
+
+		__mmdrop(mm);
+		ret = 1;
+
+		head = &get_cpu_var(delayed_drop_list);
+	}
+	put_cpu_var(delayed_drop_list);
+
+	return ret;
+}
+
+/*
+ * We dont want to do complex work from the scheduler, thus
+ * we delay the work to a per-CPU worker thread:
+ */
+void fastcall __mmdrop_delayed(struct mm_struct *mm)
+{
+	struct task_struct *desched_task;
+	struct list_head *head;
+
+	head = &get_cpu_var(delayed_drop_list);
+	list_add_tail(&mm->delayed_drop, head);
+	desched_task = __get_cpu_var(desched_task);
+	if (desched_task)
+		wake_up_process(desched_task);
+	put_cpu_var(delayed_drop_list);
+}
+
+static int desched_thread(void * __bind_cpu)
+{
+	printk("desched thread %ld started up.\n", (long) __bind_cpu);
+
+	set_user_nice(current, -10);
+	current->flags |= PF_NOFREEZE | PF_SOFTIRQ;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+		int ret;
+
+		ret = put_task_complete();
+		ret |= mmdrop_complete();
+		if (ret)
+			continue;
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	printk("desched cpu_callback %ld/%p\n", action, hcpu);
+	switch (action) {
+	case CPU_UP_PREPARE:
+
+		BUG_ON(per_cpu(desched_task, hotcpu));
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu));
+		INIT_LIST_HEAD(&per_cpu(delayed_put_list, hotcpu));
+		p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("desched_thread for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+  		per_cpu(desched_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+ 		break;
+	case CPU_ONLINE:
+
+		wake_up_process(per_cpu(desched_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+
+		p = per_cpu(desched_task, hotcpu);
+		per_cpu(desched_task, hotcpu) = NULL;
+		kthread_stop(p);
+		takeover_tasklets(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init int spawn_desched_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+
+
+	printk("spawn_desched_task(%p)\n", cpu);
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+	return 0;
+}
+
--- linux/kernel/irq/proc.c.orig
+++ linux/kernel/irq/proc.c
@@ -7,9 +7,12 @@
  */
 
 #include <linux/irq.h>
+#include <asm/uaccess.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
 
+#include "internals.h"
+
 static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
 
 #ifdef CONFIG_SMP
@@ -63,37 +66,6 @@ static int irq_affinity_write_proc(struc
 
 #endif
 
-#define MAX_NAMELEN 128
-
-static int name_unique(unsigned int irq, struct irqaction *new_action)
-{
-	struct irq_desc *desc = irq_desc + irq;
-	struct irqaction *action;
-
-	for (action = desc->action ; action; action = action->next)
-		if ((action != new_action) && action->name &&
-				!strcmp(new_action->name, action->name))
-			return 0;
-	return 1;
-}
-
-void register_handler_proc(unsigned int irq, struct irqaction *action)
-{
-	char name [MAX_NAMELEN];
-
-	if (!irq_dir[irq] || action->dir || !action->name ||
-					!name_unique(irq, action))
-		return;
-
-	memset(name, 0, MAX_NAMELEN);
-	snprintf(name, MAX_NAMELEN, "%s", action->name);
-
-	/* create /proc/irq/1234/handler/ */
-	action->dir = proc_mkdir(name, irq_dir[irq]);
-}
-
-#undef MAX_NAMELEN
-
 #define MAX_NAMELEN 10
 
 void register_irq_proc(unsigned int irq)
@@ -133,10 +105,96 @@ void register_irq_proc(unsigned int irq)
 
 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 {
+	if (action->threaded)
+		remove_proc_entry(action->threaded->name, action->dir);
 	if (action->dir)
 		remove_proc_entry(action->dir->name, irq_dir[irq]);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
+static int threaded_read_proc(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	return sprintf(page, "%c\n",
+		((struct irqaction *)data)->flags & SA_NODELAY ? '0' : '1');
+}
+
+static int threaded_write_proc(struct file *file, const char __user *buffer,
+			       unsigned long count, void *data)
+{
+	int c;
+	struct irqaction *action = data;
+	irq_desc_t *desc = irq_desc + action->irq;
+
+	if (get_user(c, buffer))
+		return -EFAULT;
+	if (c != '0' && c != '1')
+		return -EINVAL;
+
+	spin_lock_irq(&desc->lock);
+
+	if (c == '0')
+		action->flags |= SA_NODELAY;
+	if (c == '1')
+		action->flags &= ~SA_NODELAY;
+	recalculate_desc_flags(desc);
+
+	spin_unlock_irq(&desc->lock);
+
+	return 1;
+}
+
+#endif
+
+#define MAX_NAMELEN 128
+
+static int name_unique(unsigned int irq, struct irqaction *new_action)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *action;
+
+	for (action = desc->action ; action; action = action->next)
+		if ((action != new_action) && action->name &&
+				!strcmp(new_action->name, action->name))
+			return 0;
+	return 1;
+}
+
+void register_handler_proc(unsigned int irq, struct irqaction *action)
+{
+	char name [MAX_NAMELEN];
+
+	if (!irq_dir[irq] || action->dir || !action->name ||
+					!name_unique(irq, action))
+		return;
+
+	memset(name, 0, MAX_NAMELEN);
+	snprintf(name, MAX_NAMELEN, "%s", action->name);
+
+	/* create /proc/irq/1234/handler/ */
+	action->dir = proc_mkdir(name, irq_dir[irq]);
+	if (!action->dir)
+		return;
+#ifndef CONFIG_PREEMPT_RT
+	{
+		struct proc_dir_entry *entry;
+		/* create /proc/irq/1234/handler/threaded */
+		entry = create_proc_entry("threaded", 0600, action->dir);
+		if (!entry)
+			return;
+		entry->nlink = 1;
+		entry->data = (void *)action;
+		entry->read_proc = threaded_read_proc;
+		entry->write_proc = threaded_write_proc;
+		action->threaded = entry;
+	}
+#endif
+}
+
+#undef MAX_NAMELEN
+
+
 void init_irq_proc(void)
 {
 	int i;
@@ -146,6 +204,9 @@ void init_irq_proc(void)
 	if (!root_irq_dir)
 		return;
 
+	/* create /proc/irq/prof_cpu_mask */
+	create_prof_cpu_mask(root_irq_dir);
+
 	/*
 	 * Create entries for all existing IRQs.
 	 */
--- linux/kernel/irq/manage.c.orig
+++ linux/kernel/irq/manage.c
@@ -7,8 +7,10 @@
  */
 
 #include <linux/irq.h>
-#include <linux/module.h>
 #include <linux/random.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/syscalls.h>
 #include <linux/interrupt.h>
 
 #include "internals.h"
@@ -28,8 +30,12 @@ void synchronize_irq(unsigned int irq)
 {
 	struct irq_desc *desc = irq_desc + irq;
 
-	while (desc->status & IRQ_INPROGRESS)
-		cpu_relax();
+	if (hardirq_preemption && !(desc->status & IRQ_NODELAY))
+		wait_event(desc->wait_for_handler,
+			!(desc->status & IRQ_INPROGRESS));
+	else
+		while (desc->status & IRQ_INPROGRESS)
+			cpu_relax();
 }
 
 EXPORT_SYMBOL(synchronize_irq);
@@ -125,6 +131,21 @@ void enable_irq(unsigned int irq)
 EXPORT_SYMBOL(enable_irq);
 
 /*
+ * If any action has SA_NODELAY then turn IRQ_NODELAY on:
+ */
+void recalculate_desc_flags(struct irq_desc *desc)
+{
+	struct irqaction *action;
+
+	desc->status &= ~IRQ_NODELAY;
+	for (action = desc->action ; action; action = action->next)
+		if (action->flags & SA_NODELAY)
+			desc->status |= IRQ_NODELAY;
+}
+
+static int start_irq_thread(int irq, struct irq_desc *desc);
+
+/*
  * Internal function that tells the architecture code whether a
  * particular irq has been exclusively allocated or is available
  * for driver use.
@@ -179,6 +200,9 @@ int setup_irq(unsigned int irq, struct i
 		rand_initialize_irq(irq);
 	}
 
+	if (!(new->flags & SA_NODELAY))
+		if (start_irq_thread(irq, desc))
+			return -ENOMEM;
 	/*
 	 * The following block of code has to be executed atomically
 	 */
@@ -201,6 +225,11 @@ int setup_irq(unsigned int irq, struct i
 
 	*p = new;
 
+	/*
+	 * Propagate any possible SA_NODELAY flag into IRQ_NODELAY:
+	 */
+	recalculate_desc_flags(desc);
+
 	if (!shared) {
 		desc->depth = 0;
 		desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
@@ -214,7 +243,7 @@ int setup_irq(unsigned int irq, struct i
 
 	new->irq = irq;
 	register_irq_proc(irq);
-	new->dir = NULL;
+	new->dir = new->threaded = NULL;
 	register_handler_proc(irq, new);
 
 	return 0;
@@ -264,6 +293,7 @@ int teardown_irq(unsigned int irq, struc
 				else
 					desc->handler->disable(irq);
 			}
+			recalculate_desc_flags(desc);
 			spin_unlock_irqrestore(&desc->lock,flags);
 			unregister_handler_proc(irq, action);
 
@@ -388,3 +418,175 @@ int request_irq(unsigned int irq,
 
 EXPORT_SYMBOL(request_irq);
 
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+int hardirq_preemption = 1;
+
+EXPORT_SYMBOL(hardirq_preemption);
+
+/*
+ * Real-Time Preemption depends on hardirq threading:
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+static int __init hardirq_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		hardirq_preemption = 0;
+	else
+		get_option(&str, &hardirq_preemption);
+	if (!hardirq_preemption)
+		printk("turning off hardirq preemption!\n");
+
+	return 1;
+}
+
+__setup("hardirq-preempt=", hardirq_preempt_setup);
+
+#endif
+
+static void do_hardirq(struct irq_desc *desc)
+{
+	struct irqaction * action;
+	unsigned int irq = desc - irq_desc;
+
+	local_irq_disable();
+
+	if (desc->status & IRQ_INPROGRESS) {
+		action = desc->action;
+		spin_lock(&desc->lock);
+		for (;;) {
+			irqreturn_t action_ret = 0;
+
+			if (action) {
+				spin_unlock(&desc->lock);
+				action_ret = handle_IRQ_event(irq, NULL,action);
+				local_irq_enable();
+				cond_resched_all();
+				spin_lock_irq(&desc->lock);
+			}
+			if (!noirqdebug)
+				note_interrupt(irq, desc, action_ret);
+			if (likely(!(desc->status & IRQ_PENDING)))
+				break;
+			desc->status &= ~IRQ_PENDING;
+		}
+		desc->status &= ~IRQ_INPROGRESS;
+		/*
+		 * The ->end() handler has to deal with interrupts which got
+		 * disabled while the handler was running.
+		 */
+		desc->handler->end(irq);
+		spin_unlock(&desc->lock);
+	}
+	local_irq_enable();
+	if (waitqueue_active(&desc->wait_for_handler))
+		wake_up(&desc->wait_for_handler);
+}
+
+extern asmlinkage void __do_softirq(void);
+
+static int curr_irq_prio = 49;
+
+static int do_irqd(void * __desc)
+{
+	struct sched_param param = { 0, };
+	struct irq_desc *desc = __desc;
+#ifdef CONFIG_SMP
+	int irq = desc - irq_desc;
+	cpumask_t mask;
+
+	mask = cpumask_of_cpu(any_online_cpu(irq_affinity[irq]));
+	set_cpus_allowed(current, mask);
+#endif
+	current->flags |= PF_NOFREEZE | PF_HARDIRQ;
+
+	/*
+	 * Scale irq thread priorities from prio 50 to prio 25
+	 */
+	param.sched_priority = curr_irq_prio;
+	if (param.sched_priority > 25)
+		curr_irq_prio = param.sched_priority - 1;
+
+	sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		do_hardirq(desc);
+		cond_resched_all();
+		__do_softirq();
+		local_irq_enable();
+#ifdef CONFIG_SMP
+		/*
+		 * Did IRQ affinities change?
+		 */
+		if (!cpu_isset(smp_processor_id(), irq_affinity[irq])) {
+			mask = cpumask_of_cpu(any_online_cpu(irq_affinity[irq]));
+			set_cpus_allowed(current, mask);
+		}
+#endif
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int ok_to_create_irq_threads;
+
+static int start_irq_thread(int irq, struct irq_desc *desc)
+{
+	if (desc->thread || !ok_to_create_irq_threads)
+		return 0;
+
+	desc->thread = kthread_create(do_irqd, desc, "IRQ %d", irq);
+	if (!desc->thread) {
+		printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq);
+		return -ENOMEM;
+	}
+
+	/*
+	 * An interrupt may have come in before the thread pointer was
+	 * stored in desc->thread; make sure the thread gets woken up in
+	 * such a case:
+	 */
+	smp_mb();
+	wake_up_process(desc->thread);
+	
+	return 0;
+}
+
+void __init init_hardirqs(void)
+{	
+	int i;
+	ok_to_create_irq_threads = 1;
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+
+		if (desc->action && !(desc->status & IRQ_NODELAY))
+			start_irq_thread(i, desc);
+	}
+}
+
+#else
+
+void __init init_hardirqs(void)
+{
+}
+
+static int start_irq_thread(int irq, struct irq_desc *desc)
+{
+	return 0;
+}
+
+#endif
+
+void __init early_init_hardirqs(void)
+{	
+	int i;
+
+	for (i = 0; i < NR_IRQS; i++)
+		init_waitqueue_head(&irq_desc[i].wait_for_handler);
+}
+
+
--- linux/kernel/irq/handle.c.orig
+++ linux/kernel/irq/handle.c
@@ -9,6 +9,7 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/kallsyms.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 
@@ -31,7 +32,7 @@
 irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
 	[0 ... NR_IRQS-1] = {
 		.handler = &no_irq_type,
-		.lock = SPIN_LOCK_UNLOCKED
+		.lock = RAW_SPIN_LOCK_UNLOCKED
 	}
 };
 
@@ -73,6 +74,32 @@ irqreturn_t no_action(int cpl, void *dev
 }
 
 /*
+ * Hack - used for development only.
+ */
+int debug_direct_keyboard = 0;
+
+int redirect_hardirq(struct irq_desc *desc)
+{
+	/*
+	 * Direct execution:
+	 */
+	if (!hardirq_preemption || (desc->status & IRQ_NODELAY) ||
+							!desc->thread)
+		return 0;
+
+#ifdef __i386__
+	if (debug_direct_keyboard && (desc - irq_desc == 1))
+		return 0;
+#endif
+		 
+	BUG_ON(!irqs_disabled());
+	if (desc->thread && desc->thread->state != TASK_RUNNING)
+		wake_up_process(desc->thread);
+
+	return 1;
+}
+
+/*
  * Have got an event to handle:
  */
 fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
@@ -80,30 +107,48 @@ fastcall int handle_IRQ_event(unsigned i
 {
 	int ret, retval = 0, status = 0;
 
-	if (!(action->flags & SA_INTERRUPT))
+	/*
+	 * Unconditionally enable interrupts for threaded
+	 * IRQ handlers:
+	 */
+	if (!hardirq_count() || !(action->flags & SA_INTERRUPT))
 		local_irq_enable();
 
 	do {
+		unsigned int preempt_count = preempt_count();
+
 		ret = action->handler(irq, action->dev_id, regs);
+		if (preempt_count() != preempt_count) {
+			print_symbol("BUG: unbalanced irq-handler preempt count in %s!\n", (unsigned long) action->handler);
+			printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+			preempt_count() = preempt_count;
+		}
 		if (ret == IRQ_HANDLED)
 			status |= action->flags;
 		retval |= ret;
 		action = action->next;
 	} while (action);
 
-	if (status & SA_SAMPLE_RANDOM)
+	if (status & SA_SAMPLE_RANDOM) {
+		local_irq_enable();
 		add_interrupt_randomness(irq);
+	}
 	local_irq_disable();
 
 	return retval;
 }
 
+cycles_t irq_timestamp(unsigned int irq)
+{
+	return irq_desc[irq].timestamp;
+}
+
 /*
  * do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
  * handlers).
  */
-fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
+fastcall notrace unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 {
 	irq_desc_t *desc = irq_desc + irq;
 	struct irqaction * action;
@@ -123,6 +168,7 @@ fastcall unsigned int __do_IRQ(unsigned 
 		desc->handler->end(irq);
 		return 1;
 	}
+	desc->timestamp = get_cycles();
 
 	spin_lock(&desc->lock);
 	desc->handler->ack(irq);
@@ -155,6 +201,12 @@ fastcall unsigned int __do_IRQ(unsigned 
 		goto out;
 
 	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		goto out_no_end;
+
+	/*
 	 * Edge triggered interrupts need to remember
 	 * pending events.
 	 * This applies to any hw interrupts that allow a second
@@ -179,13 +231,13 @@ fastcall unsigned int __do_IRQ(unsigned 
 		desc->status &= ~IRQ_PENDING;
 	}
 	desc->status &= ~IRQ_INPROGRESS;
-
 out:
 	/*
 	 * The ->end() handler has to deal with interrupts which got
 	 * disabled while the handler was running.
 	 */
 	desc->handler->end(irq);
+out_no_end:
 	spin_unlock(&desc->lock);
 
 	return 1;
--- linux/kernel/irq/autoprobe.c.orig
+++ linux/kernel/irq/autoprobe.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 
@@ -26,7 +27,7 @@ static DECLARE_MUTEX(probe_sem);
  */
 unsigned long probe_irq_on(void)
 {
-	unsigned long val, delay;
+	unsigned long val;
 	irq_desc_t *desc;
 	unsigned int i;
 
@@ -44,9 +45,10 @@ unsigned long probe_irq_on(void)
 		spin_unlock_irq(&desc->lock);
 	}
 
-	/* Wait for longstanding interrupts to trigger. */
-	for (delay = jiffies + HZ/50; time_after(delay, jiffies); )
-		/* about 20ms delay */ barrier();
+	/*
+	 * Wait for longstanding interrupts to trigger, 20 msec delay:
+	 */
+	msleep(HZ/50);
 
 	/*
 	 * enable any unassigned irqs
@@ -66,10 +68,9 @@ unsigned long probe_irq_on(void)
 	}
 
 	/*
-	 * Wait for spurious interrupts to trigger
+	 * Wait for spurious interrupts to trigger, 100 msec delay:
 	 */
-	for (delay = jiffies + HZ/10; time_after(delay, jiffies); )
-		/* about 100ms delay */ barrier();
+	msleep(HZ/10);
 
 	/*
 	 * Now filter out any obviously spurious interrupts
--- linux/kernel/irq/internals.h.orig
+++ linux/kernel/irq/internals.h
@@ -4,6 +4,8 @@
 
 extern int noirqdebug;
 
+void recalculate_desc_flags(struct irq_desc *desc);
+
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);
--- linux/kernel/signal.c.orig
+++ linux/kernel/signal.c
@@ -845,11 +845,11 @@ specific_send_sig_info(int sig, struct s
 {
 	int ret = 0;
 
-	if (!irqs_disabled())
-		BUG();
+#ifndef CONFIG_PREEMPT_RT
+	BUG_ON(!irqs_disabled());
+#endif
 #ifdef CONFIG_SMP
-	if (!spin_is_locked(&t->sighand->siglock))
-		BUG();
+	BUG_ON(!spin_is_locked(&t->sighand->siglock));
 #endif
 
 	if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
@@ -1593,6 +1593,7 @@ static void ptrace_stop(int exit_code, i
 		do_notify_parent_cldstop(current, current->parent,
 					 CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
+		current->flags &= ~PF_NOSCHED;
 		schedule();
 	} else {
 		/*
@@ -1661,6 +1662,7 @@ finish_stop(int stop_count)
 		read_unlock(&tasklist_lock);
 	}
 
+	current->flags &= ~PF_NOSCHED;
 	schedule();
 	/*
 	 * Now we don't run again until continued.
@@ -1818,6 +1820,9 @@ int get_signal_to_deliver(siginfo_t *inf
 	sigset_t *mask = &current->blocked;
 	int signr = 0;
 
+#ifdef CONFIG_PREEMPT_RT
+	might_sleep();
+#endif
 relock:
 	spin_lock_irq(&current->sighand->siglock);
 	for (;;) {
--- linux/kernel/workqueue.c.orig
+++ linux/kernel/workqueue.c
@@ -25,6 +25,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/kthread.h>
+#include <linux/syscalls.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use cpu 0's).
@@ -93,10 +94,12 @@ static void __queue_work(struct cpu_work
  *
  * We queue the work to the CPU it was submitted, but there is no
  * guarantee that it will be processed by that CPU.
+ *
+ * Especially no such guarantee on PREEMPT_RT.
  */
 int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-	int ret = 0, cpu = get_cpu();
+	int ret = 0, cpu = _smp_processor_id();
 
 	if (!test_and_set_bit(0, &work->pending)) {
 		if (unlikely(is_single_threaded(wq)))
@@ -105,7 +108,6 @@ int fastcall queue_work(struct workqueue
 		__queue_work(wq->cpu_wq + cpu, work);
 		ret = 1;
 	}
-	put_cpu();
 	return ret;
 }
 
@@ -365,6 +367,39 @@ static void cleanup_workqueue_thread(str
 		kthread_stop(p);
 }
 
+void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu,
+				int policy, int rt_priority, int nice)
+{
+	struct task_struct *p = wq->cpu_wq[cpu].thread;
+	struct sched_param param = { .sched_priority = rt_priority };
+	int ret;
+
+	set_user_nice(p, nice);
+	ret = sys_sched_setscheduler(p->pid, policy, &param);
+	if (ret)
+		printk("BUG: wq(%s) setscheduler() returned: %d.\n",
+			wq->name, ret);
+	
+}
+
+void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+			int rt_priority, int nice)
+{
+	int cpu;
+
+	/* We don't need the distraction of CPUs appearing and vanishing. */
+	lock_cpu_hotplug();
+	if (is_single_threaded(wq))
+		set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice);
+	else {
+		for_each_online_cpu(cpu)
+			set_workqueue_thread_prio(wq, cpu, policy,
+						  rt_priority, nice);
+	}
+	unlock_cpu_hotplug();
+}
+
+
 void destroy_workqueue(struct workqueue_struct *wq)
 {
 	int cpu;
@@ -539,6 +574,7 @@ void init_workqueues(void)
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
+	set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20);
 }
 
 EXPORT_SYMBOL_GPL(__create_workqueue);
--- linux/kernel/module.c.orig
+++ linux/kernel/module.c
@@ -35,6 +35,7 @@
 #include <linux/notifier.h>
 #include <linux/stop_machine.h>
 #include <linux/device.h>
+#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
@@ -96,6 +97,16 @@ static inline int strong_try_module_get(
  */
 void __module_put_and_exit(struct module *mod, long code)
 {
+	/*
+	 * Release the kernel lock if held:
+	 */
+	if (current->lock_depth >= 0) {
+		printk("BUG: module %s holds the BKL [%d] at exit time!\n",
+			mod->name, current->lock_depth);
+		dump_stack();
+		while (current->lock_depth >= 0)
+			unlock_kernel();
+	}
 	module_put(mod);
 	do_exit(code);
 }
--- linux/kernel/profile.c.orig
+++ linux/kernel/profile.c
@@ -23,6 +23,7 @@
 #include <linux/cpu.h>
 #include <linux/profile.h>
 #include <linux/highmem.h>
+#include <linux/interrupt.h>
 #include <asm/sections.h>
 #include <asm/semaphore.h>
 
@@ -41,6 +42,7 @@ static atomic_t *prof_buffer;
 static unsigned long prof_len, prof_shift;
 static int prof_on;
 static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
+int prof_pid = -1;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
 static DEFINE_PER_CPU(int, cpu_profile_flip);
@@ -51,17 +53,25 @@ static int __init profile_setup(char * s
 {
 	int par;
 
+	if (!strncmp(str, "preempt", 7)) {
+		prof_on = PREEMPT_PROFILING;
+		printk(KERN_INFO "kernel preemption profiling enabled\n");
+		if (str[7] == ',')
+			str += 8;
+	}
 	if (!strncmp(str, "schedule", 8)) {
 		prof_on = SCHED_PROFILING;
 		printk(KERN_INFO "kernel schedule profiling enabled\n");
-		if (str[7] == ',')
-			str += 8;
+		if (str[8] == ',')
+			str += 9;
 	}
 	if (get_option(&str,&par)) {
 		prof_shift = par;
-		prof_on = CPU_PROFILING;
-		printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
-			prof_shift);
+		if (!prof_on) {
+			prof_on = CPU_PROFILING;
+			printk(KERN_INFO "kernel CPU profiling enabled\n");
+		}
+		printk(KERN_INFO "kernel profiling shift: %ld\n", prof_shift);
 	}
 	return 1;
 }
@@ -273,7 +283,7 @@ static void profile_discard_flip_buffers
 	up(&profile_flip_mutex);
 }
 
-void profile_hit(int type, void *__pc)
+void notrace profile_hit(int type, void *__pc)
 {
 	unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
 	int i, j, cpu;
@@ -381,12 +391,36 @@ void profile_hit(int type, void *__pc)
 }
 #endif /* !CONFIG_SMP */
 
-void profile_tick(int type, struct pt_regs *regs)
+#ifdef CONFIG_PREEMPT
+static void preemption_enabled(void)
+{
+}
+#endif
+
+static void preemption_disabled(void)
+{
+}
+
+void notrace profile_tick(int type, struct pt_regs *regs)
 {
 	if (type == CPU_PROFILING && timer_hook)
 		timer_hook(regs);
-	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
-		profile_hit(type, (void *)profile_pc(regs));
+	if (!user_mode(regs) && (prof_pid == -1 || prof_pid == current->pid) &&
+			cpu_isset(smp_processor_id(), prof_cpu_mask)) {
+		if (prof_on == PREEMPT_PROFILING && type == CPU_PROFILING) {
+#ifdef CONFIG_PREEMPT
+			int count = preempt_count() - HARDIRQ_OFFSET;
+
+			if (!count)
+				profile_hit(PREEMPT_PROFILING,
+						(void *)preemption_enabled);
+			else
+#endif
+				profile_hit(PREEMPT_PROFILING,
+						(void *)preemption_disabled);
+		} else
+			profile_hit(type, (void *)profile_pc(regs));
+	}
 }
 
 #ifdef CONFIG_PROC_FS
--- linux/kernel/Makefile.orig
+++ linux/kernel/Makefile
@@ -9,6 +9,11 @@ obj-y     = sched.o fork.o exec_domain.o
 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
 	    kthread.o wait.o kfifo.o sys_ni.o
 
+obj-$(CONFIG_PREEMPT_RT) += rt.o
+
+obj-$(CONFIG_DEBUG_PREEMPT) += latency.o
+obj-$(CONFIG_LATENCY_TIMING) += latency.o
+
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
--- linux/kernel/spinlock.c.orig
+++ linux/kernel/spinlock.c
@@ -17,151 +17,149 @@
  * Generic declaration of the raw read_trylock() function,
  * architectures are supposed to optimize this:
  */
-int __lockfunc generic_raw_read_trylock(rwlock_t *lock)
+int __lockfunc generic_raw_read_trylock(raw_rwlock_t *lock)
 {
-	_raw_read_lock(lock);
+	__raw_read_lock(lock);
 	return 1;
 }
 EXPORT_SYMBOL(generic_raw_read_trylock);
 
-int __lockfunc _spin_trylock(spinlock_t *lock)
+int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_spin_trylock(lock))
+	if (__raw_spin_trylock(lock))
 		return 1;
 	
 	preempt_enable();
 	return 0;
 }
-EXPORT_SYMBOL(_spin_trylock);
+EXPORT_SYMBOL(_raw_spin_trylock);
 
-int __lockfunc _read_trylock(rwlock_t *lock)
+int __lockfunc _raw_read_trylock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_read_trylock(lock))
+	if (__raw_read_trylock(lock))
 		return 1;
 
 	preempt_enable();
 	return 0;
 }
-EXPORT_SYMBOL(_read_trylock);
+EXPORT_SYMBOL(_raw_read_trylock);
 
-int __lockfunc _write_trylock(rwlock_t *lock)
+int __lockfunc _raw_write_trylock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_write_trylock(lock))
+	if (__raw_write_trylock(lock))
 		return 1;
 
 	preempt_enable();
 	return 0;
 }
-EXPORT_SYMBOL(_write_trylock);
+EXPORT_SYMBOL(_raw_write_trylock);
 
 #ifndef CONFIG_PREEMPT
 
-void __lockfunc _read_lock(rwlock_t *lock)
+void __lockfunc _raw_read_lock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	_raw_read_lock(lock);
+	__raw_read_lock(lock);
 }
-EXPORT_SYMBOL(_read_lock);
+EXPORT_SYMBOL(_raw_read_lock);
 
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
 	preempt_disable();
-	_raw_spin_lock_flags(lock, flags);
+	__raw_spin_lock_flags(lock, flags);
 	return flags;
 }
-EXPORT_SYMBOL(_spin_lock_irqsave);
+EXPORT_SYMBOL(_raw_spin_lock_irqsave);
 
-void __lockfunc _spin_lock_irq(spinlock_t *lock)
+void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
-	_raw_spin_lock(lock);
+	__raw_spin_lock(lock);
 }
-EXPORT_SYMBOL(_spin_lock_irq);
+EXPORT_SYMBOL(_raw_spin_lock_irq);
 
-void __lockfunc _spin_lock_bh(spinlock_t *lock)
+void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	_raw_spin_lock(lock);
+	__raw_spin_lock(lock);
 }
-EXPORT_SYMBOL(_spin_lock_bh);
+EXPORT_SYMBOL(_raw_spin_lock_bh);
 
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
+unsigned long __lockfunc _raw_read_lock_irqsave(raw_rwlock_t *lock)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
 	preempt_disable();
-	_raw_read_lock(lock);
+	__raw_read_lock(lock);
 	return flags;
 }
-EXPORT_SYMBOL(_read_lock_irqsave);
+EXPORT_SYMBOL(_raw_read_lock_irqsave);
 
-void __lockfunc _read_lock_irq(rwlock_t *lock)
+void __lockfunc _raw_read_lock_irq(raw_rwlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
-	_raw_read_lock(lock);
+	__raw_read_lock(lock);
 }
-EXPORT_SYMBOL(_read_lock_irq);
+EXPORT_SYMBOL(_raw_read_lock_irq);
 
-void __lockfunc _read_lock_bh(rwlock_t *lock)
+void __lockfunc _raw_read_lock_bh(raw_rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	_raw_read_lock(lock);
+	__raw_read_lock(lock);
 }
-EXPORT_SYMBOL(_read_lock_bh);
+EXPORT_SYMBOL(_raw_read_lock_bh);
 
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
+unsigned long __lockfunc _raw_write_lock_irqsave(raw_rwlock_t *lock)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(lock);
 	return flags;
 }
-EXPORT_SYMBOL(_write_lock_irqsave);
+EXPORT_SYMBOL(_raw_write_lock_irqsave);
 
-void __lockfunc _write_lock_irq(rwlock_t *lock)
+void __lockfunc _raw_write_lock_irq(raw_rwlock_t *lock)
 {
 	local_irq_disable();
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(lock);
 }
-EXPORT_SYMBOL(_write_lock_irq);
+EXPORT_SYMBOL(_raw_write_lock_irq);
 
-void __lockfunc _write_lock_bh(rwlock_t *lock)
+void __lockfunc _raw_write_lock_bh(raw_rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(lock);
 }
-EXPORT_SYMBOL(_write_lock_bh);
+EXPORT_SYMBOL(_raw_write_lock_bh);
 
-void __lockfunc _spin_lock(spinlock_t *lock)
+void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
 {
 	preempt_disable();
-	_raw_spin_lock(lock);
+	__raw_spin_lock(lock);
 }
+EXPORT_SYMBOL(_raw_spin_lock);
 
-EXPORT_SYMBOL(_spin_lock);
-
-void __lockfunc _write_lock(rwlock_t *lock)
+void __lockfunc _raw_write_lock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(lock);
 }
-
-EXPORT_SYMBOL(_write_lock);
+EXPORT_SYMBOL(_raw_write_lock);
 
 #else /* CONFIG_PREEMPT: */
 
@@ -174,11 +172,11 @@ EXPORT_SYMBOL(_write_lock);
  */
 
 #define BUILD_LOCK_OPS(op, locktype)					\
-void __lockfunc _##op##_lock(locktype *lock)				\
+void __lockfunc _raw_##op##_lock(locktype *lock)			\
 {									\
 	preempt_disable();						\
 	for (;;) {							\
-		if (likely(_raw_##op##_trylock(lock)))			\
+		if (likely(__raw_##op##_trylock(lock)))			\
 			break;						\
 		preempt_enable();					\
 		if (!(lock)->break_lock)				\
@@ -188,16 +186,16 @@ void __lockfunc _##op##_lock(locktype *l
 	}								\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock);						\
+EXPORT_SYMBOL(_raw_##op##_lock);					\
 									\
-unsigned long __lockfunc _##op##_lock_irqsave(locktype *lock)		\
+unsigned long __lockfunc _raw_##op##_lock_irqsave(locktype *lock)	\
 {									\
 	unsigned long flags;						\
 									\
 	preempt_disable();						\
 	for (;;) {							\
 		local_irq_save(flags);					\
-		if (likely(_raw_##op##_trylock(lock)))			\
+		if (likely(__raw_##op##_trylock(lock)))			\
 			break;						\
 		local_irq_restore(flags);				\
 									\
@@ -210,16 +208,16 @@ unsigned long __lockfunc _##op##_lock_ir
 	return flags;							\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_irqsave);					\
+EXPORT_SYMBOL(_raw_##op##_lock_irqsave);				\
 									\
-void __lockfunc _##op##_lock_irq(locktype *lock)			\
+void __lockfunc _raw_##op##_lock_irq(locktype *lock)			\
 {									\
-	_##op##_lock_irqsave(lock);					\
+	_raw_##op##_lock_irqsave(lock);					\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_irq);					\
+EXPORT_SYMBOL(_raw_##op##_lock_irq);					\
 									\
-void __lockfunc _##op##_lock_bh(locktype *lock)				\
+void __lockfunc _raw_##op##_lock_bh(locktype *lock)			\
 {									\
 	unsigned long flags;						\
 									\
@@ -228,12 +226,12 @@ void __lockfunc _##op##_lock_bh(locktype
 	/* irq-disabling. We use the generic preemption-aware	*/	\
 	/* function:						*/	\
 	/**/								\
-	flags = _##op##_lock_irqsave(lock);				\
+	flags = _raw_##op##_lock_irqsave(lock);				\
 	local_bh_disable();						\
 	local_irq_restore(flags);					\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_bh)
+EXPORT_SYMBOL(_raw_##op##_lock_bh)
 
 /*
  * Build preemption-friendly versions of the following
@@ -244,119 +242,156 @@ EXPORT_SYMBOL(_##op##_lock_bh)
  *         _[spin|read|write]_lock_irqsave()
  *         _[spin|read|write]_lock_bh()
  */
-BUILD_LOCK_OPS(spin, spinlock_t);
-BUILD_LOCK_OPS(read, rwlock_t);
-BUILD_LOCK_OPS(write, rwlock_t);
+BUILD_LOCK_OPS(spin, raw_spinlock_t);
+BUILD_LOCK_OPS(read, raw_rwlock_t);
+BUILD_LOCK_OPS(write, raw_rwlock_t);
 
 #endif /* CONFIG_PREEMPT */
 
-void __lockfunc _spin_unlock(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
 {
-	_raw_spin_unlock(lock);
+	__raw_spin_unlock(lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_spin_unlock);
+EXPORT_SYMBOL(_raw_spin_unlock);
 
-void __lockfunc _write_unlock(rwlock_t *lock)
+void __lockfunc _raw_write_unlock(raw_rwlock_t *lock)
 {
-	_raw_write_unlock(lock);
+	__raw_write_unlock(lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_write_unlock);
+EXPORT_SYMBOL(_raw_write_unlock);
 
-void __lockfunc _read_unlock(rwlock_t *lock)
+void __lockfunc _raw_read_unlock(raw_rwlock_t *lock)
 {
-	_raw_read_unlock(lock);
+	__raw_read_unlock(lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_read_unlock);
+EXPORT_SYMBOL(_raw_read_unlock);
 
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
 {
-	_raw_spin_unlock(lock);
+	__raw_spin_unlock(lock);
+	preempt_enable_no_resched();
 	local_irq_restore(flags);
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_spin_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
 
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
 {
-	_raw_spin_unlock(lock);
+	__raw_spin_unlock(lock);
+	preempt_enable_no_resched();
 	local_irq_enable();
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_spin_unlock_irq);
+EXPORT_SYMBOL(_raw_spin_unlock_irq);
 
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
 {
-	_raw_spin_unlock(lock);
-	preempt_enable();
+	__raw_spin_unlock(lock);
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
-EXPORT_SYMBOL(_spin_unlock_bh);
+EXPORT_SYMBOL(_raw_spin_unlock_bh);
 
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc _raw_read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
 {
-	_raw_read_unlock(lock);
+	__raw_read_unlock(lock);
+	preempt_enable_no_resched();
 	local_irq_restore(flags);
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_read_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
 
-void __lockfunc _read_unlock_irq(rwlock_t *lock)
+void __lockfunc _raw_read_unlock_irq(raw_rwlock_t *lock)
 {
-	_raw_read_unlock(lock);
+	__raw_read_unlock(lock);
+	preempt_enable_no_resched();
 	local_irq_enable();
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_read_unlock_irq);
+EXPORT_SYMBOL(_raw_read_unlock_irq);
 
-void __lockfunc _read_unlock_bh(rwlock_t *lock)
+void __lockfunc _raw_read_unlock_bh(raw_rwlock_t *lock)
 {
-	_raw_read_unlock(lock);
-	preempt_enable();
+	__raw_read_unlock(lock);
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
-EXPORT_SYMBOL(_read_unlock_bh);
+EXPORT_SYMBOL(_raw_read_unlock_bh);
 
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc _raw_write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
 {
-	_raw_write_unlock(lock);
+	__raw_write_unlock(lock);
+	preempt_enable_no_resched();
 	local_irq_restore(flags);
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_write_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
 
-void __lockfunc _write_unlock_irq(rwlock_t *lock)
+void __lockfunc _raw_write_unlock_irq(raw_rwlock_t *lock)
 {
-	_raw_write_unlock(lock);
+	__raw_write_unlock(lock);
+	preempt_enable_no_resched();
 	local_irq_enable();
-	preempt_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_write_unlock_irq);
+EXPORT_SYMBOL(_raw_write_unlock_irq);
 
-void __lockfunc _write_unlock_bh(rwlock_t *lock)
+void __lockfunc _raw_write_unlock_bh(raw_rwlock_t *lock)
 {
-	_raw_write_unlock(lock);
-	preempt_enable();
+	__raw_write_unlock(lock);
+	preempt_enable_no_resched();
 	local_bh_enable();
 }
-EXPORT_SYMBOL(_write_unlock_bh);
+EXPORT_SYMBOL(_raw_write_unlock_bh);
 
-int __lockfunc _spin_trylock_bh(spinlock_t *lock)
+int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	if (_raw_spin_trylock(lock))
+	if (__raw_spin_trylock(lock))
 		return 1;
 
-	preempt_enable();
+	preempt_enable_no_resched();
 	local_bh_enable();
 	return 0;
 }
-EXPORT_SYMBOL(_spin_trylock_bh);
+EXPORT_SYMBOL(_raw_spin_trylock_bh);
+
+int __lockfunc _raw_spin_trylock_irq(raw_spinlock_t *lock)
+{
+	local_irq_disable();
+	preempt_disable();
+	if (__raw_spin_trylock(lock))
+		return 1;
+
+	preempt_enable_no_resched();
+	local_irq_enable();
+	preempt_check_resched();
+
+	return 0;
+}
+EXPORT_SYMBOL(_raw_spin_trylock_irq);
+
+int __lockfunc _raw_spin_trylock_irqsave(raw_spinlock_t *lock,
+					 unsigned long *flags)
+{
+	local_irq_save(*flags);
+	preempt_disable();
+	if (__raw_spin_trylock(lock))
+		return 1;
+
+	preempt_enable_no_resched();
+	local_irq_restore(*flags);
+	preempt_check_resched();
+
+	return 0;
+}
+EXPORT_SYMBOL(_raw_spin_trylock_irqsave);
 
-int in_lock_functions(unsigned long addr)
+int notrace in_lock_functions(unsigned long addr)
 {
 	/* Linker adds these: start and end of __lockfunc functions */
 	extern char __lock_text_start[], __lock_text_end[];
--- linux/kernel/timer.c.orig
+++ linux/kernel/timer.c
@@ -33,6 +33,7 @@
 #include <linux/cpu.h>
 #include <linux/perfctr.h>
 #include <linux/syscalls.h>
+#include <linux/kallsyms.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -68,6 +69,7 @@ struct tvec_t_base_s {
 	spinlock_t lock;
 	unsigned long timer_jiffies;
 	struct timer_list *running_timer;
+	wait_queue_head_t wait_for_running_timer;
 	tvec_root_t tv1;
 	tvec_t tv2;
 	tvec_t tv3;
@@ -159,14 +161,15 @@ int __mod_timer(struct timer_list *timer
 {
 	tvec_base_t *old_base, *new_base;
 	unsigned long flags;
-	int ret = 0;
+	int ret = 0, cpu;
 
 	BUG_ON(!timer->function);
 
 	check_timer(timer);
 
 	spin_lock_irqsave(&timer->lock, flags);
-	new_base = &__get_cpu_var(tvec_bases);
+	cpu = _smp_processor_id();
+	new_base = &per_cpu(tvec_bases, cpu);
 repeat:
 	old_base = timer->base;
 
@@ -354,10 +357,8 @@ del_again:
 	for_each_online_cpu(i) {
 		base = &per_cpu(tvec_bases, i);
 		if (base->running_timer == timer) {
-			while (base->running_timer == timer) {
-				cpu_relax();
-				preempt_check_resched();
-			}
+			wait_event(base->wait_for_running_timer,
+				base->running_timer != timer);
 			break;
 		}
 	}
@@ -441,7 +442,23 @@ static inline void __run_timers(tvec_bas
 		struct list_head work_list = LIST_HEAD_INIT(work_list);
 		struct list_head *head = &work_list;
  		int index = base->timer_jiffies & TVR_MASK;
- 
+
+		if (softirq_need_resched()) {
+			/* running_timer might be stale: */
+			set_running_timer(base, NULL);
+//			if (waitqueue_active(&base->wait_running_timer))
+				wake_up(&base->wait_for_running_timer);
+			spin_unlock_irq(&base->lock);
+			cond_resched_all();
+			cpu_relax();
+			spin_lock_irq(&base->lock);
+			/*
+			 * We can simply continue after preemption, nobody
+			 * else can touch timer_jiffies so 'index' is still
+			 * valid. Any new jiffy will be taken care of in
+			 * subsequent loops:
+			 */
+		}
 		/*
 		 * Cascade timers:
 		 */
@@ -470,16 +487,20 @@ repeat:
 				u32 preempt_count = preempt_count();
 				fn(data);
 				if (preempt_count != preempt_count()) {
-					printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count());
-					BUG();
+					print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn);
+					printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+					preempt_count() = preempt_count;
 				}
 			}
+			cond_resched_all();
 			spin_lock_irq(&base->lock);
 			goto repeat;
 		}
 	}
 	set_running_timer(base, NULL);
 	spin_unlock_irq(&base->lock);
+//	if (waitqueue_active(&base->wait_running_timer))
+		wake_up(&base->wait_for_running_timer);
 }
 
 #ifdef CONFIG_NO_IDLE_HZ
@@ -833,7 +854,14 @@ void update_process_times(int user_tick)
  */
 static unsigned long count_active_tasks(void)
 {
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * -1 for the timer IRQ thread:
+	 */
+	return (nr_running() - 1 + nr_uninterruptible()) * FIXED_1;
+#else
 	return (nr_running() + nr_uninterruptible()) * FIXED_1;
+#endif
 }
 
 /*
@@ -873,23 +901,12 @@ unsigned long wall_jiffies = INITIAL_JIF
  * playing with xtime and avenrun.
  */
 #ifndef ARCH_HAVE_XTIME_LOCK
-seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
+DECLARE_SEQLOCK(xtime_lock);
 
 EXPORT_SYMBOL(xtime_lock);
 #endif
 
 /*
- * This function runs timers and the timer-tq in bottom half context.
- */
-static void run_timer_softirq(struct softirq_action *h)
-{
-	tvec_base_t *base = &__get_cpu_var(tvec_bases);
-
-	if (time_after_eq(jiffies, base->timer_jiffies))
-		__run_timers(base);
-}
-
-/*
  * Called by the local, per-CPU timer interrupt on SMP.
  */
 void run_local_timers(void)
@@ -898,22 +915,48 @@ void run_local_timers(void)
 }
 
 /*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
+ * Time of day handling:
  */
 static inline void update_times(void)
 {
-	unsigned long ticks;
+	unsigned long ticks = 0;
+	/*
+	 * First test outside the lock for performance reasons:
+	 */
+	if (jiffies != wall_jiffies) {
+		unsigned long flags;
 
-	ticks = jiffies - wall_jiffies;
-	if (ticks) {
-		wall_jiffies += ticks;
-		update_wall_time(ticks);
+		write_seqlock_irqsave(&xtime_lock, flags);
+		while (jiffies != wall_jiffies) {
+			wall_jiffies++;
+			ticks++;
+			update_wall_time(1);
+			/*
+			 * Unlock unconditionally, to make sure
+			 * we dont keep irqs off for a long time!
+			 */
+			write_sequnlock_irqrestore(&xtime_lock, flags);
+			cond_resched_softirq();
+			write_seqlock_irqsave(&xtime_lock, flags);
+		}
+		calc_load(ticks);
+		write_sequnlock_irqrestore(&xtime_lock, flags);
 	}
-	calc_load(ticks);
 }
   
 /*
+ * This function runs timers and the timer-tq in bottom half context.
+ */
+static void run_timer_softirq(struct softirq_action *h)
+{
+	tvec_base_t *base = &__get_cpu_var(tvec_bases);
+
+	update_times();
+	if (time_after_eq(jiffies, base->timer_jiffies))
+		__run_timers(base);
+}
+
+/*
  * The 64-bit jiffies value is not atomic - you MUST NOT read it
  * without sampling the sequence number in xtime_lock.
  * jiffies is defined in the linker script...
@@ -922,7 +965,6 @@ static inline void update_times(void)
 void do_timer(struct pt_regs *regs)
 {
 	jiffies_64++;
-	update_times();
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -1286,6 +1328,8 @@ static void __devinit init_timers_cpu(in
        
 	base = &per_cpu(tvec_bases, cpu);
 	spin_lock_init(&base->lock);
+	init_waitqueue_head(&base->wait_for_running_timer);
+
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
 		INIT_LIST_HEAD(base->tv4.vec + j);
--- linux/kernel/sys.c.orig
+++ linux/kernel/sys.c
@@ -164,7 +164,7 @@ EXPORT_SYMBOL(notifier_chain_unregister)
  *	of the last notifier function called.
  */
  
-int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
+int notrace notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
 {
 	int ret=NOTIFY_DONE;
 	struct notifier_block *nb = *n;
--- linux/kernel/latency.c.orig
+++ linux/kernel/latency.c
@@ -0,0 +1,1772 @@
+/*
+ *  kernel/latency.c
+ *
+ *  Copyright (C) 2004 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/mm.h>
+#include <linux/nmi.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/profile.h>
+#include <linux/bootmem.h>
+#include <linux/version.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <asm/rtc.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+#ifdef __i386__
+static inline cycles_t cycles(void)
+{
+        unsigned long long ret;
+
+        rdtscll(ret);
+
+        return ret;
+}
+#else
+# define cycles() get_cycles()
+#endif
+
+#ifdef CONFIG_WAKEUP_TIMING
+struct sch_struct {
+	raw_spinlock_t trace_lock;
+	struct task_struct *task;
+	int cpu;
+	struct cpu_trace *tr;
+} ____cacheline_aligned_in_smp;
+
+static __cacheline_aligned_in_smp struct sch_struct sch =
+		{ trace_lock: RAW_SPIN_LOCK_UNLOCKED };
+
+int wakeup_timing = 1;
+#endif
+
+#ifdef CONFIG_LATENCY_TIMING
+
+/*
+ * Maximum preemption latency measured. Initialize to maximum,
+ * we clear it after bootup.
+ */
+static cycles_t preempt_max_latency = (cycles_t)ULONG_MAX;
+static cycles_t preempt_thresh;
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycles_t delta)
+{
+	if (preempt_thresh) {
+		if (delta < preempt_thresh)
+			return 0;
+	} else {
+		if (delta <= preempt_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * Track maximum latencies and save the trace:
+ */
+static __cacheline_aligned_in_smp DECLARE_MUTEX(max_mutex);
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp int max_sequence;
+
+enum trace_type
+{
+	__TRACE_FIRST_TYPE = 0,
+
+	TRACE_FN,
+	TRACE_SPECIAL,
+	TRACE_SPECIAL_PID,
+	TRACE_CMDLINE,
+	TRACE_SYSCALL,
+	TRACE_SYSRET,
+
+	__TRACE_LAST_TYPE
+};
+
+enum trace_flag_type
+{
+	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_NEED_RESCHED		= 0x02,
+	TRACE_FLAG_HARDIRQ		= 0x04,
+	TRACE_FLAG_SOFTIRQ		= 0x08,
+};
+
+
+#ifdef CONFIG_LATENCY_TRACE
+
+#define MAX_TRACE (unsigned long)(4096-1)
+
+#define CMDLINE_BYTES 16
+
+/*
+ * 32 bytes on 32-bit platforms:
+ */
+struct trace_entry {
+	char type;
+	char cpu;
+	char flags;
+	char preempt_count; // assumes PREEMPT_MASK is 8 bits or less
+	int pid;
+	cycles_t timestamp;
+	union {
+		struct {
+			unsigned long eip;
+			unsigned long parent_eip;
+		} fn;
+		struct {
+			unsigned long eip;
+			unsigned long v1, v2, v3;
+		} special;
+		struct {
+			unsigned char str[CMDLINE_BYTES];
+		} cmdline;
+		struct {
+			unsigned int nr;
+			unsigned long p1, p2, p3;
+		} syscall;
+		struct {
+			unsigned int ret;
+		} sysret;
+		struct {
+			int __pad3[4];
+		} pad;
+	} u;
+} __attribute__((packed));
+
+#endif
+
+struct cpu_trace {
+	atomic_t disabled;
+	unsigned long trace_idx;
+	cycles_t preempt_timestamp;
+	unsigned long critical_start, critical_end;
+	int critical_sequence;
+	int early_warning;
+
+#ifdef CONFIG_LATENCY_TRACE
+	struct trace_entry trace[MAX_TRACE];
+	char comm[CMDLINE_BYTES];
+	pid_t pid;
+	unsigned long uid;
+	unsigned long nice;
+	unsigned long policy;
+	unsigned long rt_priority;
+	unsigned long saved_latency;
+#endif
+
+} ____cacheline_aligned_in_smp;
+
+static struct cpu_trace cpu_traces[NR_CPUS] ____cacheline_aligned_in_smp;
+
+static unsigned long notrace cycles_to_usecs(cycles_t delta)
+{
+#ifdef CONFIG_X86
+	do_div(delta, cpu_khz/1000+1);
+#elif defined(CONFIG_PPC)
+	delta = mulhwu(tb_to_us, delta);
+#else
+	#error Implement cycles_to_usecs.
+#endif
+
+	return (unsigned long) delta;
+}
+
+static cycles_t notrace usecs_to_cycles(unsigned long delta)
+{
+	return (cycles_t) delta * (cycles_t) (cpu_khz/1000+1);
+}
+
+#ifdef CONFIG_LATENCY_TRACE
+
+int trace_enabled = 1;
+int mcount_enabled = 1;
+int trace_freerunning = 0;
+int trace_print_at_crash = 0;
+int trace_verbose = 0;
+int trace_all_cpus = 0;
+
+/*
+ * user-triggered via gettimeofday(0,1)/gettimeofday(0,0)
+ */
+int trace_user_triggered = 0;
+
+struct saved_trace_struct {
+	int cpu;
+	cycles_t first_timestamp, last_timestamp;
+	struct cpu_trace traces[NR_CPUS];
+} ____cacheline_aligned_in_smp;
+
+/*
+ * The current worst-case trace:
+ */
+static struct saved_trace_struct max_tr;
+
+/*
+ * /proc/latency_trace atomicity:
+ */
+static DECLARE_MUTEX(out_mutex);
+
+static struct saved_trace_struct out_tr;
+
+
+static inline void notrace
+____trace(int cpu, enum trace_type type, struct cpu_trace *tr,
+	  unsigned long eip, unsigned long parent_eip,
+	  unsigned long v1, unsigned long v2, unsigned long v3)
+{
+	struct trace_entry *entry;
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	/* Debugging check for stack overflow: is there less than 1KB free? */
+	{
+		long esp;
+
+		__asm__ __volatile__("andl %%esp,%0" :
+					"=r" (esp) : "0" (THREAD_SIZE - 1));
+		if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
+			printk("BUG: mcount: stack overflow: %ld [%08lx...%08lx...%08lx]\n",
+				esp - sizeof(struct thread_info), (long)&esp, (long)current_thread_info(), (long)current_thread_info() + THREAD_SIZE);
+			dump_stack();
+		}
+	}
+#endif
+
+	if (likely(tr->critical_start) || unlikely(trace_user_triggered || trace_all_cpus))
+	if (tr->trace_idx < MAX_TRACE) {
+		u32 pc = preempt_count();
+
+		entry = tr->trace + tr->trace_idx;
+		entry->type = type;
+#ifdef CONFIG_SMP
+		entry->cpu = cpu;
+#endif
+		entry->flags = (irqs_disabled() ? TRACE_FLAG_IRQS_OFF : 0) |
+			((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+			((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+			(_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+		entry->preempt_count = pc & 0xff;
+		entry->pid = current->pid;
+		entry->timestamp = cycles();
+
+		switch (type) {
+		case TRACE_FN:
+			entry->u.fn.eip = eip;
+			entry->u.fn.parent_eip = parent_eip;
+			break;
+		case TRACE_SPECIAL:
+		case TRACE_SPECIAL_PID:
+			entry->u.special.eip = eip;
+			entry->u.special.v1 = v1;
+			entry->u.special.v2 = v2;
+			entry->u.special.v3 = v3;
+			break;
+		case TRACE_SYSCALL:
+			entry->u.syscall.nr = eip;
+			entry->u.syscall.p1 = v1;
+			entry->u.syscall.p2 = v2;
+			entry->u.syscall.p3 = v3;
+			break;
+		case TRACE_SYSRET:
+			entry->u.sysret.ret = eip;
+			break;
+		case TRACE_CMDLINE:
+			memcpy(entry->u.cmdline.str, current->comm, CMDLINE_BYTES);
+			break;
+		default:
+			break;
+		}
+	}
+	tr->trace_idx++;
+	if (unlikely(trace_freerunning && (tr->trace_idx >= MAX_TRACE)))
+		tr->trace_idx = 0;
+}
+
+static inline void notrace
+___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip,
+		unsigned long v1, unsigned long v2,
+			unsigned long v3)
+{
+	int cpu = _smp_processor_id();
+	struct cpu_trace *tr;
+
+	if (unlikely(trace_enabled <= 0))
+		return;
+
+	/*
+	 * Trace on the CPU where the current highest-prio task
+	 * is waiting to become runnable:
+	 */
+#ifdef CONFIG_WAKEUP_TIMING 
+	if (wakeup_timing && !trace_all_cpus) {
+		if (!sch.tr || cpu != sch.cpu)
+			return;
+		tr = sch.tr;
+	} else
+		tr = cpu_traces + cpu;
+#else
+	tr = cpu_traces + cpu;
+#endif
+	if (likely(!atomic_read(&tr->disabled))) {
+		atomic_inc(&tr->disabled);
+		____trace(cpu, type, tr, eip, parent_eip, v1, v2, v3);
+		atomic_dec(&tr->disabled);
+	}
+}
+
+/*
+ * Special, ad-hoc tracepoints:
+ */
+void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3)
+{
+	___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, v1, v2, v3);
+}
+
+EXPORT_SYMBOL(trace_special);
+
+void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2)
+{
+	___trace(TRACE_SPECIAL_PID, CALLER_ADDR0, 0, pid, v1, v2);
+}
+
+EXPORT_SYMBOL(trace_special_pid);
+
+/*
+ * Non-inlined function:
+ */
+void notrace __trace(unsigned long eip, unsigned long parent_eip)
+{
+	___trace(TRACE_FN, eip, parent_eip, 0, 0, 0);
+}
+
+extern void mcount(void);
+
+EXPORT_SYMBOL(mcount);
+
+void notrace __mcount(void)
+{
+	___trace(TRACE_FN, CALLER_ADDR1, CALLER_ADDR2, 0, 0, 0);
+}
+
+void notrace
+sys_call(int nr, unsigned long p1, unsigned long p2, unsigned long p3)
+{
+	___trace(TRACE_SYSCALL, nr, 0, p1, p2, p3);
+}
+
+void notrace sys_ret(int ret)
+{
+	___trace(TRACE_SYSRET, ret, 0, 0, 0, 0);
+}
+
+static void notrace print_name(struct seq_file *m, unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	/*
+	 * Special trace values:
+	 */
+	if (((long)eip < 10000L) && ((long)eip > -10000L)) {
+		seq_printf(m, "(%ld)", eip);
+		return;
+	}
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		seq_puts(m, sym_name);
+	else
+		seq_printf(m, "<%08lx>", eip);
+}
+
+static void notrace printk_name(unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		printk("%s+%#lx/%#lx", sym_name, offset, size);
+	else
+		printk("<%08lx>", eip);
+}
+
+
+static void notrace print_name_offset(struct seq_file *m, unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		seq_printf(m, "%s+%#lx/%#lx <%08lx>",
+					sym_name, offset, size, eip);
+	else
+		seq_printf(m, "<%08lx>", eip);
+}
+
+static unsigned int out_sequence = -1;
+static int pid_to_cmdline_array[PID_MAX_DEFAULT+1];
+
+void notrace trace_cmdline(void)
+{
+	___trace(TRACE_CMDLINE, 0, 0, 0, 0, 0);
+}
+
+static void construct_pid_to_cmdline(void)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	unsigned int i, j, entries, pid;
+
+//	printk("cs: %d, ls: %d, ms: %d\n",
+//		tr->critical_sequence, last_sequence, max_sequence);
+	if (tr->critical_sequence == out_sequence)
+		return;
+	out_sequence = tr->critical_sequence;
+
+	memset(pid_to_cmdline_array, -1, sizeof(int) * (PID_MAX_DEFAULT + 1));
+
+	entries = min(tr->trace_idx, MAX_TRACE-1);
+//	printk("entries: %d\n", entries);
+
+	for (i = 0; i < entries; i++) {
+		struct trace_entry *entry = tr->trace + i;
+
+		if (entry->type != TRACE_CMDLINE)
+			continue;
+		pid = entry->pid;
+		if (pid < PID_MAX_DEFAULT) {
+			pid_to_cmdline_array[pid] = i;
+//			printk("pid %d -> idx %d [%16s]\n",
+//				pid, i, tr->trace[pid_to_cmdline_array[pid]].u.cmdline.str);
+			/*
+			 * Replace space with underline - makes it easier
+			 * to process for tools:
+			 */
+			for (j = 0; j < CMDLINE_BYTES; j++)
+				if (entry->u.cmdline.str[j] == ' ')
+					entry->u.cmdline.str[j] = '_';
+		}
+	}
+}
+
+char *pid_to_cmdline(unsigned long pid)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	char *cmdline = "<unknown>";
+	int idx;
+
+	pid = min(pid, (unsigned long)PID_MAX_DEFAULT);
+	if (pid_to_cmdline_array[pid] != -1) {
+		idx = pid_to_cmdline_array[pid];
+		if (tr->trace[idx].type == TRACE_CMDLINE)
+			cmdline = tr->trace[idx].u.cmdline.str;
+	}
+	return cmdline;
+}
+
+struct block_idx {
+	int idx[NR_CPUS];
+};
+
+/*
+ * return the trace entry (position) of the smallest-timestamp
+ * one (that is still in the valid idx range):
+ */
+static int min_idx(struct block_idx *bidx)
+{
+	cycles_t min_stamp = (cycles_t) -1;
+	struct trace_entry *entry;
+	int cpu, min_cpu = -1, idx;
+
+	for_each_online_cpu(cpu) {
+		idx = bidx->idx[cpu];
+		entry = max_tr.traces[cpu].trace + bidx->idx[cpu];
+		if (idx > max_tr.traces[cpu].trace_idx)
+			continue;
+		if (entry->timestamp < min_stamp) {
+			min_cpu = cpu;
+			min_stamp = entry->timestamp;
+		}
+	}
+
+	return min_cpu;
+}
+
+/*
+ * This code is called to construct an output trace from
+ * the maximum trace. Having separate traces serves both
+ * atomicity (a new max might be saved while we are busy
+ * accessing /proc/latency_trace) and it is also used to
+ * delay the (expensive) sorting of the output trace by
+ * timestamps, in the trace_all_cpus case.
+ */
+static void update_out_trace(void)
+{
+	int cpu, sum, entries;
+	struct cpu_trace *tmp_max, *tmp_out;
+	struct trace_entry *out_entry, *entry;
+	struct block_idx bidx = { { 0, } };
+	cycles_t stamp, first_stamp = 0, last_stamp = (cycles_t)-1;
+
+	/*
+	 * Nasty trick. We might overflow the first array but
+	 * there are NR_CPUS of them so we use it as a 'big'
+	 * trace buffer.
+	 */
+	tmp_out = out_tr.traces + 0;
+	*tmp_out = max_tr.traces[max_tr.cpu];
+	out_tr.cpu = max_tr.cpu;
+	out_entry = tmp_out->trace + 0;
+
+	if (!trace_all_cpus) {
+		entries = min(tmp_out->trace_idx, MAX_TRACE-1);
+		if (!entries)
+			return;
+		out_tr.first_timestamp = tmp_out->trace[0].timestamp;
+		out_tr.last_timestamp = tmp_out->trace[entries-1].timestamp;
+		return;
+	}
+	/*
+	 * Find the range of timestamps that are fully traced in
+	 * all CPU traces. (since CPU traces can cover a variable
+	 * range of time, we have to find the best range.)
+	 */
+	for_each_online_cpu(cpu) {
+		tmp_max = max_tr.traces + cpu;
+		stamp = tmp_max->trace[0].timestamp;
+//		printk("cpu%d stamp0: %016Lx [trace_idx: %ld]\n",
+//			cpu, stamp, tmp_max->trace_idx);
+		if (stamp > first_stamp)
+			first_stamp = stamp;
+	}
+//	printk("first_stamp: %016Lx\n", first_stamp);
+	/*
+	 * Save the timestamp range:
+	 */
+
+	tmp_max = max_tr.traces + max_tr.cpu;
+	entries = min(tmp_max->trace_idx, MAX_TRACE-1);
+	/*
+	 * No saved trace yet?
+	 */
+	if (!entries) {
+		out_tr.traces[0].trace_idx = 0;
+		return;
+	}
+
+	last_stamp = tmp_max->trace[entries-1].timestamp;
+//	printk(" last_stamp: %016Lx [max cpu: %d]\n",
+//		last_stamp, max_tr.cpu);
+
+	WARN_ON(last_stamp < first_stamp);
+
+	out_tr.first_timestamp = first_stamp;
+	out_tr.last_timestamp = last_stamp;
+
+
+	/*
+	 * Fetch trace entries one by one, in increasing timestamp
+	 * order. Start at first_stamp, stop at last_stamp:
+	 */
+	sum = 0;
+	for (;;) {
+		cpu = min_idx(&bidx);
+//		printk("cpu: %d\n", cpu);
+		if (cpu == -1)
+			break;
+		entry = max_tr.traces[cpu].trace + bidx.idx[cpu];
+//		printk("entry [%d][%d], stamp: %016Lx",
+//			cpu, bidx.idx[cpu], entry->timestamp);
+		if (entry->timestamp > last_stamp) {
+//			printk(" ... skipped\n");
+			break;
+		}
+//		printk(" ... copied.\n");
+
+		bidx.idx[cpu]++;
+		if (entry->timestamp < first_stamp)
+			continue;
+		*out_entry = *entry;
+		out_entry++;
+		sum++;
+	}
+//	printk("sum: %d\n\n", sum);
+	
+	WARN_ON(sum > MAX_TRACE*NR_CPUS);
+	tmp_out->trace_idx = sum;
+}
+
+static void * notrace l_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t n = *pos;
+	unsigned long entries;
+	struct cpu_trace *tr;
+
+	down(&out_mutex);
+	/*
+	 * if the file is being read newly, update the output trace:
+	 */
+	if (!n) {
+		// TODO: use the sequence counter here to optimize
+		down(&max_mutex);
+		update_out_trace();
+		up(&max_mutex);
+		if (!out_tr.traces[0].trace_idx) {
+			up(&out_mutex);
+			return NULL;
+		}
+		construct_pid_to_cmdline();
+	}
+	tr = out_tr.traces;
+	entries = min(tr->trace_idx, MAX_TRACE);
+
+	if (!n) {
+		seq_printf(m, "preemption latency trace v1.1.4 on %s\n", UTS_RELEASE);
+		seq_puts(m, "--------------------------------------------------------------------\n");
+		seq_printf(m, " latency: %lu �s, #%lu/%lu, CPU#%d | (M:%s VP:%d, KP:%d, SP:%d HP:%d #P:%d)\n",
+			cycles_to_usecs(tr->saved_latency),
+			entries, tr->trace_idx, out_tr.cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+			"server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+			"desktop",
+#elif defined(CONFIG_PREEMPT_DESKTOP)
+			"preempt",
+#else
+			"rt",
+#endif
+			voluntary_preemption, kernel_preemption,
+			softirq_preemption, hardirq_preemption,
+			num_online_cpus());
+		seq_puts(m, "    -----------------\n");
+		seq_printf(m, "    | task: %.16s-%d (uid:%ld nice:%ld policy:%ld rt_prio:%ld)\n",
+			tr->comm, tr->pid, tr->uid, tr->nice,
+			tr->policy, tr->rt_priority);
+		seq_puts(m, "    -----------------\n");
+		if (trace_user_triggered) {
+			seq_puts(m, " => started at: ");
+			print_name_offset(m, tr->critical_start);
+			seq_puts(m, "\n => ended at:   ");
+			print_name_offset(m, tr->critical_end);
+			seq_puts(m, "\n");
+		}
+		seq_puts(m, "\n");
+
+		seq_puts(m, "                 _------=> CPU#            \n");
+		seq_puts(m, "                / _-----=> irqs-off        \n");
+		seq_puts(m, "               | / _----=> need-resched    \n");
+		seq_puts(m, "               || / _---=> hardirq/softirq \n");
+		seq_puts(m, "               ||| / _--=> preempt-depth   \n");
+		seq_puts(m, "               |||| /                      \n");
+		seq_puts(m, "               |||||     delay             \n");
+		seq_puts(m, "   cmd     pid ||||| time  |   caller      \n");
+		seq_puts(m, "      \\   /    |||||   \\   |   /           \n");
+
+	}
+	if (n >= entries)
+		return NULL;
+
+	return tr->trace + n;
+}
+
+static void * notrace l_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	unsigned long entries = min(tr->trace_idx, MAX_TRACE);
+
+	if (++*pos >= entries) {
+		if (*pos == entries)
+			seq_puts(m, "\n\nvim:ft=help\n");
+		return NULL;
+	}
+	return tr->trace + *pos;
+}
+
+static void notrace l_stop(struct seq_file *m, void *p)
+{
+	up(&out_mutex);
+}
+
+static void print_timestamp(struct seq_file *m, unsigned long abs_usecs,
+						unsigned long rel_usecs)
+{
+	seq_printf(m, " %4ld�s", abs_usecs);
+	if (rel_usecs > 100)
+		seq_puts(m, "!: ");
+	else if (rel_usecs > 1)
+		seq_puts(m, "+: ");
+	else
+		seq_puts(m, " : ");
+}
+
+static void
+print_timestamp_short(struct seq_file *m, unsigned long abs_usecs,
+			unsigned long rel_usecs)
+{
+	seq_printf(m, " %4ld�s", abs_usecs);
+	if (rel_usecs > 100)
+		seq_putc(m, '!');
+	else if (rel_usecs > 1)
+		seq_putc(m, '+');
+	else
+		seq_putc(m, ' ');
+}
+
+static void
+print_generic(struct seq_file *m, struct trace_entry *entry)
+{
+	int hardirq, softirq;
+
+	seq_printf(m, "%8.8s-%-5d ", pid_to_cmdline(entry->pid), entry->pid);
+	seq_printf(m, "%d", entry->cpu);
+	seq_printf(m, "%c%c",
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
+		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.');
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		seq_putc(m, 'H');
+	else {
+		if (hardirq)
+			seq_putc(m, 'h');
+		else {
+			if (softirq)
+				seq_putc(m, 's');
+			else
+				seq_putc(m, '.');
+		}
+	}
+
+	if (entry->preempt_count)
+		seq_printf(m, "%x", entry->preempt_count);
+	else
+		seq_puts(m, ".");
+}
+
+
+static int notrace l_show_fn(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	if (trace_verbose) {
+		seq_printf(m, "%16s %5d %d %d %08x %08lx [%016Lu] %ld.%03ldms (+%ld.%03ldms): ",
+			pid_to_cmdline(entry->pid),
+			entry->pid, entry->cpu, entry->flags,
+			entry->preempt_count, trace_idx,
+			entry->timestamp, abs_usecs/1000,
+			abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000);
+		print_name_offset(m, entry->u.fn.eip);
+		seq_puts(m, " (");
+		print_name_offset(m, entry->u.fn.parent_eip);
+		seq_puts(m, ")\n");
+	} else {
+		print_generic(m, entry);
+		print_timestamp(m, abs_usecs, rel_usecs);
+		print_name(m, entry->u.fn.eip);
+		seq_puts(m, " (");
+		print_name(m, entry->u.fn.parent_eip);
+		seq_puts(m, ")\n");
+	}
+	return 0;
+}
+
+static int notrace l_show_special(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp(m, abs_usecs, rel_usecs);
+	if (trace_verbose)
+		print_name_offset(m, entry->u.special.eip);
+	else
+		print_name(m, entry->u.special.eip);
+	seq_printf(m, " (%lx %lx %lx)\n",
+		entry->u.special.v1, entry->u.special.v2, entry->u.special.v3);
+
+	return 0;
+}
+
+static int notrace
+l_show_special_pid(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+	unsigned int pid;
+
+	pid = entry->u.special.v1;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp(m, abs_usecs, rel_usecs);
+	if (trace_verbose)
+		print_name_offset(m, entry->u.special.eip);
+	else
+		print_name(m, entry->u.special.eip);
+	seq_printf(m, " <%.8s-%d> (%lx %lx): ",
+		pid_to_cmdline(pid), pid,
+		entry->u.special.v2, entry->u.special.v3);
+
+	seq_puts(m, "\n");
+
+	return 0;
+}
+
+
+static int notrace l_show_cmdline(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	return 0;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	seq_printf(m,
+		"[ => %16s ] %ld.%03ldms (+%ld.%03ldms)\n",
+			entry->u.cmdline.str,
+			abs_usecs/1000, abs_usecs % 1000,
+			rel_usecs/1000, rel_usecs % 1000);
+
+	return 0;
+}
+
+extern unsigned long sys_call_table[NR_syscalls];
+
+static int notrace l_show_syscall(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+	unsigned int nr;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp_short(m, abs_usecs, rel_usecs);
+
+	seq_puts(m, "> ");
+	nr = entry->u.syscall.nr;
+	if (nr < NR_syscalls)
+		print_name(m, sys_call_table[entry->u.syscall.nr]);
+	else
+		seq_puts(m, "<badsys>");
+
+	seq_printf(m, " (%08lx %08lx %08lx)\n",
+		entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3);
+
+	return 0;
+}
+
+static int notrace l_show_sysret(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp_short(m, abs_usecs, rel_usecs);
+
+	seq_printf(m, "< (%d)\n", entry->u.sysret.ret);
+
+	return 0;
+}
+
+
+static int notrace l_show(struct seq_file *m, void *p)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	struct trace_entry *entry, *entry0, *next_entry;
+	unsigned long trace_idx;
+
+	entry = p;
+	if (entry->timestamp < out_tr.first_timestamp)
+		return 0;
+	if (entry->timestamp > out_tr.last_timestamp)
+		return 0;
+
+	entry0 = tr->trace;
+	trace_idx = entry - entry0;
+
+	if (trace_idx + 1 < tr->trace_idx)
+		next_entry = entry + 1;
+	else
+		next_entry = entry;
+
+	switch (entry->type) {
+		case TRACE_FN:
+			l_show_fn(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SPECIAL:
+			l_show_special(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SPECIAL_PID:
+			l_show_special_pid(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_CMDLINE:
+			l_show_cmdline(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SYSCALL:
+			l_show_syscall(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SYSRET:
+			l_show_sysret(m, trace_idx, entry, entry0, next_entry);
+			break;
+		default:
+			seq_printf(m, "unknown trace type %d\n", entry->type);
+	}
+	return 0;
+}
+
+struct seq_operations latency_trace_op = {
+	.start	= l_start,
+	.next	= l_next,
+	.stop	= l_stop,
+	.show	= l_show
+};
+
+static void copy_trace(struct cpu_trace *save, struct cpu_trace *tr)
+{
+	/* free-running needs reordering */
+	if (trace_freerunning) {
+		int i, idx, idx0 = tr->trace_idx;
+
+		for (i = 0; i < MAX_TRACE; i++) {
+			idx = (idx0 + i) % MAX_TRACE;
+			save->trace[i] = tr->trace[idx];
+		}
+		save->trace_idx = MAX_TRACE-1;
+	} else {
+		save->trace_idx = tr->trace_idx;
+
+		memcpy(save->trace, tr->trace,
+			min(save->trace_idx + 1, MAX_TRACE) *
+					sizeof(struct trace_entry));
+	}
+}
+
+static void update_max_tr(struct cpu_trace *tr)
+{
+	struct cpu_trace *save;
+	int this_cpu = smp_processor_id(), cpu, all_cpus = 0;
+
+	WARN_ON(!preempt_count() && !irqs_disabled());
+
+	max_tr.cpu = this_cpu;
+	save = max_tr.traces + this_cpu;
+
+	if ((wakeup_timing || trace_user_triggered) && trace_all_cpus) {
+		all_cpus = 1;
+		for_each_online_cpu(cpu)
+			atomic_inc(&cpu_traces[cpu].disabled);
+	}
+//	printk("this_cpu: %d, trace_idx: %ld.\n", this_cpu, tr->trace_idx);
+//	for_each_online_cpu(cpu)
+//		printk(".. cpu%d: %ld.\n", cpu, cpu_traces[cpu].trace_idx);
+
+	save->saved_latency = preempt_max_latency;
+	save->preempt_timestamp = tr->preempt_timestamp;
+	save->critical_start = tr->critical_start;
+	save->critical_end = tr->critical_end;
+	save->critical_sequence = tr->critical_sequence;
+
+	memcpy(save->comm, current->comm, CMDLINE_BYTES);
+	save->pid = current->pid;
+	save->uid = current->uid;
+	save->nice = current->static_prio - 20 - MAX_RT_PRIO;
+	save->policy = current->policy;
+	save->rt_priority = current->rt_priority;
+
+	if (all_cpus) {
+		for_each_online_cpu(cpu) {
+			copy_trace(max_tr.traces + cpu, cpu_traces + cpu);
+			atomic_dec(&cpu_traces[cpu].disabled);
+		}
+	} else
+		copy_trace(save, tr);
+}
+
+#else /* !LATENCY_TRACE */
+
+static inline void notrace
+____trace(int cpu, enum trace_type type, struct cpu_trace *tr,
+	  unsigned long eip, unsigned long parent_eip,
+	  unsigned long v1, unsigned long v2, unsigned long v3)
+{
+}
+
+static inline void notrace
+___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip,
+		unsigned long v1, unsigned long v2,
+			unsigned long v3)
+{
+}
+
+static inline void notrace __trace(unsigned long eip, unsigned long parent_eip)
+{
+}
+
+static inline void update_max_tr(struct cpu_trace *tr)
+{
+}
+
+#endif
+
+static int setup_preempt_thresh(char *s)
+{
+	int thresh;
+
+	get_option(&s, &thresh);
+	if (thresh > 0) {
+		preempt_thresh = usecs_to_cycles(thresh);
+		printk("Preemption threshold = %u �s\n", thresh);
+	}
+	return 1;
+}
+__setup("preempt_thresh=", setup_preempt_thresh);
+
+#ifdef CONFIG_CRITICAL_TIMING
+
+static void notrace
+check_critical_timing(struct cpu_trace *tr, unsigned long parent_eip)
+{
+	unsigned long latency, t0, t1;
+	cycles_t T1, T0, delta;
+
+	if (trace_user_triggered)
+		return;
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = tr->preempt_timestamp;
+	T1 = cycles();
+	delta = T1-T0;
+
+	if (!report_latency(delta))
+		goto out;
+	___trace(TRACE_FN, CALLER_ADDR0, parent_eip, 0, 0, 0);
+	/*
+	 * Update the timestamp, because the trace entry above
+	 * might change it (it can only get larger so the latency
+	 * is fair to be reported):
+	 */
+	T1 = cycles();
+	delta = T1-T0;
+
+	if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex))
+		goto out;
+
+	preempt_max_latency = delta;
+	t0 = cycles_to_usecs(T0);
+	t1 = cycles_to_usecs(T1);
+	latency = cycles_to_usecs(delta);
+
+	tr->critical_end = parent_eip;
+
+	update_max_tr(tr);
+
+	if (preempt_thresh)
+		printk("(%16s-%-5d|#%d): %lu �s critical section "
+			"violates %lu �s threshold.\n"
+			" => started at timestamp %lu: ",
+				current->comm, current->pid,
+				_smp_processor_id(),
+				latency, cycles_to_usecs(preempt_thresh), t0);
+	else
+		printk("(%16s-%-5d|#%d): new %lu �s maximum-latency "
+			"critical section.\n => started at timestamp %lu: ",
+				current->comm, current->pid,
+				_smp_processor_id(),
+				latency, t0);
+
+	print_symbol("<%s>\n", tr->critical_start);
+	printk(" =>   ended at timestamp %lu: ", t1);
+	print_symbol("<%s>\n", tr->critical_end);
+	dump_stack();
+	t1 = cycles_to_usecs(cycles());
+	printk(" =>   dump-end timestamp %lu\n\n", t1);
+
+	max_sequence++;
+
+	up(&max_mutex);
+out:
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = cycles();
+	tr->early_warning = 0;
+	tr->trace_idx = 0;
+	trace_cmdline();
+	__trace(CALLER_ADDR0, parent_eip);
+}
+
+void notrace touch_critical_timing(void)
+{
+	struct cpu_trace *tr = cpu_traces + _smp_processor_id();
+
+	if (!tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	if (preempt_count() > 0 && tr->critical_start) {
+		atomic_inc(&tr->disabled);
+		check_critical_timing(tr, CALLER_ADDR0);
+		tr->critical_start = CALLER_ADDR0;
+		tr->critical_sequence = max_sequence;
+		atomic_dec(&tr->disabled);
+	}
+}
+EXPORT_SYMBOL(touch_critical_timing);
+
+void notrace stop_critical_timing(void)
+{
+	struct cpu_trace *tr = cpu_traces + _smp_processor_id();
+
+	tr->critical_start = 0;
+}
+EXPORT_SYMBOL(stop_critical_timing);
+
+static inline void notrace
+__start_critical_timing(unsigned long eip, unsigned long parent_eip)
+{
+	int cpu = _smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+
+	if (tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	atomic_inc(&tr->disabled);
+
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = cycles();
+	tr->critical_start = eip;
+	tr->trace_idx = 0;
+	trace_cmdline();
+	____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0);
+
+	atomic_dec(&tr->disabled);
+}
+
+static inline void notrace
+__stop_critical_timing(unsigned long eip, unsigned long parent_eip)
+{
+	int cpu = _smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+
+	if (!tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	atomic_inc(&tr->disabled);
+	____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0);
+	check_critical_timing(tr, eip);
+	tr->critical_start = 0;
+	atomic_dec(&tr->disabled);
+}
+
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+
+void notrace trace_irqs_off(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (!preempt_count() && irqs_disabled_flags(flags))
+		__start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+EXPORT_SYMBOL(trace_irqs_off);
+
+void notrace trace_irqs_on(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (!preempt_count() && irqs_disabled_flags(flags))
+		__stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+EXPORT_SYMBOL(trace_irqs_on);
+
+#endif
+
+#endif /* LATENCY_TIMING */
+
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING)
+
+void notrace add_preempt_count(int val)
+{
+	unsigned long eip = CALLER_ADDR0;
+	unsigned long parent_eip = CALLER_ADDR1;
+
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(((int)preempt_count() < 0));
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+#endif
+
+	preempt_count() += val;
+#ifdef CONFIG_PREEMPT_TRACE
+	if (val <= 10) {
+		unsigned int idx = preempt_count() & PREEMPT_MASK;
+		if (idx < MAX_PREEMPT_TRACE) {
+			current->preempt_trace_eip[idx] = eip;
+			current->preempt_trace_parent_eip[idx] = parent_eip;
+		}
+	}
+#endif
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		local_save_flags(flags);
+
+		if (!irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == val)
+				__start_critical_timing(eip, parent_eip);
+	}
+#endif
+}
+EXPORT_SYMBOL(add_preempt_count);
+
+void notrace sub_preempt_count(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(unlikely(val > preempt_count()));
+
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
+#endif
+
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		local_save_flags(flags);
+
+		if (!irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == val)
+				__stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	}
+#endif
+	preempt_count() -= val;
+}
+
+EXPORT_SYMBOL(sub_preempt_count);
+
+#endif
+
+/*
+ * Wakeup latency timing/tracing. We get upcalls from the scheduler
+ * when a task is being woken up and we time/trace it until it gets
+ * to a CPU - or an even-higher-prio task supercedes it. (in that
+ * case we throw away the currently traced task - we dont try to
+ * handle nesting, that simplifies things significantly)
+ */
+#ifdef CONFIG_WAKEUP_TIMING
+
+static void notrace
+check_wakeup_timing(struct cpu_trace *tr, unsigned long parent_eip)
+{
+	unsigned long latency;
+	unsigned long t0, t1;
+	cycles_t T0, T1, delta;
+
+	if (trace_user_triggered)
+		return;
+
+	atomic_inc(&tr->disabled);
+	if (atomic_read(&tr->disabled) != 1)
+		goto out;
+
+	T0 = tr->preempt_timestamp;
+	T1 = cycles();
+	delta = T1-T0;
+
+	if (!report_latency(delta))
+		goto out;
+
+	____trace(smp_processor_id(), TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0);
+	T1 = cycles();
+	delta = T1-T0;
+
+	if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex))
+		goto out;
+
+	preempt_max_latency = delta;
+	t0 = cycles_to_usecs(T0);
+	t1 = cycles_to_usecs(T1);
+	latency = cycles_to_usecs(delta);
+
+	tr->critical_end = parent_eip;
+
+	update_max_tr(tr);
+
+	if (preempt_thresh)
+		printk("(%16s-%-5d|#%d): %lu �s wakeup latency "
+			"violates %lu �s threshold.\n",
+				current->comm, current->pid,
+				_smp_processor_id(), latency,
+				cycles_to_usecs(preempt_thresh));
+	else
+		printk("(%16s-%-5d|#%d): new %lu �s maximum-latency "
+			"wakeup.\n", current->comm, current->pid,
+				_smp_processor_id(), latency);
+
+	max_sequence++;
+
+	up(&max_mutex);
+out:
+	atomic_dec(&tr->disabled);
+}
+
+/*
+ * Start wakeup latency tracing - called with the runqueue held
+ * and interrupts disabled:
+ */
+void __trace_start_sched_wakeup(struct task_struct *p)
+{
+	struct cpu_trace *tr;
+	int cpu;
+
+	if (trace_user_triggered || !wakeup_timing)
+		return;
+
+	spin_lock(&sch.trace_lock);
+	if (sch.task && (sch.task->prio >= p->prio))
+		goto out_unlock;
+	/*
+	 * New highest-prio task just woke up - start tracing:
+	 */
+	sch.task = p;
+	sch.cpu = task_cpu(p);
+	/*
+	 * We keep using this CPU's trace buffer even if the task
+	 * gets migrated to another CPU. Tracing only happens on
+	 * the CPU that 'owns' the highest-prio task so it's
+	 * fundamentally single-threaded.
+	 */
+	sch.tr = tr = cpu_traces + sch.cpu;
+	if (trace_all_cpus)
+		for_each_online_cpu(cpu)
+			cpu_traces[cpu].trace_idx = 0;
+	else
+		tr->trace_idx = 0;
+
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = cycles();
+	tr->critical_start = CALLER_ADDR0;
+	trace_cmdline();
+	mcount();
+out_unlock:
+	spin_unlock(&sch.trace_lock);
+}
+
+void trace_stop_sched_switched(struct task_struct *p)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+
+	trace_cmdline();
+	if (trace_user_triggered || !wakeup_timing)
+		return;
+
+	trace_special_pid(p->pid, p->prio, 0);
+
+	spin_lock_irqsave(&sch.trace_lock, flags);
+	if (p == sch.task) {
+		sch.task = NULL;
+		tr = sch.tr;
+		sch.tr = NULL;
+		WARN_ON(!tr);
+		/*
+		 * Somewhat racy but safer:
+		 */
+		spin_unlock(&sch.trace_lock);
+		check_wakeup_timing(tr, CALLER_ADDR0);
+		local_irq_restore(flags);
+	} else {
+		if (sch.task)
+			trace_special_pid(sch.task->pid, sch.task->prio, p->prio);
+		if (sch.task && (sch.task->prio >= p->prio))
+			sch.task = NULL;
+		spin_unlock_irqrestore(&sch.trace_lock, flags);
+	}
+}
+
+void trace_change_sched_cpu(struct task_struct *p, int new_cpu)
+{
+	unsigned long flags;
+
+	if (!wakeup_timing)
+		return;
+
+	trace_special(task_cpu(p), task_cpu(p), new_cpu);
+	spin_lock_irqsave(&sch.trace_lock, flags);
+	if (p == sch.task && task_cpu(p) != new_cpu) {
+		sch.cpu = new_cpu;
+		trace_special(task_cpu(p), new_cpu, 0);
+	}
+	spin_unlock_irqrestore(&sch.trace_lock, flags);
+}
+
+#endif
+
+#ifdef CONFIG_LATENCY_TRACE
+
+long user_trace_start(void)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+	int cpu;
+
+	if (!trace_user_triggered || trace_print_at_crash)
+		return -EINVAL;
+
+	if (down_trylock(&max_mutex))
+		return -EAGAIN;
+
+	preempt_disable();
+	tr = cpu_traces + smp_processor_id();
+
+	if (wakeup_timing) {
+		spin_lock_irqsave(&sch.trace_lock, flags);
+		sch.task = current;
+		sch.cpu = smp_processor_id();
+		sch.tr = tr;
+		spin_unlock_irqrestore(&sch.trace_lock, flags);
+	}
+
+	if (trace_all_cpus)
+		for_each_online_cpu(cpu)
+			cpu_traces[cpu].trace_idx = 0;
+	else
+		tr->trace_idx = 0;
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = cycles();
+	trace_cmdline();
+	mcount();
+	preempt_enable();
+
+	up(&max_mutex);
+
+	return 0;
+}
+
+long user_trace_stop(void)
+{
+	unsigned long latency, flags;
+	struct cpu_trace *tr;
+	cycles_t delta;
+
+	if (!trace_user_triggered || trace_print_at_crash)
+		return -EINVAL;
+
+	preempt_disable();
+	mcount();
+
+	if (wakeup_timing) {
+		spin_lock_irqsave(&sch.trace_lock, flags);
+		if (current != sch.task) {
+			spin_unlock_irqrestore(&sch.trace_lock, flags);
+			preempt_enable();
+			return -EINVAL;
+		}
+		sch.task = NULL;
+		tr = sch.tr;
+		sch.tr = NULL;
+		spin_unlock_irqrestore(&sch.trace_lock, flags);
+	} else
+		tr = cpu_traces + smp_processor_id();
+
+	atomic_inc(&tr->disabled);
+	if (tr->preempt_timestamp) {
+		delta = cycles() - tr->preempt_timestamp;
+		if (!report_latency(delta))
+			goto out;
+		if (tr->critical_sequence != max_sequence ||
+						down_trylock(&max_mutex))
+			goto out;
+
+		preempt_max_latency = delta;
+		update_max_tr(tr);
+
+		latency = cycles_to_usecs(delta);
+
+		if (preempt_thresh)
+			printk("(%16s-%-5d|#%d): %lu �s user-latency "
+				"violates %lu �s threshold.\n",
+					current->comm, current->pid,
+					_smp_processor_id(), latency,
+					cycles_to_usecs(preempt_thresh));
+		else
+			printk("(%16s-%-5d|#%d): new %lu �s user-latency.\n",
+				current->comm, current->pid,
+					_smp_processor_id(), latency);
+
+		max_sequence++;
+		up(&max_mutex);
+out:
+		tr->preempt_timestamp = 0;
+	}
+	atomic_dec(&tr->disabled);
+	preempt_enable();
+
+	return 0;
+}
+
+EXPORT_SYMBOL(user_trace_stop);
+
+void stop_trace(void)
+{
+	if (trace_print_at_crash)
+		trace_enabled = -1;
+}
+
+static void print_entry(struct trace_entry *entry, struct trace_entry *entry0,
+			struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	printk("%-5d %d/%d %ld.%03ldms: ",
+		entry->pid, entry->flags, entry->preempt_count,
+			abs_usecs/1000, abs_usecs % 1000);
+
+	printk_name(entry->u.fn.eip);
+	printk("  <= (");
+	printk_name(entry->u.fn.parent_eip);
+	printk(")\n");
+}
+
+void print_last_trace(void)
+{
+	unsigned int idx0, idx, i;
+	struct cpu_trace *tr;
+	struct trace_entry *entry0, *entry, *next_entry;
+
+	if (trace_enabled != -1)
+		return;
+
+	preempt_disable();
+	tr = cpu_traces + smp_processor_id();
+
+	printk("Last %ld trace entries:\n", MAX_TRACE);
+	idx0 = tr->trace_idx;
+	printk("curr idx: %d\n", idx0);
+	if (idx0 >= MAX_TRACE)
+		idx0 = MAX_TRACE-1;
+	idx = idx0;
+	entry0 = tr->trace + idx0;
+
+	for (i = 0; i < MAX_TRACE; i++) {
+		entry = tr->trace + idx;
+		idx++;
+		if (idx == MAX_TRACE)
+			idx = 0;
+		next_entry = tr->trace + idx;
+		if (entry->type == TRACE_FN)
+			print_entry(entry, entry0, next_entry);
+	}
+	trace_print_at_crash = 1;
+	preempt_enable();
+}
+
+#ifdef CONFIG_SMP
+/*
+ * On SMP, try to 'peek' on other CPU's traces and record them
+ * in this CPU's trace. This way we get a rough idea about what's
+ * going on there, without the overhead of global tracing.
+ *
+ * (no need to make this PER_CPU, we bounce it around anyway.)
+ */
+unsigned long nmi_eips[NR_CPUS];
+unsigned long nmi_flags[NR_CPUS];
+
+void notrace nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags)
+{
+	int cpu, this_cpu = smp_processor_id();
+
+	__trace(eip, parent_eip);
+
+	nmi_eips[this_cpu] = parent_eip;
+	nmi_flags[this_cpu] = flags;
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		if (cpu_online(cpu) && cpu != this_cpu) {
+			__trace(eip, nmi_eips[cpu]);
+			__trace(eip, nmi_flags[cpu]);
+		}
+}
+#else
+/*
+ * On UP, NMI tracing is quite simple:
+ */
+void notrace nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags)
+{
+	__trace(eip, parent_eip);
+}
+#endif
+
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACE
+
+static void print_preempt_trace(struct task_struct *task)
+{
+	unsigned int count = task->thread_info->preempt_count;
+	unsigned int i, lim = count & PREEMPT_MASK;
+	if (lim >= MAX_PREEMPT_TRACE)
+		lim = MAX_PREEMPT_TRACE-1;
+	printk("---------------------------\n");
+	printk("| preempt count: %08x ]\n", count);
+	printk("| %d-level deep critical section nesting:\n", lim);
+	printk("----------------------------------------\n");
+	for (i = 1; i <= lim; i++) {
+		printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]);
+		print_symbol("%s\n", task->preempt_trace_eip[i]);
+		printk(".....[<%08lx>] ..   ( <= ",
+				task->preempt_trace_parent_eip[i]);
+		print_symbol("%s)\n", task->preempt_trace_parent_eip[i]);
+	}
+	printk("\n");
+}
+
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE)
+void print_traces(struct task_struct *task)
+{
+	preempt_disable();
+#ifdef CONFIG_PREEMPT_TRACE
+	print_preempt_trace(task);
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	print_last_trace();
+#endif
+	preempt_enable();
+}
+#endif
+
+#ifdef CONFIG_LATENCY_TIMING
+
+static int preempt_read_proc(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	cycles_t *max = data;
+
+	return sprintf(page, "%ld\n", cycles_to_usecs(*max));
+}
+
+static int preempt_write_proc(struct file *file, const char __user *buffer,
+			       unsigned long count, void *data)
+{
+	unsigned int c, done = 0, val, sum = 0;
+	cycles_t *max = data;
+
+	while (count) {
+		if (get_user(c, buffer))
+			return -EFAULT;
+		val = c - '0';
+		buffer++;
+		done++;
+		count--;
+		if (c == 0 || c == '\n')
+			break;
+		if (val > 9)
+			return -EINVAL;
+		sum *= 10;
+		sum += val;
+	}
+	*max = usecs_to_cycles(sum);
+	return done;
+}
+
+static __init int latency_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = create_proc_entry("sys/kernel/preempt_max_latency", 0600, NULL);
+
+	entry->nlink = 1;
+	entry->data = &preempt_max_latency;
+	entry->read_proc = preempt_read_proc;
+	entry->write_proc = preempt_write_proc;
+
+	entry = create_proc_entry("sys/kernel/preempt_thresh", 0600, NULL);
+
+	entry->nlink = 1;
+	entry->data = &preempt_thresh;
+	entry->read_proc = preempt_read_proc;
+	entry->write_proc = preempt_write_proc;
+
+	return 0;
+}
+__initcall(latency_init);
+
+#endif
+
--- linux/kernel/softirq.c.orig
+++ linux/kernel/softirq.c
@@ -16,6 +16,8 @@
 #include <linux/cpu.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
 
 #include <asm/irq.h>
 /*
@@ -71,7 +73,7 @@ static inline void wakeup_softirqd(void)
  */
 #define MAX_SOFTIRQ_RESTART 10
 
-asmlinkage void __do_softirq(void)
+asmlinkage void ___do_softirq(void)
 {
 	struct softirq_action *h;
 	__u32 pending;
@@ -80,7 +82,6 @@ asmlinkage void __do_softirq(void)
 
 	pending = local_softirq_pending();
 
-	local_bh_disable();
 	cpu = smp_processor_id();
 restart:
 	/* Reset the pending bitmask before enabling irqs */
@@ -92,8 +93,17 @@ restart:
 
 	do {
 		if (pending & 1) {
-			h->action(h);
+			{
+				u32 preempt_count = preempt_count();
+				h->action(h);
+				if (preempt_count != preempt_count()) {
+					print_symbol("softirq preempt bug: exited %s with wrong preemption count!\n", (unsigned long) h->action);
+					printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+					preempt_count() = preempt_count;
+				}
+			}
 			rcu_bh_qsctr_inc(cpu);
+			cond_resched_all();
 		}
 		h++;
 		pending >>= 1;
@@ -107,10 +117,51 @@ restart:
 
 	if (pending)
 		wakeup_softirqd();
+}
+
+asmlinkage void __do_softirq(void)
+{
+	unsigned long p_flags;
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	/*
+	 * 'preempt harder'. Push all softirq processing off to ksoftirqd.
+	 */
+	if (softirq_preemption) {
+		if (local_softirq_pending())
+			wakeup_softirqd();
+		return;
+	}
+#endif
+	/*
+	 * 'immediate' softirq execution:
+	 */
+	local_bh_disable();
+	p_flags = current->flags & PF_HARDIRQ;
+	current->flags &= ~PF_HARDIRQ;
 
+	___do_softirq();
 	__local_bh_enable();
+
+	current->flags |= p_flags;
 }
 
+/*
+ * 'delayed' softirq execution. Does not disable bhs and thus
+ * makes most of the softirq handlers preemptable - as long as
+ * they are not executed 'directly'.
+ */
+asmlinkage void _do_softirq(void)
+{
+	local_irq_disable();
+	if (!softirq_preemption)
+		__do_softirq();
+	else
+		___do_softirq();
+	local_irq_enable();
+}
+
+
 #ifndef __ARCH_HAS_DO_SOFTIRQ
 
 asmlinkage void do_softirq(void)
@@ -135,6 +186,8 @@ EXPORT_SYMBOL(do_softirq);
 
 #endif
 
+#ifndef CONFIG_PREEMPT_RT
+
 void local_bh_enable(void)
 {
 	WARN_ON(irqs_disabled());
@@ -152,6 +205,8 @@ void local_bh_enable(void)
 }
 EXPORT_SYMBOL(local_bh_enable);
 
+#endif
+
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
 # define invoke_softirq()	__do_softirq()
 #else
@@ -349,8 +404,14 @@ void __init softirq_init(void)
 
 static int ksoftirqd(void * __bind_cpu)
 {
-	set_user_nice(current, 19);
-	current->flags |= PF_NOFREEZE;
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO/4-1 };
+
+	printk("ksoftirqd started up.\n");
+
+	printk("softirq RT prio: %d.\n", param.sched_priority);
+//	sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
+	set_user_nice(current, -10);
+	current->flags |= PF_NOFREEZE | PF_SOFTIRQ;
 
 	set_current_state(TASK_INTERRUPTIBLE);
 
@@ -367,8 +428,8 @@ static int ksoftirqd(void * __bind_cpu)
 			preempt_disable();
 			if (cpu_is_offline((long)__bind_cpu))
 				goto wait_to_die;
-			do_softirq();
 			preempt_enable();
+			_do_softirq();
 			cond_resched();
 		}
 
@@ -419,7 +480,7 @@ void tasklet_kill_immediate(struct taskl
 	BUG();
 }
 
-static void takeover_tasklets(unsigned int cpu)
+void takeover_tasklets(unsigned int cpu)
 {
 	struct tasklet_struct **i;
 
@@ -490,3 +551,33 @@ __init int spawn_ksoftirqd(void)
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+
+int softirq_preemption = 1;
+
+EXPORT_SYMBOL(softirq_preemption);
+
+/*
+ * Real-Time Preemption depends on softirq threading:
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+static int __init softirq_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		softirq_preemption = 0;
+	else
+		get_option(&str, &softirq_preemption);
+	if (!softirq_preemption)
+		printk("turning off softirq preemption!\n");
+
+	return 1;
+}
+
+__setup("softirq-preempt=", softirq_preempt_setup);
+
+#endif
+
+#endif
+
--- linux/kernel/futex.c.orig
+++ linux/kernel/futex.c
@@ -539,8 +539,13 @@ static int futex_wait(unsigned long uadd
 	 * !list_empty() is safe here without any lock.
 	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
 	 */
-	if (likely(!list_empty(&q.list)))
+	if (likely(!list_empty(&q.list))) {
+		unsigned long nosched_flag = current->flags & PF_NOSCHED;
+
+		current->flags &= ~PF_NOSCHED;
 		time = schedule_timeout(time);
+		current->flags |= nosched_flag;
+	}
 	__set_current_state(TASK_RUNNING);
 
 	/*
--- linux/kernel/sysctl.c.orig
+++ linux/kernel/sysctl.c
@@ -41,6 +41,7 @@
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/profile.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -274,6 +275,130 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "prof_pid",
+		.data		= &prof_pid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_PREEMPT
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "kernel_preemption",
+		.data		= &kernel_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "voluntary_preemption",
+		.data		= &voluntary_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_PREEMPT_SOFTIRQS) && !defined(CONFIG_PREEMPT_RT)
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "softirq_preemption",
+		.data		= &softirq_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_PREEMPT_RT)
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "hardirq_preemption",
+		.data		= &hardirq_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_WAKEUP_TIMING
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "wakeup_timing",
+		.data		= &wakeup_timing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_enabled",
+		.data		= &trace_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "mcount_enabled",
+		.data		= &mcount_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_user_triggered",
+		.data		= &trace_user_triggered,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_freerunning",
+		.data		= &trace_freerunning,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_print_at_crash",
+		.data		= &trace_print_at_crash,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_verbose",
+		.data		= &trace_verbose,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_all_cpus",
+		.data		= &trace_all_cpus,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "debug_direct_keyboard",
+		.data		= &debug_direct_keyboard,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.ctl_name	= KERN_CORE_USES_PID,
 		.procname	= "core_uses_pid",
 		.data		= &core_uses_pid,
--- linux/ipc/msg.c.orig
+++ linux/ipc/msg.c
@@ -62,9 +62,14 @@ static atomic_t msg_hdrs = ATOMIC_INIT(0
 
 static struct ipc_ids msg_ids;
 
-#define msg_lock(id)	((struct msg_queue*)ipc_lock(&msg_ids,id))
-#define msg_unlock(msq)	ipc_unlock(&(msq)->q_perm)
-#define msg_rmid(id)	((struct msg_queue*)ipc_rmid(&msg_ids,id))
+#define msg_lock(id)		((struct msg_queue*)ipc_lock(&msg_ids,id))
+#define msg_lock_writer(id)	((struct msg_queue*)ipc_lock_writer(&msg_ids,id))
+#define msg_lock_ptr(msq)	ipc_lock_by_ptr(&msg_ids, &(msq)->q_perm)
+#define msg_lock_ptr_writer(msq) \
+		ipc_lock_by_ptr_writer(&msg_ids, &(msq)->q_perm)
+#define msg_unlock(msq)		ipc_unlock(&msg_ids, &(msq)->q_perm)
+#define msg_unlock_writer(msq)	ipc_unlock_writer(&msg_ids, &(msq)->q_perm)
+#define msg_rmid(id)		((struct msg_queue*)ipc_rmid(&msg_ids,id))
 #define msg_checkid(msq, msgid)	\
 	ipc_checkid(&msg_ids,&msq->q_perm,msgid)
 #define msg_buildid(id, seq) \
@@ -105,7 +110,7 @@ static int newque (key_t key, int msgflg
 		return retval;
 	}
 
-	id = ipc_addid(&msg_ids, &msq->q_perm, msg_ctlmni);
+	id = ipc_addid_writer(&msg_ids, &msq->q_perm, msg_ctlmni);
 	if(id == -1) {
 		security_msg_queue_free(msq);
 		ipc_rcu_putref(msq);
@@ -120,7 +125,7 @@ static int newque (key_t key, int msgflg
 	INIT_LIST_HEAD(&msq->q_messages);
 	INIT_LIST_HEAD(&msq->q_receivers);
 	INIT_LIST_HEAD(&msq->q_senders);
-	msg_unlock(msq);
+	msg_unlock_writer(msq);
 
 	return msg_buildid(id,msq->q_perm.seq);
 }
@@ -185,7 +190,7 @@ static void freeque (struct msg_queue *m
 	expunge_all(msq,-EIDRM);
 	ss_wakeup(&msq->q_senders,1);
 	msq = msg_rmid(id);
-	msg_unlock(msq);
+	msg_unlock_writer(msq);
 		
 	tmp = msq->q_messages.next;
 	while(tmp != &msq->q_messages) {
@@ -215,7 +220,7 @@ asmlinkage long sys_msgget (key_t key, i
 	} else if (msgflg & IPC_CREAT && msgflg & IPC_EXCL) {
 		ret = -EEXIST;
 	} else {
-		msq = msg_lock(id);
+		msq = msg_lock_writer(id);
 		if(msq==NULL)
 			BUG();
 		if (ipcperms(&msq->q_perm, msgflg))
@@ -226,7 +231,7 @@ asmlinkage long sys_msgget (key_t key, i
 			if (!ret)
 				ret = qid;
 		}
-		msg_unlock(msq);
+		msg_unlock_writer(msq);
 	}
 	up(&msg_ids.sem);
 	return ret;
@@ -433,7 +438,7 @@ asmlinkage long sys_msgctl (int msqid, i
 	}
 
 	down(&msg_ids.sem);
-	msq = msg_lock(msqid);
+	msq = msg_lock_writer(msqid);
 	err=-EINVAL;
 	if (msq == NULL)
 		goto out_up;
@@ -474,7 +479,7 @@ asmlinkage long sys_msgctl (int msqid, i
 		 * due to a larger queue size.
 		 */
 		ss_wakeup(&msq->q_senders,0);
-		msg_unlock(msq);
+		msg_unlock_writer(msq);
 		break;
 	}
 	case IPC_RMID:
@@ -486,7 +491,7 @@ out_up:
 	up(&msg_ids.sem);
 	return err;
 out_unlock_up:
-	msg_unlock(msq);
+	msg_unlock_writer(msq);
 	goto out_up;
 out_unlock:
 	msg_unlock(msq);
@@ -602,7 +607,7 @@ asmlinkage long sys_msgsnd (int msqid, s
 		msg_unlock(msq);
 		schedule();
 
-		ipc_lock_by_ptr(&msq->q_perm);
+		msg_lock_ptr(msq);
 		ipc_rcu_putref(msq);
 		if (msq->q_perm.deleted) {
 			err = -EIDRM;
@@ -749,7 +754,7 @@ asmlinkage long sys_msgrcv (int msqid, s
 		 * rcu_read_lock() prevents preemption between reading r_msg
 		 * and the spin_lock() inside ipc_lock_by_ptr().
 		 */
-		rcu_read_lock();
+		rcu_read_lock_sem(&msg_ids.sem);
 
 		/* Lockless receive, part 2:
 		 * Wait until pipelined_send or expunge_all are outside of
@@ -767,15 +772,17 @@ asmlinkage long sys_msgrcv (int msqid, s
 		 * locking.
 		 */
 		if(msg != ERR_PTR(-EAGAIN)) {
-			rcu_read_unlock();
+			rcu_read_unlock_sem(&msg_ids.sem);
 			break;
 		}
 
 		/* Lockless receive, part 3:
 		 * Acquire the queue spinlock.
+		 *
+		 * in the PREEMPT_RT case keep the semaphore held:
 		 */
-		ipc_lock_by_ptr(&msq->q_perm);
-		rcu_read_unlock();
+		msg_lock_ptr_writer(msq);
+		rcu_read_unlock_nort();
 
 		/* Lockless receive, part 4:
 		 * Repeat test after acquiring the spinlock.
@@ -816,7 +823,7 @@ static int sysvipc_msg_read_proc(char *b
 
 	for(i = 0; i <= msg_ids.max_id; i++) {
 		struct msg_queue * msq;
-		msq = msg_lock(i);
+		msq = msg_lock_writer(i);
 		if(msq != NULL) {
 			len += sprintf(buffer + len, "%10d %10d  %4o  %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n",
 				msq->q_perm.key,
@@ -833,7 +840,7 @@ static int sysvipc_msg_read_proc(char *b
 				msq->q_stime,
 				msq->q_rtime,
 				msq->q_ctime);
-			msg_unlock(msq);
+			msg_unlock_writer(msq);
 
 			pos += len;
 			if(pos < offset) {
--- linux/ipc/sem.c.orig
+++ linux/ipc/sem.c
@@ -76,9 +76,12 @@
 #include "util.h"
 
 
-#define sem_lock(id)	((struct sem_array*)ipc_lock(&sem_ids,id))
-#define sem_unlock(sma)	ipc_unlock(&(sma)->sem_perm)
-#define sem_rmid(id)	((struct sem_array*)ipc_rmid(&sem_ids,id))
+#define sem_lock(id)		((struct sem_array*)ipc_lock(&sem_ids,id))
+#define sem_lock_writer(id)	((struct sem_array*)ipc_lock_writer(&sem_ids,id))
+#define sem_lock_ptr(sma)	ipc_lock_by_ptr(&sem_ids,&(sma)->sem_perm)
+#define sem_unlock(sma)		ipc_unlock(&sem_ids, &(sma)->sem_perm)
+#define sem_unlock_writer(sma)	ipc_unlock_writer(&sem_ids, &(sma)->sem_perm)
+#define sem_rmid(id)		((struct sem_array*)ipc_rmid(&sem_ids,id))
 #define sem_checkid(sma, semid)	\
 	ipc_checkid(&sem_ids,&sma->sem_perm,semid)
 #define sem_buildid(id, seq) \
@@ -184,7 +187,7 @@ static int newary (key_t key, int nsems,
 		return retval;
 	}
 
-	id = ipc_addid(&sem_ids, &sma->sem_perm, sc_semmni);
+	id = ipc_addid_writer(&sem_ids, &sma->sem_perm, sc_semmni);
 	if(id == -1) {
 		security_sem_free(sma);
 		ipc_rcu_putref(sma);
@@ -198,7 +201,7 @@ static int newary (key_t key, int nsems,
 	/* sma->undo = NULL; */
 	sma->sem_nsems = nsems;
 	sma->sem_ctime = get_seconds();
-	sem_unlock(sma);
+	sem_unlock_writer(sma);
 
 	return sem_buildid(id, sma->sem_perm.seq);
 }
@@ -210,35 +213,44 @@ asmlinkage long sys_semget (key_t key, i
 
 	if (nsems < 0 || nsems > sc_semmsl)
 		return -EINVAL;
-	down(&sem_ids.sem);
 	
 	if (key == IPC_PRIVATE) {
+		down(&sem_ids.sem);
 		err = newary(key, nsems, semflg);
-	} else if ((id = ipc_findkey(&sem_ids, key)) == -1) {  /* key not used */
+		up(&sem_ids.sem);
+		return err;
+	}
+
+	down(&sem_ids.sem);
+	if ((id = ipc_findkey(&sem_ids, key)) == -1) {  /* key not used */
 		if (!(semflg & IPC_CREAT))
 			err = -ENOENT;
 		else
 			err = newary(key, nsems, semflg);
-	} else if (semflg & IPC_CREAT && semflg & IPC_EXCL) {
-		err = -EEXIST;
-	} else {
-		sma = sem_lock(id);
-		if(sma==NULL)
-			BUG();
-		if (nsems > sma->sem_nsems)
-			err = -EINVAL;
-		else if (ipcperms(&sma->sem_perm, semflg))
-			err = -EACCES;
-		else {
-			int semid = sem_buildid(id, sma->sem_perm.seq);
-			err = security_sem_associate(sma, semflg);
-			if (!err)
-				err = semid;
-		}
-		sem_unlock(sma);
+		up(&sem_ids.sem);
+		return err;
 	}
 
+	if (semflg & IPC_CREAT && semflg & IPC_EXCL) {
+		err = -EEXIST;
+		up(&sem_ids.sem);
+		return err;
+	}
+	sma = sem_lock_writer(id);
+	BUG_ON(!sma);
+	if (nsems > sma->sem_nsems)
+		err = -EINVAL;
+	else if (ipcperms(&sma->sem_perm, semflg))
+		err = -EACCES;
+	else {
+		int semid = sem_buildid(id, sma->sem_perm.seq);
+		err = security_sem_associate(sma, semflg);
+		if (!err)
+			err = semid;
+	}
+	sem_unlock_writer(sma);
 	up(&sem_ids.sem);
+
 	return err;
 }
 
@@ -464,7 +476,7 @@ static void freeary (struct sem_array *s
 
 	/* Remove the semaphore set from the ID array*/
 	sma = sem_rmid(id);
-	sem_unlock(sma);
+	sem_unlock_writer(sma);
 
 	used_sems -= sma->sem_nsems;
 	size = sizeof (*sma) + sma->sem_nsems * sizeof (struct sem);
@@ -615,13 +627,13 @@ static int semctl_main(int semid, int se
 
 			sem_io = ipc_alloc(sizeof(ushort)*nsems);
 			if(sem_io == NULL) {
-				ipc_lock_by_ptr(&sma->sem_perm);
+				sem_lock_ptr(sma);
 				ipc_rcu_putref(sma);
 				sem_unlock(sma);
 				return -ENOMEM;
 			}
 
-			ipc_lock_by_ptr(&sma->sem_perm);
+			sem_lock_ptr(sma);
 			ipc_rcu_putref(sma);
 			if (sma->sem_perm.deleted) {
 				sem_unlock(sma);
@@ -649,7 +661,7 @@ static int semctl_main(int semid, int se
 		if(nsems > SEMMSL_FAST) {
 			sem_io = ipc_alloc(sizeof(ushort)*nsems);
 			if(sem_io == NULL) {
-				ipc_lock_by_ptr(&sma->sem_perm);
+				sem_lock_ptr(sma);
 				ipc_rcu_putref(sma);
 				sem_unlock(sma);
 				return -ENOMEM;
@@ -657,7 +669,7 @@ static int semctl_main(int semid, int se
 		}
 
 		if (copy_from_user (sem_io, arg.array, nsems*sizeof(ushort))) {
-			ipc_lock_by_ptr(&sma->sem_perm);
+			sem_lock_ptr(sma);
 			ipc_rcu_putref(sma);
 			sem_unlock(sma);
 			err = -EFAULT;
@@ -666,14 +678,14 @@ static int semctl_main(int semid, int se
 
 		for (i = 0; i < nsems; i++) {
 			if (sem_io[i] > SEMVMX) {
-				ipc_lock_by_ptr(&sma->sem_perm);
+				sem_lock_ptr(sma);
 				ipc_rcu_putref(sma);
 				sem_unlock(sma);
 				err = -ERANGE;
 				goto out_free;
 			}
 		}
-		ipc_lock_by_ptr(&sma->sem_perm);
+		sem_lock_ptr(sma);
 		ipc_rcu_putref(sma);
 		if (sma->sem_perm.deleted) {
 			sem_unlock(sma);
@@ -804,7 +816,7 @@ static int semctl_down(int semid, int se
 		if(copy_semid_from_user (&setbuf, arg.buf, version))
 			return -EFAULT;
 	}
-	sma = sem_lock(semid);
+	sma = sem_lock_writer(semid);
 	if(sma==NULL)
 		return -EINVAL;
 
@@ -835,18 +847,18 @@ static int semctl_down(int semid, int se
 		ipcp->mode = (ipcp->mode & ~S_IRWXUGO)
 				| (setbuf.mode & S_IRWXUGO);
 		sma->sem_ctime = get_seconds();
-		sem_unlock(sma);
+		sem_unlock_writer(sma);
 		err = 0;
 		break;
 	default:
-		sem_unlock(sma);
+		sem_unlock_writer(sma);
 		err = -EINVAL;
 		break;
 	}
 	return err;
 
 out_unlock:
-	sem_unlock(sma);
+	sem_unlock_writer(sma);
 	return err;
 }
 
@@ -1004,7 +1016,7 @@ static struct sem_undo *find_undo(int se
 
 	new = (struct sem_undo *) kmalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
 	if (!new) {
-		ipc_lock_by_ptr(&sma->sem_perm);
+		sem_lock_ptr(sma);
 		ipc_rcu_putref(sma);
 		sem_unlock(sma);
 		return ERR_PTR(-ENOMEM);
@@ -1018,12 +1030,12 @@ static struct sem_undo *find_undo(int se
 	if (un) {
 		unlock_semundo();
 		kfree(new);
-		ipc_lock_by_ptr(&sma->sem_perm);
+		sem_lock_ptr(sma);
 		ipc_rcu_putref(sma);
 		sem_unlock(sma);
 		goto out;
 	}
-	ipc_lock_by_ptr(&sma->sem_perm);
+	sem_lock_ptr(sma);
 	ipc_rcu_putref(sma);
 	if (sma->sem_perm.deleted) {
 		sem_unlock(sma);
@@ -1343,7 +1355,7 @@ static int sysvipc_sem_read_proc(char *b
 
 	for(i = 0; i <= sem_ids.max_id; i++) {
 		struct sem_array *sma;
-		sma = sem_lock(i);
+		sma = sem_lock_writer(i);
 		if(sma) {
 			len += sprintf(buffer + len, "%10d %10d  %4o %10lu %5u %5u %5u %5u %10lu %10lu\n",
 				sma->sem_perm.key,
@@ -1356,7 +1368,7 @@ static int sysvipc_sem_read_proc(char *b
 				sma->sem_perm.cgid,
 				sma->sem_otime,
 				sma->sem_ctime);
-			sem_unlock(sma);
+			sem_unlock_writer(sma);
 
 			pos += len;
 			if(pos < offset) {
--- linux/ipc/util.h.orig
+++ linux/ipc/util.h
@@ -34,7 +34,7 @@ void __init ipc_init_ids(struct ipc_ids*
 
 /* must be called with ids->sem acquired.*/
 int ipc_findkey(struct ipc_ids* ids, key_t key);
-int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size);
+int ipc_addid_writer(struct ipc_ids* ids, struct kern_ipc_perm* new, int size);
 
 /* must be called with both locks acquired. */
 struct kern_ipc_perm* ipc_rmid(struct ipc_ids* ids, int id);
@@ -59,8 +59,11 @@ void ipc_rcu_putref(void *ptr);
 
 struct kern_ipc_perm* ipc_get(struct ipc_ids* ids, int id);
 struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id);
-void ipc_lock_by_ptr(struct kern_ipc_perm *ipcp);
-void ipc_unlock(struct kern_ipc_perm* perm);
+struct kern_ipc_perm* ipc_lock_writer(struct ipc_ids* ids, int id);
+void ipc_lock_by_ptr(struct ipc_ids* ids, struct kern_ipc_perm *ipcp);
+void ipc_lock_by_ptr_writer(struct ipc_ids* ids, struct kern_ipc_perm *ipcp);
+void ipc_unlock(struct ipc_ids* ids, struct kern_ipc_perm* perm);
+void ipc_unlock_writer(struct ipc_ids* ids, struct kern_ipc_perm* perm);
 int ipc_buildid(struct ipc_ids* ids, int id, int seq);
 int ipc_checkid(struct ipc_ids* ids, struct kern_ipc_perm* ipcp, int uid);
 
--- linux/ipc/util.c.orig
+++ linux/ipc/util.c
@@ -166,7 +166,7 @@ static int grow_ary(struct ipc_ids* ids,
  *	Called with ipc_ids.sem held.
  */
  
-int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
+int ipc_addid_writer(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
 {
 	int id;
 
@@ -195,7 +195,11 @@ found:
 
 	spin_lock_init(&new->lock);
 	new->deleted = 0;
-	rcu_read_lock();
+	/*
+	 * We cannot use rcu_read_lock_sem(&ids->sem) here because
+	 * we are holding it already - so it must not be taken again.
+	 */
+	rcu_read_lock_nort();
 	spin_lock(&new->lock);
 	ids->entries->p[id] = new;
 	return id;
@@ -507,15 +511,45 @@ struct kern_ipc_perm* ipc_lock(struct ip
 	int lid = id % SEQ_MULTIPLIER;
 	struct ipc_id_ary* entries;
 
-	rcu_read_lock();
+	rcu_read_lock_sem(&ids->sem);
+	entries = rcu_dereference(ids->entries);
+	if(lid >= entries->size) {
+		rcu_read_unlock_sem(&ids->sem);
+		return NULL;
+	}
+	out = entries->p[lid];
+	if(out == NULL) {
+		rcu_read_unlock_sem(&ids->sem);
+		return NULL;
+	}
+	spin_lock(&out->lock);
+	
+	/* ipc_rmid() may have already freed the ID while ipc_lock
+	 * was spinning: here verify that the structure is still valid
+	 */
+	if (out->deleted) {
+		spin_unlock(&out->lock);
+		rcu_read_unlock_sem(&ids->sem);
+		return NULL;
+	}
+	return out;
+}
+
+struct kern_ipc_perm* ipc_lock_writer(struct ipc_ids* ids, int id)
+{
+	struct kern_ipc_perm* out;
+	int lid = id % SEQ_MULTIPLIER;
+	struct ipc_id_ary* entries;
+
+	rcu_read_lock_nort();
 	entries = rcu_dereference(ids->entries);
 	if(lid >= entries->size) {
-		rcu_read_unlock();
+		rcu_read_unlock_nort();
 		return NULL;
 	}
 	out = entries->p[lid];
 	if(out == NULL) {
-		rcu_read_unlock();
+		rcu_read_unlock_nort();
 		return NULL;
 	}
 	spin_lock(&out->lock);
@@ -525,24 +559,45 @@ struct kern_ipc_perm* ipc_lock(struct ip
 	 */
 	if (out->deleted) {
 		spin_unlock(&out->lock);
-		rcu_read_unlock();
+		rcu_read_unlock_nort();
 		return NULL;
 	}
 	return out;
 }
 
-void ipc_lock_by_ptr(struct kern_ipc_perm *perm)
+void ipc_lock_by_ptr(struct ipc_ids* ids, struct kern_ipc_perm *perm)
 {
-	rcu_read_lock();
+	rcu_read_lock_sem(&ids->sem);
 	spin_lock(&perm->lock);
 }
 
-void ipc_unlock(struct kern_ipc_perm* perm)
+void ipc_lock_by_ptr_writer(struct ipc_ids* ids, struct kern_ipc_perm *perm)
+{
+	rcu_read_lock_nort();
+	spin_lock(&perm->lock);
+}
+
+void ipc_unlock(struct ipc_ids* ids, struct kern_ipc_perm* perm)
 {
 	spin_unlock(&perm->lock);
+	rcu_read_unlock_sem(&ids->sem);
+}
+
+/*
+ * In the PREEMPT_RT case it is important to distinguish between
+ * unlocks done while holding ids.sem for array growing purposes. This
+ * function does not drop the semaphore. rcu_read_lock/unlock can nest
+ * just fine in the PREEMPT_RT case.
+ */
+void ipc_unlock_writer(struct ipc_ids* ids, struct kern_ipc_perm* perm)
+{
+	spin_unlock(&perm->lock);
+#ifndef CONFIG_PREEMPT_RT
 	rcu_read_unlock();
+#endif
 }
 
+
 int ipc_buildid(struct ipc_ids* ids, int id, int seq)
 {
 	return SEQ_MULTIPLIER*seq + id;
--- linux/ipc/shm.c.orig
+++ linux/ipc/shm.c
@@ -38,9 +38,11 @@ static struct vm_operations_struct shm_v
 
 static struct ipc_ids shm_ids;
 
-#define shm_lock(id)	((struct shmid_kernel*)ipc_lock(&shm_ids,id))
-#define shm_unlock(shp)	ipc_unlock(&(shp)->shm_perm)
-#define shm_get(id)	((struct shmid_kernel*)ipc_get(&shm_ids,id))
+#define shm_lock(id)		((struct shmid_kernel*)ipc_lock(&shm_ids,id))
+#define shm_lock_writer(id)	((struct shmid_kernel*)ipc_lock_writer(&shm_ids,id))
+#define shm_unlock(shp)		ipc_unlock(&shm_ids,&(shp)->shm_perm)
+#define shm_unlock_writer(shp)	ipc_unlock_writer(&shm_ids,&(shp)->shm_perm)
+#define shm_get(id)		((struct shmid_kernel*)ipc_get(&shm_ids,id))
 #define shm_buildid(id, seq) \
 	ipc_buildid(&shm_ids, id, seq)
 
@@ -77,9 +79,9 @@ static inline struct shmid_kernel *shm_r
 	return (struct shmid_kernel *)ipc_rmid(&shm_ids,id);
 }
 
-static inline int shm_addid(struct shmid_kernel *shp)
+static inline int shm_addid_writer(struct shmid_kernel *shp)
 {
-	return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni);
+	return ipc_addid_writer(&shm_ids, &shp->shm_perm, shm_ctlmni);
 }
 
 
@@ -109,11 +111,11 @@ static void shm_open (struct vm_area_str
  * It has to be called with shp and shm_ids.sem locked,
  * but returns with shp unlocked and freed.
  */
-static void shm_destroy (struct shmid_kernel *shp)
+static void shm_destroy_writer(struct shmid_kernel *shp)
 {
 	shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	shm_rmid (shp->id);
-	shm_unlock(shp);
+	shm_unlock_writer(shp);
 	if (!is_file_hugepages(shp->shm_file))
 		shmem_lock(shp->shm_file, 0, shp->mlock_user);
 	else
@@ -138,16 +140,16 @@ static void shm_close (struct vm_area_st
 
 	down (&shm_ids.sem);
 	/* remove from the list of attaches of the shm segment */
-	if(!(shp = shm_lock(id)))
+	if(!(shp = shm_lock_writer(id)))
 		BUG();
 	shp->shm_lprid = current->tgid;
 	shp->shm_dtim = get_seconds();
 	shp->shm_nattch--;
 	if(shp->shm_nattch == 0 &&
 	   shp->shm_flags & SHM_DEST)
-		shm_destroy (shp);
+		shm_destroy_writer(shp);
 	else
-		shm_unlock(shp);
+		shm_unlock_writer(shp);
 	up (&shm_ids.sem);
 }
 
@@ -216,7 +218,7 @@ static int newseg (key_t key, int shmflg
 		goto no_file;
 
 	error = -ENOSPC;
-	id = shm_addid(shp);
+	id = shm_addid_writer(shp);
 	if(id == -1) 
 		goto no_id;
 
@@ -234,7 +236,7 @@ static int newseg (key_t key, int shmflg
 	else
 		file->f_op = &shm_file_operations;
 	shm_tot += numpages;
-	shm_unlock(shp);
+	shm_unlock_writer(shp);
 	return shp->id;
 
 no_id:
@@ -261,7 +263,7 @@ asmlinkage long sys_shmget (key_t key, s
 	} else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) {
 		err = -EEXIST;
 	} else {
-		shp = shm_lock(id);
+		shp = shm_lock_writer(id);
 		if(shp==NULL)
 			BUG();
 		if (shp->shm_segsz < size)
@@ -274,7 +276,7 @@ asmlinkage long sys_shmget (key_t key, s
 			if (!err)
 				err = shmid;
 		}
-		shm_unlock(shp);
+		shm_unlock_writer(shp);
 	}
 	up(&shm_ids.sem);
 
@@ -564,7 +566,7 @@ asmlinkage long sys_shmctl (int shmid, i
 		 *	the name away when the usage hits zero.
 		 */
 		down(&shm_ids.sem);
-		shp = shm_lock(shmid);
+		shp = shm_lock_writer(shmid);
 		err = -EINVAL;
 		if (shp == NULL) 
 			goto out_up;
@@ -587,9 +589,9 @@ asmlinkage long sys_shmctl (int shmid, i
 			shp->shm_flags |= SHM_DEST;
 			/* Do not find it any more */
 			shp->shm_perm.key = IPC_PRIVATE;
-			shm_unlock(shp);
+			shm_unlock_writer(shp);
 		} else
-			shm_destroy (shp);
+			shm_destroy_writer(shp);
 		up(&shm_ids.sem);
 		goto out;
 	}
@@ -601,7 +603,7 @@ asmlinkage long sys_shmctl (int shmid, i
 			goto out;
 		}
 		down(&shm_ids.sem);
-		shp = shm_lock(shmid);
+		shp = shm_lock_writer(shmid);
 		err=-EINVAL;
 		if(shp==NULL)
 			goto out_up;
@@ -634,7 +636,7 @@ asmlinkage long sys_shmctl (int shmid, i
 
 	err = 0;
 out_unlock_up:
-	shm_unlock(shp);
+	shm_unlock_writer(shp);
 out_up:
 	up(&shm_ids.sem);
 	goto out;
@@ -750,14 +752,14 @@ invalid:
 	up_write(&current->mm->mmap_sem);
 
 	down (&shm_ids.sem);
-	if(!(shp = shm_lock(shmid)))
+	if(!(shp = shm_lock_writer(shmid)))
 		BUG();
 	shp->shm_nattch--;
 	if(shp->shm_nattch == 0 &&
 	   shp->shm_flags & SHM_DEST)
-		shm_destroy (shp);
+		shm_destroy_writer(shp);
 	else
-		shm_unlock(shp);
+		shm_unlock_writer(shp);
 	up (&shm_ids.sem);
 
 	*raddr = (unsigned long) user_addr;
@@ -864,7 +866,7 @@ static int sysvipc_shm_read_proc(char *b
 	for(i = 0; i <= shm_ids.max_id; i++) {
 		struct shmid_kernel* shp;
 
-		shp = shm_lock(i);
+		shp = shm_lock_writer(i);
 		if(shp!=NULL) {
 #define SMALL_STRING "%10d %10d  %4o %10u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu\n"
 #define BIG_STRING   "%10d %10d  %4o %21u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu\n"
@@ -889,7 +891,7 @@ static int sysvipc_shm_read_proc(char *b
 				shp->shm_atim,
 				shp->shm_dtim,
 				shp->shm_ctim);
-			shm_unlock(shp);
+			shm_unlock_writer(shp);
 
 			pos += len;
 			if(pos < offset) {
--- linux/init/main.c.orig
+++ linux/init/main.c
@@ -45,6 +45,7 @@
 #include <linux/efi.h>
 #include <linux/unistd.h>
 #include <linux/rmap.h>
+#include <linux/irq.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
 
@@ -373,6 +374,8 @@ static void __init smp_init(void)
 static void noinline rest_init(void)
 	__releases(kernel_lock)
 {
+	system_state = SYSTEM_BOOTING_SCHEDULER_OK;
+
 	kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND);
 	numa_default_policy();
 	/*
@@ -453,6 +456,7 @@ asmlinkage void __init start_kernel(void
 	preempt_disable();
 	build_all_zonelists();
 	page_alloc_init();
+	early_init_hardirqs();
 	trap_init();
 	printk("Kernel command line: %s\n", saved_command_line);
 	parse_early_param();
@@ -599,12 +603,14 @@ static void __init do_basic_setup(void)
 static void do_pre_smp_initcalls(void)
 {
 	extern int spawn_ksoftirqd(void);
+	extern int spawn_desched_task(void);
 #ifdef CONFIG_SMP
 	extern int migration_init(void);
 
 	migration_init();
 #endif
 	spawn_ksoftirqd();
+	spawn_desched_task();
 }
 
 static void run_init_process(char *init_filename)
@@ -647,6 +653,8 @@ static int init(void * unused)
 	/* Sets up cpus_possible() */
 	smp_prepare_cpus(max_cpus);
 
+	init_hardirqs();
+
 	do_pre_smp_initcalls();
 
 	fixup_cpu_present_map();
--- linux/arch/x86_64/mm/fault.c.orig
+++ linux/arch/x86_64/mm/fault.c
@@ -39,6 +39,7 @@ void bust_spinlocks(int yes)
 {
 	int loglevel_save = console_loglevel;
 	if (yes) {
+		stop_trace();
 		oops_in_progress = 1;
 	} else {
 #ifdef CONFIG_VT
--- linux/arch/x86_64/Kconfig.orig
+++ linux/arch/x86_64/Kconfig
@@ -34,13 +34,6 @@ config ISA
 config SBUS
 	bool
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-	default y
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
@@ -233,33 +226,6 @@ config SMP
 
 	  If you don't know what to do here, say N.
 
-config PREEMPT
-	bool "Preemptible Kernel"
-	---help---
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
-	  under load. On contrary it may also break your drivers and add
-	  priority inheritance problems to your system. Don't select it if
-	  you rely on a stable system or have slightly obscure hardware.
-	  It's also not very well tested on x86-64 currently.
-	  You have been warned.
-
-	  Say Y here if you are feeling brave and building a kernel for a
-	  desktop, embedded or real-time system.  Say N if you are unsure.
-
-config PREEMPT_BKL
-	bool "Preempt The Big Kernel Lock"
-	depends on PREEMPT || SMP
-	default y
-	help
-	  This option reduces the latency of the kernel by making the
-	  big kernel lock preemptible.
-
-	  Say Y here if you are building a kernel for a desktop system.
-	  Say N if you are unsure.
-
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
 	depends on SMP
@@ -270,6 +236,16 @@ config SCHED_SMT
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+source "lib/Kconfig.RT"
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	depends on !PREEMPT_RT
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+
 config K8_NUMA
        bool "K8 NUMA support"
        select NUMA
--- linux/arch/x86_64/kernel/x8664_ksyms.c.orig
+++ linux/arch/x86_64/kernel/x8664_ksyms.c
@@ -14,6 +14,7 @@
 #include <linux/syscalls.h>
 #include <linux/tty.h>
 #include <linux/ioctl32.h>
+#include <linux/mc146818rtc.h>
 
 #include <asm/semaphore.h>
 #include <asm/processor.h>
@@ -34,8 +35,6 @@
 #include <asm/tlbflush.h>
 #include <asm/kdebug.h>
 
-extern spinlock_t rtc_lock;
-
 #ifdef CONFIG_SMP
 extern void __write_lock_failed(rwlock_t *rw);
 extern void __read_lock_failed(rwlock_t *rw);
@@ -63,10 +62,12 @@ EXPORT_SYMBOL(pm_idle);
 EXPORT_SYMBOL(pm_power_off);
 EXPORT_SYMBOL(get_cmos_time);
 
+#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
 EXPORT_SYMBOL(__down_failed);
 EXPORT_SYMBOL(__down_failed_interruptible);
 EXPORT_SYMBOL(__down_failed_trylock);
 EXPORT_SYMBOL(__up_wakeup);
+#endif
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_nocheck);
 EXPORT_SYMBOL(ip_compute_csum);
--- linux/arch/x86_64/kernel/io_apic.c.orig
+++ linux/arch/x86_64/kernel/io_apic.c
@@ -42,7 +42,7 @@
 
 int sis_apic_bug; /* not actually supported, dummy for compile */
 
-static spinlock_t ioapic_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
 
 /*
  * # of IRQ routing registers
@@ -112,6 +112,9 @@ static void add_pin_to_irq(unsigned int 
 		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
 		reg ACTION;						\
 		io_apic_modify(entry->apic, reg);			\
+		 /* Force POST flush by reading: */			\
+		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
+									\
 		if (!entry->next)					\
 			break;						\
 		entry = irq_2_pin + entry->next;			\
@@ -124,10 +127,8 @@ static void add_pin_to_irq(unsigned int 
 	static void name##_IO_APIC_irq (unsigned int irq)		\
 	__DO_ACTION(R, ACTION, FINAL)
 
-DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
-						/* mask = 1 */
-DO_ACTION( __unmask,           0, &= 0xfffeffff, )
-						/* mask = 0 */
+DO_ACTION( __mask,             0, |= 0x00010000, ) /* mask = 1 */
+DO_ACTION( __unmask,           0, &= 0xfffeffff, ) /* mask = 0 */
 
 static void mask_IO_APIC_irq (unsigned int irq)
 {
@@ -1059,7 +1060,6 @@ void print_all_local_APICs (void)
 
 void __apicdebuginit print_PIC(void)
 {
-	extern spinlock_t i8259A_lock;
 	unsigned int v;
 	unsigned long flags;
 
@@ -1342,11 +1342,48 @@ static unsigned int startup_level_ioapic
 	return 0; /* don't check for pending */
 }
 
+/*
+ * In the preemptible case mask the IRQ first then handle it and ack it.
+ *
+ * (In the non-preemptible case we keep the IRQ unacked in the local APIC
+ * and dont need to do the masking, because the code executes atomically.)
+ */
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+	mask_IO_APIC_irq(irq);
+	ack_APIC_irq();
+}
+
+static void end_level_ioapic_irq(unsigned int irq)
+{
+	if (!(irq_desc[irq].status & IRQ_INPROGRESS))
+		unmask_IO_APIC_irq(irq);
+}
+
+static void enable_level_ioapic_irq(unsigned int irq)
+{
+	unmask_IO_APIC_irq(irq);
+}
+
+#else /* !CONFIG_PREEMPT_HARDIRQS */
+
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+}
+
 static void end_level_ioapic_irq (unsigned int irq)
 {
 	ack_APIC_irq();
 }
 
+static void enable_level_ioapic_irq(unsigned int irq)
+{
+	unmask_IO_APIC_irq(irq);
+}
+#endif /* !CONFIG_PREEMPT_HARDIRQS */
+
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 {
 	unsigned long flags;
@@ -1386,6 +1423,13 @@ static unsigned int startup_level_ioapic
 	return startup_level_ioapic_irq (irq);
 }
 
+static void mask_and_ack_level_ioapic_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	mask_and_ack_level_ioapic_irq(irq);
+}
+
 static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
@@ -1393,6 +1437,11 @@ static void end_level_ioapic_vector (uns
 	end_level_ioapic_irq(irq);
 }
 
+static void enable_level_ioapic_vector(unsigned int vector)
+{
+	enable_level_ioapic_irq(vector_to_irq(vector));
+}
+
 static void mask_IO_APIC_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
--- linux/arch/x86_64/kernel/time.c.orig
+++ linux/arch/x86_64/kernel/time.c
@@ -49,8 +49,8 @@ static void cpufreq_delayed_get(void);
 
 extern int using_apic_timer;
 
-spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
-spinlock_t i8253_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_RAW_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
 
 static int nohpet __initdata = 0;
 static int notsc __initdata = 0;
@@ -863,7 +863,7 @@ int __init time_setup(char *str)
 }
 
 static struct irqaction irq0 = {
-	timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL
+	timer_interrupt, SA_INTERRUPT | SA_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL
 };
 
 extern void __init config_acpi_tables(void);
--- linux/arch/x86_64/kernel/nmi.c.orig
+++ linux/arch/x86_64/kernel/nmi.c
@@ -43,7 +43,7 @@
  * This is maintained separately from nmi_active because the NMI
  * watchdog may also be driven from the I/O APIC timer.
  */
-static spinlock_t lapic_nmi_owner_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_RAW_SPINLOCK(lapic_nmi_owner_lock);
 static unsigned int lapic_nmi_owner;
 #define LAPIC_NMI_WATCHDOG	(1<<0)
 #define LAPIC_NMI_RESERVED	(1<<1)
@@ -376,12 +376,41 @@ void touch_nmi_watchdog (void)
 		alert_counter[i] = 0;
 }
 
+int nmi_show_regs[NR_CPUS];
+
+void nmi_show_all_regs(void)
+{
+	int i;
+
+	if (nmi_watchdog == NMI_NONE)
+		return;
+	if (system_state != SYSTEM_RUNNING) {
+		printk("nmi_show_all_regs(): system state %d, not doing.\n",
+			system_state);
+		return;
+	}
+		
+	for_each_online_cpu(i)
+		nmi_show_regs[i] = 1;
+	for_each_online_cpu(i)
+		while (nmi_show_regs[i] == 1)
+			barrier();
+}
+
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
+
 void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
 {
 	int sum, cpu;
 
 	cpu = safe_smp_processor_id();
 	sum = read_pda(apic_timer_irqs);
+	if (nmi_show_regs[cpu]) {
+		nmi_show_regs[cpu] = 0;
+		spin_lock(&nmi_print_lock);
+		show_regs(regs);
+		spin_unlock(&nmi_print_lock);
+	}
 	if (last_irq_sums[cpu] == sum) {
 		/*
 		 * Ayiee, looks like this CPU is stuck ...
@@ -389,6 +418,12 @@ void nmi_watchdog_tick (struct pt_regs *
 		 */
 		alert_counter[cpu]++;
 		if (alert_counter[cpu] == 5*nmi_hz) {
+			int i;
+
+			for (i = 0; i < NR_CPUS; i++)
+				nmi_show_regs[i] = 1;
+		}
+		if (alert_counter[cpu] == 5*nmi_hz) {
 			if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
 							== NOTIFY_STOP) {
 				alert_counter[cpu] = 0; 
--- linux/arch/x86_64/kernel/entry.S.orig
+++ linux/arch/x86_64/kernel/entry.S
@@ -929,3 +929,40 @@ ENTRY(machine_check)
 ENTRY(call_debug)
        zeroentry do_call_debug
 
+#ifdef CONFIG_LATENCY_TRACE
+
+ENTRY(mcount)
+	cmpq $0, trace_enabled
+	jz out
+
+	push %rbp
+	mov %rsp,%rbp
+
+	push %r9
+	push %r8
+	push %rdi
+	push %rsi
+	push %rdx
+	push %rcx
+	push %rax
+
+	mov 0x0(%rbp),%rax
+	mov 0x8(%rbp),%rdi
+	mov 0x8(%rax),%rsi
+	
+	call   __trace
+
+	pop %rax
+	pop %rcx
+	pop %rdx
+	pop %rsi
+	pop %rdi
+	pop %r8
+	pop %r9
+
+	leaveq
+out:
+	ret
+
+#endif
+
--- linux/arch/x86_64/kernel/process.c.orig
+++ linux/arch/x86_64/kernel/process.c
@@ -90,7 +90,8 @@ void default_idle(void)
 			safe_halt();
 		else
 			local_irq_enable();
-	}
+	} else
+		local_irq_enable();
 }
 
 /*
@@ -163,9 +164,10 @@ void cpu_idle (void)
 			idle = pm_idle;
 			if (!idle)
 				idle = default_idle;
+			stop_critical_timing();
 			idle();
 		}
-		schedule();
+		__schedule();
 	}
 }
 
@@ -272,7 +274,7 @@ void __show_regs(struct pt_regs * regs)
 void show_regs(struct pt_regs *regs)
 {
 	__show_regs(regs);
-	show_trace(&regs->rsp);
+	show_trace(current, &regs->rsp);
 }
 
 /*
@@ -283,13 +285,14 @@ void exit_thread(void)
 	struct task_struct *me = current;
 	struct thread_struct *t = &me->thread;
 	if (me->thread.io_bitmap_ptr) { 
-		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+		struct tss_struct *tss;
 
 		kfree(t->io_bitmap_ptr);
 		t->io_bitmap_ptr = NULL;
 		/*
 		 * Careful, clear this in the TSS too:
 		 */
+		tss = &per_cpu(init_tss, get_cpu());
 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
 		t->io_bitmap_max = 0;
 		put_cpu();
--- linux/arch/x86_64/kernel/Makefile.orig
+++ linux/arch/x86_64/kernel/Makefile
@@ -4,11 +4,12 @@
 
 extra-y 	:= head.o head64.o init_task.o vmlinux.lds
 EXTRA_AFLAGS	:= -traditional
-obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o \
+obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
 		x8664_ksyms.o i387.o syscall.o vsyscall.o \
 		setup64.o bootflag.o e820.o reboot.o quirks.o
 
+obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += semaphore.o
 obj-$(CONFIG_X86_MCE)         += mce.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
 obj-$(CONFIG_MTRR)		+= ../../i386/kernel/cpu/mtrr/
--- linux/arch/x86_64/kernel/traps.c.orig
+++ linux/arch/x86_64/kernel/traps.c
@@ -143,7 +143,7 @@ unsigned long *in_exception_stack(int cp
  * Check and process them in order.
  */
 
-void show_trace(unsigned long *stack)
+void show_trace(struct task_struct *task, unsigned long *stack)
 {
 	unsigned long addr;
 	unsigned long *irqstack, *irqstack_end, *estack_end;
@@ -212,6 +212,7 @@ void show_trace(unsigned long *stack)
 		}
 	}
 	printk("\n");
+	print_traces(task);
 }
 
 void show_stack(struct task_struct *tsk, unsigned long * rsp)
@@ -247,7 +248,7 @@ void show_stack(struct task_struct *tsk,
 			printk("\n       ");
 		printk("%016lx ", *stack++);
 	}
-	show_trace((unsigned long *)rsp);
+	show_trace(tsk, (unsigned long *)rsp);
 }
 
 /*
@@ -256,7 +257,7 @@ void show_stack(struct task_struct *tsk,
 void dump_stack(void)
 {
 	unsigned long dummy;
-	show_trace(&dummy);
+	show_trace(current, &dummy);
 }
 
 EXPORT_SYMBOL(dump_stack);
@@ -327,7 +328,7 @@ void out_of_line_bug(void)
 	BUG(); 
 } 
 
-static spinlock_t die_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_RAW_SPINLOCK(die_lock);
 static int die_owner = -1;
 
 void oops_begin(void)
--- linux/arch/x86_64/kernel/i8259.c.orig
+++ linux/arch/x86_64/kernel/i8259.c
@@ -131,7 +131,7 @@ void (*interrupt[NR_IRQS])(void) = {
  * moves to arch independent land
  */
 
-spinlock_t i8259A_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 
 static void end_8259A_irq (unsigned int irq)
 {
@@ -455,7 +455,7 @@ device_initcall(i8259A_init_sysfs);
  * IRQ2 is cascade interrupt to second interrupt controller
  */
 
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
 
 void __init init_ISA_irqs (void)
 {
--- linux/arch/x86_64/kernel/smp.c.orig
+++ linux/arch/x86_64/kernel/smp.c
@@ -40,7 +40,7 @@
 static cpumask_t flush_cpumask;
 static struct mm_struct * flush_mm;
 static unsigned long flush_va;
-static spinlock_t tlbstate_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_RAW_SPINLOCK(tlbstate_lock);
 #define FLUSH_ALL	0xffffffff
 
 /*
@@ -281,7 +281,7 @@ void smp_send_nmi_allbutself(void)
  * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
-static spinlock_t call_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 struct call_data_struct {
 	void (*func) (void *info);
--- linux/arch/x86_64/kernel/vsyscall.c.orig
+++ linux/arch/x86_64/kernel/vsyscall.c
@@ -53,7 +53,7 @@
 #define force_inline __attribute__((always_inline)) inline
 
 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
-seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
+raw_seqlock_t __xtime_lock __section_xtime_lock = RAW_SEQLOCK_UNLOCKED;
 
 #include <asm/unistd.h>
 
--- linux/arch/x86_64/lib/dec_and_lock.c.orig
+++ linux/arch/x86_64/lib/dec_and_lock.c
@@ -10,7 +10,7 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
 
-int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+int _atomic_dec_and_lock(atomic_t *atomic, raw_spinlock_t *lock)
 {
 	int counter;
 	int newcount;
--- linux/arch/x86_64/lib/thunk.S.orig
+++ linux/arch/x86_64/lib/thunk.S
@@ -43,11 +43,13 @@
 	thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
 #endif	
 	thunk do_softirq_thunk,do_softirq
-	
+
+#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
 	thunk __down_failed,__down
 	thunk_retrax __down_failed_interruptible,__down_interruptible
 	thunk_retrax __down_failed_trylock,__down_trylock
 	thunk __up_wakeup,__up
+#endif
 	
 	/* SAVE_ARGS below is used only for the .cfi directives it contains. */
 	CFI_STARTPROC
--- linux/arch/i386/mm/highmem.c.orig
+++ linux/arch/i386/mm/highmem.c
@@ -17,6 +17,27 @@ void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
+void kunmap_virt(void *ptr)
+{
+	struct page *page;
+
+	if ((unsigned long)ptr < PKMAP_ADDR(0))
+		return;
+	page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]);
+	kunmap(page);
+}
+
+struct page *kmap_to_page(void *ptr)
+{
+	struct page *page;
+
+	if ((unsigned long)ptr < PKMAP_ADDR(0))
+		return virt_to_page(ptr);
+	page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]);
+	return page;
+}
+
+
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
  * no global lock is needed and because the kmap code must perform a global TLB
@@ -25,7 +46,7 @@ void kunmap(struct page *page)
  * However when holding an atomic kmap is is not legal to sleep, so atomic
  * kmaps are appropriate for short, tight code paths only.
  */
-char *kmap_atomic(struct page *page, enum km_type type)
+char *__kmap_atomic(struct page *page, enum km_type type)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
@@ -47,7 +68,7 @@ char *kmap_atomic(struct page *page, enu
 	return (char *)vaddr;
 }
 
-void kunmap_atomic(char *kaddr, enum km_type type)
+void __kunmap_atomic(char *kaddr, enum km_type type)
 {
 #ifdef CONFIG_DEBUG_HIGHMEM
 	unsigned long vaddr = (unsigned long)kaddr & PAGE_MASK;
@@ -77,7 +98,7 @@ void kunmap_atomic(char *kaddr, enum km_
 /* This is the same as kmap_atomic() but can map memory that doesn't
  * have a struct page associated with it.
  */
-char *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+char *__kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
@@ -92,7 +113,7 @@ char *kmap_atomic_pfn(unsigned long pfn,
 	return (char *)vaddr;
 }
 
-struct page *kmap_atomic_to_page(char *ptr)
+struct page *__kmap_atomic_to_page(char *ptr)
 {
 	unsigned long idx, vaddr = (unsigned long)ptr;
 	pte_t *pte;
--- linux/arch/i386/mm/fault.c.orig
+++ linux/arch/i386/mm/fault.c
@@ -38,6 +38,8 @@ void bust_spinlocks(int yes)
 	int loglevel_save = console_loglevel;
 
 	if (yes) {
+		stop_trace();
+		zap_rt_locks();
 		oops_in_progress = 1;
 		return;
 	}
@@ -213,7 +215,7 @@ fastcall void do_invalid_op(struct pt_re
  *	bit 1 == 0 means read, 1 means write
  *	bit 2 == 0 means kernel, 1 means user-mode
  */
-fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code)
+fastcall notrace void do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
 	struct task_struct *tsk;
 	struct mm_struct *mm;
@@ -225,6 +227,7 @@ fastcall void do_page_fault(struct pt_re
 
 	/* get the address */
 	__asm__("movl %%cr2,%0":"=r" (address));
+	trace_special(regs->eip, error_code, address);
 
 	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
 					SIGSEGV) == NOTIFY_STOP)
@@ -454,9 +457,9 @@ no_context:
 	}
 #endif
 	if (address < PAGE_SIZE)
-		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
+		printk(KERN_ALERT "BUG: Unable to handle kernel NULL pointer dereference");
 	else
-		printk(KERN_ALERT "Unable to handle kernel paging request");
+		printk(KERN_ALERT "BUG: Unable to handle kernel paging request");
 	printk(" at virtual address %08lx\n",address);
 	printk(KERN_ALERT " printing eip:\n");
 	printk("%08lx\n", regs->eip);
--- linux/arch/i386/mm/pgtable.c.orig
+++ linux/arch/i386/mm/pgtable.c
@@ -169,7 +169,7 @@ void pmd_ctor(void *pmd, kmem_cache_t *c
  * recommendations and having no core impact whatsoever.
  * -- wli
  */
-spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_RAW_SPINLOCK(pgd_lock);
 struct page *pgd_list;
 
 static inline void pgd_list_add(pgd_t *pgd)
--- linux/arch/i386/boot/compressed/misc.c.orig
+++ linux/arch/i386/boot/compressed/misc.c
@@ -16,6 +16,12 @@
 #include <asm/io.h>
 #include <asm/segment.h>
 
+#ifdef CONFIG_MCOUNT
+void notrace mcount(void)
+{
+}
+#endif
+
 /*
  * gzip declarations
  */
@@ -113,7 +119,7 @@ static long free_mem_end_ptr;
 #define INPLACE_MOVE_ROUTINE  0x1000
 #define LOW_BUFFER_START      0x2000
 #define LOW_BUFFER_MAX       0x90000
-#define HEAP_SIZE             0x3000
+#define HEAP_SIZE             0x4000
 static unsigned int low_buffer_end, low_buffer_size;
 static int high_loaded =0;
 static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
--- linux/arch/i386/Kconfig.orig
+++ linux/arch/i386/Kconfig
@@ -368,16 +368,6 @@ config X86_L1_CACHE_SHIFT
 	default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2
 	default "6" if MK7 || MK8 || MPENTIUMM
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-	depends on M386
-	default y
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-	depends on !M386
-	default y
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
@@ -434,7 +424,7 @@ config X86_USE_PPRO_CHECKSUM
 
 config X86_USE_3DNOW
 	bool
-	depends on MCYRIXIII || MK7
+	depends on (MCYRIXIII || MK7) && !PREEMPT_RT
 	default y
 
 config X86_OOSTORE
@@ -510,28 +500,22 @@ config SCHED_SMT
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
-config PREEMPT
-	bool "Preemptible Kernel"
-	help
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
-	  under load.
+source "lib/Kconfig.RT"
 
-	  Say Y here if you are building a kernel for a desktop, embedded
-	  or real-time system.  Say N if you are unsure.
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	depends on M386 && !PREEMPT_RT
+	default y
 
-config PREEMPT_BKL
-	bool "Preempt The Big Kernel Lock"
-	depends on PREEMPT || SMP
+config ASM_SEMAPHORES
+	bool
+	depends on !PREEMPT_RT
 	default y
-	help
-	  This option reduces the latency of the kernel by making the
-	  big kernel lock preemptible.
 
-	  Say Y here if you are building a kernel for a desktop system.
-	  Say N if you are unsure.
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+	depends on !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT
+	default y
 
 config X86_UP_APIC
 	bool "Local APIC support on uniprocessors" if !SMP
@@ -883,7 +867,7 @@ config BOOT_IOREMAP
 
 config REGPARM
 	bool "Use register arguments (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on EXPERIMENTAL && !MCOUNT
 	default n
 	help
 	Compile the kernel with -mregparm=3. This uses an different ABI
--- linux/arch/i386/kernel/io_apic.c.orig
+++ linux/arch/i386/kernel/io_apic.c
@@ -37,6 +37,7 @@
 #include <asm/smp.h>
 #include <asm/desc.h>
 #include <asm/timer.h>
+#include <asm/i8259.h>
 
 #include <mach_apic.h>
 
@@ -45,7 +46,7 @@
 int (*ioapic_renumber_irq)(int ioapic, int irq);
 atomic_t irq_mis_count;
 
-static spinlock_t ioapic_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
 
 /*
  *	Is the SiS APIC rmw bug present ?
@@ -127,6 +128,37 @@ static void __init replace_pin_at_irq(un
 	}
 }
 
+/*
+ * Cache the register used by the irq-redirection hotpath:
+ */
+static unsigned int io_apic_cache[MAX_IO_APICS][NR_IRQS]
+		____cacheline_aligned_in_smp;
+
+static void update_io_apic_cache(unsigned int irq)
+{
+	struct irq_pin_list *entry = irq_2_pin + irq;
+	unsigned int pin;
+
+	for (;;) {
+		pin = entry->pin;
+		if (pin == -1)
+			break;
+		io_apic_cache[entry->apic][irq] =
+			io_apic_read(entry->apic, 0x10 + pin*2);
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+}
+
+#define IOAPIC_CACHE
+/*
+ * Some systems need a POST flush or else level-triggered interrupts
+ * generate lots of spurious interrupts due to the POST-ed write not
+ * reaching the IOAPIC before the IRQ is ACK-ed in the local APIC.
+ */
+#define IOAPIC_POSTFLUSH
+
 static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
 {
 	struct irq_pin_list *entry = irq_2_pin + irq;
@@ -136,10 +168,29 @@ static void __modify_IO_APIC_irq (unsign
 		pin = entry->pin;
 		if (pin == -1)
 			break;
+#ifdef IOAPIC_CACHE
+		reg = io_apic_cache[entry->apic][irq];
+		if (unlikely(!reg)) {
+			reg = io_apic_read(entry->apic, 0x10 + pin*2);
+			io_apic_cache[entry->apic][irq] = reg;
+			printk("hm: ioapic cache empty for irq %d (e:%08lx/d:%08lx) %08x\n", irq, enable, disable, reg);
+		
+		}
+		reg &= ~disable;
+		reg |= enable;
+		io_apic_write(entry->apic, 0x10 + pin*2, reg);
+#else
 		reg = io_apic_read(entry->apic, 0x10 + pin*2);
 		reg &= ~disable;
 		reg |= enable;
 		io_apic_modify(entry->apic, 0x10 + pin*2, reg);
+#endif
+#ifdef IOAPIC_POSTFLUSH
+		/*
+		 * Force POST flush by reading:
+	 	 */
+		reg = *(IO_APIC_BASE(entry->apic)+4);
+#endif
 		if (!entry->next)
 			break;
 		entry = irq_2_pin + entry->next;
@@ -158,18 +209,6 @@ static void __unmask_IO_APIC_irq (unsign
 	__modify_IO_APIC_irq(irq, 0, 0x00010000);
 }
 
-/* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
-{
-	__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
-}
-
-/* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
-{
-	__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
-}
-
 static void mask_IO_APIC_irq (unsigned int irq)
 {
 	unsigned long flags;
@@ -1249,6 +1288,7 @@ void __init setup_IO_APIC_irqs(void)
 		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
 		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
 		spin_unlock_irqrestore(&ioapic_lock, flags);
+		update_io_apic_cache(irq);
 	}
 	}
 
@@ -1564,7 +1604,6 @@ void print_all_local_APICs (void)
 
 void /*__init*/ print_PIC(void)
 {
-	extern spinlock_t i8259A_lock;
 	unsigned int v;
 	unsigned long flags;
 
@@ -1855,6 +1894,37 @@ static unsigned int startup_level_ioapic
 	return 0; /* don't check for pending */
 }
 
+/*
+ * Level-triggered interrupt handling is different for RT kernels.
+ *
+ * In the RT case mask the IRQ first, then ack it, redirect it,
+ * and the IRQ thread then will handle it (sometime later) and will
+ * unmask it.
+ */
+#if defined(CONFIG_PREEMPT_HARDIRQS)
+
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+	move_irq(irq);
+	mask_IO_APIC_irq(irq);
+	ack_APIC_irq();
+}
+
+static void end_level_ioapic_irq(unsigned int irq)
+{
+#ifndef CONFIG_PCI_MSI
+	if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)) &&
+							irq_desc[irq].action)
+#endif
+		unmask_IO_APIC_irq(irq);
+}
+
+#else /* !CONFIG_PREEMPT_HARDIRQS */
+
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+}
+
 static void end_level_ioapic_irq (unsigned int irq)
 {
 	unsigned long v;
@@ -1889,12 +1959,21 @@ static void end_level_ioapic_irq (unsign
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
+		/* mask = 1, trigger = 0 */
+		__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+		/* mask = 0, trigger = 1 */
+		__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
 		spin_unlock(&ioapic_lock);
 	}
 }
 
+#endif /* !CONFIG_PREEMPT_HARDIRQS */
+
+static void enable_level_ioapic_irq(unsigned int irq)
+{
+	unmask_IO_APIC_irq(irq);
+}
+
 #ifdef CONFIG_PCI_MSI
 static unsigned int startup_edge_ioapic_vector(unsigned int vector)
 {
@@ -1917,11 +1996,27 @@ static unsigned int startup_level_ioapic
 	return startup_level_ioapic_irq (irq);
 }
 
+static void mask_and_ack_level_ioapic_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	mask_and_ack_level_ioapic_irq(irq);
+}
+
 static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
-	end_level_ioapic_irq(irq);
+#if defined(CONFIG_PREEMPT_HARDIRQS)
+	if (!(irq_desc[vector].status & (IRQ_DISABLED | IRQ_INPROGRESS)) &&
+							irq_desc[vector].action)
+#endif
+		end_level_ioapic_irq(irq);
+}
+
+static void enable_level_ioapic_vector(unsigned int vector)
+{
+	enable_level_ioapic_irq(vector_to_irq(vector));
 }
 
 static void mask_IO_APIC_vector (unsigned int vector)
--- linux/arch/i386/kernel/apic.c.orig
+++ linux/arch/i386/kernel/apic.c
@@ -35,6 +35,7 @@
 #include <asm/desc.h>
 #include <asm/arch_hooks.h>
 #include <asm/hpet.h>
+#include <asm/i8253.h>
 
 #include <mach_apic.h>
 
@@ -874,7 +875,6 @@ fake_ioapic_page:
  */
 static unsigned int __init get_8254_timer_count(void)
 {
-	extern spinlock_t i8253_lock;
 	unsigned long flags;
 
 	unsigned int count;
@@ -1137,6 +1137,7 @@ inline void smp_local_timer_interrupt(st
 	int cpu = smp_processor_id();
 
 	profile_tick(CPU_PROFILING, regs);
+
 	if (--per_cpu(prof_counter, cpu) <= 0) {
 		/*
 		 * The multiplier may have changed since the last time we got
@@ -1182,7 +1183,7 @@ inline void smp_local_timer_interrupt(st
  *   interrupt as well. Thus we cannot inline the local irq ... ]
  */
 
-fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
+fastcall notrace void smp_apic_timer_interrupt(struct pt_regs *regs)
 {
 	int cpu = smp_processor_id();
 
@@ -1191,6 +1192,8 @@ fastcall void smp_apic_timer_interrupt(s
 	 */
 	irq_stat[cpu].apic_timer_irqs++;
 
+        trace_special(regs->eip, 0, 0);
+
 	/*
 	 * NOTE! We'd better ACK the irq immediately,
 	 * because timer handling can be slow.
--- linux/arch/i386/kernel/i386_ksyms.c.orig
+++ linux/arch/i386/kernel/i386_ksyms.c
@@ -16,6 +16,7 @@
 #include <linux/tty.h>
 #include <linux/highmem.h>
 #include <linux/time.h>
+#include <linux/mc146818rtc.h>
 
 #include <asm/semaphore.h>
 #include <asm/processor.h>
@@ -34,7 +35,6 @@
 #include <asm/kdebug.h>
 
 extern void dump_thread(struct pt_regs *, struct user *);
-extern spinlock_t rtc_lock;
 
 /* This is definitely a GPL-only symbol */
 EXPORT_SYMBOL_GPL(cpu_gdt_table);
@@ -83,10 +83,12 @@ EXPORT_SYMBOL(get_cmos_time);
 EXPORT_SYMBOL(cpu_khz);
 EXPORT_SYMBOL(apm_info);
 
+#ifdef CONFIG_ASM_SEMAPHORES
 EXPORT_SYMBOL(__down_failed);
 EXPORT_SYMBOL(__down_failed_interruptible);
 EXPORT_SYMBOL(__down_failed_trylock);
 EXPORT_SYMBOL(__up_wakeup);
+#endif
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_generic);
 /* Delay loops */
@@ -138,8 +140,10 @@ EXPORT_SYMBOL(cpu_sibling_map);
 EXPORT_SYMBOL(cpu_data);
 EXPORT_SYMBOL(cpu_online_map);
 EXPORT_SYMBOL(cpu_callout_map);
+#ifdef CONFIG_ASM_SEMAPHORES
 EXPORT_SYMBOL(__write_lock_failed);
 EXPORT_SYMBOL(__read_lock_failed);
+#endif
 
 /* Global SMP stuff */
 EXPORT_SYMBOL(smp_call_function);
@@ -174,17 +178,19 @@ EXPORT_SYMBOL(memcmp);
 
 EXPORT_SYMBOL(register_die_notifier);
 #ifdef CONFIG_HAVE_DEC_LOCK
-EXPORT_SYMBOL(_atomic_dec_and_lock);
+EXPORT_SYMBOL(_atomic_dec_and_raw_spin_lock);
 #endif
 
 EXPORT_SYMBOL(__PAGE_KERNEL);
 
 #ifdef CONFIG_HIGHMEM
 EXPORT_SYMBOL(kmap);
+EXPORT_SYMBOL(kmap_to_page);
 EXPORT_SYMBOL(kunmap);
-EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic);
-EXPORT_SYMBOL(kmap_atomic_to_page);
+EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(kunmap_virt);
+EXPORT_SYMBOL(__kmap_atomic_to_page);
 #endif
 
 #if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
--- linux/arch/i386/kernel/semaphore.c.orig
+++ linux/arch/i386/kernel/semaphore.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <asm/semaphore.h>
 
 /*
@@ -263,35 +264,10 @@ asm(
 	"ret"
 );
 
-/*
- * rw spinlock fallbacks
- */
-#if defined(CONFIG_SMP)
-asm(
-".section .sched.text\n"
-".align	4\n"
-".globl	__write_lock_failed\n"
-"__write_lock_failed:\n\t"
-	LOCK "addl	$" RW_LOCK_BIAS_STR ",(%eax)\n"
-"1:	rep; nop\n\t"
-	"cmpl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
-	"jne	1b\n\t"
-	LOCK "subl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
-	"jnz	__write_lock_failed\n\t"
-	"ret"
-);
+int fastcall sem_is_locked(struct semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+
+EXPORT_SYMBOL(sem_is_locked);
 
-asm(
-".section .sched.text\n"
-".align	4\n"
-".globl	__read_lock_failed\n"
-"__read_lock_failed:\n\t"
-	LOCK "incl	(%eax)\n"
-"1:	rep; nop\n\t"
-	"cmpl	$1,(%eax)\n\t"
-	"js	1b\n\t"
-	LOCK "decl	(%eax)\n\t"
-	"js	__read_lock_failed\n\t"
-	"ret"
-);
-#endif
--- linux/arch/i386/kernel/timers/timer_hpet.c.orig
+++ linux/arch/i386/kernel/timers/timer_hpet.c
@@ -24,7 +24,7 @@ static unsigned long hpet_last; 	/* hpet
 static unsigned long last_tsc_low;	/* lsb 32 bits of Time Stamp Counter */
 static unsigned long last_tsc_high; 	/* msb 32 bits of Time Stamp Counter */
 static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
+static DECLARE_RAW_SEQLOCK(monotonic_lock);
 
 /* convert from cycles(64bits) => nanoseconds (64bits)
  *  basic equation:
--- linux/arch/i386/kernel/timers/timer_tsc.c.orig
+++ linux/arch/i386/kernel/timers/timer_tsc.c
@@ -24,6 +24,7 @@
 #include "mach_timer.h"
 
 #include <asm/hpet.h>
+#include <asm/i8253.h>
 
 #ifdef CONFIG_HPET_TIMER
 static unsigned long hpet_usec_quotient;
@@ -35,8 +36,6 @@ static inline void cpufreq_delayed_get(v
 
 int tsc_disable __initdata = 0;
 
-extern spinlock_t i8253_lock;
-
 static int use_tsc;
 /* Number of usecs that the last interrupt was delayed */
 static int delay_at_last_interrupt;
@@ -44,7 +43,7 @@ static int delay_at_last_interrupt;
 static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
 static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
 static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
+static DECLARE_SEQLOCK(monotonic_lock);
 
 /* convert from cycles(64bits) => nanoseconds (64bits)
  *  basic equation:
@@ -171,9 +170,9 @@ static void delay_tsc(unsigned long loop
 static void mark_offset_tsc_hpet(void)
 {
 	unsigned long long this_offset, last_offset;
- 	unsigned long offset, temp, hpet_current;
+ 	unsigned long offset, temp, hpet_current, flags;
 
-	write_seqlock(&monotonic_lock);
+	write_seqlock_irqsave(&monotonic_lock, flags);
 	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
 	/*
 	 * It is important that these two operations happen almost at
@@ -201,7 +200,7 @@ static void mark_offset_tsc_hpet(void)
 	/* update the monotonic base value */
 	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
 	monotonic_base += cycles_2_ns(this_offset - last_offset);
-	write_sequnlock(&monotonic_lock);
+	write_sequnlock_irqrestore(&monotonic_lock, flags);
 
 	/* calculate delay_at_last_interrupt */
 	/*
@@ -322,7 +321,7 @@ static inline void cpufreq_delayed_get(v
 
 static void mark_offset_tsc(void)
 {
-	unsigned long lost,delay;
+	unsigned long lost,delay, flags, flags2;
 	unsigned long delta = last_tsc_low;
 	int count;
 	int countmp;
@@ -330,7 +329,7 @@ static void mark_offset_tsc(void)
 	unsigned long long this_offset, last_offset;
 	static int lost_count = 0;
 
-	write_seqlock(&monotonic_lock);
+	write_seqlock_irqsave(&monotonic_lock, flags);
 	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
 	/*
 	 * It is important that these two operations happen almost at
@@ -348,24 +347,26 @@ static void mark_offset_tsc(void)
 
 	rdtsc(last_tsc_low, last_tsc_high);
 
-	spin_lock(&i8253_lock);
-	outb_p(0x00, PIT_MODE);     /* latch the count ASAP */
-
-	count = inb_p(PIT_CH0);    /* read the latched count */
+	spin_lock_irqsave(&i8253_lock, flags2);
+	outb(0x00, PIT_MODE);     /* latch the count ASAP */
+	count = inb(PIT_CH0);    /* read the latched count */
 	count |= inb(PIT_CH0) << 8;
 
+#undef VIA686A_WORKAROUND
 	/*
 	 * VIA686a test code... reset the latch if count > max + 1
 	 * from timer_pit.c - cjb
 	 */
+#ifdef VIA686A_WORKAROUND
 	if (count > LATCH) {
 		outb_p(0x34, PIT_MODE);
 		outb_p(LATCH & 0xff, PIT_CH0);
 		outb(LATCH >> 8, PIT_CH0);
 		count = LATCH - 1;
 	}
+#endif
 
-	spin_unlock(&i8253_lock);
+	spin_unlock_irqrestore(&i8253_lock, flags2);
 
 	if (pit_latch_buggy) {
 		/* get center value of last 3 time lutch */
@@ -418,7 +419,7 @@ static void mark_offset_tsc(void)
 	/* update the monotonic base value */
 	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
 	monotonic_base += cycles_2_ns(this_offset - last_offset);
-	write_sequnlock(&monotonic_lock);
+	write_sequnlock_irqrestore(&monotonic_lock, flags);
 
 	/* calculate delay_at_last_interrupt */
 	count = ((LATCH-1) - count) * TICK_SIZE;
--- linux/arch/i386/kernel/timers/timer_pm.c.orig
+++ linux/arch/i386/kernel/timers/timer_pm.c
@@ -41,7 +41,7 @@ static u32 offset_tick;
 static u32 offset_delay;
 
 static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
+static DECLARE_RAW_SEQLOCK(monotonic_lock);
 
 #define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
 
--- linux/arch/i386/kernel/timers/timer_cyclone.c.orig
+++ linux/arch/i386/kernel/timers/timer_cyclone.c
@@ -17,9 +17,9 @@
 #include <asm/io.h>
 #include <asm/pgtable.h>
 #include <asm/fixmap.h>
-#include "io_ports.h"
+#include <asm/i8253.h>
 
-extern spinlock_t i8253_lock;
+#include "io_ports.h"
 
 /* Number of usecs that the last interrupt was delayed */
 static int delay_at_last_interrupt;
@@ -36,7 +36,7 @@ static u32* volatile cyclone_timer;	/* C
 static u32 last_cyclone_low;
 static u32 last_cyclone_high;
 static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
+static DECLARE_RAW_SEQLOCK(monotonic_lock);
 
 /* helper macro to atomically read both cyclone counter registers */
 #define read_cyclone_counter(low,high) \
--- linux/arch/i386/kernel/timers/timer_pit.c.orig
+++ linux/arch/i386/kernel/timers/timer_pit.c
@@ -15,9 +15,8 @@
 #include <asm/smp.h>
 #include <asm/io.h>
 #include <asm/arch_hooks.h>
+#include <asm/i8253.h>
 
-extern spinlock_t i8259A_lock;
-extern spinlock_t i8253_lock;
 #include "do_timer.h"
 #include "io_ports.h"
 
@@ -166,7 +165,6 @@ struct init_timer_opts __initdata timer_
 
 void setup_pit_timer(void)
 {
-	extern spinlock_t i8253_lock;
 	unsigned long flags;
 
 	spin_lock_irqsave(&i8253_lock, flags);
--- linux/arch/i386/kernel/time.c.orig
+++ linux/arch/i386/kernel/time.c
@@ -67,7 +67,8 @@
 
 #include "io_ports.h"
 
-extern spinlock_t i8259A_lock;
+#include <asm/i8259.h>
+
 int pit_latch_buggy;              /* extern */
 
 #include "do_timer.h"
@@ -80,9 +81,11 @@ unsigned long cpu_khz;	/* Detected as we
 
 extern unsigned long wall_jiffies;
 
-spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(rtc_lock);
+
+#include <asm/i8253.h>
 
-spinlock_t i8253_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_RAW_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
 struct timer_opts *cur_timer = &timer_none;
@@ -201,7 +204,7 @@ unsigned long long monotonic_clock(void)
 EXPORT_SYMBOL(monotonic_clock);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
-unsigned long profile_pc(struct pt_regs *regs)
+unsigned long notrace profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
@@ -213,6 +216,19 @@ unsigned long profile_pc(struct pt_regs 
 EXPORT_SYMBOL(profile_pc);
 #endif
 
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+/*
+ * If the timer is redirected then this is the minimal
+ * interrupt-context processing we have to do:
+ */
+void direct_timer_interrupt(struct pt_regs *regs)
+{
+	do_timer_interrupt_hook(regs);
+}
+
+#endif
+
 /*
  * timer_interrupt() needs to keep up the real-time clock,
  * as well as call the "do_timer()" routine every clocktick
@@ -222,21 +238,24 @@ static inline void do_timer_interrupt(in
 {
 #ifdef CONFIG_X86_IO_APIC
 	if (timer_ack) {
+		unsigned long flags;
 		/*
 		 * Subtle, when I/O APICs are used we have to ack timer IRQ
 		 * manually to reset the IRR bit for do_slow_gettimeoffset().
 		 * This will also deassert NMI lines for the watchdog if run
 		 * on an 82489DX-based system.
 		 */
-		spin_lock(&i8259A_lock);
+		spin_lock_irqsave(&i8259A_lock, flags);
 		outb(0x0c, PIC_MASTER_OCW3);
 		/* Ack the IRQ; AEOI will end it automatically. */
 		inb(PIC_MASTER_POLL);
-		spin_unlock(&i8259A_lock);
+		spin_unlock_irqrestore(&i8259A_lock, flags);
 	}
 #endif
 
+#ifndef CONFIG_PREEMPT_HARDIRQS
 	do_timer_interrupt_hook(regs);
+#endif
 
 	/*
 	 * If we have an externally synchronized Linux clock, then update
--- linux/arch/i386/kernel/apm.c.orig
+++ linux/arch/i386/kernel/apm.c
@@ -228,10 +228,10 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/desc.h>
+#include <asm/i8253.h>
 
 #include "io_ports.h"
 
-extern spinlock_t i8253_lock;
 extern unsigned long get_cmos_time(void);
 extern void machine_real_restart(unsigned char *, int);
 
@@ -1168,8 +1168,7 @@ static void get_time_diff(void)
 static void reinit_timer(void)
 {
 #ifdef INIT_TIMER_AFTER_SUSPEND
-	unsigned long	flags;
-	extern spinlock_t i8253_lock;
+	unsigned long flags;
 
 	spin_lock_irqsave(&i8253_lock, flags);
 	/* set the clock to 100 Hz */
--- linux/arch/i386/kernel/cpu/mtrr/generic.c.orig
+++ linux/arch/i386/kernel/cpu/mtrr/generic.c
@@ -231,7 +231,7 @@ static unsigned long set_mtrr_state(u32 
 
 static unsigned long cr4 = 0;
 static u32 deftype_lo, deftype_hi;
-static spinlock_t set_atomicity_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
 
 static void prepare_set(void)
 {
--- linux/arch/i386/kernel/signal.c.orig
+++ linux/arch/i386/kernel/signal.c
@@ -591,6 +591,13 @@ int fastcall do_signal(struct pt_regs *r
 	int signr;
 	struct k_sigaction ka;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
--- linux/arch/i386/kernel/kgdb_stub.c.orig
+++ linux/arch/i386/kernel/kgdb_stub.c
@@ -365,8 +365,8 @@ __asm__("fn_rtn_stub:\n\t"
 
 #ifdef CONFIG_SMP
 static int in_kgdb_called;
-static spinlock_t waitlocks[MAX_NO_CPUS] =
-    {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED };
+static raw_spinlock_t waitlocks[MAX_NO_CPUS] =
+    {[0 ... MAX_NO_CPUS - 1] = RAW_SPIN_LOCK_UNLOCKED };
 /*
  * The following array has the thread pointer of each of the "other"
  * cpus.  We make it global so it can be seen by gdb.
@@ -374,9 +374,9 @@ static spinlock_t waitlocks[MAX_NO_CPUS]
 volatile int in_kgdb_entry_log[MAX_NO_CPUS];
 volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS];
 /*
-static spinlock_t continuelocks[MAX_NO_CPUS];
+static raw_spinlock_t continuelocks[MAX_NO_CPUS];
 */
-spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED;
+raw_spinlock_t kgdb_spinlock = RAW_SPIN_LOCK_UNLOCKED;
 /* waiters on our spinlock plus us */
 static atomic_t spinlock_waiters = ATOMIC_INIT(1);
 static int spinlock_count = 0;
@@ -2404,7 +2404,7 @@ int kgdb_and_then_count;
 void
 kgdb_tstamp(int line, char *source, int data0, int data1)
 {
-	static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED;
+	static raw_spinlock_t ts_spin = RAW_SPIN_LOCK_UNLOCKED;
 	int flags;
 	kgdb_local_irq_save(flags);
 	spin_lock(&ts_spin);
--- linux/arch/i386/kernel/nmi.c.orig
+++ linux/arch/i386/kernel/nmi.c
@@ -46,7 +46,7 @@ unsigned int nmi_watchdog = NMI_NONE;
 #endif
 
 extern int unknown_nmi_panic;
-static unsigned int nmi_hz = HZ;
+static unsigned int nmi_hz = 1000;
 static unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
 static unsigned int nmi_p4_cccr_val;
 extern void show_registers(struct pt_regs *regs);
@@ -122,7 +122,7 @@ int __init check_nmi_watchdog (void)
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
 		prev_nmi_count[cpu] = irq_stat[cpu].__nmi_count;
 	local_irq_enable();
-	mdelay((10*1000)/nmi_hz); // wait 10 ticks
+	mdelay((100*1000)/nmi_hz); // wait 100 ticks
 
 	/* FIXME: Only boot CPU is online at this stage.  Check CPUs
            as they come up. */
@@ -141,7 +141,7 @@ int __init check_nmi_watchdog (void)
 	/* now that we know it works we can reduce NMI frequency to
 	   something more reasonable; makes a difference in some configs */
 	if (nmi_watchdog == NMI_LOCAL_APIC)
-		nmi_hz = 1;
+		nmi_hz = 1000;
 
 	return 0;
 }
@@ -342,8 +342,8 @@ static void setup_k7_watchdog(void)
 		| K7_NMI_EVENT;
 
 	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
-	Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
-	wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
+	Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz*1000/nmi_hz));
+	wrmsr(MSR_K7_PERFCTR0, -(cpu_khz*1000/nmi_hz), -1);
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 	evntsel |= K7_EVNTSEL_ENABLE;
 	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
@@ -364,8 +364,8 @@ static void setup_p6_watchdog(void)
 		| P6_NMI_EVENT;
 
 	wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
-	Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
-	wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), 0);
+	Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz*1000/nmi_hz));
+	wrmsr(MSR_P6_PERFCTR0, -(cpu_khz*1000/nmi_hz), 0);
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 	evntsel |= P6_EVNTSEL0_ENABLE;
 	wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
@@ -405,8 +405,8 @@ static int setup_p4_watchdog(void)
 
 	wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
 	wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
-	Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
-	wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
+	Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz*1000/nmi_hz));
+	wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz*1000/nmi_hz), -1);
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
 	wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
 	return 1;
@@ -482,7 +482,29 @@ int tune_watchdog = 5*HZ;
 
 extern void die_nmi(struct pt_regs *, const char *msg);
 
-void nmi_watchdog_tick (struct pt_regs * regs)
+int nmi_show_regs[NR_CPUS];
+
+void nmi_show_all_regs(void)
+{
+	int i;
+
+	if (nmi_watchdog == NMI_NONE)
+		return;
+	if (system_state != SYSTEM_RUNNING) {
+		printk("nmi_show_all_regs(): system state %d, not doing.\n",
+			system_state);
+		return;
+	}
+		
+	for_each_online_cpu(i)
+		nmi_show_regs[i] = 1;
+	for_each_online_cpu(i)
+		while (nmi_show_regs[i] == 1)
+			barrier();
+}
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
+ 
+void notrace nmi_watchdog_tick (struct pt_regs * regs)
 {
 
 	/*
@@ -490,10 +512,17 @@ void nmi_watchdog_tick (struct pt_regs *
 	 * always switch the stack NMI-atomically, it's safe to use
 	 * smp_processor_id().
 	 */
-	int sum, cpu = smp_processor_id();
+	int sum, cpu = _smp_processor_id();
 
 	sum = irq_stat[cpu].apic_timer_irqs;
 
+	if (nmi_show_regs[cpu]) {
+		nmi_show_regs[cpu] = 0;
+		spin_lock(&nmi_print_lock);
+		show_regs(regs);
+		spin_unlock(&nmi_print_lock);
+	}
+
 #ifdef CONFIG_KGDB
 	if (!in_kgdb(regs) && last_irq_sums[cpu] == sum) {
 
@@ -512,6 +541,13 @@ void nmi_watchdog_tick (struct pt_regs *
 			alert_counter[cpu] = 0;
 		}
 #endif
+		if (alert_counter[cpu] == 5*nmi_hz) {
+			int i;
+
+			bust_spinlocks(1);
+			for (i = 0; i < NR_CPUS; i++)
+				nmi_show_regs[i] = 1;
+		}
 		if (alert_counter[cpu] == 5*nmi_hz)
 			die_nmi(regs, "NMI Watchdog detected LOCKUP");
 	} else {
@@ -536,7 +572,7 @@ void nmi_watchdog_tick (struct pt_regs *
 			 * other P6 variant */
 			apic_write(APIC_LVTPC, APIC_DM_NMI);
 		}
-		wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
+		wrmsr(nmi_perfctr_msr, -(cpu_khz*1000/nmi_hz), -1);
 	}
 }
 
--- linux/arch/i386/kernel/entry.S.orig
+++ linux/arch/i386/kernel/entry.S
@@ -189,6 +189,8 @@ ENTRY(resume_userspace)
 
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
+	cmpl $0, kernel_preemption
+	jz restore_all
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_all
 need_resched:
@@ -197,9 +199,8 @@ need_resched:
 	jz restore_all
 	testl $IF_MASK,EFLAGS(%esp)     # interrupts off (exception path) ?
 	jz restore_all
-	sti
-	call preempt_schedule
 	cli
+	call preempt_schedule_irq
 	movl $0,TI_preempt_count(%ebp)
 	jmp need_resched
 #endif
@@ -233,6 +234,11 @@ sysenter_past_esp:
 
 	pushl %eax
 	SAVE_ALL
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
 	GET_THREAD_INFO(%ebp)
 
 	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
@@ -245,6 +251,11 @@ sysenter_past_esp:
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
 	jne syscall_exit_work
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %eax
+	call sys_ret
+	popl %eax
+#endif
 /* if something modifies registers it must also disable sysexit */
 	movl EIP(%esp), %edx
 	movl OLDESP(%esp), %ecx
@@ -257,6 +268,11 @@ sysenter_past_esp:
 ENTRY(system_call)
 	pushl %eax			# save orig_eax
 	SAVE_ALL
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation
 	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
@@ -274,6 +290,9 @@ syscall_exit:
 	testw $_TIF_ALLWORK_MASK, %cx	# current->work
 	jne syscall_exit_work
 restore_all:
+#ifdef CONFIG_CRITICAL_TIMING
+	call touch_critical_timing
+#endif
 #ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS
 	movl EFLAGS(%esp), %eax		# mix EFLAGS and CS
 	movb CS(%esp), %al
@@ -287,6 +306,16 @@ restore_all:
 
 resume_kernelX:
 #endif
+#if defined(CONFIG_CRITICAL_IRQSOFF_TIMING) || defined(CONFIG_LATENCY_TRACE)
+	pushl %eax
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+	call trace_irqs_on
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	call sys_ret
+#endif
+	popl %eax
+#endif
 	RESTORE_ALL
 
 	# perform work that needs to be done immediately before resumption
@@ -295,8 +324,9 @@ work_pending:
 	testb $_TIF_NEED_RESCHED, %cl
 	jz work_notifysig
 work_resched:
-	call schedule
-	cli				# make sure we don't miss an interrupt
+	cli
+	call __schedule
+					# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	movl TI_flags(%ebp), %ecx
@@ -343,6 +373,11 @@ syscall_trace_entry:
 syscall_exit_work:
 	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
 	jz work_pending
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+	pushl %eax
+	call trace_irqs_on
+	popl %eax
+#endif
 	sti				# could let do_syscall_trace() call
 					# schedule() instead
 	movl %esp, %eax
--- linux/arch/i386/kernel/process.c.orig
+++ linux/arch/i386/kernel/process.c
@@ -106,6 +106,7 @@ void default_idle(void)
 		else
 			local_irq_enable();
 	} else {
+		local_irq_enable();
 		cpu_relax();
 	}
 }
@@ -197,9 +198,10 @@ void cpu_idle (void)
 				play_dead();
 
 			irq_stat[cpu].idle_timestamp = jiffies;
+			stop_critical_timing();
 			idle();
 		}
-		schedule();
+		__schedule();
 	}
 }
 
@@ -360,11 +362,16 @@ void exit_thread(void)
 
 	/* The process may have allocated an io port bitmap... nuke it. */
 	if (unlikely(NULL != t->io_bitmap_ptr)) {
-		int cpu = get_cpu();
-		struct tss_struct *tss = &per_cpu(init_tss, cpu);
+		int cpu;
+		struct tss_struct *tss;
+		void *io_bitmap_ptr = t->io_bitmap_ptr;
 
-		kfree(t->io_bitmap_ptr);
 		t->io_bitmap_ptr = NULL;
+		mb();
+		kfree(io_bitmap_ptr);
+
+		cpu = get_cpu();
+		tss = &per_cpu(init_tss, cpu);
 		/*
 		 * Careful, clear this in the TSS too:
 		 */
--- linux/arch/i386/kernel/Makefile.orig
+++ linux/arch/i386/kernel/Makefile
@@ -4,11 +4,12 @@
 
 extra-y := head.o init_task.o vmlinux.lds
 
-obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
+obj-y	:= process.o signal.o entry.o traps.o irq.o vm86.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
 		pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
 		doublefault.o quirks.o
 
+obj-$(CONFIG_ASM_SEMAPHORES)	+= semaphore.o
 obj-y				+= cpu/
 obj-y				+= timers/
 obj-$(CONFIG_ACPI_BOOT)		+= acpi/
@@ -21,6 +22,7 @@ obj-$(CONFIG_MICROCODE)		+= microcode.o
 obj-$(CONFIG_APM)		+= apm.o
 obj-$(CONFIG_X86_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
+obj-$(CONFIG_MCOUNT)		+= mcount-wrapper.o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
--- linux/arch/i386/kernel/traps.c.orig
+++ linux/arch/i386/kernel/traps.c
@@ -93,7 +93,7 @@ asmlinkage void machine_check(void);
 
 static int kstack_depth_to_print = 24;
 struct notifier_block *i386die_chain;
-static spinlock_t die_notifier_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_RAW_SPINLOCK(die_notifier_lock);
 
 int register_die_notifier(struct notifier_block *nb)
 {
@@ -148,22 +148,27 @@ static inline unsigned long print_contex
 				unsigned long *stack, unsigned long ebp)
 {
 	unsigned long addr;
+#ifndef CONFIG_FRAME_POINTER
+	unsigned long prev_frame;
+#endif
 
-#ifdef	CONFIG_FRAME_POINTER
+#ifdef CONFIG_FRAME_POINTER
 	while (valid_stack_ptr(tinfo, (void *)ebp)) {
 		addr = *(unsigned long *)(ebp + 4);
 		printk(" [<%08lx>] ", addr);
 		print_symbol("%s", addr);
-		printk("\n");
+		printk(" (%ld)\n", *(unsigned long *)ebp - ebp);
 		ebp = *(unsigned long *)ebp;
 	}
 #else
+	prev_frame = (unsigned long)stack;
 	while (valid_stack_ptr(tinfo, stack)) {
 		addr = *stack++;
 		if (__kernel_text_address(addr)) {
 			printk(" [<%08lx>]", addr);
 			print_symbol(" %s", addr);
-			printk("\n");
+			printk(" (%ld)\n", (unsigned long)stack - prev_frame);
+			prev_frame = (unsigned long)stack;
 		}
 	}
 #endif
@@ -195,6 +200,7 @@ void show_trace(struct task_struct *task
 			break;
 		printk(" =======================\n");
 	}
+	print_traces(task);
 }
 
 void show_stack(struct task_struct *task, unsigned long *esp)
@@ -257,8 +263,8 @@ void show_registers(struct pt_regs *regs
 		regs->eax, regs->ebx, regs->ecx, regs->edx);
 	printk("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
 		regs->esi, regs->edi, regs->ebp, esp);
-	printk("ds: %04x   es: %04x   ss: %04x\n",
-		regs->xds & 0xffff, regs->xes & 0xffff, ss);
+	printk("ds: %04x   es: %04x   ss: %04x   preempt: %08x\n",
+		regs->xds & 0xffff, regs->xes & 0xffff, ss, preempt_count());
 	printk("Process %s (pid: %d, threadinfo=%p task=%p)",
 		current->comm, current->pid, current_thread_info(), current);
 	/*
@@ -329,11 +335,11 @@ bug:
 void die(const char * str, struct pt_regs * regs, long err)
 {
 	static struct {
-		spinlock_t lock;
+		raw_spinlock_t lock;
 		u32 lock_owner;
 		int lock_owner_depth;
 	} die = {
-		.lock =			SPIN_LOCK_UNLOCKED,
+		.lock =			RAW_SPIN_LOCK_UNLOCKED,
 		.lock_owner =		-1,
 		.lock_owner_depth =	0
 	};
@@ -412,6 +418,11 @@ static void do_trap(int trapnr, int sign
 	if (!(regs->xcs & 3))
 		goto kernel_trap;
 
+#ifdef CONFIG_PREEMPT_RT
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	trap_signal: {
 		struct task_struct *tsk = current;
 		tsk->thread.error_code = error_code;
@@ -596,10 +607,11 @@ static void unknown_nmi_error(unsigned c
 	printk("Do you have a strange power saving mode enabled?\n");
 }
 
-static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
 
 void die_nmi (struct pt_regs *regs, const char *msg)
 {
+	deadlock_trace_off();
 	spin_lock(&nmi_print_lock);
 	/*
 	* We are in trouble anyway, lets at least try
@@ -614,17 +626,19 @@ void die_nmi (struct pt_regs *regs, cons
 	console_silent();
 	spin_unlock(&nmi_print_lock);
 	bust_spinlocks(0);
+	nmi_exit();
 	do_exit(SIGSEGV);
 }
 
-static void default_do_nmi(struct pt_regs * regs)
+static void notrace default_do_nmi(struct pt_regs * regs)
 {
 	unsigned char reason = 0;
 
 	/* Only the BSP gets external NMIs from the system.  */
 	if (!smp_processor_id())
 		reason = get_nmi_reason();
- 
+
+//	trace_special(6, 0, 0);
 	if (!(reason & 0xc0)) {
 		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
 							== NOTIFY_STOP)
@@ -636,6 +650,7 @@ static void default_do_nmi(struct pt_reg
 		 */
 		if (nmi_watchdog) {
 			nmi_watchdog_tick(regs);
+//			trace_special(6, 1, 0);
 			return;
 		}
 #endif
@@ -655,21 +670,26 @@ static void default_do_nmi(struct pt_reg
 	reassert_nmi();
 }
 
-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
+static notrace int dummy_nmi_callback(struct pt_regs * regs, int cpu)
 {
 	return 0;
 }
  
 static nmi_callback_t nmi_callback = dummy_nmi_callback;
  
-fastcall void do_nmi(struct pt_regs * regs, long error_code)
+fastcall notrace void do_nmi(struct pt_regs * regs, long error_code)
 {
 	int cpu;
 
 	nmi_enter();
+	nmi_trace((unsigned long)do_nmi, regs->eip, regs->eflags);
 
 	cpu = smp_processor_id();
 
+	if (__kernel_text_address(regs->eip) &&
+			*(unsigned char *)regs->eip == 0xf4)
+		regs->eip++;
+
 #ifdef CONFIG_HOTPLUG_CPU
 	if (!cpu_online(cpu)) {
 		nmi_exit();
--- linux/arch/i386/kernel/i8259.c.orig
+++ linux/arch/i386/kernel/i8259.c
@@ -39,7 +39,7 @@
  * moves to arch independent land
  */
 
-spinlock_t i8259A_lock = SPIN_LOCK_UNLOCKED;
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 
 static void end_8259A_irq (unsigned int irq)
 {
@@ -195,14 +195,19 @@ void mask_and_ack_8259A(unsigned int irq
 		goto spurious_8259A_irq;
 	cached_irq_mask |= irqmask;
 
+#undef DO_DUMMY_IMR_READ
 handle_real_irq:
 	if (irq & 8) {
+#ifdef DO_DUMMY_IMR_READ
 		inb(PIC_SLAVE_IMR);	/* DUMMY - (do we need this?) */
+#endif
 		outb(cached_slave_mask, PIC_SLAVE_IMR);
 		outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
 		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
 	} else {
+#ifdef DO_DUMMY_IMR_READ
 		inb(PIC_MASTER_IMR);	/* DUMMY - (do we need this?) */
+#endif
 		outb(cached_master_mask, PIC_MASTER_IMR);
 		outb(0x60+irq,PIC_MASTER_CMD);	/* 'Specific EOI to master */
 	}
@@ -371,7 +376,7 @@ static irqreturn_t math_error_irq(int cp
  * New motherboards sometimes make IRQ 13 be a PCI interrupt,
  * so allow interrupt sharing.
  */
-static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL };
+static struct irqaction fpu_irq = { math_error_irq, SA_NODELAY, CPU_MASK_NONE, "fpu", NULL, NULL };
 
 void __init init_ISA_irqs (void)
 {
--- linux/arch/i386/kernel/smp.c.orig
+++ linux/arch/i386/kernel/smp.c
@@ -251,7 +251,7 @@ inline void send_IPI_mask_sequence(cpuma
 static cpumask_t flush_cpumask;
 static struct mm_struct * flush_mm;
 static unsigned long flush_va;
-static spinlock_t tlbstate_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_RAW_SPINLOCK(tlbstate_lock);
 #define FLUSH_ALL	0xffffffff
 
 /*
@@ -396,7 +396,7 @@ static void flush_tlb_others(cpumask_t c
 
 	while (!cpus_empty(flush_cpumask))
 		/* nothing. lockup detection does not belong here */
-		mb();
+		cpu_relax();
 
 	flush_mm = NULL;
 	flush_va = 0;
@@ -495,6 +495,16 @@ void smp_send_reschedule(int cpu)
 	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
 }
 
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	send_IPI_allbutself(RESCHEDULE_VECTOR);
+}
+
 void crash_dump_send_ipi(void)
 {
 	send_IPI_allbutself(CRASH_DUMP_VECTOR);
@@ -504,7 +514,7 @@ void crash_dump_send_ipi(void)
  * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
-static spinlock_t call_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 struct call_data_struct {
 	void (*func) (void *info);
@@ -607,8 +617,9 @@ void smp_send_stop(void)
  * all the work is done automatically when
  * we return from the interrupt.
  */
-fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
+fastcall notrace void smp_reschedule_interrupt(struct pt_regs *regs)
 {
+	trace_special(regs->eip, 0, 0);
 	ack_APIC_irq();
 }
 
--- linux/arch/i386/kernel/irq.c.orig
+++ linux/arch/i386/kernel/irq.c
@@ -48,7 +48,7 @@ static union irq_ctx *softirq_ctx[NR_CPU
  * SMP cross-CPU interrupts have their own specific
  * handlers).
  */
-fastcall unsigned int do_IRQ(struct pt_regs *regs)
+fastcall notrace unsigned int do_IRQ(struct pt_regs *regs)
 {	
 	/* high bits used in ret_from_ code */
 	int irq = regs->orig_eax & 0xff;
@@ -58,6 +58,7 @@ fastcall unsigned int do_IRQ(struct pt_r
 #endif
 
 	irq_enter();
+	trace_special(regs->eip, irq, 0);
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 1KB free? */
 	{
@@ -66,12 +67,14 @@ fastcall unsigned int do_IRQ(struct pt_r
 		__asm__ __volatile__("andl %%esp,%0" :
 					"=r" (esp) : "0" (THREAD_SIZE - 1));
 		if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
-			printk("do_IRQ: stack overflow: %ld\n",
+			printk("BUG: do_IRQ: stack overflow: %ld\n",
 				esp - sizeof(struct thread_info));
 			dump_stack();
 		}
 	}
 #endif
+	if (unlikely(!irq))
+		direct_timer_interrupt(regs);
 
 #ifdef CONFIG_4KSTACKS
 
@@ -234,6 +237,7 @@ int show_interrupts(struct seq_file *p, 
 
 		for (action=action->next; action; action = action->next)
 			seq_printf(p, ", %s", action->name);
+		seq_printf(p, "  %d/%d", irq_desc[i].irqs_unhandled, irq_desc[i].irq_count);
 
 		seq_putc(p, '\n');
 skip:
--- linux/arch/i386/kernel/mcount-wrapper.S.orig
+++ linux/arch/i386/kernel/mcount-wrapper.S
@@ -0,0 +1,27 @@
+/*
+ *  linux/arch/i386/mcount-wrapper.S
+ *
+ *  Copyright (C) 2004 Ingo Molnar
+ */
+
+.globl mcount
+mcount:
+
+	cmpl $0, mcount_enabled
+	jz out
+
+	push %ebp
+	mov %esp, %ebp
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+
+	call __mcount
+
+	popl %edx
+	popl %ecx
+	popl %eax
+	popl %ebp
+out:
+	ret
+
--- linux/arch/i386/mach-voyager/voyager_basic.c.orig
+++ linux/arch/i386/mach-voyager/voyager_basic.c
@@ -30,6 +30,7 @@
 #include <linux/irq.h>
 #include <asm/tlbflush.h>
 #include <asm/arch_hooks.h>
+#include <asm/i8253.h>
 
 /*
  * Power off function, if any
@@ -184,7 +185,6 @@ voyager_timer_interrupt(struct pt_regs *
 		 * and swiftly introduce it to something sharp and
 		 * pointy.  */
 		__u16 val;
-		extern spinlock_t i8253_lock;
 
 		spin_lock(&i8253_lock);
 		
--- linux/arch/i386/mach-voyager/setup.c.orig
+++ linux/arch/i386/mach-voyager/setup.c
@@ -17,7 +17,7 @@ void __init pre_intr_init_hook(void)
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
  */
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
 
 void __init intr_init_hook(void)
 {
@@ -40,7 +40,7 @@ void __init trap_init_hook(void)
 {
 }
 
-static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL};
+static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT | SA_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL};
 
 void __init time_init_hook(void)
 {
--- linux/arch/i386/lib/dec_and_lock.c.orig
+++ linux/arch/i386/lib/dec_and_lock.c
@@ -10,7 +10,7 @@
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
 
-int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock)
 {
 	int counter;
 	int newcount;
@@ -32,9 +32,9 @@ repeat:
 	return 0;
 
 slow_path:
-	spin_lock(lock);
+	_raw_spin_lock(lock);
 	if (atomic_dec_and_test(atomic))
 		return 1;
-	spin_unlock(lock);
+	_raw_spin_unlock(lock);
 	return 0;
 }
--- linux/arch/i386/lib/bitops.c.orig
+++ linux/arch/i386/lib/bitops.c
@@ -68,3 +68,37 @@ int find_next_zero_bit(const unsigned lo
 	return (offset + set + res);
 }
 EXPORT_SYMBOL(find_next_zero_bit);
+
+
+/*
+ * rw spinlock fallbacks
+ */
+#if defined(CONFIG_SMP)
+asm(
+".section .sched.text\n"
+".align	4\n"
+".globl	__write_lock_failed\n"
+"__write_lock_failed:\n\t"
+	LOCK "addl	$" RW_LOCK_BIAS_STR ",(%eax)\n"
+"1:	rep; nop\n\t"
+	"cmpl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
+	"jne	1b\n\t"
+	LOCK "subl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
+	"jnz	__write_lock_failed\n\t"
+	"ret"
+);
+
+asm(
+".section .sched.text\n"
+".align	4\n"
+".globl	__read_lock_failed\n"
+"__read_lock_failed:\n\t"
+	LOCK "incl	(%eax)\n"
+"1:	rep; nop\n\t"
+	"cmpl	$1,(%eax)\n\t"
+	"js	1b\n\t"
+	LOCK "decl	(%eax)\n\t"
+	"js	__read_lock_failed\n\t"
+	"ret"
+);
+#endif
--- linux/arch/i386/lib/kgdb_serial.c.orig
+++ linux/arch/i386/lib/kgdb_serial.c
@@ -104,9 +104,9 @@ read_data_bfr(struct async_struct *info)
  * but we will just depend on the uart status to help keep that straight.
 
  */
-static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED;
+static raw_spinlock_t uart_interrupt_lock = RAW_SPIN_LOCK_UNLOCKED;
 #ifdef CONFIG_SMP
-extern spinlock_t kgdb_spinlock;
+extern raw_spinlock_t kgdb_spinlock;
 #endif
 
 static int
@@ -343,7 +343,7 @@ program_uart(struct async_struct *info)
  */
 int kgdb_in_isr = 0;
 int kgdb_in_lsr = 0;
-extern spinlock_t kgdb_spinlock;
+extern raw_spinlock_t kgdb_spinlock;
 
 /* Caller takes needed protections */
 
@@ -381,7 +381,7 @@ tty_getDebugChar(void)
 }				/* tty_getDebugChar */
 
 static int count = 3;
-static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED;
+static raw_spinlock_t one_at_atime = RAW_SPIN_LOCK_UNLOCKED;
 
 static int __init
 kgdb_enable_ints(void)
@@ -435,7 +435,7 @@ kgdb_enable_ints_now(void)
 #endif
 		ints_disabled = request_irq(gdb_async_info->state->irq,
 					    gdb_interrupt,
-					    IRQ_T(gdb_async_info),
+					    IRQ_T(gdb_async_info) | SA_NODELAY,
 					    "KGDB-stub", NULL);
 		intprintk(("KGDB: request_irq returned %d\n", ints_disabled));
 	}
--- linux/arch/i386/mach-visws/visws_apic.c.orig
+++ linux/arch/i386/mach-visws/visws_apic.c
@@ -261,11 +261,13 @@ out_unlock:
 static struct irqaction master_action = {
 	.handler =	piix4_master_intr,
 	.name =		"PIIX4-8259",
+	.flags =	SA_NODELAY,
 };
 
 static struct irqaction cascade_action = {
 	.handler = 	no_action,
 	.name =		"cascade",
+	.flags =	SA_NODELAY,
 };
 
 
--- linux/arch/i386/mach-visws/setup.c.orig
+++ linux/arch/i386/mach-visws/setup.c
@@ -112,7 +112,7 @@ void __init pre_setup_arch_hook()
 
 static struct irqaction irq0 = {
 	.handler =	timer_interrupt,
-	.flags =	SA_INTERRUPT,
+	.flags =	SA_INTERRUPT | SA_NODELAY,
 	.name =		"timer",
 };
 
--- linux/arch/i386/mach-default/setup.c.orig
+++ linux/arch/i386/mach-default/setup.c
@@ -27,7 +27,7 @@ void __init pre_intr_init_hook(void)
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
  */
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
 
 /**
  * intr_init_hook - post gate setup interrupt initialisation
--- linux/drivers/net/tg3.c.orig
+++ linux/drivers/net/tg3.c
@@ -3070,7 +3070,7 @@ static int tg3_start_xmit(struct sk_buff
 	 * So we really do need to disable interrupts when taking
 	 * tx_lock here.
 	 */
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	if (!spin_trylock(&tp->tx_lock)) { 
 		local_irq_restore(flags);
 		return NETDEV_TX_LOCKED; 
@@ -3230,7 +3230,8 @@ static int tg3_start_xmit(struct sk_buff
 
 out_unlock:
     	mmiowb();
-	spin_unlock_irqrestore(&tp->tx_lock, flags);
+	spin_unlock(&tp->tx_lock);
+	local_irq_restore(flags);
 
 	dev->trans_start = jiffies;
 
--- linux/drivers/net/tulip/tulip_core.c.orig
+++ linux/drivers/net/tulip/tulip_core.c
@@ -1781,6 +1781,7 @@ static void __devexit tulip_remove_one (
 	pci_iounmap(pdev, tp->base_addr);
 	free_netdev (dev);
 	pci_release_regions (pdev);
+	pci_disable_device (pdev);
 	pci_set_drvdata (pdev, NULL);
 
 	/* pci_power_off (pdev, -1); */
--- linux/drivers/net/e1000/e1000_main.c.orig
+++ linux/drivers/net/e1000/e1000_main.c
@@ -1802,10 +1802,10 @@ e1000_xmit_frame(struct sk_buff *skb, st
 	if(adapter->pcix_82544)
 		count += nr_frags;
 
- 	local_irq_save(flags); 
+ 	local_irq_save_nort(flags); 
  	if (!spin_trylock(&adapter->tx_lock)) { 
  		/* Collision - tell upper layer to requeue */ 
- 		local_irq_restore(flags); 
+ 		local_irq_restore_nort(flags); 
  		return NETDEV_TX_LOCKED; 
  	} 
 
--- linux/drivers/net/3c59x.c.orig
+++ linux/drivers/net/3c59x.c
@@ -954,9 +954,13 @@ static void poll_vortex(struct net_devic
 	struct vortex_private *vp = netdev_priv(dev);
 	unsigned long flags;
 	local_save_flags(flags);
+#ifndef CONFIG_PREEMPT_RT
 	local_irq_disable();
+#endif
 	(vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev,NULL);
+#ifndef CONFIG_PREEMPT_RT
 	local_irq_restore(flags);
+#endif
 } 
 #endif
 
@@ -1985,12 +1989,16 @@ static void vortex_tx_timeout(struct net
 			 * Block interrupts because vortex_interrupt does a bare spin_lock()
 			 */
 			unsigned long flags;
+#ifndef CONFIG_PREEMPT_RT
 			local_irq_save(flags);
+#endif
 			if (vp->full_bus_master_tx)
 				boomerang_interrupt(dev->irq, dev, NULL);
 			else
 				vortex_interrupt(dev->irq, dev, NULL);
+#ifndef CONFIG_PREEMPT_RT
 			local_irq_restore(flags);
+#endif
 		}
 	}
 
--- linux/drivers/net/netconsole.c.orig
+++ linux/drivers/net/netconsole.c
@@ -74,10 +74,19 @@ static void write_msg(struct console *co
 		return;
 
 	local_irq_save(flags);
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * A bit hairy. Netconsole uses mutexes (indirectly) and
+	 * thus must have interrupts enabled:
+	 */
+	local_irq_enable();
+#endif
 
 	for(left = len; left; ) {
 		frag = min(left, MAX_PRINT_CHUNK);
+		WARN_ON_RT(irqs_disabled());
 		netpoll_send_udp(&np, msg, frag);
+		WARN_ON_RT(irqs_disabled());
 		msg += frag;
 		left -= frag;
 	}
--- linux/drivers/base/driver.c.orig
+++ linux/drivers/base/driver.c
@@ -79,14 +79,13 @@ void put_driver(struct device_driver * d
  *	since most of the things we have to do deal with the bus
  *	structures.
  *
- *	The one interesting aspect is that we initialize @drv->unload_sem
- *	to a locked state here. It will be unlocked when the driver
- *	reference count reaches 0.
+ *	We init the completion strcut here. When the reference 
+ *	count reaches zero, complete() is called from bus_release().
  */
 int driver_register(struct device_driver * drv)
 {
 	INIT_LIST_HEAD(&drv->devices);
-	init_MUTEX_LOCKED(&drv->unload_sem);
+	init_completion(&drv->unload_done);
 	return bus_add_driver(drv);
 }
 
@@ -97,18 +96,16 @@ int driver_register(struct device_driver
  *
  *	Again, we pass off most of the work to the bus-level call.
  *
- *	Though, once that is done, we attempt to take @drv->unload_sem.
- *	This will block until the driver refcount reaches 0, and it is
- *	released. Only modular drivers will call this function, and we
+ *	Though, once that is done, we wait until the driver refcount 
+ *	reaches 0, and complete() is called in bus_release().
+ *	Only modular drivers will call this function, and we
  *	have to guarantee that it won't complete, letting the driver
  *	unload until all references are gone.
  */
-
 void driver_unregister(struct device_driver * drv)
 {
 	bus_remove_driver(drv);
-	down(&drv->unload_sem);
-	up(&drv->unload_sem);
+	wait_for_completion(&drv->unload_done);
 }
 
 /**
--- linux/drivers/base/bus.c.orig
+++ linux/drivers/base/bus.c
@@ -65,7 +65,7 @@ static struct sysfs_ops driver_sysfs_ops
 static void driver_release(struct kobject * kobj)
 {
 	struct device_driver * drv = to_driver(kobj);
-	up(&drv->unload_sem);
+	complete(&drv->unload_done);
 }
 
 static struct kobj_type ktype_driver = {
--- linux/drivers/char/Kconfig.orig
+++ linux/drivers/char/Kconfig
@@ -730,6 +730,22 @@ config RTC
 	  To compile this driver as a module, choose M here: the
 	  module will be called rtc.
 
+config RTC_HISTOGRAM
+	tristate "Real Time Clock Histogram Support"
+	default y
+	depends on RTC
+	---help---
+	  If you say Y here then the kernel will track the delivery and
+	  wakeup latency of /dev/rtc using tasks and will report a
+	  histogram to the kernel log when the application closes /dev/rtc.
+
+config BLOCKER
+	tristate "Priority Inheritance Debugging (Blocker) Device Support"
+	default y
+	---help---
+	  If you say Y here then a device will be created that the userspace
+	  pi_test suite uses to test and measure kernel locking primitives.
+
 config SGI_DS1286
 	tristate "SGI DS1286 RTC support"
 	depends on SGI_IP22
--- linux/drivers/char/vt.c.orig
+++ linux/drivers/char/vt.c
@@ -2174,6 +2174,13 @@ void vt_console_print(struct console *co
 	if (vcmode != KD_TEXT)
 		goto quit;
 
+	/*
+	 * Skip kernel message from within a critical section going
+	 * to a preemptible console (such as fbcon).
+	 */
+	if (in_atomic_rt() && sw->con_preemptible)
+		goto quit;
+
 	/* undraw cursor first */
 	if (IS_FG)
 		hide_cursor(currcons);
@@ -2817,8 +2824,8 @@ void do_blank_screen(int entering_gfx)
 		return;
 
 	if (vesa_off_interval) {
-		blank_state = blank_vesa_wait,
-		mod_timer(&console_timer, jiffies + vesa_off_interval);
+		blank_state = blank_vesa_wait;
+//		mod_timer(&console_timer, jiffies + vesa_off_interval);
 	}
 
     	if (vesa_blank_mode)
@@ -2848,7 +2855,10 @@ void do_unblank_screen(int leaving_gfx)
 		return; /* but leave console_blanked != 0 */
 
 	if (blankinterval) {
-		mod_timer(&console_timer, jiffies + blankinterval);
+#ifdef CONFIG_PREEMPT_RT
+		local_irq_enable();
+#endif
+//		mod_timer(&console_timer, jiffies + blankinterval);
 		blank_state = blank_normal_wait;
 	}
 
@@ -2891,16 +2901,16 @@ void poke_blanked_console(void)
 	/* This isn't perfectly race free, but a race here would be mostly harmless,
 	 * at worse, we'll do a spurrious blank and it's unlikely
 	 */
-	del_timer(&console_timer);
-	blank_timer_expired = 0;
+//	del_timer(&console_timer);
+//	blank_timer_expired = 0;
 
 	if (ignore_poke || !vt_cons[fg_console] || vt_cons[fg_console]->vc_mode == KD_GRAPHICS)
 		return;
 	if (console_blanked)
 		unblank_screen();
 	else if (blankinterval) {
-		mod_timer(&console_timer, jiffies + blankinterval);
-		blank_state = blank_normal_wait;
+//		mod_timer(&console_timer, jiffies + blankinterval);
+//		blank_state = blank_normal_wait;
 	}
 }
 
--- linux/drivers/char/blocker.c.orig
+++ linux/drivers/char/blocker.c
@@ -0,0 +1,118 @@
+/*
+ * priority inheritance testing device
+ */
+
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+
+#define BLOCKER_MINOR		221
+
+#define BLOCK_IOCTL		4245
+#define BLOCK_SET_DEPTH		4246
+
+#define MAX_LOCK_DEPTH		10
+
+u64 notrace get_cpu_tick(void)
+{
+	u64 tsc;
+#ifdef ARCHARM
+	tsc = *oscr;
+#else
+	__asm__ __volatile__("rdtsc" : "=A" (tsc));
+#endif
+	return tsc;
+}
+
+void notrace loop(int loops)
+{
+	int i;
+
+	for (i = 0; i < loops; i++)
+		get_cpu_tick();
+}
+
+static spinlock_t blocker_lock[MAX_LOCK_DEPTH];
+
+static unsigned int lock_depth = 1;
+
+void do_the_lock_and_loop(unsigned int args)
+{
+	int i, max;
+
+	if (rt_task(current))
+		max = lock_depth;
+	else if (lock_depth > 1)
+		max = (current->pid % lock_depth) + 1;
+	else
+		max = 1;
+
+	/* Always lock from the top down */
+	for (i = max-1; i >= 0; i--)
+		 spin_lock(&blocker_lock[i]);
+	loop(args);
+	for (i = 0; i < max; i++)
+		spin_unlock(&blocker_lock[i]);
+}
+
+static int blocker_open(struct inode *in, struct file *file)
+{
+	printk(KERN_INFO "blocker_open called\n");
+
+	return 0;
+}
+
+static int blocker_ioctl(struct inode *in, struct file *file,
+			 unsigned int cmd, unsigned long args)
+{
+	switch(cmd) {
+	case BLOCK_IOCTL:
+		do_the_lock_and_loop(args);
+		return 0;
+	case BLOCK_SET_DEPTH:
+		if (args >= MAX_LOCK_DEPTH)
+			return -EINVAL;
+		lock_depth = args;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct file_operations blocker_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.unlocked_ioctl = blocker_ioctl,
+	.open		= blocker_open,
+};
+
+static struct miscdevice blocker_dev =
+{
+	BLOCKER_MINOR,
+	"blocker",
+	&blocker_fops
+};
+
+static int __init blocker_init(void)
+{
+	int i;
+
+	if (misc_register(&blocker_dev))
+		return -ENODEV;
+
+	for (i = 0; i < MAX_LOCK_DEPTH; i++)
+		blocker_lock[i] = SPIN_LOCK_UNLOCKED;
+
+	return 0;
+}
+
+void __exit blocker_exit(void)
+{
+	printk(KERN_INFO "blocker device uninstalled\n");
+	misc_deregister(&blocker_dev);
+}
+
+module_init(blocker_init);
+module_exit(blocker_exit);
+
+MODULE_LICENSE("GPL");
+
--- linux/drivers/char/ipmi/ipmi_watchdog.c.orig
+++ linux/drivers/char/ipmi/ipmi_watchdog.c
@@ -372,7 +372,7 @@ static void panic_halt_ipmi_set_timeout(
    when both messages are free. */
 static atomic_t heartbeat_tofree = ATOMIC_INIT(0);
 static DECLARE_MUTEX(heartbeat_lock);
-static DECLARE_MUTEX_LOCKED(heartbeat_wait_lock);
+static DECLARE_MUTEX_NOCHECK(heartbeat_wait_lock);
 static void heartbeat_free_smi(struct ipmi_smi_msg *msg)
 {
     if (atomic_dec_and_test(&heartbeat_tofree))
@@ -931,6 +931,8 @@ static int __init ipmi_wdog_init(void)
 	printk(KERN_INFO PFX "driver version "
 	       IPMI_WATCHDOG_VERSION "\n");
 
+	down(&heartbeat_wait_lock); // initialize as locked
+
 	if (strcmp(action, "reset") == 0) {
 		action_val = WDOG_TIMEOUT_RESET;
 	} else if (strcmp(action, "none") == 0) {
--- linux/drivers/char/tty_io.c.orig
+++ linux/drivers/char/tty_io.c
@@ -226,6 +226,7 @@ static int check_tty_count(struct tty_st
 		printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) "
 				    "!= #fd's(%d) in %s\n",
 		       tty->name, tty->count, count, routine);
+		dump_stack();
 		return count;
        }	
 #endif
@@ -829,8 +830,8 @@ void do_tty_hangup(void *data)
 				p->signal->tty = NULL;
 			if (!p->signal->leader)
 				continue;
-			send_group_sig_info(SIGHUP, SEND_SIG_PRIV, p);
-			send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p);
+			group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p);
+			group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p);
 			if (tty->pgrp > 0)
 				p->signal->tty_old_pgrp = tty->pgrp;
 		} while_each_task_pid(tty->session, PIDTYPE_SID, p);
--- linux/drivers/char/sysrq.c.orig
+++ linux/drivers/char/sysrq.c
@@ -175,6 +175,38 @@ static struct sysrq_key_op sysrq_showreg
 	.enable_mask	= SYSRQ_ENABLE_DUMP,
 };
 
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+ 
+static void sysrq_handle_showlocks(int key, struct pt_regs *pt_regs,
+				   struct tty_struct *tty) 
+{
+	show_all_locks();
+}
+
+static struct sysrq_key_op sysrq_showlocks_op = {
+	.handler	= sysrq_handle_showlocks,
+	.help_msg	= "show-all-locks(D)",
+	.action_msg	= "Show Locks Held",
+};
+
+#endif
+
+#if defined(__i386__)
+ 
+static void sysrq_handle_showallregs(int key, struct pt_regs *pt_regs,
+				     struct tty_struct *tty) 
+{
+	nmi_show_all_regs();
+}
+
+static struct sysrq_key_op sysrq_showallregs_op = {
+	.handler	= sysrq_handle_showallregs,
+	.help_msg	= "showalLcpupc",
+	.action_msg	= "Show Regs On All CPUs",
+};
+
+#endif
+
 
 static void sysrq_handle_showstate(int key, struct pt_regs *pt_regs,
 				   struct tty_struct *tty) 
@@ -290,7 +322,11 @@ static struct sysrq_key_op *sysrq_key_ta
 		 and will never arrive */
 /* b */	&sysrq_reboot_op,
 /* c */ NULL,
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+/* d */	&sysrq_showlocks_op,
+#else
 /* d */	NULL,
+#endif
 /* e */	&sysrq_term_op,
 /* f */	&sysrq_moom_op,
 /* g */	GDB_OP,
@@ -302,7 +338,11 @@ static struct sysrq_key_op *sysrq_key_ta
 #else
 /* k */	NULL,
 #endif
+#if defined(__i386__)
+/* l */	&sysrq_showallregs_op,
+#else
 /* l */	NULL,
+#endif
 /* m */	&sysrq_showmem_op,
 /* n */	&sysrq_unrt_op,
 /* o */	NULL, /* This will often be registered
--- linux/drivers/char/rtc.c.orig
+++ linux/drivers/char/rtc.c
@@ -77,6 +77,7 @@
 #include <linux/sysctl.h>
 #include <linux/wait.h>
 #include <linux/bcd.h>
+#include <linux/delay.h>
 
 #include <asm/current.h>
 #include <asm/uaccess.h>
@@ -86,6 +87,28 @@
 #include <asm/hpet.h>
 #endif
 
+#ifdef CONFIG_RTC_HISTOGRAM
+
+static cycles_t last_interrupt_time;
+
+#include <asm/timex.h>
+
+#define CPU_MHZ		(cpu_khz / 1000)
+
+#define HISTSIZE	10000
+static int histogram[HISTSIZE];
+
+static int rtc_state;
+
+enum rtc_states {
+	S_STARTUP,		/* First round - let the application start */
+	S_IDLE,			/* Waiting for an interrupt */
+	S_WAITING_FOR_READ,	/* Signal delivered. waiting for rtc_read() */
+	S_READ_MISSED,		/* Signal delivered, read() deadline missed */
+};
+
+#endif
+
 #ifdef __sparc__
 #include <linux/pci.h>
 #include <asm/ebus.h>
@@ -204,7 +227,147 @@ static inline unsigned char rtc_is_updat
 	return uip;
 }
 
+#ifndef RTC_IRQ
+# undef CONFIG_RTC_HISTOGRAM
+#endif
+
+static inline void rtc_open_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	int i;
+
+	last_interrupt_time = 0;
+	rtc_state = S_STARTUP;
+	rtc_irq_data = 0;
+
+	for (i = 0; i < HISTSIZE; i++)
+		histogram[i] = 0;
+#endif
+}
+
+static inline void rtc_wake_event(void)
+{
+#ifndef CONFIG_RTC_HISTOGRAM
+	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+#else
+	if (!(rtc_status & RTC_IS_OPEN))
+		return;
+
+	switch (rtc_state) {
+	/* Startup */
+	case S_STARTUP:
+		kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+		break;
+	/* Waiting for an interrupt */
+	case S_IDLE:
+		kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+		last_interrupt_time = get_cycles();
+		rtc_state = S_WAITING_FOR_READ;
+		break;
+
+	/* Signal has been delivered. waiting for rtc_read() */
+	case S_WAITING_FOR_READ:
+		/*
+		 * Well foo.  The usermode application didn't
+		 * schedule and read in time.
+		 */
+		rtc_state = S_READ_MISSED;
+		printk("`%s'[%d] is being piggy. need_resched=%d, cpu=%d\n",
+			current->comm, current->pid,
+				need_resched(), smp_processor_id());
+		printk("Read missed before next interrupt\n");
+		break;
+	/* Signal has been delivered, read() deadline was missed */
+	case S_READ_MISSED:
+		/*
+		 * Not much we can do here.  We're waiting for the usermode
+		 * application to read the rtc
+		 */
+		break;
+	}
+#endif
+}
+
+static inline void rtc_read_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	cycles_t now = get_cycles();
+
+	switch (rtc_state) {
+	/* Startup */
+	case S_STARTUP:
+		rtc_state = S_IDLE;
+		break;
+		
+	/* Waiting for an interrupt */
+	case S_IDLE:
+		printk("bug in rtc_read(): called in state S_IDLE!\n");
+		break;
+	case S_WAITING_FOR_READ:	/*
+					 * Signal has been delivered.
+					 * waiting for rtc_read()
+					 */
+		/*
+		 * Well done
+		 */
+	case S_READ_MISSED:		/*
+					 * Signal has been delivered, read()
+					 * deadline was missed
+					 */
+		/*
+		 * So, you finally got here.
+		 */
+		if (!last_interrupt_time)
+			printk("bug in rtc_read(): last_interrupt_time = 0\n");
+		rtc_state = S_IDLE;
+		{
+			cycles_t latency = now - last_interrupt_time;
+			unsigned long delta;	/* Microseconds */
+
+			delta = latency;
+			delta /= CPU_MHZ;
+
+			if (delta > 1000 * 1000) {
+				printk("rtc: eek\n");
+			} else {
+				unsigned long slot = delta;
+				if (slot >= HISTSIZE)
+					slot = HISTSIZE - 1;
+				histogram[slot]++;
+				if (delta > 2000)
+					printk("wow!  That was a "
+							"%ld millisec bump\n",
+						delta / 1000);
+			}
+		}
+		rtc_state = S_IDLE;
+		break;
+	}
+#endif
+}
+
+static inline void rtc_close_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	int i = 0;
+	unsigned long total = 0;
+
+	for (i = 0; i < HISTSIZE; i++)
+		total += histogram[i];
+	if (!total)
+		return;
+
+	printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n",
+		current->comm, current->pid, total);
+	for (i = 0; i < HISTSIZE; i++) {
+		if (histogram[i])
+			printk("%d %d\n", i, histogram[i]);
+	}
+#endif
+}
+
 #ifdef RTC_IRQ
+
 /*
  *	A very tiny interrupt handler. It runs with SA_INTERRUPT set,
  *	but there is possibility of conflicting with the set_rtc_mmss()
@@ -248,9 +411,9 @@ irqreturn_t rtc_interrupt(int irq, void 
 	if (rtc_callback)
 		rtc_callback->func(rtc_callback->private_data);
 	spin_unlock(&rtc_task_lock);
-	wake_up_interruptible(&rtc_wait);	
 
-	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+	rtc_wake_event();
+	wake_up_interruptible(&rtc_wait);	
 
 	return IRQ_HANDLED;
 }
@@ -354,6 +517,8 @@ static ssize_t rtc_read(struct file *fil
 		schedule();
 	} while (1);
 
+	rtc_read_event();
+
 	if (count < sizeof(unsigned long))
 		retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int); 
 	else
@@ -583,6 +748,11 @@ static int rtc_do_ioctl(unsigned int cmd
 		save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
 		CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
 
+		/*
+		 * Make CMOS date writes nonpreemptible even on PREEMPT_RT.
+		 * There's a limit to everything! =B-)
+		 */
+		preempt_disable();
 #ifdef CONFIG_MACH_DECSTATION
 		CMOS_WRITE(real_yrs, RTC_DEC_YEAR);
 #endif
@@ -592,6 +762,7 @@ static int rtc_do_ioctl(unsigned int cmd
 		CMOS_WRITE(hrs, RTC_HOURS);
 		CMOS_WRITE(min, RTC_MINUTES);
 		CMOS_WRITE(sec, RTC_SECONDS);
+		preempt_enable();
 
 		CMOS_WRITE(save_control, RTC_CONTROL);
 		CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
@@ -689,6 +860,7 @@ static int rtc_open(struct inode *inode,
 	if(rtc_status & RTC_IS_OPEN)
 		goto out_busy;
 
+	rtc_open_event();
 	rtc_status |= RTC_IS_OPEN;
 
 	rtc_irq_data = 0;
@@ -744,6 +916,7 @@ no_irq:
 	rtc_irq_data = 0;
 	rtc_status &= ~RTC_IS_OPEN;
 	spin_unlock_irq (&rtc_lock);
+	rtc_close_event();
 	return 0;
 }
 
@@ -886,7 +1059,6 @@ static int __init rtc_init(void)
 {
 #if defined(__alpha__) || defined(__mips__)
 	unsigned int year, ctrl;
-	unsigned long uip_watchdog;
 	char *guess = NULL;
 #endif
 #ifdef __sparc__
@@ -989,12 +1161,8 @@ no_irq:
 	/* Each operating system on an Alpha uses its own epoch.
 	   Let's try to guess which one we are using now. */
 	
-	uip_watchdog = jiffies;
 	if (rtc_is_updating() != 0)
-		while (jiffies - uip_watchdog < 2*HZ/100) { 
-			barrier();
-			cpu_relax();
-		}
+		msleep(2*HZ/100);
 	
 	spin_lock_irq(&rtc_lock);
 	year = CMOS_READ(RTC_YEAR);
@@ -1211,7 +1379,6 @@ static int rtc_read_proc(char *page, cha
 
 void rtc_get_rtc_time(struct rtc_time *rtc_tm)
 {
-	unsigned long uip_watchdog = jiffies;
 	unsigned char ctrl;
 #ifdef CONFIG_MACH_DECSTATION
 	unsigned int real_year;
@@ -1219,19 +1386,15 @@ void rtc_get_rtc_time(struct rtc_time *r
 
 	/*
 	 * read RTC once any update in progress is done. The update
-	 * can take just over 2ms. We wait 10 to 20ms. There is no need to
+	 * can take just over 2ms. We wait 20ms. There is no need to
 	 * to poll-wait (up to 1s - eeccch) for the falling edge of RTC_UIP.
 	 * If you need to know *exactly* when a second has started, enable
 	 * periodic update complete interrupts, (via ioctl) and then 
 	 * immediately read /dev/rtc which will block until you get the IRQ.
 	 * Once the read clears, read the RTC time (again via ioctl). Easy.
 	 */
-
 	if (rtc_is_updating() != 0)
-		while (jiffies - uip_watchdog < 2*HZ/100) {
-			barrier();
-			cpu_relax();
-		}
+		msleep(2*HZ/100);
 
 	/*
 	 * Only the values that we read from the RTC are set. We leave
--- linux/drivers/char/Makefile.orig
+++ linux/drivers/char/Makefile
@@ -58,6 +58,7 @@ obj-$(CONFIG_R3964) += n_r3964.o
 obj-$(CONFIG_APPLICOM) += applicom.o
 obj-$(CONFIG_SONYPI) += sonypi.o
 obj-$(CONFIG_RTC) += rtc.o
+obj-$(CONFIG_BLOCKER) += blocker.o
 obj-$(CONFIG_HPET) += hpet.o
 obj-$(CONFIG_GEN_RTC) += genrtc.o
 obj-$(CONFIG_EFI_RTC) += efirtc.o
--- linux/drivers/char/random.c.orig
+++ linux/drivers/char/random.c
@@ -822,8 +822,11 @@ static void add_timer_randomness(struct 
 	preempt_disable();
 	/* if over the trickle threshold, use only 1 in 4096 samples */
 	if ( random_state->entropy_count > trickle_thresh &&
-	     (__get_cpu_var(trickle_count)++ & 0xfff))
-		goto out;
+	     (__get_cpu_var(trickle_count)++ & 0xfff)) {
+		preempt_enable();
+		return;
+	}
+	preempt_enable();
 
 	/*
 	 * Use get_cycles() if implemented, otherwise fall back to
@@ -872,8 +875,6 @@ static void add_timer_randomness(struct 
 		entropy = int_ln_12bits(delta);
 	}
 	batch_entropy_store(num, time, entropy);
-out:
-	preempt_enable();
 }
 
 void add_keyboard_randomness(unsigned char scancode)
--- linux/drivers/input/joystick/analog.c.orig
+++ linux/drivers/input/joystick/analog.c
@@ -141,12 +141,14 @@ struct analog_port {
  */
 
 #ifdef __i386__
+
+#include <asm/i8253.h>
+
 #define GET_TIME(x)	do { if (cpu_has_tsc) rdtscl(x); else x = get_time_pit(); } while (0)
 #define DELTA(x,y)	(cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? CLOCK_TICK_RATE / HZ : 0)))
 #define TIME_NAME	(cpu_has_tsc?"TSC":"PIT")
 static unsigned int get_time_pit(void)
 {
-        extern spinlock_t i8253_lock;
         unsigned long flags;
         unsigned int count;
 
--- linux/drivers/input/gameport/gameport.c.orig
+++ linux/drivers/input/gameport/gameport.c
@@ -37,12 +37,13 @@ static LIST_HEAD(gameport_dev_list);
 
 #ifdef __i386__
 
+#include <asm/i8253.h>
+
 #define DELTA(x,y)      ((y)-(x)+((y)<(x)?1193182/HZ:0))
 #define GET_TIME(x)     do { x = get_time_pit(); } while (0)
 
 static unsigned int get_time_pit(void)
 {
-	extern spinlock_t i8253_lock;
 	unsigned long flags;
 	unsigned int count;
 
--- linux/drivers/scsi/scsi_error.c.orig
+++ linux/drivers/scsi/scsi_error.c
@@ -477,10 +477,12 @@ static void scsi_eh_done(struct scsi_cmn
 static int scsi_send_eh_cmnd(struct scsi_cmnd *scmd, int timeout)
 {
 	struct Scsi_Host *host = scmd->device->host;
-	DECLARE_MUTEX_LOCKED(sem);
+	DECLARE_MUTEX_NOCHECK(sem);
 	unsigned long flags;
 	int rtn = SUCCESS;
 
+	down(&sem);
+
 	/*
 	 * we will use a queued command if possible, otherwise we will
 	 * emulate the queuing and calling of completion function ourselves.
@@ -1624,8 +1626,9 @@ int scsi_error_handler(void *data)
 {
 	struct Scsi_Host *shost = (struct Scsi_Host *) data;
 	int rtn;
-	DECLARE_MUTEX_LOCKED(sem);
+	DECLARE_MUTEX_NOCHECK(sem);
 
+	down(&sem);
 	/*
 	 *    Flush resources
 	 */
--- linux/drivers/scsi/aha152x.c.orig
+++ linux/drivers/scsi/aha152x.c
@@ -1160,11 +1160,13 @@ static void timer_expired(unsigned long 
 static int aha152x_device_reset(Scsi_Cmnd * SCpnt)
 {
 	struct Scsi_Host *shpnt = SCpnt->device->host;
-	DECLARE_MUTEX_LOCKED(sem);
+	DECLARE_MUTEX_NOCHECK(sem);
 	struct timer_list timer;
 	int ret, issued, disconnected;
 	unsigned long flags;
 
+	down(&sem);
+
 #if defined(AHA152X_DEBUG)
 	if(HOSTDATA(shpnt)->debug & debug_eh) {
 		printk(INFO_LEAD "aha152x_device_reset(%p)", CMDINFO(SCpnt), SCpnt);
--- linux/drivers/scsi/qla2xxx/qla_os.c.orig
+++ linux/drivers/scsi/qla2xxx/qla_os.c
@@ -3190,7 +3190,7 @@ qla2x00_free_sp_pool( scsi_qla_host_t *h
 static int
 qla2x00_do_dpc(void *data)
 {
-	DECLARE_MUTEX_LOCKED(sem);
+	DECLARE_MUTEX_NOCHECK(sem);
 	scsi_qla_host_t *ha;
 	fc_port_t	*fcport;
 	os_lun_t        *q;
@@ -3204,6 +3204,8 @@ qla2x00_do_dpc(void *data)
 	int t;
 	os_tgt_t *tq;
 
+	down(&sem);
+
 	ha = (scsi_qla_host_t *)data;
 
 	lock_kernel();
--- linux/drivers/ieee1394/raw1394.c.orig
+++ linux/drivers/ieee1394/raw1394.c
@@ -2529,7 +2529,7 @@ static int raw1394_open(struct inode *in
         fi->state = opened;
         INIT_LIST_HEAD(&fi->req_pending);
         INIT_LIST_HEAD(&fi->req_complete);
-        sema_init(&fi->complete_sem, 0);
+        sema_init_nocheck(&fi->complete_sem, 0);
         spin_lock_init(&fi->reqlists_lock);
         init_waitqueue_head(&fi->poll_wait_complete);
         INIT_LIST_HEAD(&fi->addr_list);
--- linux/drivers/ieee1394/ieee1394_core.c.orig
+++ linux/drivers/ieee1394/ieee1394_core.c
@@ -1003,7 +1003,7 @@ void abort_timedouts(unsigned long __opa
 static int khpsbpkt_pid = -1, khpsbpkt_kill;
 static DECLARE_COMPLETION(khpsbpkt_complete);
 struct sk_buff_head hpsbpkt_queue;
-static DECLARE_MUTEX_LOCKED(khpsbpkt_sig);
+static DECLARE_MUTEX_NOCHECK(khpsbpkt_sig);
 
 
 static void queue_packet_complete(struct hpsb_packet *packet)
@@ -1059,6 +1059,8 @@ static int __init ieee1394_init(void)
 {
 	int i, ret;
 
+	down(&khpsbpkt_sig); // initialize as locked
+
 	skb_queue_head_init(&hpsbpkt_queue);
 
 	/* non-fatal error */
--- linux/drivers/ieee1394/ieee1394_types.h.orig
+++ linux/drivers/ieee1394/ieee1394_types.h
@@ -28,7 +28,7 @@ do {						\
 	spin_lock_init(&(_tp)->lock);		\
 	(_tp)->next = 0;			\
 	(_tp)->allocations = 0;			\
-	sema_init(&(_tp)->count, 63);		\
+	sema_init_nocheck(&(_tp)->count, 63);	\
 } while (0)
 
 
--- linux/drivers/ieee1394/nodemgr.c.orig
+++ linux/drivers/ieee1394/nodemgr.c
@@ -1664,7 +1664,7 @@ static void nodemgr_add_host(struct hpsb
 
 	hi->host = host;
 	init_completion(&hi->exited);
-        sema_init(&hi->reset_sem, 0);
+        sema_init_nocheck(&hi->reset_sem, 0);
 
 	sprintf(hi->daemon_name, "knodemgrd_%d", host->id);
 
--- linux/drivers/video/console/vgacon.c.orig
+++ linux/drivers/video/console/vgacon.c
@@ -53,7 +53,7 @@
 #include <video/vga.h>
 #include <asm/io.h>
 
-static spinlock_t vga_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_RAW_SPINLOCK(vga_lock);
 static struct vgastate state;
 
 #define BLANK 0x0020
--- linux/drivers/video/console/fbcon.c.orig
+++ linux/drivers/video/console/fbcon.c
@@ -1028,7 +1028,6 @@ static void fbcon_clear(struct vc_data *
 {
 	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
 	struct fbcon_ops *ops = info->fbcon_par;
-
 	struct display *p = &fb_display[vc->vc_num];
 	u_int y_break;
 
@@ -1057,10 +1056,11 @@ static void fbcon_putcs(struct vc_data *
 	struct display *p = &fb_display[vc->vc_num];
 	struct fbcon_ops *ops = info->fbcon_par;
 
-	if (!fbcon_is_inactive(vc, info))
+	if (!fbcon_is_inactive(vc, info)) {
 		ops->putcs(vc, info, s, count, real_y(p, ypos), xpos,
 			   get_color(vc, info, scr_readw(s), 1),
 			   get_color(vc, info, scr_readw(s), 0));
+	}
 }
 
 static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos)
@@ -2733,6 +2733,7 @@ const struct consw fb_con = {
 	.con_screen_pos 	= fbcon_screen_pos,
 	.con_getxy 		= fbcon_getxy,
 	.con_resize             = fbcon_resize,
+	.con_preemptible 	= 1,
 };
 
 static struct notifier_block fbcon_event_notifier = {
--- linux/drivers/media/video/saa7134/saa7134-tvaudio.c.orig
+++ linux/drivers/media/video/saa7134/saa7134-tvaudio.c
@@ -943,7 +943,6 @@ int saa7134_tvaudio_getstereo(struct saa
 
 int saa7134_tvaudio_init2(struct saa7134_dev *dev)
 {
-	DECLARE_MUTEX_LOCKED(sem);
 	int (*my_thread)(void *data) = NULL;
 
 	/* enable I2S audio output */
--- linux/drivers/media/dvb/dvb-core/dvb_frontend.c.orig
+++ linux/drivers/media/dvb/dvb-core/dvb_frontend.c
@@ -500,7 +500,7 @@ static void dvb_frontend_stop(struct dvb
 		printk("dvb_frontend_stop: thread PID %d already died\n",
 				fe->thread_pid);
 		/* make sure the mutex was not held by the thread */
-		init_MUTEX (&fe->sem);
+		sema_init_nocheck (&fe->sem, 1);
 		return;
 	}
 
@@ -831,10 +831,10 @@ int dvb_register_frontend(struct dvb_ada
 	if (down_interruptible (&frontend_mutex))
 		return -ERESTARTSYS;
 
-	init_MUTEX (&fe->sem);
+	sema_init_nocheck (&fe->sem, 1);
 	init_waitqueue_head (&fe->wait_queue);
 	init_waitqueue_head (&fe->events.wait_queue);
-	init_MUTEX (&fe->events.sem);
+	sema_init_nocheck (&fe->events.sem, 1);
 	fe->events.eventw = fe->events.eventr = 0;
 	fe->events.overflow = 0;
 	fe->dvb = dvb;
--- linux/drivers/acpi/osl.c.orig
+++ linux/drivers/acpi/osl.c
@@ -837,7 +837,7 @@ acpi_os_create_semaphore(
 		return_ACPI_STATUS (AE_NO_MEMORY);
 	memset(sem, 0, sizeof(struct semaphore));
 
-	sema_init(sem, initial_units);
+	sema_init_nocheck(sem, initial_units);
 
 	*handle = (acpi_handle*)sem;
 
--- linux/drivers/usb/core/hcd.c.orig
+++ linux/drivers/usb/core/hcd.c
@@ -333,7 +333,9 @@ static int rh_call_control (struct usb_h
 	u8		*ubuf = urb->transfer_buffer;
 	int		len = 0;
 	int		patch_wakeup = 0;
+#ifndef CONFIG_PREEMPT_RT
 	unsigned long	flags;
+#endif
 
 	cmd = (struct usb_ctrlrequest *) urb->setup_packet;
 	typeReq  = (cmd->bRequestType << 8) | cmd->bRequest;
@@ -468,9 +470,13 @@ error:
 	}
 
 	/* any errors get returned through the urb completion */
+#ifndef CONFIG_PREEMPT_RT
 	local_irq_save (flags);
+#endif
 	usb_hcd_giveback_urb (hcd, urb, NULL);
+#ifndef CONFIG_PREEMPT_RT
 	local_irq_restore (flags);
+#endif
 	return 0;
 }
 
@@ -520,15 +526,13 @@ static void rh_report_status (unsigned l
 	unsigned long	flags;
 
 	urb = (struct urb *) ptr;
-	local_irq_save (flags);
-	spin_lock (&urb->lock);
+	spin_lock_irqsave (&urb->lock, flags);
 
 	/* do nothing if the urb's been unlinked */
 	if (!urb->dev
 			|| urb->status != -EINPROGRESS
 			|| (hcd = urb->dev->bus->hcpriv) == 0) {
-		spin_unlock (&urb->lock);
-		local_irq_restore (flags);
+		spin_unlock_irqrestore (&urb->lock, flags);
 		return;
 	}
 
@@ -546,12 +550,12 @@ static void rh_report_status (unsigned l
 			mod_timer (&hcd->rh_timer, jiffies + HZ/4);
 	}
 	spin_unlock (&hcd_data_lock);
-	spin_unlock (&urb->lock);
 
 	/* local irqs are always blocked in completions */
 	if (length > 0)
 		usb_hcd_giveback_urb (hcd, urb, NULL);
-	local_irq_restore (flags);
+
+	spin_unlock_irqrestore (&urb->lock, flags);
 }
 
 /*-------------------------------------------------------------------------*/
@@ -577,17 +581,23 @@ static int rh_urb_enqueue (struct usb_hc
 
 static int usb_rh_urb_dequeue (struct usb_hcd *hcd, struct urb *urb)
 {
+#ifndef CONFIG_PREEMPT_RT
 	unsigned long	flags;
+#endif
 
 	/* note:  always a synchronous unlink */
 	if ((unsigned long) urb == hcd->rh_timer.data) {
 		del_timer_sync (&hcd->rh_timer);
 		hcd->rh_timer.data = 0;
 
+#ifndef CONFIG_PREEMPT_RT
 		local_irq_save (flags);
+#endif
 		urb->hcpriv = NULL;
 		usb_hcd_giveback_urb (hcd, urb, NULL);
+#ifndef CONFIG_PREEMPT_RT
 		local_irq_restore (flags);
+#endif
 
 	} else if (usb_pipeendpoint(urb->pipe) == 0) {
 		spin_lock_irq(&urb->lock);	/* from usb_kill_urb */
@@ -1333,9 +1343,10 @@ static void hcd_endpoint_disable (struct
 
 	WARN_ON (!HCD_IS_RUNNING (hcd->state) && hcd->state != USB_STATE_HALT);
 
-	local_irq_disable ();
-
 rescan:
+#ifndef CONFIG_PREEMPT_RT
+	local_irq_disable();
+#endif
 	/* (re)block new requests, as best we can */
 	if (endpoint & USB_DIR_IN)
 		udev->epmaxpacketin [epnum] = 0;
@@ -1343,7 +1354,7 @@ rescan:
 		udev->epmaxpacketout [epnum] = 0;
 
 	/* then kill any current requests */
-	spin_lock (&hcd_data_lock);
+	spin_lock_irq (&hcd_data_lock);
 	list_for_each_entry (urb, &dev->urb_list, urb_list) {
 		int	tmp = urb->pipe;
 
@@ -1362,13 +1373,13 @@ rescan:
 		if (urb->status != -EINPROGRESS)
 			continue;
 		usb_get_urb (urb);
-		spin_unlock (&hcd_data_lock);
+		spin_unlock_irq (&hcd_data_lock);
 
-		spin_lock (&urb->lock);
+		spin_lock_irq (&urb->lock);
 		tmp = urb->status;
 		if (tmp == -EINPROGRESS)
 			urb->status = -ESHUTDOWN;
-		spin_unlock (&urb->lock);
+		spin_unlock_irq (&urb->lock);
 
 		/* kick hcd unless it's already returning this */
 		if (tmp == -EINPROGRESS) {
@@ -1391,8 +1402,7 @@ rescan:
 		/* list contents may have changed */
 		goto rescan;
 	}
-	spin_unlock (&hcd_data_lock);
-	local_irq_enable ();
+	spin_unlock_irq (&hcd_data_lock);
 
 	/* synchronize with the hardware, so old configuration state
 	 * clears out immediately (and will be freed).
--- linux/drivers/block/loop.c.orig
+++ linux/drivers/block/loop.c
@@ -378,7 +378,7 @@ static void loop_add_bio(struct loop_dev
 		lo->lo_bio = lo->lo_biotail = bio;
 	spin_unlock_irqrestore(&lo->lo_lock, flags);
 
-	up(&lo->lo_bh_mutex);
+	complete(&lo->lo_bh_done);
 }
 
 /*
@@ -427,7 +427,7 @@ static int loop_make_request(request_que
 	return 0;
 err:
 	if (atomic_dec_and_test(&lo->lo_pending))
-		up(&lo->lo_bh_mutex);
+		complete(&lo->lo_bh_done);
 out:
 	bio_io_error(old_bio, old_bio->bi_size);
 	return 0;
@@ -495,12 +495,12 @@ static int loop_thread(void *data)
 	/*
 	 * up sem, we are running
 	 */
-	up(&lo->lo_sem);
+	complete(&lo->lo_done);
 
 	for (;;) {
-		down_interruptible(&lo->lo_bh_mutex);
+		wait_for_completion_interruptible(&lo->lo_bh_done);
 		/*
-		 * could be upped because of tear-down, not because of
+		 * could be completed because of tear-down, not because of
 		 * pending work
 		 */
 		if (!atomic_read(&lo->lo_pending))
@@ -521,7 +521,7 @@ static int loop_thread(void *data)
 			break;
 	}
 
-	up(&lo->lo_sem);
+	complete(&lo->lo_done);
 	return 0;
 }
 
@@ -731,7 +731,7 @@ static int loop_set_fd(struct loop_devic
 	set_blocksize(bdev, lo_blocksize);
 
 	kernel_thread(loop_thread, lo, CLONE_KERNEL);
-	down(&lo->lo_sem);
+	wait_for_completion(&lo->lo_done);
 	return 0;
 
  out_putf:
@@ -796,10 +796,10 @@ static int loop_clr_fd(struct loop_devic
 	spin_lock_irq(&lo->lo_lock);
 	lo->lo_state = Lo_rundown;
 	if (atomic_dec_and_test(&lo->lo_pending))
-		up(&lo->lo_bh_mutex);
+		complete(&lo->lo_bh_done);
 	spin_unlock_irq(&lo->lo_lock);
 
-	down(&lo->lo_sem);
+	wait_for_completion(&lo->lo_done);
 
 	lo->lo_backing_file = NULL;
 
@@ -1176,8 +1176,8 @@ int __init loop_init(void)
 		if (!lo->lo_queue)
 			goto out_mem4;
 		init_MUTEX(&lo->lo_ctl_mutex);
-		init_MUTEX_LOCKED(&lo->lo_sem);
-		init_MUTEX_LOCKED(&lo->lo_bh_mutex);
+		init_completion(&lo->lo_done);
+		init_completion(&lo->lo_bh_done);
 		lo->lo_number = i;
 		spin_lock_init(&lo->lo_lock);
 		disk->major = LOOP_MAJOR;
--- linux/drivers/block/ll_rw_blk.c.orig
+++ linux/drivers/block/ll_rw_blk.c
@@ -1214,7 +1214,9 @@ static int ll_merge_requests_fn(request_
  */
 void blk_plug_device(request_queue_t *q)
 {
+#ifndef CONFIG_PREEMPT_RT
 	WARN_ON(!irqs_disabled());
+#endif
 
 	/*
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
@@ -1235,7 +1237,9 @@ EXPORT_SYMBOL(blk_plug_device);
  */
 int blk_remove_plug(request_queue_t *q)
 {
+#ifndef CONFIG_PREEMPT_RT
 	WARN_ON(!irqs_disabled());
+#endif
 
 	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
 		return 0;
--- linux/drivers/ide/ide-iops.c.orig
+++ linux/drivers/ide/ide-iops.c
@@ -767,13 +767,11 @@ int ide_driveid_update (ide_drive_t *dri
 		printk("%s: CHECK for good STATUS\n", drive->name);
 		return 0;
 	}
-	local_irq_save(flags);
-	SELECT_MASK(drive, 0);
 	id = kmalloc(SECTOR_WORDS*4, GFP_ATOMIC);
-	if (!id) {
-		local_irq_restore(flags);
+	if (!id)
 		return 0;
-	}
+	local_irq_save(flags);
+	SELECT_MASK(drive, 0);
 	ata_input_data(drive, id, SECTOR_WORDS);
 	(void) hwif->INB(IDE_STATUS_REG);	/* clear drive IRQ */
 	local_irq_enable();
--- linux/drivers/ide/ide.c.orig
+++ linux/drivers/ide/ide.c
@@ -175,7 +175,7 @@ static int system_bus_speed;	/* holds wh
 static int initializing;	/* set while initializing built-in drivers */
 
 DECLARE_MUTEX(ide_cfg_sem);
-spinlock_t ide_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+DEFINE_SPINLOCK(ide_lock);
 
 #ifdef CONFIG_BLK_DEV_IDEPCI
 static int ide_scan_direction; /* THIS was formerly 2.2.x pci=reverse */
--- linux/drivers/ide/ide-taskfile.c.orig
+++ linux/drivers/ide/ide-taskfile.c
@@ -271,7 +271,7 @@ static void ide_pio_sector(ide_drive_t *
 	ide_hwif_t *hwif = drive->hwif;
 	struct scatterlist *sg = hwif->sg_table;
 	struct page *page;
-#ifdef CONFIG_HIGHMEM
+#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_PREEMPT_RT)
 	unsigned long flags;
 #endif
 	unsigned int offset;
@@ -284,7 +284,7 @@ static void ide_pio_sector(ide_drive_t *
 	page = nth_page(page, (offset >> PAGE_SHIFT));
 	offset %= PAGE_SIZE;
 
-#ifdef CONFIG_HIGHMEM
+#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_PREEMPT_RT)
 	local_irq_save(flags);
 #endif
 	buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset;
@@ -304,7 +304,7 @@ static void ide_pio_sector(ide_drive_t *
 		taskfile_input_data(drive, buf, SECTOR_WORDS);
 
 	kunmap_atomic(buf, KM_BIO_SRC_IRQ);
-#ifdef CONFIG_HIGHMEM
+#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_PREEMPT_RT)
 	local_irq_restore(flags);
 #endif
 }
@@ -457,8 +457,10 @@ ide_startstop_t pre_task_out_intr (ide_d
 		return startstop;
 	}
 
+#ifndef CONFIG_PREEMPT_RT
 	if (!drive->unmask)
 		local_irq_disable();
+#endif
 
 	ide_set_handler(drive, &task_out_intr, WAIT_WORSTCASE, NULL);
 	ide_pio_datablock(drive, rq, 1);
--- linux/drivers/ide/legacy/hd.c.orig
+++ linux/drivers/ide/legacy/hd.c
@@ -156,11 +156,13 @@ else \
 
 
 #if (HD_DELAY > 0)
+
+#include <asm/i8253.h>
+
 unsigned long last_req;
 
 unsigned long read_timer(void)
 {
-        extern spinlock_t i8253_lock;
 	unsigned long t, flags;
 	int i;
 
--- linux/drivers/ide/ide-io.c.orig
+++ linux/drivers/ide/ide-io.c
@@ -115,6 +115,9 @@ static int __ide_end_request(ide_drive_t
 	int ret = 1;
 
 	BUG_ON(!(rq->flags & REQ_STARTED));
+	spin_unlock(&ide_lock);
+	if (drive->unmask)
+		local_irq_enable();
 
 	/*
 	 * if failfast is set on a request, override number of sectors and
@@ -136,6 +139,7 @@ static int __ide_end_request(ide_drive_t
 	}
 
 	if (!end_that_request_first(rq, uptodate, nr_sectors)) {
+		spin_lock_irq(&ide_lock);
 		add_disk_randomness(rq->rq_disk);
 
 		if (blk_rq_tagged(rq))
@@ -145,7 +149,8 @@ static int __ide_end_request(ide_drive_t
 		HWGROUP(drive)->rq = NULL;
 		end_that_request_last(rq);
 		ret = 0;
-	}
+	} else
+		spin_lock_irq(&ide_lock);
 	return ret;
 }
 
@@ -1072,7 +1077,9 @@ static void ide_do_request (ide_hwgroup_
 	ide_get_lock(ide_intr, hwgroup);
 
 	/* caller must own ide_lock */
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(!irqs_disabled());
+#endif
 
 	while (!hwgroup->busy) {
 		hwgroup->busy = 1;
@@ -1330,7 +1337,7 @@ void ide_timer_expiry (unsigned long dat
 #endif /* DISABLE_IRQ_NOSYNC */
 			/* local CPU only,
 			 * as if we were handling an interrupt */
-			local_irq_disable();
+//			local_irq_disable();
 			if (hwgroup->poll_timeout != 0) {
 				startstop = handler(drive);
 			} else if (drive_is_ready(drive)) {
--- linux/include/net/protocol.h.orig
+++ linux/include/net/protocol.h
@@ -79,9 +79,11 @@ struct inet_protosw {
 
 extern struct net_protocol *inet_protocol_base;
 extern struct net_protocol *inet_protos[MAX_INET_PROTOS];
+extern rwlock_t inet_proto_lock;
 
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 extern struct inet6_protocol *inet6_protos[MAX_INET_PROTOS];
+extern rwlock_t inet6_proto_lock;
 #endif
 
 extern int	inet_add_protocol(struct net_protocol *prot, unsigned char num);
--- linux/include/net/sock.h.orig
+++ linux/include/net/sock.h
@@ -594,12 +594,12 @@ static __inline__ void sk_set_owner(stru
 /* Called with local bh disabled */
 static __inline__ void sock_prot_inc_use(struct proto *prot)
 {
-	prot->stats[smp_processor_id()].inuse++;
+	prot->stats[_smp_processor_id()].inuse++;
 }
 
 static __inline__ void sock_prot_dec_use(struct proto *prot)
 {
-	prot->stats[smp_processor_id()].inuse--;
+	prot->stats[_smp_processor_id()].inuse--;
 }
 
 /* About 10 seconds */
@@ -706,8 +706,8 @@ extern void FASTCALL(lock_sock(struct so
 extern void FASTCALL(release_sock(struct sock *sk));
 
 /* BH context may only use the following locking interface. */
-#define bh_lock_sock(__sk)	spin_lock(&((__sk)->sk_lock.slock))
-#define bh_unlock_sock(__sk)	spin_unlock(&((__sk)->sk_lock.slock))
+#define bh_lock_sock(__sk)	do { spin_lock(&((__sk)->sk_lock.slock)); } while (0)
+#define bh_unlock_sock(__sk)	do { spin_unlock(&((__sk)->sk_lock.slock)); } while (0)
 
 extern struct sock *		sk_alloc(int family, int priority, int zero_it,
 					 kmem_cache_t *slab);
--- linux/include/asm-generic/tlb-simple.h.orig
+++ linux/include/asm-generic/tlb-simple.h
@@ -0,0 +1,53 @@
+#ifndef _TLB_SIMPLE_H
+#define _TLB_SIMPLE_H
+
+/*
+ * Simple, preemptible TLB flush implementation.
+ */
+
+# include <linux/config.h>
+# include <linux/swap.h>
+# include <asm/pgalloc.h>
+# include <asm/tlbflush.h>
+
+struct mmu_gather {
+	void *self;
+};
+
+/*
+ * We store the mm in the tlb pointer itself, so we dont
+ * have to allocate anything on tlb-gather:
+ */
+#define tlb_mm(tlb) ((struct mm_struct *)(tlb))
+
+static inline struct mmu_gather *
+tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
+{
+	return (struct mmu_gather *)mm;
+}
+static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+	free_page_and_swap_cache(page);
+}
+static inline void tlb_free(struct mmu_gather *tlb)
+{
+	if (tlb_mm(tlb)->rss)
+		tlb_mm(tlb)->rss--;
+}
+
+# define tlb_remove_tlb_entry __tlb_remove_tlb_entry
+# define pmd_free_tlb __pmd_free_tlb
+# define pte_free_tlb __pte_free_tlb
+# define tlb_migrate_finish(mm) do { } while (0)
+# define tlb_is_full_mm(tlb) 1
+
+#define pgd_free_tlb(tlb, pgdp) __pgd_free_tlb(tlb, pgdp)
+
+static inline void
+tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+{
+	flush_tlb_mm(tlb_mm(tlb));
+	check_pgt_cache();
+}
+
+#endif
--- linux/include/asm-generic/tlb.h.orig
+++ linux/include/asm-generic/tlb.h
@@ -44,6 +44,11 @@ struct mmu_gather {
 	struct page *		pages[FREE_PTE_NR];
 };
 
+/*
+ * Some architectures might want to store mm in the tlb pointer itself.
+ */
+#define tlb_mm(tlb) ((tlb)->mm)
+
 /* Users of the generic TLB shootdown code must declare this storage space. */
 DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
 
@@ -53,7 +58,7 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_g
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &per_cpu(mmu_gathers, smp_processor_id());
+	struct mmu_gather *tlb = &get_cpu_var(mmu_gathers);
 
 	tlb->mm = mm;
 
@@ -94,6 +99,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
 		freed = rss;
 	mm->rss = rss - freed;
 	tlb_flush_mmu(tlb, start, end);
+	put_cpu_var(mmu_gathers);
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
@@ -105,6 +111,15 @@ tlb_is_full_mm(struct mmu_gather *tlb)
 	return tlb->fullmm;
 }
 
+/* tlb_free
+ *	this counts the number of pages we have to take off the RSS
+ *	at flush time.
+ */
+static inline void tlb_free(struct mmu_gather *tlb)
+{
+	tlb->freed++;
+}
+
 /* tlb_remove_page
  *	Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
  *	handling the additional races in SMP caused by other CPUs caching valid
--- linux/include/asm-generic/percpu.h.orig
+++ linux/include/asm-generic/percpu.h
@@ -10,11 +10,23 @@ extern unsigned long __per_cpu_offset[NR
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    __attribute__((__section__(".data.percpu"))) spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED; \
+    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name##_locked
 
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]))
 #define __get_cpu_var(var) per_cpu(var, smp_processor_id())
 
+#define per_cpu_lock(var, cpu) \
+	(*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset[cpu]))
+#define per_cpu_var_locked(var, cpu) \
+		(*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset[cpu]))
+#define __get_cpu_lock(var, cpu) \
+		per_cpu_lock(var, cpu)
+#define __get_cpu_var_locked(var, cpu) \
+		per_cpu_var_locked(var, cpu)
+
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
@@ -29,8 +41,14 @@ do {								\
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED; \
+    __typeof__(type) per_cpu__##name##_locked
+
 #define per_cpu(var, cpu)			(*((void)cpu, &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
+#define __get_cpu_lock(var, cpu)		per_cpu_lock__##var##_locked
+#define __get_cpu_var_locked(var, cpu)		per_cpu__##var##_locked
 
 #endif	/* SMP */
 
--- linux/include/asm-generic/bug.h.orig
+++ linux/include/asm-generic/bug.h
@@ -25,10 +25,16 @@
 #ifndef HAVE_ARCH_WARN_ON
 #define WARN_ON(condition) do { \
 	if (unlikely((condition)!=0)) { \
-		printk("Badness in %s at %s:%d\n", __FUNCTION__, __FILE__, __LINE__); \
+		printk("%s/%d: BUG in %s at %s:%d\n", current->comm, current->pid,__FUNCTION__, __FILE__, __LINE__); \
 		dump_stack(); \
 	} \
 } while (0)
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+# define WARN_ON_RT(condition) WARN_ON(condition)
+#else
+# define WARN_ON_RT(condition) do { } while (0)
+#endif
+
 #endif
--- linux/include/linux/loop.h.orig
+++ linux/include/linux/loop.h
@@ -58,9 +58,9 @@ struct loop_device {
 	struct bio 		*lo_bio;
 	struct bio		*lo_biotail;
 	int			lo_state;
-	struct semaphore	lo_sem;
+	struct completion	lo_done;
+	struct completion	lo_bh_done;
 	struct semaphore	lo_ctl_mutex;
-	struct semaphore	lo_bh_mutex;
 	atomic_t		lo_pending;
 
 	request_queue_t		*lo_queue;
--- linux/include/linux/workqueue.h.orig
+++ linux/include/linux/workqueue.h
@@ -54,6 +54,8 @@ extern struct workqueue_struct *__create
 						    int singlethread);
 #define create_workqueue(name) __create_workqueue((name), 0)
 #define create_singlethread_workqueue(name) __create_workqueue((name), 1)
+extern void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+				int rt_priority, int nice);
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
--- linux/include/linux/rwsem.h.orig
+++ linux/include/linux/rwsem.h
@@ -9,6 +9,10 @@
 
 #include <linux/linkage.h>
 
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#else
+
 #define RWSEM_DEBUG 0
 
 #ifdef __KERNEL__
@@ -112,4 +116,5 @@ static inline void downgrade_write(struc
 }
 
 #endif /* __KERNEL__ */
+#endif /* PREEMPT_RT */
 #endif /* _LINUX_RWSEM_H */
--- linux/include/linux/irq.h.orig
+++ linux/include/linux/irq.h
@@ -17,6 +17,7 @@
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/cpumask.h>
+#include <linux/wait.h>
 
 #include <asm/irq.h>
 #include <asm/ptrace.h>
@@ -33,6 +34,15 @@
 #define IRQ_LEVEL	64	/* IRQ level triggered */
 #define IRQ_MASKED	128	/* IRQ masked - shouldn't be seen again */
 #define IRQ_PER_CPU	256	/* IRQ is per CPU */
+#define IRQ_NODELAY	512	/* IRQ must run immediately */
+
+/*
+ * Not used on any of the architectures, but feel free to provide
+ * your own per-arch one:
+ */
+#ifndef SA_NODELAY
+# define SA_NODELAY 0x01000000
+#endif
 
 /*
  * Interrupt controller descriptor. This is all we need
@@ -65,7 +75,10 @@ typedef struct irq_desc {
 	unsigned int depth;		/* nested irq disables */
 	unsigned int irq_count;		/* For detecting broken interrupts */
 	unsigned int irqs_unhandled;
-	spinlock_t lock;
+	struct task_struct *thread;
+	wait_queue_head_t wait_for_handler;
+	cycles_t timestamp;
+	raw_spinlock_t lock;
 } ____cacheline_aligned irq_desc_t;
 
 extern irq_desc_t irq_desc [NR_IRQS];
@@ -87,7 +100,13 @@ extern void note_interrupt(unsigned int 
 extern void report_bad_irq(unsigned int irq, irq_desc_t *desc, int action_ret);
 extern int can_request_irq(unsigned int irq, unsigned long irqflags);
 
+extern void early_init_hardirqs(void);
+extern void init_hardirqs(void);
 extern void init_irq_proc(void);
+extern cycles_t irq_timestamp(unsigned int irq);
+#else
+static inline void early_init_hardirqs(void) { }
+static inline void init_hardirqs(void) { }
 #endif
 
 extern hw_irq_controller no_irq_type;  /* needed in every arch ? */
--- linux/include/linux/percpu_counter.h.orig
+++ linux/include/linux/percpu_counter.h
@@ -15,7 +15,7 @@
 #ifdef CONFIG_SMP
 
 struct percpu_counter {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	long count;
 	long *counters;
 };
--- linux/include/linux/linkage.h.orig
+++ linux/include/linux/linkage.h
@@ -4,6 +4,8 @@
 #include <linux/config.h>
 #include <asm/linkage.h>
 
+#define notrace __attribute ((no_instrument_function))
+
 #ifdef __cplusplus
 #define CPP_ASMLINKAGE extern "C"
 #else
@@ -35,7 +37,7 @@
 
 #endif
 
-#define NORET_TYPE    /**/
+#define NORET_TYPE    /* */
 #define ATTRIB_NORET  __attribute__((noreturn))
 #define NORET_AND     noreturn,
 
--- linux/include/linux/interrupt.h.orig
+++ linux/include/linux/interrupt.h
@@ -41,7 +41,7 @@ struct irqaction {
 	void *dev_id;
 	struct irqaction *next;
 	int irq;
-	struct proc_dir_entry *dir;
+	struct proc_dir_entry *dir, *threaded;
 };
 
 extern irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs);
@@ -68,13 +68,18 @@ extern void enable_irq(unsigned int irq)
 # define save_and_cli(x)	local_irq_save(x)
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+# define local_bh_disable() do { } while (0)
+# define local_bh_enable() do { } while (0)
+# define __local_bh_enable() do { } while (0)
+#else
 /* SoftIRQ primitives.  */
-#define local_bh_disable() \
+# define local_bh_disable() \
 		do { add_preempt_count(SOFTIRQ_OFFSET); barrier(); } while (0)
-#define __local_bh_enable() \
+# define __local_bh_enable() \
 		do { barrier(); sub_preempt_count(SOFTIRQ_OFFSET); } while (0)
-
-extern void local_bh_enable(void);
+  extern void local_bh_enable(void);
+#endif
 
 /* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
    frequency threaded job scheduling. For almost all the purposes
@@ -108,6 +113,7 @@ extern void softirq_init(void);
 #define __raise_softirq_irqoff(nr) do { local_softirq_pending() |= 1UL << (nr); } while (0)
 extern void FASTCALL(raise_softirq_irqoff(unsigned int nr));
 extern void FASTCALL(raise_softirq(unsigned int nr));
+extern void wakeup_irqd(void);
 
 
 /* Tasklets --- multithreaded analogue of BHs.
@@ -268,4 +274,33 @@ extern int probe_irq_off(unsigned long);
 extern unsigned int probe_irq_mask(unsigned long);	/* returns mask of ISA interrupts */
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+# define local_irq_disable_nort()	do { BUG_ON(in_interrupt()); } while (0)
+# define local_irq_enable_nort()	do { BUG_ON(in_interrupt()); } while (0)
+# define local_irq_save_nort(flags)	do { local_save_flags(flags); WARN_ON(in_interrupt()); } while (0)
+# define local_irq_restore_nort(flags)	do { (void)(flags); WARN_ON(in_interrupt()); } while (0)
+# define spin_lock_nort(lock)		do { } while (0)
+# define spin_unlock_nort(lock)		do { } while (0)
+# define spin_lock_bh_nort(lock)	do { } while (0)
+# define spin_unlock_bh_nort(lock)	do { } while (0)
+# define spin_lock_rt(lock)		spin_lock(lock)
+# define spin_unlock_rt(lock)		spin_unlock(lock)
+# define smp_processor_id_rt(cpu)	(cpu)
+# define in_atomic_rt()			(!oops_in_progress && \
+					  (in_atomic() || irqs_disabled()))
+#else
+# define local_irq_disable_nort()	local_irq_disable()
+# define local_irq_enable_nort()	local_irq_enable()
+# define local_irq_save_nort(flags)	local_irq_save(flags)
+# define local_irq_restore_nort(flags)	local_irq_restore(flags)
+# define spin_lock_rt(lock)		do { } while (0)
+# define spin_unlock_rt(lock)		do { } while (0)
+# define spin_lock_nort(lock)		spin_lock(lock)
+# define spin_unlock_nort(lock)		spin_unlock(lock)
+# define spin_lock_bh_nort(lock)	spin_lock_bh(lock)
+# define spin_unlock_bh_nort(lock)	spin_unlock_bh(lock)
+# define smp_processor_id_rt(cpu)	smp_processor_id()
+# define in_atomic_rt()			0
+#endif
+
 #endif
--- linux/include/linux/completion.h.orig
+++ linux/include/linux/completion.h
@@ -28,6 +28,8 @@ static inline void init_completion(struc
 }
 
 extern void FASTCALL(wait_for_completion(struct completion *));
+extern int FASTCALL(wait_for_completion_interruptible(struct completion *x));
+
 extern void FASTCALL(complete(struct completion *));
 extern void FASTCALL(complete_all(struct completion *));
 
--- linux/include/linux/radix-tree.h.orig
+++ linux/include/linux/radix-tree.h
@@ -19,6 +19,7 @@
 #ifndef _LINUX_RADIX_TREE_H
 #define _LINUX_RADIX_TREE_H
 
+#include <linux/config.h>
 #include <linux/preempt.h>
 #include <linux/types.h>
 
@@ -51,7 +52,18 @@ void *radix_tree_delete(struct radix_tre
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items);
+/*
+ * On a mutex based kernel we can freely schedule within the radix code:
+ */
+#ifdef CONFIG_PREEMPT_RT
+static inline int radix_tree_preload(int gfp_mask)
+{
+	return 0;
+}
+#else
 int radix_tree_preload(int gfp_mask);
+#endif
+
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, int tag);
@@ -66,7 +78,9 @@ int radix_tree_tagged(struct radix_tree_
 
 static inline void radix_tree_preload_end(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	preempt_enable();
+#endif
 }
 
 #endif /* _LINUX_RADIX_TREE_H */
--- linux/include/linux/seqlock.h.orig
+++ linux/include/linux/seqlock.h
@@ -33,35 +33,58 @@
 typedef struct {
 	unsigned sequence;
 	spinlock_t lock;
-} seqlock_t;
+} __seqlock_t;
+
+typedef struct {
+	unsigned sequence;
+	raw_spinlock_t lock;
+} __raw_seqlock_t;
+
+#define seqlock_need_resched(seq) lock_need_resched(&(seq)->lock)
+
+#ifdef CONFIG_PREEMPT_RT
+typedef __seqlock_t seqlock_t;
+#else
+typedef __raw_seqlock_t seqlock_t;
+#endif
+
+typedef __raw_seqlock_t raw_seqlock_t;
 
 /*
  * These macros triggered gcc-3.x compile-time problems.  We think these are
  * OK now.  Be cautious.
  */
+#ifdef CONFIG_PREEMPT_RT
 #define SEQLOCK_UNLOCKED { 0, SPIN_LOCK_UNLOCKED }
 #define seqlock_init(x)	do { *(x) = (seqlock_t) SEQLOCK_UNLOCKED; } while (0)
-
+#else
+#define SEQLOCK_UNLOCKED { 0, RAW_SPIN_LOCK_UNLOCKED }
+#define seqlock_init(x)	do { *(x) = (seqlock_t) RAW_SEQLOCK_UNLOCKED; } while (0)
+#endif
+
+#define RAW_SEQLOCK_UNLOCKED { 0, RAW_SPIN_LOCK_UNLOCKED }
+#define raw_seqlock_init(x) \
+		do { *(x) = (raw_seqlock_t) RAW_SEQLOCK_UNLOCKED; } while (0)
 
 /* Lock out other writers and update the count.
  * Acts like a normal spin_lock/unlock.
  * Don't need preempt_disable() because that is in the spin_lock already.
  */
-static inline void write_seqlock(seqlock_t *sl)
+static inline void __write_seqlock(seqlock_t *sl)
 {
 	spin_lock(&sl->lock);
 	++sl->sequence;
 	smp_wmb();			
 }	
 
-static inline void write_sequnlock(seqlock_t *sl) 
+static inline void __write_sequnlock(seqlock_t *sl) 
 {
 	smp_wmb();
 	sl->sequence++;
 	spin_unlock(&sl->lock);
 }
 
-static inline int write_tryseqlock(seqlock_t *sl)
+static inline int __write_tryseqlock(seqlock_t *sl)
 {
 	int ret = spin_trylock(&sl->lock);
 
@@ -73,7 +96,7 @@ static inline int write_tryseqlock(seqlo
 }
 
 /* Start of read calculation -- fetch last complete writer token */
-static inline unsigned read_seqbegin(const seqlock_t *sl)
+static inline unsigned __read_seqbegin(const seqlock_t *sl)
 {
 	unsigned ret = sl->sequence;
 	smp_rmb();
@@ -88,13 +111,126 @@ static inline unsigned read_seqbegin(con
  *    
  * Using xor saves one conditional branch.
  */
-static inline int read_seqretry(const seqlock_t *sl, unsigned iv)
+static inline int __read_seqretry(seqlock_t *sl, unsigned iv)
+{
+	int ret;
+
+	smp_rmb();
+	ret = (iv & 1) | (sl->sequence ^ iv);
+	/*
+	 * If invalid then serialize with the writer, to make sure we
+	 * are not livelocking it:
+	 */
+	if (unlikely(ret)) {
+		unsigned long flags;
+		spin_lock_irqsave(&sl->lock, flags);
+		spin_unlock_irqrestore(&sl->lock, flags);
+	}
+	return ret;
+}
+
+static inline void __write_seqlock_raw(raw_seqlock_t *sl)
+{
+	spin_lock(&sl->lock);
+	++sl->sequence;
+	smp_wmb();			
+}	
+
+static inline void __write_sequnlock_raw(raw_seqlock_t *sl) 
+{
+	smp_wmb();
+	sl->sequence++;
+	spin_unlock(&sl->lock);
+}
+
+static inline int __write_tryseqlock_raw(raw_seqlock_t *sl)
+{
+	int ret = spin_trylock(&sl->lock);
+
+	if (ret) {
+		++sl->sequence;
+		smp_wmb();			
+	}
+	return ret;
+}
+
+static inline unsigned __read_seqbegin_raw(const raw_seqlock_t *sl)
+{
+	unsigned ret = sl->sequence;
+	smp_rmb();
+	return ret;
+}
+
+static inline int __read_seqretry_raw(const raw_seqlock_t *sl, unsigned iv)
 {
 	smp_rmb();
 	return (iv & 1) | (sl->sequence ^ iv);
 }
 
 
+extern int __bad_seqlock_type(void);
+
+#define PICK_SEQOP(op, lock)					\
+do {								\
+	if (TYPE_EQUAL((lock), seqlock_t))			\
+		op((seqlock_t *)(lock));			\
+	else if (TYPE_EQUAL(lock, raw_seqlock_t))		\
+		op##_raw((raw_seqlock_t *)(lock));		\
+	else __bad_seqlock_type();				\
+} while (0)
+
+#define PICK_SEQOP_RET(op, lock)				\
+({								\
+	unsigned int __ret;					\
+								\
+	if (TYPE_EQUAL((lock), seqlock_t))			\
+		__ret = op((seqlock_t *)(lock));		\
+	else if (TYPE_EQUAL(lock, raw_seqlock_t))		\
+		__ret = op##_raw((raw_seqlock_t *)(lock));	\
+	else __ret = __bad_seqlock_type();			\
+								\
+	__ret;							\
+})
+
+#define PICK_SEQOP_CONST_RET(op, lock)				\
+({								\
+	unsigned int __ret;					\
+								\
+	if (TYPE_EQUAL((lock), seqlock_t))			\
+		__ret = op((const seqlock_t *)(lock));		\
+	else if (TYPE_EQUAL(lock, raw_seqlock_t))		\
+		__ret = op##_raw((const raw_seqlock_t *)(lock));\
+	else __ret = __bad_seqlock_type();			\
+								\
+	__ret;							\
+})
+
+#define PICK_SEQOP2_CONST_RET(op, lock, arg)				\
+({									\
+	unsigned int __ret;						\
+									\
+	if (TYPE_EQUAL((lock), seqlock_t))				\
+		__ret = op((seqlock_t *)(lock), (arg));			\
+	else if (TYPE_EQUAL(lock, raw_seqlock_t))			\
+		__ret = op##_raw((const raw_seqlock_t *)(lock), (arg));	\
+	else __ret = __bad_seqlock_type();				\
+									\
+	__ret;								\
+})
+
+
+#define write_seqlock(sl)	PICK_SEQOP(__write_seqlock, sl)
+#define write_sequnlock(sl)	PICK_SEQOP(__write_sequnlock, sl)
+#define write_tryseqlock(sl)	PICK_SEQOP_RET(__write_tryseqlock, sl)
+#define read_seqbegin(sl)	PICK_SEQOP_CONST_RET(__read_seqbegin, sl)
+#define read_seqretry(sl, iv)	PICK_SEQOP2_CONST_RET(__read_seqretry, sl, iv)
+
+#define DECLARE_SEQLOCK(name) \
+	seqlock_t name __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED
+
+#define DECLARE_RAW_SEQLOCK(name) \
+	raw_seqlock_t name __cacheline_aligned_in_smp = RAW_SEQLOCK_UNLOCKED
+
 /*
  * Version using sequence counter only.
  * This can be used when code has its own mutex protecting the
@@ -145,30 +281,51 @@ static inline void write_seqcount_end(se
 	s->sequence++;
 }
 
+#define PICK_IRQOP(op, lock)					\
+do {								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		op();						\
+	else if (TYPE_EQUAL((lock), seqlock_t))			\
+		{ /* nothing */ }				\
+	else __bad_seqlock_type();				\
+} while (0)
+
+#define PICK_IRQOP2(op, arg, lock)				\
+do {								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		op(arg);					\
+	else if (TYPE_EQUAL(lock, seqlock_t))			\
+		{ /* nothing */ }				\
+	else __bad_seqlock_type();				\
+} while (0)
+
+
+
 /*
  * Possible sw/hw IRQ protected versions of the interfaces.
  */
 #define write_seqlock_irqsave(lock, flags)				\
-	do { local_irq_save(flags); write_seqlock(lock); } while (0)
+	do { PICK_IRQOP2(local_irq_save, flags, lock); write_seqlock(lock); } while (0)
 #define write_seqlock_irq(lock)						\
-	do { local_irq_disable();   write_seqlock(lock); } while (0)
+	do { PICK_IRQOP(local_irq_disable, lock); write_seqlock(lock); } while (0)
 #define write_seqlock_bh(lock)						\
-        do { local_bh_disable();    write_seqlock(lock); } while (0)
+        do { PICK_IRQOP(local_bh_disable, lock); write_seqlock(lock); } while (0)
 
 #define write_sequnlock_irqrestore(lock, flags)				\
-	do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP2(local_irq_restore, flags, lock); preempt_check_resched(); } while(0)
 #define write_sequnlock_irq(lock)					\
-	do { write_sequnlock(lock); local_irq_enable(); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP(local_irq_enable, lock); preempt_check_resched(); } while(0)
 #define write_sequnlock_bh(lock)					\
-	do { write_sequnlock(lock); local_bh_enable(); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP(local_bh_enable, lock); } while(0)
 
 #define read_seqbegin_irqsave(lock, flags)				\
-	({ local_irq_save(flags);   read_seqbegin(lock); })
+	({ PICK_IRQOP2(local_irq_save, flags, lock); read_seqbegin(lock); })
 
 #define read_seqretry_irqrestore(lock, iv, flags)			\
 	({								\
 		int ret = read_seqretry(lock, iv);			\
-		local_irq_restore(flags);				\
+		PICK_IRQOP2(local_irq_restore, flags, lock);		\
+		preempt_check_resched(); 				\
 		ret;							\
 	})
 
--- linux/include/linux/profile.h.orig
+++ linux/include/linux/profile.h
@@ -7,10 +7,12 @@
 #include <linux/config.h>
 #include <linux/init.h>
 #include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
 #include <asm/errno.h>
 
-#define CPU_PROFILING	1
-#define SCHED_PROFILING	2
+#define CPU_PROFILING		1
+#define SCHED_PROFILING		2
+#define PREEMPT_PROFILING	3
 
 struct proc_dir_entry;
 struct pt_regs;
@@ -30,6 +32,8 @@ enum profile_type {
 	PROFILE_MUNMAP
 };
 
+extern int prof_pid;
+
 #ifdef CONFIG_PROFILING
 
 struct notifier_block;
--- linux/include/linux/percpu.h.orig
+++ linux/include/linux/percpu.h
@@ -8,13 +8,28 @@
 
 /* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
 #ifndef PERCPU_ENOUGH_ROOM
-#define PERCPU_ENOUGH_ROOM 32768
+#define PERCPU_ENOUGH_ROOM 65536
 #endif
 
 /* Must be an lvalue. */
 #define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
 #define put_cpu_var(var) preempt_enable()
 
+/*
+ * Per-CPU data structures with an additional lock - useful for
+ * PREEMPT_RT code that wants to reschedule but also wants
+ * per-CPU data structures. 
+ *
+ * NOTE: on normal !PREEMPT_RT kernels these per-CPU variables
+ * are the same as the normal per-CPU variables.
+ */
+#define get_cpu_var_locked(var, cpu) \
+		(*({ spin_lock(&__get_cpu_lock(var, cpu)); \
+		&__get_cpu_var_locked(var, cpu); }))
+
+#define put_cpu_var_locked(var, cpu) \
+		 do { (void)cpu; spin_unlock(&__get_cpu_lock(var, cpu)); } while (0)
+
 #ifdef CONFIG_SMP
 
 struct percpu_data {
--- linux/include/linux/spinlock.h.orig
+++ linux/include/linux/spinlock.h
@@ -13,6 +13,8 @@
 #include <linux/kernel.h>
 #include <linux/stringify.h>
 
+#include <linux/rt_lock.h>
+
 #include <asm/processor.h>	/* for cpu relax */
 #include <asm/system.h>
 #ifdef CONFIG_KGDB
@@ -46,41 +48,43 @@
 #ifdef CONFIG_SMP
 #include <asm/spinlock.h>
 
-int __lockfunc _spin_trylock(spinlock_t *lock);
-int __lockfunc _read_trylock(rwlock_t *lock);
-int __lockfunc _write_trylock(rwlock_t *lock);
-
-void __lockfunc _spin_lock(spinlock_t *lock)	__acquires(spinlock_t);
-void __lockfunc _read_lock(rwlock_t *lock)	__acquires(rwlock_t);
-void __lockfunc _write_lock(rwlock_t *lock)	__acquires(rwlock_t);
-
-void __lockfunc _spin_unlock(spinlock_t *lock)	__releases(spinlock_t);
-void __lockfunc _read_unlock(rwlock_t *lock)	__releases(rwlock_t);
-void __lockfunc _write_unlock(rwlock_t *lock)	__releases(rwlock_t);
-
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)	__acquires(spinlock_t);
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)	__acquires(rwlock_t);
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)	__acquires(rwlock_t);
-
-void __lockfunc _spin_lock_irq(spinlock_t *lock)	__acquires(spinlock_t);
-void __lockfunc _spin_lock_bh(spinlock_t *lock)		__acquires(spinlock_t);
-void __lockfunc _read_lock_irq(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _read_lock_bh(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock_irq(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock_bh(rwlock_t *lock)		__acquires(rwlock_t);
-
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)	__releases(spinlock_t);
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)				__releases(spinlock_t);
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)				__releases(spinlock_t);
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)	__releases(rwlock_t);
-void __lockfunc _read_unlock_irq(rwlock_t *lock)				__releases(rwlock_t);
-void __lockfunc _read_unlock_bh(rwlock_t *lock)					__releases(rwlock_t);
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)	__releases(rwlock_t);
-void __lockfunc _write_unlock_irq(rwlock_t *lock)				__releases(rwlock_t);
-void __lockfunc _write_unlock_bh(rwlock_t *lock)				__releases(rwlock_t);
-
-int __lockfunc _spin_trylock_bh(spinlock_t *lock);
-int __lockfunc generic_raw_read_trylock(rwlock_t *lock);
+int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock);
+int __lockfunc _raw_read_trylock(raw_rwlock_t *lock);
+int __lockfunc _raw_write_trylock(raw_rwlock_t *lock);
+
+void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)	__acquires(raw_spinlock_t);
+void __lockfunc _raw_read_lock(raw_rwlock_t *lock)	__acquires(raw_rwlock_t);
+void __lockfunc _raw_write_lock(raw_rwlock_t *lock)	__acquires(raw_rwlock_t);
+
+void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)	__releases(raw_spinlock_t);
+void __lockfunc _raw_read_unlock(raw_rwlock_t *lock)	__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock(raw_rwlock_t *lock)	__releases(raw_rwlock_t);
+
+unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)	__acquires(raw_spinlock_t);
+unsigned long __lockfunc _raw_read_lock_irqsave(raw_rwlock_t *lock)	__acquires(raw_rwlock_t);
+unsigned long __lockfunc _raw_write_lock_irqsave(raw_rwlock_t *lock)	__acquires(raw_rwlock_t);
+
+void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)	__acquires(raw_spinlock_t);
+void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)		__acquires(raw_spinlock_t);
+void __lockfunc _raw_read_lock_irq(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_read_lock_bh(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_write_lock_irq(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_write_lock_bh(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+
+void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)	__releases(raw_spinlock_t);
+void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)				__releases(raw_spinlock_t);
+void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)				__releases(raw_spinlock_t);
+void __lockfunc _raw_read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)	__releases(raw_rwlock_t);
+void __lockfunc _raw_read_unlock_irq(raw_rwlock_t *lock)				__releases(raw_rwlock_t);
+void __lockfunc _raw_read_unlock_bh(raw_rwlock_t *lock)					__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)	__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock_irq(raw_rwlock_t *lock)				__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock_bh(raw_rwlock_t *lock)				__releases(raw_rwlock_t);
+
+int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock);
+int __lockfunc _raw_spin_trylock_irq(raw_spinlock_t *lock);
+int __lockfunc _raw_spin_trylock_irqsave(raw_spinlock_t *lock, unsigned long *flags);
+int __lockfunc generic_raw_read_trylock(raw_rwlock_t *lock);
 int in_lock_functions(unsigned long addr);
 
 #else
@@ -88,7 +92,7 @@ int in_lock_functions(unsigned long addr
 #define in_lock_functions(ADDR) 0
 
 #if !defined(CONFIG_PREEMPT) && !defined(CONFIG_DEBUG_SPINLOCK)
-# define _atomic_dec_and_lock(atomic,lock) atomic_dec_and_test(atomic)
+# define _atomic_dec_and_raw_spin_lock(atomic,lock) atomic_dec_and_test(atomic)
 # define ATOMIC_DEC_AND_LOCK
 #endif
 
@@ -127,7 +131,7 @@ typedef struct {
 		} \
 	} while(0)
 
-#define _raw_spin_lock(x)		\
+#define __raw_spin_lock(x)		\
 	do { \
 	 	CHECK_LOCK(x); \
 		if ((x)->lock&&(x)->babble) { \
@@ -158,7 +162,7 @@ typedef struct {
 
 /* without debugging, spin_trylock on UP always says
  * TRUE. --> printk if already locked. */
-#define _raw_spin_trylock(x) \
+#define __raw_spin_trylock(x) \
 	({ \
 	 	CHECK_LOCK(x); \
 		if ((x)->lock&&(x)->babble) { \
@@ -185,7 +189,7 @@ typedef struct {
 		}\
 	} while (0)
 
-#define _raw_spin_unlock(x) \
+#define __raw_spin_unlock(x) \
 	do { \
 	 	CHECK_LOCK(x); \
 		if (!(x)->lock&&(x)->babble) { \
@@ -196,330 +200,567 @@ typedef struct {
 		(x)->lock = 0; \
 	} while (0)
 #else
-/*
- * gcc versions before ~2.95 have a nasty bug with empty initializers.
- */
-#if (__GNUC__ > 2)
-  typedef struct { } spinlock_t;
-  #define SPIN_LOCK_UNLOCKED (spinlock_t) { }
-#else
-  typedef struct { int gcc_is_buggy; } spinlock_t;
-  #define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 }
-#endif
 
 /*
  * If CONFIG_SMP is unset, declare the _raw_* definitions as nops
  */
-#define spin_lock_init(lock)	do { (void)(lock); } while(0)
-#define _raw_spin_lock(lock)	do { (void)(lock); } while(0)
-#define spin_is_locked(lock)	((void)(lock), 0)
-#define _raw_spin_trylock(lock)	(((void)(lock), 1))
-#define spin_unlock_wait(lock)	(void)(lock);
-#define _raw_spin_unlock(lock) do { (void)(lock); } while(0)
+#define __raw_spin_lock_init(lock)	do { (void)(lock); } while(0)
+#define __raw_spin_lock(lock)		do { (void)(lock); } while(0)
+#define __raw_spin_is_locked(lock)	((void)(lock), 0)
+#define __raw_spin_trylock(lock)		(((void)(lock), 1))
+#define __raw_spin_unlock_wait(lock)	(void)(lock)
+#define __raw_spin_unlock(lock) 		do { (void)(lock); } while(0)
 #endif /* CONFIG_DEBUG_SPINLOCK */
 
 /* RW spinlocks: No debug version */
 
-#if (__GNUC__ > 2)
-  typedef struct { } rwlock_t;
-  #define RW_LOCK_UNLOCKED (rwlock_t) { }
-#else
-  typedef struct { int gcc_is_buggy; } rwlock_t;
-  #define RW_LOCK_UNLOCKED (rwlock_t) { 0 }
-#endif
+// typedef struct { } raw_rwlock_t;
+// #define __RAW_RW_LOCK_UNLOCKED { }
+// #define RAW_RW_LOCK_UNLOCKED (raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED
+
+#define __raw_rwlock_init(lock)	do { (void)(lock); } while(0)
+#define __raw_read_lock(lock)	do { (void)(lock); } while(0)
+#define __raw_read_unlock(lock)	do { (void)(lock); } while(0)
+#define __raw_write_lock(lock)	do { (void)(lock); } while(0)
+#define __raw_write_unlock(lock)	do { (void)(lock); } while(0)
+#define __raw_read_trylock(lock) ({ (void)(lock); (1); })
+#define __raw_write_trylock(lock) ({ (void)(lock); (1); })
 
-#define rwlock_init(lock)	do { (void)(lock); } while(0)
-#define _raw_read_lock(lock)	do { (void)(lock); } while(0)
-#define _raw_read_unlock(lock)	do { (void)(lock); } while(0)
-#define _raw_write_lock(lock)	do { (void)(lock); } while(0)
-#define _raw_write_unlock(lock)	do { (void)(lock); } while(0)
-#define _raw_read_trylock(lock) ({ (void)(lock); (1); })
-#define _raw_write_trylock(lock) ({ (void)(lock); (1); })
-
-#define _spin_trylock(lock)	({preempt_disable(); _raw_spin_trylock(lock) ? \
+#define _raw_spin_trylock(lock)	({preempt_disable(); __raw_spin_trylock(lock) ? \
 				1 : ({preempt_enable(); 0;});})
 
-#define _read_trylock(lock)	({preempt_disable();_raw_read_trylock(lock) ? \
+#define _raw_read_trylock(lock)	({preempt_disable();__raw_read_trylock(lock) ? \
 				1 : ({preempt_enable(); 0;});})
 
-#define _write_trylock(lock)	({preempt_disable(); _raw_write_trylock(lock) ? \
+#define _raw_write_trylock(lock)	({preempt_disable(); __raw_write_trylock(lock) ? \
 				1 : ({preempt_enable(); 0;});})
 
-#define _spin_trylock_bh(lock)	({preempt_disable(); local_bh_disable(); \
-				_raw_spin_trylock(lock) ? \
+#define _raw_spin_trylock_bh(lock)	({preempt_disable(); local_bh_disable(); \
+				__raw_spin_trylock(lock) ? \
 				1 : ({preempt_enable(); local_bh_enable(); 0;});})
 
-#define _spin_lock(lock)	\
+#define _raw_spin_trylock_irq(lock) \
+({ \
+	local_irq_disable(); preempt_disable(); \
+	__raw_spin_trylock(lock) ? \
+	1 : ({ preempt_enable_no_resched(); local_irq_enable(); preempt_check_resched(); 0; }); \
+})
+
+#define _raw_spin_trylock_irqsave(lock, flags) \
+({ \
+	local_irq_save(*flags); preempt_disable(); \
+	__raw_spin_trylock(lock) ? \
+	1 : ({ preempt_enable_no_resched(); local_irq_restore(*flags); preempt_check_resched(); 0;}); \
+})
+
+#define _raw_spin_lock(lock)	\
 do { \
 	preempt_disable(); \
-	_raw_spin_lock(lock); \
+	__raw_spin_lock(lock); \
 	__acquire(lock); \
 } while(0)
 
-#define _write_lock(lock) \
+#define _raw_write_lock(lock) \
 do { \
 	preempt_disable(); \
-	_raw_write_lock(lock); \
+	__raw_write_lock(lock); \
 	__acquire(lock); \
 } while(0)
  
-#define _read_lock(lock)	\
+#define _raw_read_lock(lock)	\
 do { \
 	preempt_disable(); \
-	_raw_read_lock(lock); \
+	__raw_read_lock(lock); \
 	__acquire(lock); \
 } while(0)
 
-#define _spin_unlock(lock) \
+#define _raw_spin_unlock(lock) \
 do { \
-	_raw_spin_unlock(lock); \
+	__raw_spin_unlock(lock); \
 	preempt_enable(); \
 	__release(lock); \
 } while (0)
 
-#define _write_unlock(lock) \
+#define _raw_write_unlock(lock) \
 do { \
-	_raw_write_unlock(lock); \
+	__raw_write_unlock(lock); \
 	preempt_enable(); \
 	__release(lock); \
 } while(0)
 
-#define _read_unlock(lock) \
+#define _raw_read_unlock(lock) \
 do { \
-	_raw_read_unlock(lock); \
+	__raw_read_unlock(lock); \
 	preempt_enable(); \
 	__release(lock); \
 } while(0)
 
-#define _spin_lock_irqsave(lock, flags) \
-do {	\
-	local_irq_save(flags); \
+#define _raw_spin_lock_irqsave(lock) \
+({	unsigned long __flags; \
+	local_irq_save(__flags); \
 	preempt_disable(); \
-	_raw_spin_lock(lock); \
+	__raw_spin_lock(lock); \
 	__acquire(lock); \
-} while (0)
+	__flags; \
+})
 
-#define _spin_lock_irq(lock) \
+#define _raw_spin_lock_irq(lock) \
 do { \
 	local_irq_disable(); \
 	preempt_disable(); \
-	_raw_spin_lock(lock); \
+	__raw_spin_lock(lock); \
 	__acquire(lock); \
 } while (0)
 
-#define _spin_lock_bh(lock) \
+#define _raw_spin_lock_bh(lock) \
 do { \
 	local_bh_disable(); \
 	preempt_disable(); \
-	_raw_spin_lock(lock); \
+	__raw_spin_lock(lock); \
 	__acquire(lock); \
 } while (0)
 
-#define _read_lock_irqsave(lock, flags) \
-do {	\
-	local_irq_save(flags); \
+#define _raw_read_lock_irqsave(lock) \
+({	unsigned long __flags; \
+	local_irq_save(__flags); \
 	preempt_disable(); \
-	_raw_read_lock(lock); \
+	__raw_read_lock(lock); \
 	__acquire(lock); \
-} while (0)
+	__flags; \
+})
 
-#define _read_lock_irq(lock) \
+#define _raw_read_lock_irq(lock) \
 do { \
 	local_irq_disable(); \
 	preempt_disable(); \
-	_raw_read_lock(lock); \
+	__raw_read_lock(lock); \
 	__acquire(lock); \
 } while (0)
 
-#define _read_lock_bh(lock) \
+#define _raw_read_lock_bh(lock) \
 do { \
 	local_bh_disable(); \
 	preempt_disable(); \
-	_raw_read_lock(lock); \
+	__raw_read_lock(lock); \
 	__acquire(lock); \
 } while (0)
 
-#define _write_lock_irqsave(lock, flags) \
-do {	\
-	local_irq_save(flags); \
+#define _raw_write_lock_irqsave(lock) \
+({	unsigned long __flags; \
+	local_irq_save(__flags); \
 	preempt_disable(); \
-	_raw_write_lock(lock); \
+	__raw_write_lock(lock); \
 	__acquire(lock); \
-} while (0)
+	__flags; \
+})
 
-#define _write_lock_irq(lock) \
+#define _raw_write_lock_irq(lock) \
 do { \
 	local_irq_disable(); \
 	preempt_disable(); \
-	_raw_write_lock(lock); \
+	__raw_write_lock(lock); \
 	__acquire(lock); \
 } while (0)
 
-#define _write_lock_bh(lock) \
+#define _raw_write_lock_bh(lock) \
 do { \
 	local_bh_disable(); \
 	preempt_disable(); \
-	_raw_write_lock(lock); \
+	__raw_write_lock(lock); \
 	__acquire(lock); \
 } while (0)
 
-#define _spin_unlock_irqrestore(lock, flags) \
+#define _raw_spin_unlock_irqrestore(lock, flags) \
 do { \
-	_raw_spin_unlock(lock); \
+	__raw_spin_unlock(lock); \
+	preempt_enable_no_resched(); \
 	local_irq_restore(flags); \
-	preempt_enable(); \
+	preempt_check_resched(); \
 	__release(lock); \
 } while (0)
 
-#define _spin_unlock_irq(lock) \
+#define _raw_spin_unlock_irq(lock) \
 do { \
-	_raw_spin_unlock(lock); \
+	__raw_spin_unlock(lock); \
+	preempt_enable_no_resched(); \
 	local_irq_enable(); \
-	preempt_enable(); \
+	preempt_check_resched(); \
 	__release(lock); \
 } while (0)
 
-#define _spin_unlock_bh(lock) \
+#define _raw_spin_unlock_bh(lock) \
 do { \
-	_raw_spin_unlock(lock); \
+	__raw_spin_unlock(lock); \
 	preempt_enable(); \
 	local_bh_enable(); \
 	__release(lock); \
 } while (0)
 
-#define _write_unlock_bh(lock) \
+#define _raw_write_unlock_bh(lock) \
 do { \
-	_raw_write_unlock(lock); \
+	__raw_write_unlock(lock); \
 	preempt_enable(); \
 	local_bh_enable(); \
 	__release(lock); \
 } while (0)
 
-#define _read_unlock_irqrestore(lock, flags) \
+#define _raw_read_unlock_irqrestore(lock, flags) \
 do { \
-	_raw_read_unlock(lock); \
+	__raw_read_unlock(lock); \
+	preempt_enable_no_resched(); \
 	local_irq_restore(flags); \
-	preempt_enable(); \
+	preempt_check_resched(); \
 	__release(lock); \
 } while (0)
 
-#define _write_unlock_irqrestore(lock, flags) \
+#define _raw_write_unlock_irqrestore(lock, flags) \
 do { \
-	_raw_write_unlock(lock); \
+	__raw_write_unlock(lock); \
+	preempt_enable_no_resched(); \
 	local_irq_restore(flags); \
-	preempt_enable(); \
+	preempt_check_resched(); \
 	__release(lock); \
 } while (0)
 
-#define _read_unlock_irq(lock)	\
+#define _raw_read_unlock_irq(lock)	\
 do { \
-	_raw_read_unlock(lock);	\
+	__raw_read_unlock(lock);\
+	preempt_enable_no_resched(); \
 	local_irq_enable();	\
-	preempt_enable();	\
+	preempt_check_resched(); \
 	__release(lock); \
 } while (0)
 
-#define _read_unlock_bh(lock)	\
+#define _raw_read_unlock_bh(lock)	\
 do { \
-	_raw_read_unlock(lock);	\
+	__raw_read_unlock(lock);\
 	local_bh_enable();	\
 	preempt_enable();	\
 	__release(lock); \
 } while (0)
 
-#define _write_unlock_irq(lock)	\
+#define _raw_write_unlock_irq(lock)	\
 do { \
-	_raw_write_unlock(lock);	\
+	__raw_write_unlock(lock);\
+	preempt_enable_no_resched(); \
 	local_irq_enable();	\
-	preempt_enable();	\
+	preempt_check_resched(); \
 	__release(lock); \
 } while (0)
 
 #endif /* !SMP */
 
+extern int __bad_spinlock_type(void);
+
+/*
+ * The following ones are only implemented on PREEMPT_RT, but
+ * the type selection macros need the prototypes even though the
+ * functions never get called (hence, linked):
+ */
+extern void _spin_lock(spinlock_t *lock);
+extern void _spin_lock_bh(spinlock_t *lock);
+extern void _spin_lock_irq(spinlock_t *lock);
+extern unsigned long _spin_lock_irqsave(spinlock_t *lock);
+extern void _spin_unlock(spinlock_t *lock);
+extern void _spin_unlock_wait(spinlock_t *lock);
+extern void _spin_unlock_bh(spinlock_t *lock);
+extern void _spin_unlock_irq(spinlock_t *lock);
+extern void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags);
+extern int _spin_trylock(spinlock_t *lock);
+extern int _spin_trylock_bh(spinlock_t *lock);
+extern int _spin_trylock_irq(spinlock_t *lock);
+extern int _spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
+extern int _spin_is_locked(spinlock_t *lock);
+extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
+extern void _spin_lock_init(spinlock_t *lock, char *name, char *file, int line);
+
+#define TYPE_EQUAL(lock, type) \
+		__builtin_types_compatible_p(typeof(lock), type *)
+
+#define PICK_OP(type, optype, op, lock)				\
+do {								\
+	if (TYPE_EQUAL((lock), type))				\
+		_raw_##optype##op((type *)(lock));		\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		_spin##op((spinlock_t *)(lock));		\
+	else __bad_spinlock_type();				\
+} while (0)
+
+#define PICK_OP_RET(type, optype, op, lock...)			\
+({								\
+	int __ret;						\
+								\
+	if (TYPE_EQUAL((lock), type))	  			\
+		__ret = _raw_##optype##op((type *)(lock));	\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		__ret = _spin##op((spinlock_t *)(lock));	\
+	else __ret = __bad_spinlock_type();			\
+								\
+	__ret;							\
+})
+
+#define PICK_OP2(type, optype, op, lock, flags)			\
+do {								\
+	if (TYPE_EQUAL((lock), type))				\
+		_raw_##optype##op((type *)(lock), flags);	\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		_spin##op((spinlock_t *)(lock), flags);		\
+	else __bad_spinlock_type();				\
+} while (0)
+
+#define PICK_OP2_RET(type, optype, op, lock, flags)		\
+({								\
+	int __ret;						\
+								\
+	if (TYPE_EQUAL((lock), type))				\
+		__ret = _raw_##optype##op((type *)(lock), flags);\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		__ret = _spin##op((spinlock_t *)(lock), flags);	\
+	else __bad_spinlock_type();				\
+								\
+	__ret;							\
+})
+
+
+extern int _read_trylock(rwlock_t *rwlock);
+extern int _write_trylock(rwlock_t *rwlock);
+extern void _write_lock(rwlock_t *rwlock);
+extern void _read_lock(rwlock_t *rwlock);
+extern void _write_unlock(rwlock_t *rwlock);
+extern void _read_unlock(rwlock_t *rwlock);
+extern unsigned long _write_lock_irqsave(rwlock_t *rwlock);
+extern unsigned long _read_lock_irqsave(rwlock_t *rwlock);
+extern void _write_lock_irq(rwlock_t *rwlock);
+extern void _read_lock_irq(rwlock_t *rwlock);
+extern void _write_lock_bh(rwlock_t *rwlock);
+extern void _read_lock_bh(rwlock_t *rwlock);
+extern void _write_unlock_irq(rwlock_t *rwlock);
+extern void _read_unlock_irq(rwlock_t *rwlock);
+extern void _write_unlock_bh(rwlock_t *rwlock);
+extern void _read_unlock_bh(rwlock_t *rwlock);
+extern void _write_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags);
+extern void _read_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags);
+extern void _rwlock_init(rwlock_t *rwlock, char *name, char *file, int line);
+extern int _rwlock_is_locked(rwlock_t *rwlock);
+
+#define __PICK_RW_OP(type, optype, op, lock)				\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		##op((rwlock_t *)(lock));			\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define PICK_RW_OP(type, optype, op, lock)				\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock));			\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define __PICK_RW_OP_RET(type, optype, op, lock...)			\
+({									\
+	int __ret;							\
+									\
+	if (TYPE_EQUAL((lock), type))	  				\
+		__ret = _raw_##optype##op((type *)(lock));		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		__ret = _##optype##op((rwlock_t *)(lock));		\
+	else __ret = __bad_spinlock_type();				\
+									\
+	__ret;								\
+})
+
+#define PICK_RW_OP_RET(type, optype, op, lock...)			\
+({									\
+	int __ret;							\
+									\
+	if (TYPE_EQUAL((lock), type))	  				\
+		__ret = _raw_##optype##op((type *)(lock));		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		__ret = _##optype##op((rwlock_t *)(lock));	\
+	else __ret = __bad_spinlock_type();				\
+									\
+	__ret;								\
+})
+
+#define PICK_RW_OP2(type, optype, op, lock, flags)			\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock), flags);		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock), flags);	\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define _raw_spin_lock_init __raw_spin_lock_init
+
+#define PICK_OP_INIT(type, optype, op, lock)				\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, spinlock_t))				\
+		_spin##op((spinlock_t *)(lock), #lock, __FILE__, __LINE__); \
+	else __bad_spinlock_type();					\
+} while (0)
+
+
+#define spin_lock_init(lock) \
+		PICK_OP_INIT(raw_spinlock_t, spin, _lock_init, lock)
+
+#define _raw_rwlock_init __raw_rwlock_init
+
+#define __PICK_RW_OP_INIT(type, optype, op, lock)			\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock), #lock, __FILE__, __LINE__);\
+	else __bad_spinlock_type();					\
+} while (0)
+
+
+#define rwlock_init(lock) \
+		__PICK_RW_OP_INIT(raw_rwlock_t, rwlock, _init, lock)
+
+#define _raw_spin_is_locked __raw_spin_is_locked
+
+#define spin_is_locked(lock) \
+		PICK_OP_RET(raw_spinlock_t, spin, _is_locked, lock)
+
+#define _raw_rwlock_is_locked __raw_rwlock_is_locked
+
+#define rwlock_is_locked(lock) \
+		__PICK_RW_OP_RET(raw_rwlock_t, rwlock, _is_locked, lock)
+
+#define _raw_spin_unlock_wait __raw_spin_unlock_wait
+
+#define spin_unlock_wait(lock) \
+		PICK_OP(raw_spinlock_t, spin, _unlock_wait, lock)
 /*
  * Define the various spin_lock and rw_lock methods.  Note we define these
  * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
  * methods are defined as nops in the case they are not required.
  */
-#define spin_trylock(lock)	__cond_lock(_spin_trylock(lock))
-#define read_trylock(lock)	__cond_lock(_read_trylock(lock))
-#define write_trylock(lock)	__cond_lock(_write_trylock(lock))
-
-#define spin_lock(lock)		_spin_lock(lock)
-#define write_lock(lock)	_write_lock(lock)
-#define read_lock(lock)		_read_lock(lock)
+// #define spin_trylock(lock)	_spin_trylock(lock)
+#define spin_trylock(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _trylock, lock))
 
-#ifdef CONFIG_SMP
-#define spin_lock_irqsave(lock, flags)	flags = _spin_lock_irqsave(lock)
-#define read_lock_irqsave(lock, flags)	flags = _read_lock_irqsave(lock)
-#define write_lock_irqsave(lock, flags)	flags = _write_lock_irqsave(lock)
-#else
-#define spin_lock_irqsave(lock, flags)	_spin_lock_irqsave(lock, flags)
-#define read_lock_irqsave(lock, flags)	_read_lock_irqsave(lock, flags)
-#define write_lock_irqsave(lock, flags)	_write_lock_irqsave(lock, flags)
-#endif
-
-#define spin_lock_irq(lock)		_spin_lock_irq(lock)
-#define spin_lock_bh(lock)		_spin_lock_bh(lock)
-
-#define read_lock_irq(lock)		_read_lock_irq(lock)
-#define read_lock_bh(lock)		_read_lock_bh(lock)
+//#define read_trylock(lock)	_read_trylock(lock)
+#define read_trylock(lock)	__cond_lock(PICK_RW_OP_RET(raw_rwlock_t, read, _trylock, lock))
 
-#define write_lock_irq(lock)		_write_lock_irq(lock)
-#define write_lock_bh(lock)		_write_lock_bh(lock)
+//#define write_trylock(lock)	_write_trylock(lock)
+#define write_trylock(lock)	__cond_lock(PICK_RW_OP_RET(raw_rwlock_t, write, _trylock, lock))
 
-#define spin_unlock(lock)	_spin_unlock(lock)
-#define write_unlock(lock)	_write_unlock(lock)
-#define read_unlock(lock)	_read_unlock(lock)
+// #define spin_lock(lock)	_spin_lock(lock)
+#define spin_lock(lock)		PICK_OP(raw_spinlock_t, spin, _lock, lock)
 
-#define spin_unlock_irqrestore(lock, flags)	_spin_unlock_irqrestore(lock, flags)
-#define spin_unlock_irq(lock)		_spin_unlock_irq(lock)
-#define spin_unlock_bh(lock)		_spin_unlock_bh(lock)
+//#define write_lock(lock)	_write_lock(lock)
+#define write_lock(lock)	PICK_RW_OP(raw_rwlock_t, write, _lock, lock)
 
-#define read_unlock_irqrestore(lock, flags)	_read_unlock_irqrestore(lock, flags)
-#define read_unlock_irq(lock)			_read_unlock_irq(lock)
-#define read_unlock_bh(lock)			_read_unlock_bh(lock)
+// #define read_lock(lock)		_read_lock(lock)
+#define read_lock(lock)		PICK_RW_OP(raw_rwlock_t, read, _lock, lock)
 
-#define write_unlock_irqrestore(lock, flags)	_write_unlock_irqrestore(lock, flags)
-#define write_unlock_irq(lock)			_write_unlock_irq(lock)
-#define write_unlock_bh(lock)			_write_unlock_bh(lock)
+#ifdef CONFIG_SMP
+// #define spin_lock_irqsave(lock, flags)	flags = _spin_lock_irqsave(lock)
+// #define read_lock_irqsave(lock, flags)	flags = _read_lock_irqsave(lock)
+// #define write_lock_irqsave(lock, flags)	flags = _write_lock_irqsave(lock)
+#else
+// #define spin_lock_irqsave(lock, flags)	_spin_lock_irqsave(lock, flags)
+// #define read_lock_irqsave(lock, flags)	_read_lock_irqsave(lock, flags)
+// #define write_lock_irqsave(lock, flags)	_write_lock_irqsave(lock, flags)
+#endif
+
+# define spin_lock_irqsave(lock, flags) \
+	flags = PICK_OP_RET(raw_spinlock_t, spin, _lock_irqsave, lock)
+# define read_lock_irqsave(lock, flags) \
+	flags = PICK_RW_OP_RET(raw_rwlock_t, read, _lock_irqsave, lock)
+# define write_lock_irqsave(lock, flags) \
+	flags = PICK_RW_OP_RET(raw_rwlock_t, write, _lock_irqsave, lock)
+
+// #define spin_lock_irq(lock)	_spin_lock_irq(lock)
+// #define spin_lock_bh(lock)	_spin_lock_bh(lock)
+#define spin_lock_irq(lock)	PICK_OP(raw_spinlock_t, spin, _lock_irq, lock)
+#define spin_lock_bh(lock)	PICK_OP(raw_spinlock_t, spin, _lock_bh, lock)
+
+// #define read_lock_irq(lock)	_read_lock_irq(lock)
+// #define read_lock_bh(lock)	_read_lock_bh(lock)
+#define read_lock_irq(lock)	PICK_RW_OP(raw_rwlock_t, read, _lock_irq, lock)
+#define read_lock_bh(lock)	PICK_RW_OP(raw_rwlock_t, read, _lock_bh, lock)
+
+// #define write_lock_irq(lock)		_write_lock_irq(lock)
+// #define write_lock_bh(lock)		_write_lock_bh(lock)
+#define write_lock_irq(lock)	PICK_RW_OP(raw_rwlock_t, write, _lock_irq, lock)
+#define write_lock_bh(lock)	PICK_RW_OP(raw_rwlock_t, write, _lock_bh, lock)
+
+// #define spin_unlock(lock)	_spin_unlock(lock)
+// #define write_unlock(lock)	_write_unlock(lock)
+// #define read_unlock(lock)	_read_unlock(lock)
+#define spin_unlock(lock)	PICK_OP(raw_spinlock_t, spin, _unlock, lock)
+#define read_unlock(lock)	PICK_RW_OP(raw_rwlock_t, read, _unlock, lock)
+#define write_unlock(lock)	PICK_RW_OP(raw_rwlock_t, write, _unlock, lock)
+
+//#define spin_unlock_irqrestore(lock, flags)
+//		_spin_unlock_irqrestore(lock, flags)
+//#define spin_unlock_irq(lock)	_spin_unlock_irq(lock)
+//#define spin_unlock_bh(lock)	_spin_unlock_bh(lock)
+#define spin_unlock_irqrestore(lock, flags) \
+	PICK_OP2(raw_spinlock_t, spin, _unlock_irqrestore, lock, flags)
+#define spin_unlock_irq(lock)	PICK_OP(raw_spinlock_t, spin, _unlock_irq, lock)
+#define spin_unlock_bh(lock)	PICK_OP(raw_spinlock_t, spin, _unlock_bh, lock)
+
+// #define read_unlock_irqrestore(lock, flags)
+// 		_read_unlock_irqrestore(lock, flags)
+// #define read_unlock_irq(lock)	_read_unlock_irq(lock)
+// #define read_unlock_bh(lock)	_read_unlock_bh(lock)
+#define read_unlock_irqrestore(lock, flags) \
+		PICK_RW_OP2(raw_rwlock_t, read, _unlock_irqrestore, lock, flags)
+#define read_unlock_irq(lock) PICK_RW_OP(raw_rwlock_t, read, _unlock_irq, lock)
+#define read_unlock_bh(lock) PICK_RW_OP(raw_rwlock_t, read, _unlock_bh, lock)
+
+// #define write_unlock_irqrestore(lock, flags)
+// 	_write_unlock_irqrestore(lock, flags)
+// #define write_unlock_irq(lock)			_write_unlock_irq(lock)
+// #define write_unlock_bh(lock)			_write_unlock_bh(lock)
+#define write_unlock_irqrestore(lock, flags) \
+	PICK_RW_OP2(raw_rwlock_t, write, _unlock_irqrestore, lock, flags)
+#define write_unlock_irq(lock) PICK_RW_OP(raw_rwlock_t, write, _unlock_irq, lock)
+#define write_unlock_bh(lock) PICK_RW_OP(raw_rwlock_t, write, _unlock_bh, lock)
+
+// #define spin_trylock_bh(lock)	_spin_trylock_bh(lock)
+#define spin_trylock_bh(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _trylock_bh, lock))
 
-#define spin_trylock_bh(lock)			__cond_lock(_spin_trylock_bh(lock))
+// #define spin_trylock_irq(lock)
 
-#define spin_trylock_irq(lock) \
-({ \
-	local_irq_disable(); \
-	_spin_trylock(lock) ? \
-	1 : ({local_irq_enable(); 0; }); \
-})
+#define spin_trylock_irq(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _trylock_irq, lock))
 
-#define spin_trylock_irqsave(lock, flags) \
-({ \
-	local_irq_save(flags); \
-	_spin_trylock(lock) ? \
-	1 : ({local_irq_restore(flags); 0;}); \
-})
+// #define spin_trylock_irqsave(lock, flags)
 
-#ifdef CONFIG_LOCKMETER
-extern void _metered_spin_lock   (spinlock_t *lock);
-extern void _metered_spin_unlock (spinlock_t *lock);
-extern int  _metered_spin_trylock(spinlock_t *lock);
-extern void _metered_read_lock    (rwlock_t *lock);
-extern void _metered_read_unlock  (rwlock_t *lock);
-extern void _metered_write_lock   (rwlock_t *lock);
-extern void _metered_write_unlock (rwlock_t *lock);
-extern int  _metered_read_trylock (rwlock_t *lock);
-extern int  _metered_write_trylock(rwlock_t *lock);
-#endif
+#define spin_trylock_irqsave(lock, flags)	__cond_lock(PICK_OP2_RET(raw_spinlock_t, spin, _trylock_irqsave, lock, &flags))
 
 /* "lock on reference count zero" */
 #ifndef ATOMIC_DEC_AND_LOCK
-#include <asm/atomic.h>
-extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
+# include <asm/atomic.h>
+  extern int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock);
 #endif
 
-#define atomic_dec_and_lock(atomic,lock) __cond_lock(_atomic_dec_and_lock(atomic,lock))
+#define atomic_dec_and_lock(atomic, lock)				\
+__cond_lock(({								\
+	int __ret;							\
+									\
+	if (TYPE_EQUAL(lock, raw_spinlock_t))				\
+		__ret = _atomic_dec_and_raw_spin_lock(atomic,		\
+					(raw_spinlock_t *)(lock));	\
+	else if (TYPE_EQUAL(lock, spinlock_t))				\
+		__ret = atomic_dec_and_spin_lock(atomic,		\
+					(spinlock_t *)(lock));		\
+	else __ret = __bad_spinlock_type();				\
+									\
+	__ret;								\
+}))
+
 
 /*
  *  bit-based spin_lock()
@@ -536,15 +777,10 @@ static inline void bit_spin_lock(int bit
 	 * busywait with less bus contention for a good time to
 	 * attempt to acquire the lock bit.
 	 */
-	preempt_disable();
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-	while (test_and_set_bit(bitnum, addr)) {
-		while (test_bit(bitnum, addr)) {
-			preempt_enable();
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	while (test_and_set_bit(bitnum, addr))
+		while (test_bit(bitnum, addr))
 			cpu_relax();
-			preempt_disable();
-		}
-	}
 #endif
 	__acquire(bitlock);
 }
@@ -554,12 +790,9 @@ static inline void bit_spin_lock(int bit
  */
 static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
 {
-	preempt_disable();	
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-	if (test_and_set_bit(bitnum, addr)) {
-		preempt_enable();
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	if (test_and_set_bit(bitnum, addr))
 		return 0;
-	}
 #endif
 	__acquire(bitlock);
 	return 1;
@@ -570,12 +803,11 @@ static inline int bit_spin_trylock(int b
  */
 static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
 {
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
 	BUG_ON(!test_bit(bitnum, addr));
 	smp_mb__before_clear_bit();
 	clear_bit(bitnum, addr);
 #endif
-	preempt_enable();
 	__release(bitlock);
 }
 
@@ -584,16 +816,29 @@ static inline void bit_spin_unlock(int b
  */
 static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
 {
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
 	return test_bit(bitnum, addr);
-#elif defined CONFIG_PREEMPT
-	return preempt_count();
 #else
 	return 1;
 #endif
 }
 
-#define DEFINE_SPINLOCK(x) spinlock_t x = SPIN_LOCK_UNLOCKED
-#define DEFINE_RWLOCK(x) rwlock_t x = RW_LOCK_UNLOCKED
+#define DEFINE_SPINLOCK(name) \
+	spinlock_t name __cacheline_aligned_in_smp = _SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_RAW_SPINLOCK(name) \
+	raw_spinlock_t name __cacheline_aligned_in_smp = RAW_SPIN_LOCK_UNLOCKED
+
+#define DEFINE_RWLOCK(name) \
+	rwlock_t name __cacheline_aligned_in_smp = _RW_LOCK_UNLOCKED(name)
+
+#define DEFINE_RAW_RWLOCK(name) \
+	raw_rwlock_t name __cacheline_aligned_in_smp = RAW_RW_LOCK_UNLOCKED
+
+#ifndef CONFIG_PREEMPT_RT
+# define DECLARE_MUTEX_NOCHECK DECLARE_MUTEX
+# define sema_count(sem) atomic_read(&(sem)->count)
+# define sema_init_nocheck sema_init
+#endif
 
 #endif /* __LINUX_SPINLOCK_H */
--- linux/include/linux/init_task.h.orig
+++ linux/include/linux/init_task.h
@@ -38,7 +38,7 @@
 	.mm_users	= ATOMIC_INIT(2), 			\
 	.mm_count	= ATOMIC_INIT(1), 			\
 	.mmap_sem	= __RWSEM_INITIALIZER(name.mmap_sem),	\
-	.page_table_lock =  SPIN_LOCK_UNLOCKED, 		\
+	.page_table_lock = SPIN_LOCK_UNLOCKED, 			\
 	.mmlist		= LIST_HEAD_INIT(name.mmlist),		\
 	.cpu_vm_mask	= CPU_MASK_ALL,				\
 	.default_kioctx = INIT_KIOCTX(name.default_kioctx, name),	\
@@ -110,8 +110,10 @@ extern struct group_info init_groups;
 	.blocked	= {{0}},					\
 	.alloc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
-	.switch_lock	= SPIN_LOCK_UNLOCKED,				\
+	.switch_lock	= RAW_SPIN_LOCK_UNLOCKED,			\
 	.journal_info	= NULL,						\
+	.delayed_put	= LIST_HEAD_INIT(tsk.delayed_put),		\
+	.pi_waiters	= LIST_HEAD_INIT(tsk.pi_waiters),		\
 	.private_pages	= LIST_HEAD_INIT(tsk.private_pages),		\
 	.private_pages_count = 0,					\
 }
--- linux/include/linux/quota.h.orig
+++ linux/include/linux/quota.h
@@ -37,6 +37,7 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
+#include <linux/wait.h>
 #include <linux/spinlock.h>
 
 #define __DQUOT_VERSION__	"dquot_6.5.1"
--- linux/include/linux/netdevice.h.orig
+++ linux/include/linux/netdevice.h
@@ -680,6 +680,8 @@ extern int		dev_change_name(struct net_d
 extern int		dev_set_mtu(struct net_device *, int);
 extern void		dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
 
+extern rwlock_t ptype_lock;
+
 extern void		dev_init(void);
 
 extern int		netdev_nit;
--- linux/include/linux/pagemap.h.orig
+++ linux/include/linux/pagemap.h
@@ -121,20 +121,19 @@ DECLARE_PER_CPU(long, nr_pagecache_local
  * an offset in their per-cpu arena and will spill that into the
  * global count whenever the absolute value of the local count
  * exceeds the counter's threshold.
- *
- * MUST be protected from preemption.
- * current protection is mapping->page_lock.
  */
 static inline void pagecache_acct(int count)
 {
 	long *local;
 
+	preempt_disable();
 	local = &__get_cpu_var(nr_pagecache_local);
 	*local += count;
 	if (*local > PAGECACHE_ACCT_THRESHOLD || *local < -PAGECACHE_ACCT_THRESHOLD) {
 		atomic_add(*local, &nr_pagecache);
 		*local = 0;
 	}
+	preempt_enable();
 }
 
 #else
--- linux/include/linux/kernel.h.orig
+++ linux/include/linux/kernel.h
@@ -48,15 +48,23 @@ extern int console_printk[];
 
 struct completion;
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-void __might_sleep(char *file, int line);
-#define might_sleep() __might_sleep(__FILE__, __LINE__)
-#define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0)
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+extern int cond_resched(void);
+# define might_resched() cond_resched()
 #else
-#define might_sleep() do {} while(0)
-#define might_sleep_if(cond) do {} while (0)
+# define might_resched() do { } while (0)
 #endif
 
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
+   void __might_sleep(char *file, int line);
+# define might_sleep() \
+	do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
+#else
+# define might_sleep() do { might_resched(); } while (0)
+#endif
+
+#define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0)
+
 #define abs(x) ({				\
 		int __x = (x);			\
 		(__x < 0) ? -__x : __x;		\
@@ -105,6 +113,12 @@ asmlinkage int vprintk(const char *fmt, 
 asmlinkage int printk(const char * fmt, ...)
 	__attribute__ ((format (printf, 1, 2)));
 
+#ifdef CONFIG_PREEMPT_RT
+extern void zap_rt_locks(void);
+#else
+# define zap_rt_locks() do { } while (0)
+#endif
+
 unsigned long int_sqrt(unsigned long);
 
 static inline int __attribute_pure__ long_log2(unsigned long x)
@@ -145,6 +159,7 @@ extern void add_taint(unsigned);
 /* Values used for system_state */
 extern enum system_states {
 	SYSTEM_BOOTING,
+	SYSTEM_BOOTING_SCHEDULER_OK,
 	SYSTEM_RUNNING,
 	SYSTEM_HALT,
 	SYSTEM_POWER_OFF,
--- linux/include/linux/console.h.orig
+++ linux/include/linux/console.h
@@ -54,6 +54,7 @@ struct consw {
 	void	(*con_invert_region)(struct vc_data *, u16 *, int);
 	u16    *(*con_screen_pos)(struct vc_data *, int);
 	unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *);
+	int	con_preemptible; // can it reschedule from within printk?
 };
 
 extern const struct consw *conswitchp;
--- linux/include/linux/hardirq.h.orig
+++ linux/include/linux/hardirq.h
@@ -58,11 +58,13 @@
  * Are we doing bottom half or hardware interrupt processing?
  * Are we in a softirq context? Interrupt context?
  */
-#define in_irq()		(hardirq_count())
-#define in_softirq()		(softirq_count())
-#define in_interrupt()		(irq_count())
-
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
+#define in_irq()	(hardirq_count() || (current->flags & PF_HARDIRQ))
+#define in_softirq()	(softirq_count() || (current->flags & PF_SOFTIRQ))
+#define in_interrupt()	(irq_count())
+
+#if defined(CONFIG_PREEMPT) && \
+	!defined(CONFIG_PREEMPT_BKL) && \
+		!defined(CONFIG_PREEMPT_RT)
 # define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked())
 #else
 # define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
--- linux/include/linux/inetdevice.h.orig
+++ linux/include/linux/inetdevice.h
@@ -6,6 +6,7 @@
 #include <linux/if.h>
 #include <linux/netdevice.h>
 #include <linux/rcupdate.h>
+#include <linux/rtnetlink.h>
 #include <linux/timer.h>
 
 struct ipv4_devconf
@@ -145,11 +146,11 @@ in_dev_get(const struct net_device *dev)
 {
 	struct in_device *in_dev;
 
-	rcu_read_lock();
+	rcu_read_lock_down_read(&rtnl_sem);
 	in_dev = dev->ip_ptr;
 	if (in_dev)
 		atomic_inc(&in_dev->refcnt);
-	rcu_read_unlock();
+	rcu_read_unlock_up_read(&rtnl_sem);
 	return in_dev;
 }
 
--- linux/include/linux/device.h.orig
+++ linux/include/linux/device.h
@@ -102,7 +102,7 @@ struct device_driver {
 	char			* name;
 	struct bus_type		* bus;
 
-	struct semaphore	unload_sem;
+	struct completion	unload_done;
 	struct kobject		kobj;
 	struct list_head	devices;
 
--- linux/include/linux/rcupdate.h.orig
+++ linux/include/linux/rcupdate.h
@@ -190,6 +190,39 @@ static inline int rcu_pending(int cpu)
  */
 #define rcu_read_unlock()	preempt_enable()
 
+#define IGNORE_LOCK(op, lock)	do { (void)(lock); op(); } while (0)
+
+#ifdef CONFIG_PREEMPT_RT
+# define rcu_read_lock_spin(lock)	spin_lock(lock)
+# define rcu_read_unlock_spin(lock)	spin_unlock(lock)
+# define rcu_read_lock_read(lock)	read_lock(lock)
+# define rcu_read_unlock_read(lock)	read_unlock(lock)
+# define rcu_read_lock_bh_read(lock)	read_lock_bh(lock)
+# define rcu_read_unlock_bh_read(lock)	read_unlock_bh(lock)
+# define rcu_read_lock_down_read(rwsem)	down_read(rwsem)
+# define rcu_read_unlock_up_read(rwsem)	up_read(rwsem)
+# define rcu_read_lock_nort()		do { } while (0)
+# define rcu_read_unlock_nort()		do { } while (0)
+#else
+# define rcu_read_lock_spin(lock)	IGNORE_LOCK(rcu_read_lock, lock)
+# define rcu_read_unlock_spin(lock)	IGNORE_LOCK(rcu_read_unlock, lock)
+# define rcu_read_lock_read(lock)	IGNORE_LOCK(rcu_read_lock, lock)
+# define rcu_read_unlock_read(lock)	IGNORE_LOCK(rcu_read_unlock, lock)
+# define rcu_read_lock_down_read(rwsem)	IGNORE_LOCK(rcu_read_lock, rwsem)
+# define rcu_read_unlock_up_read(rwsem)	IGNORE_LOCK(rcu_read_unlock, rwsem)
+# define rcu_read_lock_nort()		rcu_read_lock()
+# define rcu_read_unlock_nort()		rcu_read_unlock()
+# define rcu_read_lock_bh_read(lock)	IGNORE_LOCK(rcu_read_lock_bh, lock)
+# define rcu_read_unlock_bh_read(lock)	IGNORE_LOCK(rcu_read_unlock_bh, lock)
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+# define rcu_read_lock_sem(lock)	down(lock)
+# define rcu_read_unlock_sem(lock)	up(lock)
+#else
+# define rcu_read_lock_sem(lock)	IGNORE_LOCK(rcu_read_lock, lock)
+# define rcu_read_unlock_sem(lock)	IGNORE_LOCK(rcu_read_unlock, lock)
+#endif
 /*
  * So where is rcu_write_lock()?  It does not exist, as there is no
  * way for writers to lock out RCU readers.  This is a feature, not
@@ -220,6 +253,13 @@ static inline int rcu_pending(int cpu)
  */
 #define rcu_read_unlock_bh()	local_bh_enable()
 
+#ifdef CONFIG_PREEMPT_RT
+# define rcu_read_lock_bh_spin(lock)	spin_lock(lock)
+# define rcu_read_unlock_bh_spin(lock)	spin_unlock(lock)
+#else
+# define rcu_read_lock_bh_spin(lock)	IGNORE_LOCK(rcu_read_lock, lock)
+# define rcu_read_unlock_bh_spin(lock)	IGNORE_LOCK(rcu_read_unlock, lock)
+#endif
 /**
  * rcu_dereference - fetch an RCU-protected pointer in an
  * RCU read-side critical section.  This pointer may later
--- linux/include/linux/genhd.h.orig
+++ linux/include/linux/genhd.h
@@ -135,18 +135,26 @@ struct gendisk {
  * variants disable/enable preemption.
  */
 #ifdef	CONFIG_SMP
-#define __disk_stat_add(gendiskp, field, addnd) 	\
-	(per_cpu_ptr(gendiskp->dkstats, smp_processor_id())->field += addnd)
+#define __disk_stat_add(gendiskp, field, addnd)			\
+do {								\
+	preempt_disable();					\
+	(per_cpu_ptr(gendiskp->dkstats,				\
+			smp_processor_id())->field += addnd);	\
+	preempt_enable();					\
+} while (0)
+
 
 #define disk_stat_read(gendiskp, field)					\
 ({									\
 	typeof(gendiskp->dkstats->field) res = 0;			\
 	int i;								\
+	preempt_disable();						\
 	for (i=0; i < NR_CPUS; i++) {					\
 		if (!cpu_possible(i))					\
 			continue;					\
 		res += per_cpu_ptr(gendiskp->dkstats, i)->field;	\
 	}								\
+	preempt_enable();						\
 	res;								\
 })
 
--- linux/include/linux/smp.h.orig
+++ linux/include/linux/smp.h
@@ -32,6 +32,11 @@ extern void smp_send_stop(void);
  */
 extern void smp_send_reschedule(int cpu);
 
+/*
+ * trigger a reschedule on all other CPUs:
+ */
+extern void smp_send_reschedule_allbutself(void);
+
 
 /*
  * Prepare machine for booting other CPUs.
@@ -104,6 +109,7 @@ void smp_prepare_boot_cpu(void);
 #define smp_call_function(func,info,retry,wait)	({ 0; })
 #define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
 static inline void smp_send_reschedule(int cpu) { }
+static inline void smp_send_reschedule_allbutself(void) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
 
--- linux/include/linux/preempt.h.orig
+++ linux/include/linux/preempt.h
@@ -9,14 +9,22 @@
 #include <linux/config.h>
 #include <linux/linkage.h>
 
-#ifdef CONFIG_DEBUG_PREEMPT
-  extern void fastcall add_preempt_count(int val);
-  extern void fastcall sub_preempt_count(int val);
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING)
+  extern void notrace add_preempt_count(int val);
+  extern void notrace sub_preempt_count(int val);
 #else
 # define add_preempt_count(val)	do { preempt_count() += (val); } while (0)
 # define sub_preempt_count(val)	do { preempt_count() -= (val); } while (0)
 #endif
 
+#ifdef CONFIG_CRITICAL_TIMING
+  extern void touch_critical_timing(void);
+  extern void stop_critical_timing(void);
+#else
+# define touch_critical_timing()	do { } while (0)
+# define stop_critical_timing()	do { } while (0)
+#endif
+
 #define inc_preempt_count() add_preempt_count(1)
 #define dec_preempt_count() sub_preempt_count(1)
 
@@ -25,6 +33,7 @@
 #ifdef CONFIG_PREEMPT
 
 asmlinkage void preempt_schedule(void);
+asmlinkage void preempt_schedule_irq(void);
 
 #define preempt_disable() \
 do { \
@@ -57,6 +66,8 @@ do { \
 #define preempt_enable()		do { } while (0)
 #define preempt_check_resched()		do { } while (0)
 
+#define preempt_schedule_irq()		do { } while (0)
+
 #endif
 
 #endif /* __LINUX_PREEMPT_H */
--- linux/include/linux/wait.h.orig
+++ linux/include/linux/wait.h
@@ -48,11 +48,13 @@ struct wait_bit_queue {
 	wait_queue_t wait;
 };
 
+#if 1
 struct __wait_queue_head {
 	spinlock_t lock;
 	struct list_head task_list;
 };
 typedef struct __wait_queue_head wait_queue_head_t;
+#endif
 
 
 /*
@@ -68,7 +70,7 @@ typedef struct __wait_queue_head wait_qu
 	wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
 
 #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {				\
-	.lock		= SPIN_LOCK_UNLOCKED,				\
+	.lock		= SPIN_LOCK_UNLOCKED,			\
 	.task_list	= { &(name).task_list, &(name).task_list } }
 
 #define DECLARE_WAIT_QUEUE_HEAD(name) \
@@ -79,7 +81,7 @@ typedef struct __wait_queue_head wait_qu
 
 static inline void init_waitqueue_head(wait_queue_head_t *q)
 {
-	q->lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&q->lock);
 	INIT_LIST_HEAD(&q->task_list);
 }
 
--- linux/include/linux/pagevec.h.orig
+++ linux/include/linux/pagevec.h
@@ -6,7 +6,7 @@
  */
 
 /* 14 pointers + two long's align the pagevec structure to a power of two */
-#define PAGEVEC_SIZE	14
+#define PAGEVEC_SIZE	8
 
 struct page;
 struct address_space;
--- linux/include/linux/smp_lock.h.orig
+++ linux/include/linux/smp_lock.h
@@ -20,6 +20,7 @@ extern void __lockfunc __release_kernel_
 		__release_kernel_lock();	\
 } while (0)
 
+
 /*
  * Non-SMP kernels will never block on the kernel lock,
  * so we are better off returning a constant zero from
@@ -47,7 +48,7 @@ extern void __lockfunc unlock_kernel(voi
 #define lock_kernel()				do { } while(0)
 #define unlock_kernel()				do { } while(0)
 #define release_kernel_lock(task)		do { } while(0)
-#define reacquire_kernel_lock(task)		0
+#define reacquire_kernel_lock(task)		do { } while(0)
 #define kernel_locked()				1
 
 #endif /* CONFIG_LOCK_KERNEL */
--- linux/include/linux/sched.h.orig
+++ linux/include/linux/sched.h
@@ -23,6 +23,121 @@
 #include <asm/mmu.h>
 #include <asm/cputime.h>
 
+#ifdef CONFIG_PREEMPT
+extern int kernel_preemption;
+#else
+# define kernel_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+extern int voluntary_preemption;
+#else
+# define voluntary_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+extern int softirq_preemption;
+#else
+# define softirq_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_HARDIRQS
+extern int hardirq_preemption;
+extern void direct_timer_interrupt(struct pt_regs *regs);
+#else
+# define hardirq_preemption 0
+# define direct_timer_interrupt(regs) do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_BKL
+extern struct semaphore kernel_sem;
+#endif
+
+extern int debug_direct_keyboard;
+
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+  extern void deadlock_trace_off(void);
+#else
+# define deadlock_trace_off()			do { } while (0)
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE)
+  extern void print_traces(struct task_struct *task);
+#else
+# define print_traces(task)			do { } while (0)
+#endif
+
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+ extern void check_no_held_locks(struct task_struct *task);
+ extern void show_all_locks(void);
+#else
+# define check_no_held_locks(task)		do { } while (0)
+# define show_all_locks()			do { } while (0)
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+#else
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 0UL
+# define CALLER_ADDR2 0UL
+#endif
+
+#ifdef CONFIG_MCOUNT
+  extern void notrace mcount(void);
+#else
+# define mcount() do { } while (0)
+#endif
+
+#ifdef CONFIG_LATENCY_TRACE
+  extern int mcount_enabled, trace_enabled, trace_user_triggered,
+		trace_freerunning, trace_verbose, trace_print_at_crash,
+		trace_all_cpus;
+  extern void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3);
+  extern void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2);
+  extern void stop_trace(void);
+  extern void print_last_trace(void);
+  extern void nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags);
+  extern long user_trace_start(void);
+  extern long user_trace_stop(void);
+  extern void trace_cmdline(void);
+#else
+# define mcount_enabled				0
+# define trace_enabled				0
+# define trace_user_triggered			0
+# define trace_freerunning			0
+# define trace_all_cpus				0
+# define trace_verbose				0
+# define trace_special(v1,v2,v3)		do { } while (0)
+# define trace_special_pid(pid,v1,v2)		do { } while (0)
+# define stop_trace()				do { } while (0)
+# define print_last_trace()			do { } while (0)
+# define nmi_trace(eip, parent_eip, flags)	do { } while (0)
+# define user_trace_start()			do { } while (0)
+# define user_trace_stop()			do { } while (0)
+# define trace_cmdline()			do { } while (0)
+#endif
+
+#ifdef CONFIG_WAKEUP_TIMING
+  extern int wakeup_timing;
+  extern void __trace_start_sched_wakeup(struct task_struct *p);
+  extern void trace_stop_sched_switched(struct task_struct *p);
+  extern void trace_change_sched_cpu(struct task_struct *p, int new_cpu);
+#else
+# define wakeup_timing 0
+# define __trace_start_sched_wakeup(p)		do { } while (0)
+# define trace_stop_sched_switched(p)		do { } while (0)
+# define trace_change_sched_cpu(p, cpu)		do { } while (0)
+#endif
+
+// #define PREEMPT_DIRECT
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern void nmi_show_all_regs(void);
+#else
+# define nmi_show_all_regs() do { } while (0)
+#endif
+
 #include <linux/smp.h>
 #include <linux/sem.h>
 #include <linux/signal.h>
@@ -108,12 +223,13 @@ extern unsigned long nr_iowait(void);
 #include <asm/processor.h>
 
 #define TASK_RUNNING		0
-#define TASK_INTERRUPTIBLE	1
-#define TASK_UNINTERRUPTIBLE	2
-#define TASK_STOPPED		4
-#define TASK_TRACED		8
-#define EXIT_ZOMBIE		16
-#define EXIT_DEAD		32
+#define TASK_RUNNING_MUTEX	1
+#define TASK_INTERRUPTIBLE	2
+#define TASK_UNINTERRUPTIBLE	4
+#define TASK_STOPPED		8
+#define TASK_TRACED		16
+#define EXIT_ZOMBIE		32
+#define EXIT_DEAD		64
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
@@ -184,6 +300,11 @@ extern int in_sched_functions(unsigned l
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long FASTCALL(schedule_timeout(signed long timeout));
 asmlinkage void schedule(void);
+/*
+ * This one can be called with interrupts disabled, only
+ * to be used by lowlevel arch code!
+ */
+extern void __sched __schedule(void);
 
 struct namespace;
 
@@ -242,6 +363,9 @@ struct mm_struct {
 	/* Architecture-specific MM context */
 	mm_context_t context;
 
+	/* realtime bits */
+	struct list_head	delayed_drop;
+
 	/* Token based thrashing protection. */
 	unsigned long swap_token_time;
 	char recent_pagein;
@@ -360,6 +484,7 @@ struct signal_struct {
 
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
 
+#define rt_prio(prio)		((prio) < MAX_RT_PRIO)
 #define rt_task(p)		(unlikely((p)->prio < MAX_RT_PRIO))
 
 /*
@@ -655,7 +780,7 @@ struct task_struct {
 /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
 	spinlock_t proc_lock;
 /* context-switch lock */
-	spinlock_t switch_lock;
+	raw_spinlock_t switch_lock;
 
 /* journalling filesystem info */
 	void *journal_info;
@@ -694,6 +819,24 @@ struct task_struct {
 	int cpuset_mems_generation;
 #endif
 
+#define MAX_PREEMPT_TRACE 16
+
+#ifdef CONFIG_PREEMPT_TRACE
+	unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE];
+	unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE];
+#endif
+
+	/* realtime bits */
+	struct list_head delayed_put;
+	struct list_head pi_waiters;
+
+	/* RT deadlock detection and priority inheritance handling */
+	struct rt_mutex_waiter *blocked_on;
+
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+	void *last_kernel_lock;
+#endif
+
 	struct list_head private_pages;	/* per-process private pages */
 	int private_pages_count;
 };
@@ -717,10 +860,9 @@ static inline int pid_alive(struct task_
 }
 
 extern void free_task(struct task_struct *tsk);
-extern void __put_task_struct(struct task_struct *tsk);
-#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
-#define put_task_struct(tsk) \
-do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
+extern void get_task_struct(struct task_struct *tsk);
+extern void put_task_struct(struct task_struct *tsk);
+extern void put_task_struct_delayed(struct task_struct *tsk);
 
 /*
  * Per process flags
@@ -747,6 +889,9 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_LESS_THROTTLE 0x00100000	/* Throttle me less: I clean memory */
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
+#define PF_SOFTIRQ	0x00800000      /* softirq context */
+#define PF_HARDIRQ	0x01000000      /* hardirq context */
+#define PF_NOSCHED	0x02000000      /* no voluntary scheduling */
 
 #ifdef CONFIG_SMP
 extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
@@ -774,6 +919,8 @@ extern int task_prio(const task_t *p);
 extern int task_nice(const task_t *p);
 extern int task_curr(const task_t *p);
 extern int idle_cpu(int cpu);
+extern void mutex_setprio(task_t *p, int prio);
+extern int mutex_getprio(task_t *p);
 
 void yield(void);
 
@@ -823,6 +970,7 @@ extern void do_timer(struct pt_regs *);
 
 extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
 extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_mutex(struct task_struct * tsk));
 extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
 						unsigned long clone_flags));
 #ifdef CONFIG_SMP
@@ -919,12 +1067,20 @@ extern struct mm_struct * mm_alloc(void)
 
 /* mmdrop drops the mm and the page tables */
 extern void FASTCALL(__mmdrop(struct mm_struct *));
+extern void FASTCALL(__mmdrop_delayed(struct mm_struct *));
+
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (atomic_dec_and_test(&mm->mm_count))
 		__mmdrop(mm);
 }
 
+static inline void mmdrop_delayed(struct mm_struct * mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		__mmdrop_delayed(mm);
+}
+
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
@@ -1073,29 +1229,24 @@ static inline int signal_pending(struct 
 {
 	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
 }
-  
-static inline int need_resched(void)
+
+static inline int _need_resched(void)
 {
 	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
 }
 
-/*
- * cond_resched() and cond_resched_lock(): latency reduction via
- * explicit rescheduling in places that are safe. The return
- * value indicates whether a reschedule was done in fact.
- * cond_resched_lock() will drop the spinlock before scheduling,
- * cond_resched_softirq() will enable bhs before scheduling.
- */
-extern int cond_resched(void);
-extern int cond_resched_lock(spinlock_t * lock);
-extern int cond_resched_softirq(void);
+static inline int need_resched(void)
+{
+	touch_critical_timing();
+	return _need_resched();
+}
 
 /*
  * Does a critical section need to be broken due to another
  * task waiting?:
  */
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
-# define need_lockbreak(lock) ((lock)->break_lock)
+# define need_lockbreak(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; })
 #else
 # define need_lockbreak(lock) 0
 #endif
@@ -1104,13 +1255,51 @@ extern int cond_resched_softirq(void);
  * Does a critical section need to be broken due to another
  * task waiting or preemption being signalled:
  */
-static inline int lock_need_resched(spinlock_t *lock)
+#define lock_need_resched(lock) \
+	unlikely(need_lockbreak(lock) || need_resched())
+
+static inline int softirq_need_resched(void)
 {
-	if (need_lockbreak(lock) || need_resched())
-		return 1;
+	if (softirq_preemption)
+		return need_resched();
 	return 0;
 }
 
+static inline int hardirq_need_resched(void)
+{
+	if (current->flags & PF_HARDIRQ)
+		return need_resched();
+	return 0;
+}
+
+/*
+ * cond_resched() and cond_resched_lock(): latency reduction via
+ * explicit rescheduling in places that are safe. The return
+ * value indicates whether a reschedule was done in fact.
+ * cond_resched_lock() will drop the spinlock before scheduling,
+ * cond_resched_softirq() will enable bhs before scheduling.
+ */
+extern int cond_resched(void);
+extern int __cond_resched_raw_spinlock(raw_spinlock_t *lock);
+extern int __cond_resched_spinlock(spinlock_t *spinlock);
+
+#define cond_resched_lock(lock) \
+({								\
+	int __ret;						\
+								\
+	if (TYPE_EQUAL((lock), raw_spinlock_t))	 		\
+		__ret = __cond_resched_raw_spinlock((raw_spinlock_t *)lock);\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		__ret = __cond_resched_spinlock((spinlock_t *)lock); \
+	else __ret = __bad_spinlock_type();			\
+								\
+	__ret;							\
+})
+
+extern int cond_resched_softirq(void);
+extern int cond_resched_hardirq(void);
+extern int cond_resched_all(void);
+
 /* Reevaluate whether the task has signals pending delivery.
    This is required every time the blocked sigset_t changes.
    callers must hold sighand->siglock.  */
@@ -1132,6 +1321,7 @@ static inline unsigned int task_cpu(cons
 
 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
+	trace_change_sched_cpu(p, cpu);
 	p->thread_info->cpu = cpu;
 }
 
--- linux/include/linux/netfilter_ipv4/ip_conntrack.h.orig
+++ linux/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -315,7 +315,12 @@ struct ip_conntrack_stat
 	unsigned int expect_delete;
 };
 
-#define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++)
+#define CONNTRACK_STAT_INC(count) \
+do { \
+	preempt_disable(); \
+	__get_cpu_var(ip_conntrack_stat).count++; \
+	preempt_enable(); \
+} while (0)
 
 /* eg. PROVIDES_CONNTRACK(ftp); */
 #define PROVIDES_CONNTRACK(name)                        \
--- linux/include/linux/rtnetlink.h.orig
+++ linux/include/linux/rtnetlink.h
@@ -795,12 +795,12 @@ __rta_reserve(struct sk_buff *skb, int a
 
 extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change);
 
-extern struct semaphore rtnl_sem;
+extern struct rw_semaphore rtnl_sem;
 
-#define rtnl_shlock()		down(&rtnl_sem)
-#define rtnl_shlock_nowait()	down_trylock(&rtnl_sem)
+#define rtnl_shlock()		down_write(&rtnl_sem)
+#define rtnl_shlock_nowait()	(!down_write_trylock(&rtnl_sem))
 
-#define rtnl_shunlock()	do { up(&rtnl_sem); \
+#define rtnl_shunlock()	do { up_write(&rtnl_sem); \
 		             if (rtnl && rtnl->sk_receive_queue.qlen) \
 				     rtnl->sk_data_ready(rtnl, 0); \
 		        } while(0)
@@ -810,8 +810,8 @@ extern void rtnl_unlock(void);
 extern void rtnetlink_init(void);
 
 #define ASSERT_RTNL() do { \
-	if (unlikely(down_trylock(&rtnl_sem) == 0)) { \
-		up(&rtnl_sem); \
+	if (unlikely(down_write_trylock(&rtnl_sem) != 0)) { \
+		up_write(&rtnl_sem); \
 		printk(KERN_ERR "RTNL: assertion failed at %s (%d)\n", \
 		       __FILE__,  __LINE__); \
 		dump_stack(); \
--- linux/include/linux/rt_lock.h.orig
+++ linux/include/linux/rt_lock.h
@@ -0,0 +1,272 @@
+#ifndef __LINUX_RT_LOCK_H
+#define __LINUX_RT_LOCK_H
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+
+/*
+ * These are the basic SMP spinlocks, allowing only a single CPU anywhere.
+ * We use a generic definition on all architectures.
+ */
+
+#ifdef CONFIG_SMP
+typedef struct {
+	volatile unsigned long lock;
+# ifdef CONFIG_DEBUG_SPINLOCK
+	unsigned int magic;
+# endif
+# ifdef CONFIG_PREEMPT
+	unsigned int break_lock;
+# endif
+} raw_spinlock_t;
+#else
+  typedef struct { } raw_spinlock_t;
+# define __RAW_SPIN_LOCK_UNLOCKED { }
+# define RAW_SPIN_LOCK_UNLOCKED (raw_spinlock_t) __RAW_SPIN_LOCK_UNLOCKED
+#endif
+
+/*
+ * Read-write spinlocks, allowing multiple readers
+ * but only one writer.
+ */
+#ifdef CONFIG_SMP
+typedef struct {
+	volatile unsigned long lock;
+# ifdef CONFIG_DEBUG_SPINLOCK
+	unsigned magic;
+# endif
+# ifdef CONFIG_PREEMPT
+	unsigned int break_lock;
+# endif
+} raw_rwlock_t;
+#else
+  typedef struct { } raw_rwlock_t;
+# define __RAW_RW_LOCK_UNLOCKED { }
+# define RAW_RW_LOCK_UNLOCKED (raw_rwlock_t) __RAW_RW_LOCK_UNLOCKED
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * This is the core locking object used by PREEMPT_RT.
+ * This one handles all the logic necessary, the other locking
+ * objects (spinlocks, rwlocks, semaphores and rw-semaphores)
+ * all use this synchronization object internally:
+ */
+struct rt_mutex {
+	raw_spinlock_t		wait_lock;
+	struct list_head	wait_list;
+	struct task_struct	*owner;
+	int			owner_prio;
+# ifdef CONFIG_RT_DEADLOCK_DETECT
+	int			debug;
+	int			save_state;
+	struct list_head	held_list;
+	unsigned long		acquire_eip;
+	char 			*name, *file;
+	int			line;
+# endif
+};
+
+/*
+ * This is the control structure for tasks blocked on an
+ * RT mutex:
+ */
+struct rt_mutex_waiter {
+	struct rt_mutex *lock;
+	struct list_head list;
+	struct list_head pi_list;
+	struct task_struct *task;
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+	unsigned long eip;
+#endif
+};
+
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+# define ___RT_MUTEX_INITIALIZER(lockname) \
+	.wait_lock = RAW_SPIN_LOCK_UNLOCKED, \
+	.wait_list = LIST_HEAD_INIT((lockname).wait_list), \
+	.name = #lockname, .file = __FILE__, .line = __LINE__
+# define __RT_MUTEX_INITIALIZER(lockname) \
+	{ .debug = 1, ___RT_MUTEX_INITIALIZER(lockname) }
+# define __RT_MUTEX_INITIALIZER_NOCHECK(lockname) \
+	{ .debug = 0, ___RT_MUTEX_INITIALIZER(lockname) }
+#else
+# define __RT_MUTEX_INITIALIZER(lockname) \
+	{ .wait_lock = RAW_SPIN_LOCK_UNLOCKED, \
+	   LIST_HEAD_INIT((lockname).wait_list) }
+# define __RT_MUTEX_INITIALIZER_NOCHECK(lockname) \
+		__RT_MUTEX_INITIALIZER(lockname)
+#endif
+/*
+ * RW-semaphores are an RT mutex plus a reader-depth count.
+ *
+ * Note that the semantics are different from the usual
+ * Linux rw-sems, in PREEMPT_RT mode we do not allow
+ * multiple readers to hold the lock at once, we only allow
+ * a read-lock owner to read-lock recursively. This is
+ * better for latency, makes the implementation inherently
+ * fair and makes it simpler as well:
+ */
+struct rw_semaphore {
+	struct rt_mutex		lock;
+	int			read_depth;
+};
+
+/*
+ * rwlocks - an RW semaphore plus lock-break field:
+ */
+typedef struct {
+	struct rw_semaphore	lock;
+	unsigned int		break_lock;
+} rwlock_t;
+
+# ifdef CONFIG_RT_DEADLOCK_DETECT
+#  define __RW_LOCK_UNLOCKED \
+	.wait_lock = __RAW_SPIN_LOCK_UNLOCKED, .save_state = 1, \
+	.debug = .1, .file = __FILE__, .line = __LINE__
+#  define _RW_LOCK_UNLOCKED(lock) \
+	(rwlock_t) { { { __RW_LOCK_UNLOCKED, .name = #lock } } }
+#  define RW_LOCK_UNLOCKED \
+	(rwlock_t) { { { __RW_LOCK_UNLOCKED } } }
+# else
+#  define RW_LOCK_UNLOCKED (rwlock_t) \
+	{ { { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED } } }
+#  define _RW_LOCK_UNLOCKED(lock) RW_LOCK_UNLOCKED
+# endif
+#else /* !PREEMPT_RT */
+  typedef raw_rwlock_t rwlock_t;
+# define _RW_LOCK_UNLOCKED(lock)	RAW_RW_LOCK_UNLOCKED
+# define RW_LOCK_UNLOCKED		RAW_RW_LOCK_UNLOCKED
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * spinlocks - an RT mutex plus lock-break field:
+ */
+typedef struct {
+	struct rt_mutex lock;
+	unsigned int break_lock;
+} spinlock_t;
+
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+# define __SPIN_LOCK_UNLOCKED \
+	.wait_lock = __RAW_SPIN_LOCK_UNLOCKED, \
+	.save_state = 1, .debug = 1, .file = __FILE__, .line = __LINE__
+# define _SPIN_LOCK_UNLOCKED(lock) \
+	(spinlock_t) { { __SPIN_LOCK_UNLOCKED, .name = #lock } }
+# define SPIN_LOCK_UNLOCKED \
+	(spinlock_t) { { __SPIN_LOCK_UNLOCKED } }
+#else
+# define SPIN_LOCK_UNLOCKED \
+	(spinlock_t) { { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED } }
+# define _SPIN_LOCK_UNLOCKED(lock) SPIN_LOCK_UNLOCKED
+#endif
+#else /* !PREEMPT_RT */
+  typedef raw_spinlock_t spinlock_t;
+# define _SPIN_LOCK_UNLOCKED(lock)	RAW_SPIN_LOCK_UNLOCKED
+# define SPIN_LOCK_UNLOCKED		RAW_SPIN_LOCK_UNLOCKED
+#endif
+
+
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * semaphores - an RT-mutex plus the semaphore count:
+ */
+struct semaphore {
+	atomic_t count;
+	struct rt_mutex lock;
+};
+
+/*
+ * Semaphores:
+ */
+#define DECLARE_MUTEX(name) \
+struct semaphore name = \
+	{ .count = { 1 }, .lock = __RT_MUTEX_INITIALIZER(name.lock) }
+
+#define DECLARE_MUTEX_NOCHECK(name) \
+struct semaphore name = \
+	{ .count = { 1 }, .lock = __RT_MUTEX_INITIALIZER_NOCHECK(name.lock) }
+
+/*
+ * DECLARE_MUTEX_LOCKED() is deprecated: very hard to initialize properly
+ * and it also often signals abuse of semaphores.
+ */
+
+extern void FASTCALL(__sema_init(struct semaphore *sem, int val, int debug, char *name, char *file, int line));
+
+#define sema_init(sem, val) \
+		__sema_init(sem, val, 1, #sem, __FILE__, __LINE__)
+#define sema_init_nocheck(sem, val) \
+		__sema_init(sem, val, 0, #sem, __FILE__, __LINE__)
+	
+extern void FASTCALL(__init_MUTEX(struct semaphore *sem, char *name, char *file, int line));
+extern void FASTCALL(__init_MUTEX_LOCKED(struct semaphore *sem, char *name, char *file, int line));
+#define init_MUTEX(sem) \
+		__init_MUTEX(sem, #sem, __FILE__, __LINE__)
+#define init_MUTEX_LOCKED(sem) \
+		__init_MUTEX_LOCKED(sem, #sem, __FILE__, __LINE__)
+extern void FASTCALL(down(struct semaphore * sem));
+extern int FASTCALL(down_interruptible(struct semaphore * sem));
+extern int FASTCALL(down_trylock(struct semaphore * sem));
+extern void FASTCALL(up(struct semaphore * sem));
+extern int FASTCALL(sem_is_locked(struct semaphore *sem));
+extern int FASTCALL(sema_count(struct semaphore * sem));
+
+
+#define __RWSEM_INITIALIZER(lockname) \
+	{ .lock = __RT_MUTEX_INITIALIZER(lockname.lock) }
+#define __RWSEM_INITIALIZER_NOCHECK(lockname) \
+	{ .lock = __RT_MUTEX_INITIALIZER_NOCHECK(lockname.lock) }
+
+#define DECLARE_RWSEM(lockname) \
+	struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
+
+extern void FASTCALL(__init_rwsem(struct rw_semaphore *rwsem, int mutex,
+				int debug, char *name, char *file, int line));
+
+#define init_rwsem(sem) __init_rwsem(sem, 0, 1, #sem, __FILE__, __LINE__)
+
+extern void FASTCALL(down_read(struct rw_semaphore *rwsem));
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+extern int FASTCALL(down_read_trylock(struct rw_semaphore *rwsem));
+
+/*
+ * lock for writing
+ */
+extern void FASTCALL(down_write(struct rw_semaphore *rwsem));
+extern int FASTCALL(down_write_interruptible(struct rw_semaphore *rwsem));
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+extern int FASTCALL(down_write_trylock(struct rw_semaphore *rwsem));
+
+/*
+ * release a read lock
+ */
+extern void FASTCALL(up_read(struct rw_semaphore *rwsem));
+
+/*
+ * release a write lock
+ */
+extern void FASTCALL(up_write(struct rw_semaphore *rwsem));
+
+/*
+ * downgrade write lock to read lock
+ */
+extern void FASTCALL(downgrade_write(struct rw_semaphore *rwsem));
+
+extern int FASTCALL(rwsem_is_locked(struct rw_semaphore *rwsem));
+
+#endif /* CONFIG_PREEMPT_RT */
+
+#endif
+
--- linux/include/asm-i386/tlb.h.orig
+++ linux/include/asm-i386/tlb.h
@@ -13,8 +13,17 @@
  * .. because we flush the whole mm when it
  * fills up.
  */
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+#define tlb_flush(tlb) flush_tlb_mm(tlb_mm(tlb))
 
-#include <asm-generic/tlb.h>
+/*
+ * The mutex based kernel can preempt anytime so the per-CPU
+ * gather structures dont really fit. Fortunately TLB flushing
+ * is really simple on x86 ...
+ */
+#ifndef CONFIG_PREEMPT_RT
+# include <asm-generic/tlb.h>
+#else
+# include <asm-generic/tlb-simple.h>
+#endif
 
 #endif
--- linux/include/asm-i386/tlbflush.h.orig
+++ linux/include/asm-i386/tlbflush.h
@@ -5,15 +5,32 @@
 #include <linux/mm.h>
 #include <asm/processor.h>
 
+/*
+ * TLB-flush needs to be nonpreemptible on PREEMPT_RT due to the
+ * following complex race scenario:
+ *
+ * if the current task is lazy-TLB and does a TLB flush and
+ * gets preempted after the movl %%r3, %0 but before the
+ * movl %0, %%cr3 then its ->active_mm might change and it will
+ * install the wrong cr3 when it switches back. This is not a
+ * problem for the lazy-TLB task itself, but if the next task it
+ * switches to has an ->mm that is also the lazy-TLB task's
+ * new ->active_mm, then the scheduler will assume that cr3 is
+ * the new one, while we overwrote it with the old one. The result
+ * is the wrong cr3 in the new (non-lazy-TLB) task, which typically
+ * causes an infinite pagefault upon the next userspace access.
+ */
 #define __flush_tlb()							\
 	do {								\
 		unsigned int tmpreg;					\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movl %%cr3, %0;              \n"		\
 			"movl %0, %%cr3;  # flush TLB \n"		\
 			: "=r" (tmpreg)					\
 			:: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 /*
@@ -24,6 +41,7 @@
 	do {								\
 		unsigned int tmpreg;					\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movl %1, %%cr4;  # turn off PGE     \n"	\
 			"movl %%cr3, %0;                     \n"	\
@@ -33,6 +51,7 @@
 			: "r" (mmu_cr4_features & ~X86_CR4_PGE),	\
 			  "r" (mmu_cr4_features)			\
 			: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 extern unsigned long pgkern_mask;
@@ -85,6 +104,13 @@ extern unsigned long pgkern_mask;
 
 static inline void flush_tlb_mm(struct mm_struct *mm)
 {
+	/*
+	 * This is safe on PREEMPT_RT because if we preempt
+	 * right after the check but before the __flush_tlb(),
+	 * and if ->active_mm changes, then we might miss a
+	 * TLB flush, but that TLB flush happened already when
+	 * ->active_mm was changed:
+	 */
 	if (mm == current->active_mm)
 		__flush_tlb();
 }
--- linux/include/asm-i386/i8259.h.orig
+++ linux/include/asm-i386/i8259.h
@@ -7,7 +7,7 @@ extern unsigned int cached_irq_mask;
 #define cached_master_mask	(__byte(0, cached_irq_mask))
 #define cached_slave_mask	(__byte(1, cached_irq_mask))
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 extern void init_8259A(int auto_eoi);
 extern void enable_8259A_irq(unsigned int irq);
--- linux/include/asm-i386/system.h.orig
+++ linux/include/asm-i386/system.h
@@ -518,23 +518,36 @@ struct alt_instr { 
 
 #define set_wmb(var, value) do { var = value; wmb(); } while (0)
 
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+  extern void notrace trace_irqs_off(void);
+  extern void notrace trace_irqs_on(void);
+#else
+# define trace_irqs_off()		do { } while (0)
+# define trace_irqs_on()		do { } while (0)
+#endif
+
 /* interrupt control.. */
 #define local_save_flags(x)	do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0)
-#define local_irq_restore(x) 	do { typecheck(unsigned long,x); __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
-#define local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
-#define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
+#define local_irq_restore(x) 	do { typecheck(unsigned long,x); if (irqs_disabled_flags(x)) trace_irqs_on(); else trace_irqs_on(); __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
+#define local_irq_disable() 	do { __asm__ __volatile__("cli": : :"memory"); trace_irqs_off(); } while (0)
+#define local_irq_enable()	do { trace_irqs_on(); __asm__ __volatile__("sti": : :"memory"); } while (0)
 /* used in the idle loop; sti takes one instruction cycle to complete */
-#define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
+#define safe_halt()		do { trace_irqs_on(); __asm__ __volatile__("sti; hlt": : :"memory"); } while (0)
+
+#define irqs_disabled_flags(flags)	\
+({					\
+	!(flags & (1<<9));		\
+})
 
 #define irqs_disabled()			\
 ({					\
 	unsigned long flags;		\
 	local_save_flags(flags);	\
-	!(flags & (1<<9));		\
+	irqs_disabled_flags(flags);	\
 })
 
 /* For spinlocks etc */
-#define local_irq_save(x)	__asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory")
+#define local_irq_save(x)	do { __asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory"); trace_irqs_off(); } while (0)
 
 /*
  * disable hlt during certain critical i/o operations
--- linux/include/asm-i386/spinlock.h.orig
+++ linux/include/asm-i386/spinlock.h
@@ -5,25 +5,9 @@
 #include <asm/rwlock.h>
 #include <asm/page.h>
 #include <linux/config.h>
+#include <linux/list.h>
 #include <linux/compiler.h>
 
-asmlinkage int printk(const char * fmt, ...)
-	__attribute__ ((format (printf, 1, 2)));
-
-/*
- * Your basic SMP spinlocks, allowing only a single CPU anywhere
- */
-
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned magic;
-#endif
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
-
 #define SPINLOCK_MAGIC	0xdead4ead
 
 #ifdef CONFIG_DEBUG_SPINLOCK
@@ -32,9 +16,10 @@ typedef struct {
 #define SPINLOCK_MAGIC_INIT	/* */
 #endif
 
-#define SPIN_LOCK_UNLOCKED (spinlock_t) { 1 SPINLOCK_MAGIC_INIT }
+#define __RAW_SPIN_LOCK_UNLOCKED { 1 SPINLOCK_MAGIC_INIT }
+#define RAW_SPIN_LOCK_UNLOCKED (raw_spinlock_t) __RAW_SPIN_LOCK_UNLOCKED
 
-#define spin_lock_init(x)	do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
+#define __raw_spin_lock_init(x)	do { *(x) = RAW_SPIN_LOCK_UNLOCKED; } while(0)
 
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
@@ -43,8 +28,9 @@ typedef struct {
  * We make no fairness assumptions. They have a cost.
  */
 
-#define spin_is_locked(x)	(*(volatile signed char *)(&(x)->lock) <= 0)
-#define spin_unlock_wait(x)	do { barrier(); } while(spin_is_locked(x))
+#define __raw_spin_is_locked(x)	(*(volatile signed char *)(&(x)->lock) <= 0)
+#define __raw_spin_unlock_wait(x) \
+		do { barrier(); } while (__raw_spin_is_locked(x))
 
 #define spin_lock_string \
 	"\n1:\t" \
@@ -86,11 +72,11 @@ typedef struct {
 		:"=m" (lock->lock) : : "memory"
 
 
-static inline void _raw_spin_unlock(spinlock_t *lock)
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	BUG_ON(lock->magic != SPINLOCK_MAGIC);
-	BUG_ON(!spin_is_locked(lock));
+	BUG_ON(!__raw_spin_is_locked(lock));
 #endif
 	__asm__ __volatile__(
 		spin_unlock_string
@@ -104,12 +90,12 @@ static inline void _raw_spin_unlock(spin
 		:"=q" (oldval), "=m" (lock->lock) \
 		:"0" (oldval) : "memory"
 
-static inline void _raw_spin_unlock(spinlock_t *lock)
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
 	char oldval = 1;
 #ifdef CONFIG_DEBUG_SPINLOCK
 	BUG_ON(lock->magic != SPINLOCK_MAGIC);
-	BUG_ON(!spin_is_locked(lock));
+	BUG_ON(!__raw_spin_is_locked(lock));
 #endif
 	__asm__ __volatile__(
 		spin_unlock_string
@@ -118,7 +104,7 @@ static inline void _raw_spin_unlock(spin
 
 #endif
 
-static inline int _raw_spin_trylock(spinlock_t *lock)
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	char oldval;
 	__asm__ __volatile__(
@@ -128,7 +114,7 @@ static inline int _raw_spin_trylock(spin
 	return oldval > 0;
 }
 
-static inline void _raw_spin_lock(spinlock_t *lock)
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	if (unlikely(lock->magic != SPINLOCK_MAGIC)) {
@@ -141,7 +127,7 @@ static inline void _raw_spin_lock(spinlo
 		:"=m" (lock->lock) : : "memory");
 }
 
-static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags)
+static inline void __raw_spin_lock_flags (raw_spinlock_t *lock, unsigned long flags)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	if (unlikely(lock->magic != SPINLOCK_MAGIC)) {
@@ -154,26 +140,6 @@ static inline void _raw_spin_lock_flags 
 		:"=m" (lock->lock) : "r" (flags) : "memory");
 }
 
-/*
- * Read-write spinlocks, allowing multiple readers
- * but only one writer.
- *
- * NOTE! it is quite common to have readers in interrupts
- * but no interrupt writers. For those circumstances we
- * can "mix" irq-safe locks - any writer needs to get a
- * irq-safe write-lock, but readers can get non-irqsafe
- * read-locks.
- */
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned magic;
-#endif
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
-
 #define RWLOCK_MAGIC	0xdeaf1eed
 
 #ifdef CONFIG_DEBUG_SPINLOCK
@@ -182,11 +148,12 @@ typedef struct {
 #define RWLOCK_MAGIC_INIT	/* */
 #endif
 
-#define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT }
+#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS RWLOCK_MAGIC_INIT }
+#define RAW_RW_LOCK_UNLOCKED (raw_rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT }
 
-#define rwlock_init(x)	do { *(x) = RW_LOCK_UNLOCKED; } while(0)
+#define __raw_rwlock_init(x) do { *(x) = RAW_RW_LOCK_UNLOCKED; } while(0)
 
-#define rwlock_is_locked(x) ((x)->lock != RW_LOCK_BIAS)
+#define __raw_rwlock_is_locked(x) 	((x)->lock != RW_LOCK_BIAS)
 
 /*
  * On x86, we implement read-write locks as a 32-bit counter
@@ -199,7 +166,7 @@ typedef struct {
  */
 /* the spinlock helpers are in arch/i386/kernel/semaphore.c */
 
-static inline void _raw_read_lock(rwlock_t *rw)
+static inline void __raw_read_lock(raw_rwlock_t *rw)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	BUG_ON(rw->magic != RWLOCK_MAGIC);
@@ -207,7 +174,7 @@ static inline void _raw_read_lock(rwlock
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void _raw_write_lock(rwlock_t *rw)
+static inline void __raw_write_lock(raw_rwlock_t *rw)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	BUG_ON(rw->magic != RWLOCK_MAGIC);
@@ -215,10 +182,10 @@ static inline void _raw_write_lock(rwloc
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-#define _raw_read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
-#define _raw_write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
+#define __raw_read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
+#define __raw_write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
 
-static inline int _raw_read_trylock(rwlock_t *lock)
+static inline int __raw_read_trylock(raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	atomic_dec(count);
@@ -228,7 +195,7 @@ static inline int _raw_read_trylock(rwlo
 	return 0;
 }
 
-static inline int _raw_write_trylock(rwlock_t *lock)
+static inline int __raw_write_trylock(raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
--- linux/include/asm-i386/pgtable.h.orig
+++ linux/include/asm-i386/pgtable.h
@@ -34,7 +34,7 @@ extern unsigned long empty_zero_page[102
 extern pgd_t swapper_pg_dir[1024];
 extern kmem_cache_t *pgd_cache;
 extern kmem_cache_t *pmd_cache;
-extern spinlock_t pgd_lock;
+extern raw_spinlock_t pgd_lock;
 extern struct page *pgd_list;
 
 void pmd_ctor(void *, kmem_cache_t *, unsigned long);
--- linux/include/asm-i386/xor.h.orig
+++ linux/include/asm-i386/xor.h
@@ -862,7 +862,21 @@ static struct xor_block_template xor_blo
 #include <asm-generic/xor.h>
 
 #undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
+/*
+ * MMX/SSE ops disable preemption for long periods of time,
+ * so on PREEMPT_RT use the register-based ops only:
+ */
+#ifdef CONFIG_PREEMPT_RT
+# define XOR_TRY_TEMPLATES				\
+	do {						\
+		xor_speed(&xor_block_8regs);		\
+		xor_speed(&xor_block_8regs_p);		\
+		xor_speed(&xor_block_32regs);		\
+		xor_speed(&xor_block_32regs_p);		\
+	} while (0)
+# define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST)
+#else
+# define XOR_TRY_TEMPLATES				\
 	do {						\
 		xor_speed(&xor_block_8regs);		\
 		xor_speed(&xor_block_8regs_p);		\
@@ -875,9 +889,10 @@ static struct xor_block_template xor_blo
 	                xor_speed(&xor_block_p5_mmx);	\
 	        }					\
 	} while (0)
-
 /* We force the use of the SSE xor block because it can write around L2.
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
+# define XOR_SELECT_TEMPLATE(FASTEST) \
 	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+#endif
+
--- linux/include/asm-i386/highmem.h.orig
+++ linux/include/asm-i386/highmem.h
@@ -69,14 +69,32 @@ extern void * FASTCALL(kmap_high(struct 
 extern void FASTCALL(kunmap_high(struct page *page));
 
 void *kmap(struct page *page);
+extern void kunmap_virt(void *ptr);
+extern struct page *kmap_to_page(void *ptr);
 void kunmap(struct page *page);
-char *kmap_atomic(struct page *page, enum km_type type);
-void kunmap_atomic(char *kvaddr, enum km_type type);
-char *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
-struct page *kmap_atomic_to_page(char *ptr);
+
+char *__kmap_atomic(struct page *page, enum km_type type);
+void __kunmap_atomic(char *kvaddr, enum km_type type);
+char *__kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+struct page *__kmap_atomic_to_page(char *ptr);
 
 #define flush_cache_kmaps()	do { } while (0)
 
+/*
+ * on PREEMPT_RT kmap_atomic() is a wrapper that uses kmap():
+ */
+#ifdef CONFIG_PREEMPT_RT
+# define kmap_atomic(page, type)	kmap(page)
+# define kmap_atomic_pfn(pfn, type)	kmap(pfn_to_page(pfn))
+# define kunmap_atomic(kvaddr, type)	kunmap_virt(kvaddr)
+# define kmap_atomic_to_page(kvaddr)	kmap_to_page(kvaddr)
+#else
+# define kmap_atomic			__kmap_atomic
+# define kmap_atomic_pfn		__kmap_atomic_pfn
+# define kunmap_atomic			__kunmap_atomic
+# define kmap_atomic_to_page(kvaddr)	__kmap_atomic_to_page(kvaddr)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_HIGHMEM_H */
--- linux/include/asm-i386/dma.h.orig
+++ linux/include/asm-i386/dma.h
@@ -135,7 +135,7 @@
 #define DMA_AUTOINIT	0x10
 
 
-extern spinlock_t  dma_spin_lock;
+extern spinlock_t dma_spin_lock;
 
 static __inline__ unsigned long claim_dma_lock(void)
 {
--- linux/include/asm-i386/i8253.h.orig
+++ linux/include/asm-i386/i8253.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_I8253_H__
+#define __ASM_I8253_H__
+
+extern raw_spinlock_t i8253_lock;
+
+#endif	/* __ASM_I8253_H__ */
--- linux/include/asm-i386/bug.h.orig
+++ linux/include/asm-i386/bug.h
@@ -11,10 +11,13 @@
 
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 #define BUG()				\
+do {					\
+printk("BUG at %s:%d!\n", __FILE__, __LINE__); \
  __asm__ __volatile__(	"ud2\n"		\
 			"\t.word %c0\n"	\
 			"\t.long %c1\n"	\
-			 : : "i" (__LINE__), "i" (__FILE__))
+			 : : "i" (__LINE__), "i" (__FILE__)); \
+} while (0)
 #else
 #define BUG() __asm__ __volatile__("ud2\n")
 #endif
--- linux/include/asm-i386/mach-default/do_timer.h.orig
+++ linux/include/asm-i386/mach-default/do_timer.h
@@ -1,6 +1,7 @@
 /* defines for inline arch setup functions */
 
 #include <asm/apic.h>
+#include <asm/i8259.h>
 
 /**
  * do_timer_interrupt_hook - hook into timer tick
--- linux/include/asm-i386/semaphore.h.orig
+++ linux/include/asm-i386/semaphore.h
@@ -1,10 +1,12 @@
 #ifndef _I386_SEMAPHORE_H
 #define _I386_SEMAPHORE_H
 
+#include <linux/config.h>
 #include <linux/linkage.h>
 
-#ifdef __KERNEL__
-
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#else
 /*
  * SMP- and interrupt-safe semaphores..
  *
@@ -195,5 +197,7 @@ static inline void up(struct semaphore *
 		:"memory","ax");
 }
 
-#endif
+extern int FASTCALL(sem_is_locked(struct semaphore *sem));
+
+#endif /* CONFIG_PREEMPT_RT */
 #endif
--- linux/include/asm-i386/io_apic.h.orig
+++ linux/include/asm-i386/io_apic.h
@@ -16,11 +16,10 @@
 #ifdef CONFIG_PCI_MSI
 static inline int use_pci_vector(void)	{return 1;}
 static inline void disable_edge_ioapic_vector(unsigned int vector) { }
-static inline void mask_and_ack_level_ioapic_vector(unsigned int vector) { }
 static inline void end_edge_ioapic_vector (unsigned int vector) { }
 #define startup_level_ioapic	startup_level_ioapic_vector
 #define shutdown_level_ioapic	mask_IO_APIC_vector
-#define enable_level_ioapic	unmask_IO_APIC_vector
+#define enable_level_ioapic	enable_level_ioapic_vector
 #define disable_level_ioapic	mask_IO_APIC_vector
 #define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_vector
 #define end_level_ioapic	end_level_ioapic_vector
@@ -35,11 +34,10 @@ static inline void end_edge_ioapic_vecto
 #else
 static inline int use_pci_vector(void)	{return 0;}
 static inline void disable_edge_ioapic_irq(unsigned int irq) { }
-static inline void mask_and_ack_level_ioapic_irq(unsigned int irq) { }
 static inline void end_edge_ioapic_irq (unsigned int irq) { }
 #define startup_level_ioapic	startup_level_ioapic_irq
 #define shutdown_level_ioapic	mask_IO_APIC_irq
-#define enable_level_ioapic	unmask_IO_APIC_irq
+#define enable_level_ioapic	enable_level_ioapic_irq
 #define disable_level_ioapic	mask_IO_APIC_irq
 #define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_irq
 #define end_level_ioapic	end_level_ioapic_irq
--- linux/include/acpi/acpiosxf.h.orig
+++ linux/include/acpi/acpiosxf.h
@@ -59,7 +59,7 @@
 #define OSD_PRIORITY_MED            3
 #define OSD_PRIORITY_LO             4
 
-#define ACPI_NO_UNIT_LIMIT          ((u32) -1)
+#define ACPI_NO_UNIT_LIMIT          (INT_MAX/2)
 #define ACPI_MUTEX_SEM              1
 
 
--- linux/include/asm-x86_64/tlb.h.orig
+++ linux/include/asm-x86_64/tlb.h
@@ -1,13 +1,29 @@
-#ifndef TLB_H
-#define TLB_H 1
-
+#ifndef _X86_64_TLB_H
+#define _X86_64_TLB_H
 
+/*
+ * x64 doesn't need any special per-pte or
+ * per-vma handling..
+ */
 #define tlb_start_vma(tlb, vma) do { } while (0)
 #define tlb_end_vma(tlb, vma) do { } while (0)
 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
 
-#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
+/*
+ * .. because we flush the whole mm when it
+ * fills up.
+ */
+#define tlb_flush(tlb) flush_tlb_mm(tlb_mm(tlb))
 
-#include <asm-generic/tlb.h>
+/*
+ * The mutex based kernel can preempt anytime so the per-CPU
+ * gather structures dont really fit. Fortunately TLB flushing
+ * is really simple on x64 ...
+ */
+#ifndef CONFIG_PREEMPT_RT
+# include <asm-generic/tlb.h>
+#else
+# include <asm-generic/tlb-simple.h>
+#endif
 
 #endif
--- linux/include/asm-x86_64/tlbflush.h.orig
+++ linux/include/asm-x86_64/tlbflush.h
@@ -9,11 +9,13 @@
 	do {								\
 		unsigned long tmpreg;					\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movq %%cr3, %0;  # flush TLB \n"		\
 			"movq %0, %%cr3;              \n"		\
 			: "=r" (tmpreg)					\
 			:: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 /*
@@ -24,6 +26,7 @@
 	do {								\
 		unsigned long tmpreg;					\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movq %1, %%cr4;  # turn off PGE     \n"	\
 			"movq %%cr3, %0;  # flush TLB        \n"	\
@@ -33,6 +36,7 @@
 			: "r" (mmu_cr4_features & ~X86_CR4_PGE),	\
 			  "r" (mmu_cr4_features)			\
 			: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 extern unsigned long pgkern_mask;
--- linux/include/asm-x86_64/system.h.orig
+++ linux/include/asm-x86_64/system.h
@@ -318,11 +318,16 @@ static inline unsigned long __cmpxchg(vo
 /* used in the idle loop; sti takes one instruction cycle to complete */
 #define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
 
+#define irqs_disabled_flags(flags)	\
+({					\
+	!(flags & (1<<9));		\
+})
+
 #define irqs_disabled()			\
 ({					\
 	unsigned long flags;		\
 	local_save_flags(flags);	\
-	!(flags & (1<<9));		\
+	irqs_disabled_flags(flags);	\
 })
 
 /* For spinlocks etc */
--- linux/include/asm-x86_64/proto.h.orig
+++ linux/include/asm-x86_64/proto.h
@@ -55,7 +55,7 @@ extern unsigned long end_pfn_map; 
 
 extern unsigned long cpu_initialized;
 
-extern void show_trace(unsigned long * rsp);
+extern void show_trace(struct task_struct *task, unsigned long * rsp);
 extern void show_registers(struct pt_regs *regs);
 
 extern void exception_table_check(void);
--- linux/include/asm-x86_64/percpu.h.orig
+++ linux/include/asm-x86_64/percpu.h
@@ -17,11 +17,23 @@
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
-
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    __attribute__((__section__(".data.percpu"))) spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED; \
+    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name##_locked
+ 
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
 
+#define per_cpu_lock(var, cpu) \
+	(*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset(cpu)))
+#define per_cpu_var_locked(var, cpu) \
+		(*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset(cpu)))
+#define __get_cpu_lock(var, cpu) \
+		per_cpu_lock(var, cpu)
+#define __get_cpu_var_locked(var, cpu) \
+		per_cpu_var_locked(var, cpu)
+
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
@@ -39,8 +51,14 @@ extern void setup_per_cpu_areas(void);
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+	spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED; \
+	__typeof__(type) per_cpu__##name##_locked
+
 #define per_cpu(var, cpu)			(*((void)cpu, &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
+#define __get_cpu_lock(var, cpu)		per_cpu_lock__##var##_locked
+#define __get_cpu_var_locked(var, cpu)		per_cpu__##var##_locked
 
 #endif	/* SMP */
 
--- linux/include/asm-x86_64/spinlock.h.orig
+++ linux/include/asm-x86_64/spinlock.h
@@ -5,23 +5,7 @@
 #include <asm/rwlock.h>
 #include <asm/page.h>
 #include <linux/config.h>
-
-extern int printk(const char * fmt, ...)
-	__attribute__ ((format (printf, 1, 2)));
-
-/*
- * Your basic SMP spinlocks, allowing only a single CPU anywhere
- */
-
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned magic;
-#endif
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} spinlock_t;
+#include <linux/list.h>
 
 #define SPINLOCK_MAGIC	0xdead4ead
 
@@ -31,9 +15,10 @@ typedef struct {
 #define SPINLOCK_MAGIC_INIT	/* */
 #endif
 
-#define SPIN_LOCK_UNLOCKED (spinlock_t) { 1 SPINLOCK_MAGIC_INIT }
+#define __RAW_SPIN_LOCK_UNLOCKED { 1 SPINLOCK_MAGIC_INIT }
+#define RAW_SPIN_LOCK_UNLOCKED (raw_spinlock_t) { 1 SPINLOCK_MAGIC_INIT }
 
-#define spin_lock_init(x)	do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
+#define __raw_spin_lock_init(x) do { *(x) = RAW_SPIN_LOCK_UNLOCKED; } while(0)
 
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
@@ -42,8 +27,9 @@ typedef struct {
  * We make no fairness assumptions. They have a cost.
  */
 
-#define spin_is_locked(x)	(*(volatile signed char *)(&(x)->lock) <= 0)
-#define spin_unlock_wait(x)	do { barrier(); } while(spin_is_locked(x))
+#define __raw_spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0)
+#define __raw_spin_unlock_wait(x) \
+		do { barrier(); } while (__raw_spin_is_locked(x))
 
 #define spin_lock_string \
 	"\n1:\t" \
@@ -85,11 +71,11 @@ typedef struct {
 		:"=m" (lock->lock) : : "memory"
 
 
-static inline void _raw_spin_unlock(spinlock_t *lock)
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	BUG_ON(lock->magic != SPINLOCK_MAGIC);
-	BUG_ON(!spin_is_locked(lock));
+	BUG_ON(!__raw_spin_is_locked(lock));
 #endif
 	__asm__ __volatile__(
 		spin_unlock_string
@@ -103,12 +89,12 @@ static inline void _raw_spin_unlock(spin
 		:"=q" (oldval), "=m" (lock->lock) \
 		:"0" (oldval) : "memory"
 
-static inline void _raw_spin_unlock(spinlock_t *lock)
+static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
 	char oldval = 1;
 #ifdef CONFIG_DEBUG_SPINLOCK
 	BUG_ON(lock->magic != SPINLOCK_MAGIC);
-	BUG_ON(!spin_is_locked(lock));
+	BUG_ON(!__raw_spin_is_locked(lock));
 #endif
 	__asm__ __volatile__(
 		spin_unlock_string
@@ -117,7 +103,7 @@ static inline void _raw_spin_unlock(spin
 
 #endif
 
-static inline int _raw_spin_trylock(spinlock_t *lock)
+static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	char oldval;
 	__asm__ __volatile__(
@@ -127,7 +113,7 @@ static inline int _raw_spin_trylock(spin
 	return oldval > 0;
 }
 
-static inline void _raw_spin_lock(spinlock_t *lock)
+static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	if (lock->magic != SPINLOCK_MAGIC) {
@@ -140,7 +126,7 @@ static inline void _raw_spin_lock(spinlo
 		:"=m" (lock->lock) : : "memory");
 }
 
-static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags)
+static inline void __raw_spin_lock_flags (raw_spinlock_t *lock, unsigned long flags)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	__label__ here;
@@ -154,27 +140,6 @@ here:
 		:"=m" (lock->lock) : "r" (flags) : "memory");
 }
 
-
-/*
- * Read-write spinlocks, allowing multiple readers
- * but only one writer.
- *
- * NOTE! it is quite common to have readers in interrupts
- * but no interrupt writers. For those circumstances we
- * can "mix" irq-safe locks - any writer needs to get a
- * irq-safe write-lock, but readers can get non-irqsafe
- * read-locks.
- */
-typedef struct {
-	volatile unsigned int lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
-	unsigned magic;
-#endif
-#ifdef CONFIG_PREEMPT
-	unsigned int break_lock;
-#endif
-} rwlock_t;
-
 #define RWLOCK_MAGIC	0xdeaf1eed
 
 #ifdef CONFIG_DEBUG_SPINLOCK
@@ -183,11 +148,12 @@ typedef struct {
 #define RWLOCK_MAGIC_INIT	/* */
 #endif
 
-#define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT }
+#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS RWLOCK_MAGIC_INIT }
+#define RAW_RW_LOCK_UNLOCKED (raw_rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT }
 
-#define rwlock_init(x)	do { *(x) = RW_LOCK_UNLOCKED; } while(0)
+#define __raw_rwlock_init(x)	do { *(x) = RAW_RW_LOCK_UNLOCKED; } while(0)
 
-#define rwlock_is_locked(x) ((x)->lock != RW_LOCK_BIAS)
+#define __raw_rwlock_is_locked(x)	((x)->lock != RW_LOCK_BIAS)
 
 /*
  * On x86, we implement read-write locks as a 32-bit counter
@@ -200,7 +166,7 @@ typedef struct {
  */
 /* the spinlock helpers are in arch/i386/kernel/semaphore.c */
 
-static inline void _raw_read_lock(rwlock_t *rw)
+static inline void __raw_read_lock(raw_rwlock_t *rw)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	BUG_ON(rw->magic != RWLOCK_MAGIC);
@@ -208,7 +174,7 @@ static inline void _raw_read_lock(rwlock
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void _raw_write_lock(rwlock_t *rw)
+static inline void __raw_write_lock(raw_rwlock_t *rw)
 {
 #ifdef CONFIG_DEBUG_SPINLOCK
 	BUG_ON(rw->magic != RWLOCK_MAGIC);
@@ -216,10 +182,10 @@ static inline void _raw_write_lock(rwloc
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-#define _raw_read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
-#define _raw_write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
+#define __raw_read_unlock(rw)		asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory")
+#define __raw_write_unlock(rw)	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory")
 
-static inline int _raw_read_trylock(rwlock_t *lock)
+static inline int __raw_read_trylock(raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	atomic_dec(count);
@@ -229,7 +195,7 @@ static inline int _raw_read_trylock(rwlo
 	return 0;
 }
 
-static inline int _raw_write_trylock(rwlock_t *lock)
+static inline int __raw_write_trylock(raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
--- linux/include/asm-x86_64/vsyscall.h.orig
+++ linux/include/asm-x86_64/vsyscall.h
@@ -45,14 +45,14 @@ extern struct timespec __xtime;
 extern volatile unsigned long __jiffies;
 extern unsigned long __wall_jiffies;
 extern struct timezone __sys_tz;
-extern seqlock_t __xtime_lock;
+extern raw_seqlock_t __xtime_lock;
 
 /* kernel space (writeable) */
 extern struct vxtime_data vxtime;
 extern unsigned long wall_jiffies;
 extern struct timezone sys_tz;
 extern int sysctl_vsyscall;
-extern seqlock_t xtime_lock;
+extern raw_seqlock_t xtime_lock;
 
 #define ARCH_HAVE_XTIME_LOCK 1
 
--- linux/include/asm-x86_64/semaphore.h.orig
+++ linux/include/asm-x86_64/semaphore.h
@@ -1,10 +1,15 @@
 #ifndef _X86_64_SEMAPHORE_H
 #define _X86_64_SEMAPHORE_H
 
+#include <linux/config.h>
 #include <linux/linkage.h>
 
 #ifdef __KERNEL__
 
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#else
+
 /*
  * SMP- and interrupt-safe semaphores..
  *
@@ -192,5 +197,6 @@ static inline void up(struct semaphore *
 		:"D" (sem)
 		:"memory");
 }
+#endif /* CONFIG_PREEMPT_RT */
 #endif /* __KERNEL__ */
 #endif
--- linux/include/asm-x86_64/io_apic.h.orig
+++ linux/include/asm-x86_64/io_apic.h
@@ -16,11 +16,10 @@
 #ifdef CONFIG_PCI_MSI
 static inline int use_pci_vector(void)	{return 1;}
 static inline void disable_edge_ioapic_vector(unsigned int vector) { }
-static inline void mask_and_ack_level_ioapic_vector(unsigned int vector) { }
 static inline void end_edge_ioapic_vector (unsigned int vector) { }
 #define startup_level_ioapic	startup_level_ioapic_vector
 #define shutdown_level_ioapic	mask_IO_APIC_vector
-#define enable_level_ioapic	unmask_IO_APIC_vector
+#define enable_level_ioapic	enable_level_ioapic_vector
 #define disable_level_ioapic	mask_IO_APIC_vector
 #define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_vector
 #define end_level_ioapic	end_level_ioapic_vector
@@ -35,11 +34,10 @@ static inline void end_edge_ioapic_vecto
 #else
 static inline int use_pci_vector(void)	{return 0;}
 static inline void disable_edge_ioapic_irq(unsigned int irq) { }
-static inline void mask_and_ack_level_ioapic_irq(unsigned int irq) { }
 static inline void end_edge_ioapic_irq (unsigned int irq) { }
 #define startup_level_ioapic	startup_level_ioapic_irq
 #define shutdown_level_ioapic	mask_IO_APIC_irq
-#define enable_level_ioapic	unmask_IO_APIC_irq
+#define enable_level_ioapic	enable_level_ioapic_irq
 #define disable_level_ioapic	mask_IO_APIC_irq
 #define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_irq
 #define end_level_ioapic	end_level_ioapic_irq
@@ -218,4 +216,6 @@ extern int assign_irq_vector(int irq);
 
 void enable_NMI_through_LVT0 (void * dummy);
 
+extern raw_spinlock_t i8259A_lock;
+
 #endif
--- linux/security/selinux/avc.c.orig
+++ linux/security/selinux/avc.c
@@ -125,6 +125,8 @@ unsigned int avc_cache_threshold = AVC_D
 DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats) = { 0 };
 #endif
 
+static DEFINE_SPINLOCK(avc_lock);
+
 static struct avc_cache avc_cache;
 static struct avc_callback_node *avc_callbacks;
 static kmem_cache_t *avc_node_cachep;
@@ -243,7 +245,7 @@ int avc_get_hash_stats(char *page)
 	int i, chain_len, max_chain_len, slots_used;
 	struct avc_node *node;
 
-	rcu_read_lock();
+	rcu_read_lock_spin(&avc_lock);
 
 	slots_used = 0;
 	max_chain_len = 0;
@@ -258,7 +260,7 @@ int avc_get_hash_stats(char *page)
 		}
 	}
 
-	rcu_read_unlock();
+	rcu_read_unlock_spin(&avc_lock);
 
 	return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n"
 			 "longest chain: %d\n",
@@ -827,7 +829,7 @@ static int avc_update_cache(u32 event, u
 	struct avc_node *node;
 	int i;
 
-	rcu_read_lock();
+	rcu_read_lock_spin(&avc_lock);
 
 	if (ssid == SECSID_WILD || tsid == SECSID_WILD) {
 		/* apply to all matching nodes */
@@ -846,7 +848,7 @@ static int avc_update_cache(u32 event, u
 		avc_update_node(event, perms, ssid, tsid, tclass);
 	}
 
-	rcu_read_unlock();
+	rcu_read_unlock_spin(&avc_lock);
 
 	return 0;
 }
@@ -1051,15 +1053,15 @@ int avc_has_perm_noaudit(u32 ssid, u32 t
 	int rc = 0;
 	u32 denied;
 
-	rcu_read_lock();
+	rcu_read_lock_spin(&avc_lock);
 
 	node = avc_lookup(ssid, tsid, tclass, requested);
 	if (!node) {
-		rcu_read_unlock();
+		rcu_read_unlock_spin(&avc_lock);
 		rc = security_compute_av(ssid,tsid,tclass,requested,&entry.avd);
 		if (rc)
 			goto out;
-		rcu_read_lock();
+		rcu_read_lock_spin(&avc_lock);
 		node = avc_insert(ssid,tsid,tclass,&entry);
 	}
 
@@ -1079,7 +1081,7 @@ int avc_has_perm_noaudit(u32 ssid, u32 t
 						ssid,tsid,tclass);
 	}
 
-	rcu_read_unlock();
+	rcu_read_unlock_spin(&avc_lock);
 out:
 	return rc;
 }
--- linux/security/selinux/netif.c.orig
+++ linux/security/selinux/netif.c
@@ -45,7 +45,7 @@ struct sel_netif
 
 static u32 sel_netif_total;
 static LIST_HEAD(sel_netif_list);
-static spinlock_t sel_netif_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(sel_netif_lock);
 static struct list_head sel_netif_hash[SEL_NETIF_HASH_SIZE];
 
 static inline u32 sel_netif_hasfn(struct net_device *dev)
@@ -132,17 +132,17 @@ static struct sel_netif *sel_netif_looku
 
 	nsec->dev = dev;
 	
-	spin_lock_bh(&sel_netif_lock);
+	spin_lock_bh_nort(&sel_netif_lock);
 	
 	netif = sel_netif_find(dev);
 	if (netif) {
-		spin_unlock_bh(&sel_netif_lock);
+		spin_unlock_bh_nort(&sel_netif_lock);
 		kfree(new);
 		goto out;
 	}
 	
 	ret = sel_netif_insert(new);
-	spin_unlock_bh(&sel_netif_lock);
+	spin_unlock_bh_nort(&sel_netif_lock);
 	
 	if (ret) {
 		kfree(new);
@@ -182,15 +182,15 @@ int sel_netif_sids(struct net_device *de
 	int ret = 0;
 	struct sel_netif *netif;
 
-	rcu_read_lock();
+	rcu_read_lock_spin(&sel_netif_lock);
 	netif = sel_netif_lookup(dev);
 	if (IS_ERR(netif)) {
-		rcu_read_unlock();
+		rcu_read_unlock_spin(&sel_netif_lock);
 		ret = sel_netif_sids_slow(dev, if_sid, msg_sid);
 		goto out;
 	}
 	sel_netif_assign_sids(netif->nsec.if_sid, netif->nsec.msg_sid, if_sid, msg_sid);
-	rcu_read_unlock();
+	rcu_read_unlock_spin(&sel_netif_lock);
 out:
 	return ret;
 }
--- linux/Makefile.orig
+++ linux/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 10
-EXTRAVERSION =-rc3-mm1
+EXTRAVERSION =-rc3-mm1-V0.7.33-04
 NAME=Woozy Numbat
 
 # *DOCUMENTATION*
@@ -517,10 +517,14 @@ CFLAGS		+= $(call add-align,CONFIG_CC_AL
 CFLAGS		+= $(call add-align,CONFIG_CC_ALIGN_LOOPS,-loops)
 CFLAGS		+= $(call add-align,CONFIG_CC_ALIGN_JUMPS,-jumps)
 
-ifdef CONFIG_FRAME_POINTER
-CFLAGS		+= -fno-omit-frame-pointer
+ifdef CONFIG_MCOUNT
+CFLAGS		+= -pg -fno-omit-frame-pointer
 else
-CFLAGS		+= -fomit-frame-pointer
+ ifdef CONFIG_FRAME_POINTER
+ CFLAGS		+= -fno-omit-frame-pointer
+ else
+ CFLAGS		+= -fomit-frame-pointer
+ endif
 endif
 
 ifdef CONFIG_DEBUG_INFO
--- linux/lib/dec_and_lock.c.orig
+++ linux/lib/dec_and_lock.c
@@ -27,14 +27,14 @@
  */
 
 #ifndef ATOMIC_DEC_AND_LOCK
-int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock)
 {
-	spin_lock(lock);
+	_raw_spin_lock(lock);
 	if (atomic_dec_and_test(atomic))
 		return 1;
-	spin_unlock(lock);
+	_raw_spin_unlock(lock);
 	return 0;
 }
 
-EXPORT_SYMBOL(_atomic_dec_and_lock);
+EXPORT_SYMBOL(_atomic_dec_and_raw_spin_lock);
 #endif
--- linux/lib/radix-tree.c.orig
+++ linux/lib/radix-tree.c
@@ -103,6 +103,8 @@ radix_tree_node_free(struct radix_tree_n
 	kmem_cache_free(radix_tree_node_cachep, node);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
 /*
  * Load up this CPU's radix_tree_node buffer with sufficient objects to
  * ensure that the addition of a single element in the tree cannot fail.  On
@@ -135,6 +137,8 @@ out:
 }
 EXPORT_SYMBOL(radix_tree_preload);
 
+#endif
+
 static inline void tag_set(struct radix_tree_node *node, int tag, int offset)
 {
 	if (!test_bit(offset, &node->tags[tag][0]))
--- linux/lib/inflate.c.orig
+++ linux/lib/inflate.c
@@ -304,7 +304,7 @@ STATIC int INIT huft_build(
   register struct huft *q;      /* points to current table */
   struct huft r;                /* table entry for structure assignment */
   struct huft *u[BMAX];         /* table stack */
-  unsigned v[N_MAX];            /* values in order of bit length */
+  unsigned *v;                  /* values in order of bit length */
   register int w;               /* bits before this table == (l * h) */
   unsigned x[BMAX+1];           /* bit offsets, then code stack */
   unsigned *xp;                 /* pointer into x */
@@ -313,6 +313,10 @@ STATIC int INIT huft_build(
 
 DEBG("huft1 ");
 
+  /* allocate new table */
+  v = (unsigned *)malloc(sizeof(unsigned)*N_MAX);
+  if (!v)
+    return 3;             /* not enough memory */
   /* Generate counts for each bit length */
   memzero(c, sizeof(c));
   p = b;  i = n;
@@ -326,6 +330,7 @@ DEBG("huft1 ");
   {
     *t = (struct huft *)NULL;
     *m = 0;
+    free(v);
     return 0;
   }
 
@@ -351,10 +356,14 @@ DEBG("huft3 ");
 
   /* Adjust last length count to fill out codes, if needed */
   for (y = 1 << j; j < i; j++, y <<= 1)
-    if ((y -= c[j]) < 0)
+    if ((y -= c[j]) < 0) {
+      free(v);
       return 2;                 /* bad input: more codes than bits */
-  if ((y -= c[i]) < 0)
+    }
+  if ((y -= c[i]) < 0) {
+    free(v);
     return 2;
+  }
   c[i] += y;
 
 DEBG("huft4 ");
@@ -426,6 +435,7 @@ DEBG1("3 ");
         {
           if (h)
             huft_free(u[0]);
+          free(v);
           return 3;             /* not enough memory */
         }
 DEBG1("4 ");
@@ -489,6 +499,7 @@ DEBG("h6f ");
 
 DEBG("huft7 ");
 
+  free(v);
   /* Return true (1) if we were given an incomplete table */
   return y != 0 && g != 1;
 }
--- linux/lib/kernel_lock.c.orig
+++ linux/lib/kernel_lock.c
@@ -15,7 +15,7 @@
 /*
  * Debugging check.
  */
-unsigned int smp_processor_id(void)
+unsigned int notrace smp_processor_id(void)
 {
 	unsigned long preempt_count = preempt_count();
 	int this_cpu = __smp_processor_id();
@@ -90,22 +90,25 @@ DECLARE_MUTEX(kernel_sem);
  * about recursion, both due to the down() and due to the enabling of
  * preemption. schedule() will re-check the preemption flag after
  * reacquiring the semaphore.
+ *
+ * Called with interrupts disabled.
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
 	struct task_struct *task = current;
 	int saved_lock_depth = task->lock_depth;
 
+	local_irq_enable();
 	BUG_ON(saved_lock_depth < 0);
 
 	task->lock_depth = -1;
-	preempt_enable_no_resched();
 
 	down(&kernel_sem);
 
-	preempt_disable();
 	task->lock_depth = saved_lock_depth;
 
+	local_irq_disable();
+
 	return 0;
 }
 
@@ -122,11 +125,15 @@ void __lockfunc lock_kernel(void)
 	struct task_struct *task = current;
 	int depth = task->lock_depth + 1;
 
-	if (likely(!depth))
+	if (likely(!depth)) {
 		/*
 		 * No recursion worries - we set up lock_depth _after_
 		 */
 		down(&kernel_sem);
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+		current->last_kernel_lock = __builtin_return_address(0);
+#endif
+	}
 
 	task->lock_depth = depth;
 }
@@ -137,8 +144,12 @@ void __lockfunc unlock_kernel(void)
 
 	BUG_ON(task->lock_depth < 0);
 
-	if (likely(--task->lock_depth < 0))
+	if (likely(--task->lock_depth == -1)) {
+#ifdef CONFIG_RT_DEADLOCK_DETECT
+		current->last_kernel_lock = NULL;
+#endif
 		up(&kernel_sem);
+	}
 }
 
 #else
@@ -153,7 +164,7 @@ void __lockfunc unlock_kernel(void)
  *
  * Don't use in new code.
  */
-static spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(kernel_flag);
 
 
 /*
@@ -171,38 +182,40 @@ static spinlock_t kernel_flag __cachelin
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
-	while (!_raw_spin_trylock(&kernel_flag)) {
+	local_irq_enable();
+	while (!__raw_spin_trylock(&kernel_flag)) {
 		if (test_thread_flag(TIF_NEED_RESCHED))
 			return -EAGAIN;
 		cpu_relax();
 	}
+	local_irq_disable();
 	preempt_disable();
 	return 0;
 }
 
 void __lockfunc __release_kernel_lock(void)
 {
-	_raw_spin_unlock(&kernel_flag);
+	__raw_spin_unlock(&kernel_flag);
 	preempt_enable_no_resched();
 }
 
 /*
  * These are the BKL spinlocks - we try to be polite about preemption. 
  * If SMP is not on (ie UP preemption), this all goes away because the
- * _raw_spin_trylock() will always succeed.
+ * __raw_spin_trylock() will always succeed.
  */
 #ifdef CONFIG_PREEMPT
 static inline void __lock_kernel(void)
 {
 	preempt_disable();
-	if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
+	if (unlikely(!__raw_spin_trylock(&kernel_flag))) {
 		/*
 		 * If preemption was disabled even before this
 		 * was called, there's nothing we can be polite
 		 * about - just spin.
 		 */
 		if (preempt_count() > 1) {
-			_raw_spin_lock(&kernel_flag);
+			__raw_spin_lock(&kernel_flag);
 			return;
 		}
 
@@ -215,7 +228,7 @@ static inline void __lock_kernel(void)
 			while (spin_is_locked(&kernel_flag))
 				cpu_relax();
 			preempt_disable();
-		} while (!_raw_spin_trylock(&kernel_flag));
+		} while (!__raw_spin_trylock(&kernel_flag));
 	}
 }
 
@@ -226,13 +239,13 @@ static inline void __lock_kernel(void)
  */
 static inline void __lock_kernel(void)
 {
-	_raw_spin_lock(&kernel_flag);
+	__raw_spin_lock(&kernel_flag);
 }
 #endif
 
 static inline void __unlock_kernel(void)
 {
-	_raw_spin_unlock(&kernel_flag);
+	__raw_spin_unlock(&kernel_flag);
 	preempt_enable();
 }
 
--- linux/lib/Kconfig.RT.orig
+++ linux/lib/Kconfig.RT
@@ -0,0 +1,139 @@
+
+choice
+	prompt "Preemption Mode"
+	default PREEMPT_RT
+
+config PREEMPT_NONE
+	bool "No Forced Preemption (Server)"
+	help
+	  This is the traditional Linux preemption model geared towards
+	  throughput. It will still provide good latencies most of the
+	  time but there are no guarantees and occasional long delays
+	  are possible.
+
+	  Select this option if you are building a kernel for a server or
+	  scientific/computation system, or if you want to maximize the
+	  raw processing power of the kernel, irrespective of scheduling
+	  latencies.
+
+config PREEMPT_VOLUNTARY
+	bool "Voluntary Kernel Preemption (Desktop)"
+	help
+	  This option reduces the latency of the kernel by adding more
+	  "explicit preemption points" to the kernel code. These new
+	  preemption points have been selected to minimize the maximum
+	  latency of rescheduling, providing faster application reactions,
+	  at the cost of slighly lower throughput.
+
+	  This allows reaction to interactive events by allowing a
+	  low priority process to voluntarily preempt itself even if it
+	  is in kernel mode executing a system call. This allows
+	  applications to run more 'smoothly' even when the system is
+	  under load.
+
+	  Select this if you are building a kernel for a desktop system.
+
+config PREEMPT_DESKTOP
+	bool "Preemptible Kernel (Low-Latency Desktop)"
+	help
+	  This option reduces the latency of the kernel by making
+	  all kernel code that is not executing in a critical section
+	  preemptible.  This allows reaction to interactive events by
+	  permitting a low priority process to be preempted involuntarily
+	  even if it is in kernel mode executing a system call and would
+	  otherwise not about to reach a preemption point.  This allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of slighly lower throughput and a
+	  slight runtime overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 50% of the time.)
+
+	  Select this if you are building a kernel for a desktop or
+	  embedded system with latency requirements in the milliseconds
+	  range.
+
+config PREEMPT_RT
+	bool "Complete Preemption (Real-Time)"
+	select PREEMPT_SOFTIRQS
+	select PREEMPT_HARDIRQS
+	help
+	  This option further reduces the scheduling latency of the
+	  kernel by replacing almost every spinlock used by the kernel
+	  with preemptible mutexes and thus making all but the most
+	  critical kernel code involuntarily preemptible. The remaining
+	  handful of lowlevel non-preemptible codepaths are short and
+	  have a deterministic latency of a couple of tens of
+	  microseconds (depending the the hardware).  This also allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of lower throughput and runtime
+	  overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 95% of the time.)
+
+	  Select this if you are building a kernel for a desktop,
+	  embedded or real-time system with guaranteed latency
+	  requirements of 100 usecs or lower.
+
+endchoice
+
+config PREEMPT
+	bool
+	default y
+	depends on PREEMPT_DESKTOP || PREEMPT_RT
+
+config PREEMPT_SOFTIRQS
+	bool "Thread Softirqs"
+	default n
+#	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by 'threading'
+          soft interrupts. This means that all softirqs will execute
+          in softirqd's context. While this helps latency, it can also
+          reduce performance.
+
+          The threading of softirqs can also be controlled via
+          /proc/sys/kernel/softirq_preemption runtime flag and the
+          sofirq-preempt=0/1 boot-time option.
+
+	  Say N if you are unsure.
+
+config PREEMPT_HARDIRQS
+	bool "Thread Hardirqs"
+	default n
+#	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by 'threading'
+          hardirqs. This means that all (or selected) hardirqs will run
+          in their own kernel thread context. While this helps latency,
+          this feature can also reduce performance.
+
+          The threading of hardirqs can also be controlled via the
+          /proc/sys/kernel/hardirq_preemption runtime flag and the
+          hardirq-preempt=0/1 boot-time option. Per-irq threading can
+          be enabled/disable via the /proc/irq/<IRQ>/<handler>/threaded
+          runtime flags.
+
+	  Say N if you are unsure.
+
+config SPINLOCK_BKL
+	bool "Old-Style Big Kernel Lock"
+	depends on (PREEMPT || SMP) && !PREEMPT_RT
+	default n
+	help
+	  This option increases the latency of the kernel by making the
+	  big kernel lock spinlock-based (which is bad for latency).
+	  However, enable this option if you see any problems to revert
+	  back to the traditional spinlock BKL design.
+
+	  Say Y here if you are building a kernel for a desktop system.
+	  Say N if you are unsure.
+
+config PREEMPT_BKL
+	bool
+	depends on PREEMPT_RT || !SPINLOCK_BKL
+	default y
+
--- linux/arch/i386/Kconfig.debug.orig
+++ linux/arch/i386/Kconfig.debug
@@ -18,6 +18,7 @@ config EARLY_PRINTK
 config DEBUG_STACKOVERFLOW
 	bool "Check for stack overflows"
 	depends on DEBUG_KERNEL
+	default y
 
 config KPROBES
 	bool "Kprobes"
@@ -32,6 +33,7 @@ config KPROBES
 config DEBUG_STACK_USAGE
 	bool "Stack utilization instrumentation"
 	depends on DEBUG_KERNEL
+	default y
 	help
 	  Enables the display of the minimum amount of free stack which each
 	  task has ever had available in the sysrq-T and sysrq-P debug output.
--- linux/lib/Kconfig.debug.orig
+++ linux/lib/Kconfig.debug
@@ -56,11 +56,14 @@ config DEBUG_PREEMPT
 	  If you say Y here then the kernel will use a debug variant of the
 	  commonly used smp_processor_id() function and will print warnings
 	  if kernel code uses it in a preemption-unsafe way. Also, the kernel
-	  will detect preemption count underflows.
+	  will detect preemption count underflows and will trace critical
+	  section entries and print that info when an illegal sleep happens.
 
+# broken, disable for now
 config DEBUG_SPINLOCK
 	bool "Spinlock debugging"
-	depends on DEBUG_KERNEL && (ALPHA || ARM || X86 || IA64 || M32R || MIPS || PARISC || PPC32 || (SUPERH && !SUPERH64) || SPARC32 || SPARC64 || USERMODE || X86_64)
+	depends on 0 && (DEBUG_KERNEL && (ALPHA || ARM || X86 || IA64 || M32R || MIPS || PARISC || PPC32 || (SUPERH && !SUPERH64) || SPARC32 || SPARC64 || USERMODE || X86_64))
+	default n
 	help
 	  Say Y here and build SMP to catch missing spinlock initialization
 	  and certain other kinds of spinlock errors commonly made.  This is
@@ -69,11 +72,101 @@ config DEBUG_SPINLOCK
 
 config DEBUG_SPINLOCK_SLEEP
 	bool "Sleep-inside-spinlock checking"
-	depends on DEBUG_KERNEL && (X86 || IA64 || M32R || MIPS || PPC32 || PPC64 || ARCH_S390 || SPARC32 || SPARC64 || USERMODE)
+	depends on DEBUG_KERNEL && !DEBUG_PREEMPT && (X86 || IA64 || M32R || MIPS || PPC32 || PPC64 || ARCH_S390 || SPARC32 || SPARC64 || USERMODE)
 	help
 	  If you say Y here, various routines which may sleep will become very
 	  noisy if they are called with a spinlock held.
 
+config WAKEUP_TIMING
+	bool "Wakeup latency timing"
+	default y
+	help
+	  This option measures the time spent from a highprio thread being
+	  woken up to it getting scheduled on a CPU, with microsecond
+	  accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+config PREEMPT_TRACE
+	bool
+	default y
+	depends on DEBUG_PREEMPT
+
+config CRITICAL_PREEMPT_TIMING
+	bool "Non-preemptible critical section latency timing"
+	default n
+	depends on PREEMPT
+	help
+	  This option measures the time spent in preempt-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the irqs-off timing option can be
+	  used together or separately.)
+
+config CRITICAL_IRQSOFF_TIMING
+	bool "Interrupts-off critical section latency timing"
+	default n
+	help
+	  This option measures the time spent in irqs-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the preempt-off timing option can be
+	  used together or separately.)
+
+config CRITICAL_TIMING
+	bool
+	default y
+	depends on CRITICAL_PREEMPT_TIMING || CRITICAL_IRQSOFF_TIMING
+
+config LATENCY_TIMING
+	bool
+	default y
+	depends on WAKEUP_TIMING || CRITICAL_TIMING
+
+config LATENCY_TRACE
+	bool "Latency tracing"
+	default n
+	depends on LATENCY_TIMING
+	help
+	  This option enables a kernel tracing mechanism that will track
+	  precise function-call granularity kernel execution during
+	  wakeup paths or critical sections.  When this option is enabled
+	  then the last maximum latency timing event's full trace can be
+	  found in /proc/latency_trace, in a human-readable (or rather as
+	  some would say, in a kernel-developer-readable) form.
+
+	  (Note that kernel size and overhead increases noticeably
+	  with this option enabled.)
+
+config MCOUNT
+	bool
+	depends on LATENCY_TRACE
+	default y
+
+config RT_DEADLOCK_DETECT
+	bool "Automatic mutex/rwsem deadlock detection"
+	depends on PREEMPT_RT
+	default y
+	help
+	  This allows semaphores, rw-semaphores, and spinlock/rwlock
+	  mutexes to be traced for purposes of automatic deadlock
+	  detection.
+
 config DEBUG_KOBJECT
 	bool "kobject debugging"
 	depends on DEBUG_KERNEL
@@ -119,12 +212,17 @@ config DEBUG_INFO
         bugs back to the UML developers, say N, otherwise say Y.
 
 if !X86_64
-config FRAME_POINTER
+config USE_FRAME_POINTER
 	bool "Compile the kernel with frame pointers"
-	depends on X86 || CRIS || M68KNOMMU
+	depends on (X86 || CRIS || M68KNOMMU) && !MCOUNT
 	help
 	  If you say Y here the resulting kernel image will be slightly larger
 	  and slower, but it will give very useful debugging information.
 	  If you don't debug the kernel, you can say N, but we may not be able
 	  to solve problems without frame pointers.
 endif
+
+config FRAME_POINTER
+	bool
+	depends on USE_FRAME_POINTER || MCOUNT
+	default y