--- linux/net/core/pktgen.c.orig +++ linux/net/core/pktgen.c @@ -268,7 +268,7 @@ static struct net_device *setup_inject(s if (strlen(info->src_min) == 0) { struct in_device *in_dev; - rcu_read_lock(); + rcu_read_lock_down_read(&rtnl_sem); in_dev = __in_dev_get(odev); if (in_dev) { if (in_dev->ifa_list) { @@ -276,7 +276,7 @@ static struct net_device *setup_inject(s info->saddr_max = info->saddr_min; } } - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); } else { info->saddr_min = in_aton(info->src_min); --- linux/net/core/netfilter.c.orig +++ linux/net/core/netfilter.c @@ -47,7 +47,7 @@ static DECLARE_MUTEX(nf_sockopt_mutex); struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; static LIST_HEAD(nf_sockopts); -static spinlock_t nf_hook_lock = SPIN_LOCK_UNLOCKED; +static rwlock_t nf_hook_lock = RW_LOCK_UNLOCKED; /* * A queue handler may be registered for each protocol. Each is protected by @@ -64,13 +64,13 @@ int nf_register_hook(struct nf_hook_ops { struct list_head *i; - spin_lock_bh(&nf_hook_lock); + write_lock_bh(&nf_hook_lock); list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { if (reg->priority < ((struct nf_hook_ops *)i)->priority) break; } list_add_rcu(®->list, i->prev); - spin_unlock_bh(&nf_hook_lock); + write_unlock_bh(&nf_hook_lock); synchronize_net(); return 0; @@ -78,9 +78,9 @@ int nf_register_hook(struct nf_hook_ops void nf_unregister_hook(struct nf_hook_ops *reg) { - spin_lock_bh(&nf_hook_lock); + write_lock_bh(&nf_hook_lock); list_del_rcu(®->list); - spin_unlock_bh(&nf_hook_lock); + write_unlock_bh(&nf_hook_lock); synchronize_net(); } @@ -504,8 +504,15 @@ int nf_hook_slow(int pf, unsigned int ho unsigned int verdict; int ret = 0; + /* + * PREEMPT_RT semantics: different-type read-locks + * dont nest that easily: + */ +// rcu_read_lock_read(&ptype_lock); + /* We may already have this, but read-locks nest anyway */ - rcu_read_lock(); + // FIXME, HACK: complex locking dependencies here ... +// rcu_read_lock_read(&nf_hook_lock); #ifdef CONFIG_NETFILTER_DEBUG if (skb->nf_debug & (1 << hook)) { @@ -536,7 +543,9 @@ int nf_hook_slow(int pf, unsigned int ho break; } - rcu_read_unlock(); +// rcu_read_unlock_read(&nf_hook_lock); +// rcu_read_unlock_read(&ptype_lock); + return ret; } @@ -546,7 +555,8 @@ void nf_reinject(struct sk_buff *skb, st struct list_head *elem = &info->elem->list; struct list_head *i; - rcu_read_lock(); +// rcu_read_lock_read(&ptype_lock); +// rcu_read_lock_read(&nf_hook_lock); /* Release those devices we held, or Alexey will kill me. */ if (info->indev) dev_put(info->indev); @@ -600,7 +610,8 @@ void nf_reinject(struct sk_buff *skb, st goto next_hook; break; } - rcu_read_unlock(); +// rcu_read_unlock_read(&nf_hook_lock); +// rcu_read_unlock_read(&ptype_lock); if (verdict == NF_DROP) kfree_skb(skb); @@ -744,7 +755,7 @@ EXPORT_SYMBOL(skb_ip_make_writable); static nf_logfn *nf_logging[NPROTO]; /* = NULL */ static int reported = 0; -static spinlock_t nf_log_lock = SPIN_LOCK_UNLOCKED; +static rwlock_t nf_log_lock = RW_LOCK_UNLOCKED; int nf_log_register(int pf, nf_logfn *logfn) { @@ -752,21 +763,21 @@ int nf_log_register(int pf, nf_logfn *lo /* Any setup of logging members must be done before * substituting pointer. */ - spin_lock(&nf_log_lock); + write_lock(&nf_log_lock); if (!nf_logging[pf]) { rcu_assign_pointer(nf_logging[pf], logfn); ret = 0; } - spin_unlock(&nf_log_lock); + write_unlock(&nf_log_lock); return ret; } void nf_log_unregister(int pf, nf_logfn *logfn) { - spin_lock(&nf_log_lock); + write_lock(&nf_log_lock); if (nf_logging[pf] == logfn) nf_logging[pf] = NULL; - spin_unlock(&nf_log_lock); + write_unlock(&nf_log_lock); /* Give time to concurrent readers. */ synchronize_net(); @@ -783,7 +794,7 @@ void nf_log_packet(int pf, char prefix[NF_LOG_PREFIXLEN]; nf_logfn *logfn; - rcu_read_lock(); + rcu_read_lock_read(&nf_log_lock); logfn = rcu_dereference(nf_logging[pf]); if (logfn) { va_start(args, fmt); @@ -796,7 +807,7 @@ void nf_log_packet(int pf, "no backend logging module loaded in!\n"); reported++; } - rcu_read_unlock(); + rcu_read_unlock_read(&nf_log_lock); } EXPORT_SYMBOL(nf_log_register); EXPORT_SYMBOL(nf_log_unregister); --- linux/net/core/netpoll.c.orig +++ linux/net/core/netpoll.c @@ -80,7 +80,9 @@ void netpoll_poll(struct netpoll *np) return; /* Process pending work on NIC */ + WARN_ON_RT(irqs_disabled()); np->dev->poll_controller(np->dev); + WARN_ON_RT(irqs_disabled()); /* If scheduling is stopped, tickle NAPI bits */ spin_lock_irqsave(&netpoll_poll_lock, flags); @@ -119,25 +121,28 @@ static void refill_skbs(void) static void zap_completion_queue(void) { - unsigned long flags; struct softnet_data *sd = &get_cpu_var(softnet_data); + struct sk_buff *clist = NULL; + unsigned long flags; if (sd->completion_queue) { - struct sk_buff *clist; - local_irq_save(flags); clist = sd->completion_queue; sd->completion_queue = NULL; local_irq_restore(flags); - - while (clist != NULL) { - struct sk_buff *skb = clist; - clist = clist->next; - __kfree_skb(skb); - } } + /* + * Took the list private, can drop our softnet + * reference: + */ put_cpu_var(softnet_data); + + while (clist != NULL) { + struct sk_buff *skb = clist; + clist = clist->next; + __kfree_skb(skb); + } } static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve) @@ -189,7 +194,7 @@ repeat: } spin_lock(&np->dev->xmit_lock); - np->dev->xmit_lock_owner = smp_processor_id(); + np->dev->xmit_lock_owner = _smp_processor_id(); /* * network drivers do not expect to be called if the queue is @@ -608,18 +613,18 @@ int netpoll_setup(struct netpoll *np) memcpy(np->local_mac, ndev->dev_addr, 6); if (!np->local_ip) { - rcu_read_lock(); + rcu_read_lock_down_read(&rtnl_sem); in_dev = __in_dev_get(ndev); if (!in_dev || !in_dev->ifa_list) { - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); printk(KERN_ERR "%s: no IP address for %s, aborting\n", np->name, np->dev_name); goto release; } np->local_ip = ntohl(in_dev->ifa_list->ifa_local); - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n", np->name, HIPQUAD(np->local_ip)); } --- linux/net/core/dev.c.orig +++ linux/net/core/dev.c @@ -154,7 +154,7 @@ * 86DD IPv6 */ -static spinlock_t ptype_lock = SPIN_LOCK_UNLOCKED; +DEFINE_RWLOCK(ptype_lock); static struct list_head ptype_base[16]; /* 16 way hashed list */ static struct list_head ptype_all; /* Taps */ @@ -272,7 +272,7 @@ void dev_add_pack(struct packet_type *pt { int hash; - spin_lock_bh(&ptype_lock); + write_lock_bh(&ptype_lock); if (pt->type == htons(ETH_P_ALL)) { netdev_nit++; list_add_rcu(&pt->list, &ptype_all); @@ -280,7 +280,7 @@ void dev_add_pack(struct packet_type *pt hash = ntohs(pt->type) & 15; list_add_rcu(&pt->list, &ptype_base[hash]); } - spin_unlock_bh(&ptype_lock); + write_unlock_bh(&ptype_lock); } extern void linkwatch_run_queue(void); @@ -305,7 +305,7 @@ void __dev_remove_pack(struct packet_typ struct list_head *head; struct packet_type *pt1; - spin_lock_bh(&ptype_lock); + write_lock_bh(&ptype_lock); if (pt->type == htons(ETH_P_ALL)) { netdev_nit--; @@ -322,7 +322,7 @@ void __dev_remove_pack(struct packet_typ printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); out: - spin_unlock_bh(&ptype_lock); + write_unlock_bh(&ptype_lock); } /** * dev_remove_pack - remove packet handler @@ -1034,7 +1034,7 @@ void dev_queue_xmit_nit(struct sk_buff * struct packet_type *ptype; net_timestamp(&skb->stamp); - rcu_read_lock(); +// rcu_read_lock_read(&ptype_lock); list_for_each_entry_rcu(ptype, &ptype_all, list) { /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) @@ -1066,7 +1066,7 @@ void dev_queue_xmit_nit(struct sk_buff * ptype->func(skb2, skb->dev, ptype); } } - rcu_read_unlock(); +// rcu_read_unlock_read(&ptype_lock); } /* @@ -1228,6 +1228,8 @@ int dev_queue_xmit(struct sk_buff *skb) struct Qdisc *q; int rc = -ENOMEM; +// rcu_read_lock_read(&ptype_lock); // for PREEMPT_RT + if (skb_shinfo(skb)->frag_list && !(dev->features & NETIF_F_FRAGLIST) && __skb_linearize(skb, GFP_ATOMIC)) @@ -1299,10 +1301,16 @@ int dev_queue_xmit(struct sk_buff *skb) Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { - int cpu = smp_processor_id(); /* ok because BHs are off */ + int cpu = _smp_processor_id(); /* ok because BHs are off */ + /* + * No need to check for recursion with threaded interrupts: + */ +#ifdef CONFIG_PREEMPT_RT + if (1) { +#else if (dev->xmit_lock_owner != cpu) { - +#endif HARD_TX_LOCK(dev, cpu); if (!netif_queue_stopped(dev)) { @@ -1333,9 +1341,11 @@ int dev_queue_xmit(struct sk_buff *skb) out_kfree_skb: kfree_skb(skb); +// rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT return rc; out: local_bh_enable(); +// rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT return rc; } @@ -1515,6 +1525,7 @@ static void net_tx_action(struct softirq { struct softnet_data *sd = &__get_cpu_var(softnet_data); +// rcu_read_lock_read(&ptype_lock); // for PREEMPT_RT if (sd->completion_queue) { struct sk_buff *clist; @@ -1529,6 +1540,13 @@ static void net_tx_action(struct softirq BUG_TRAP(!atomic_read(&skb->users)); __kfree_skb(skb); + /* + * Safe to reschedule - the list is private + * at this point. + */ +// rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT + cond_resched_all(); +// rcu_read_lock_read(&ptype_lock); // for PREEMPT_RT } } @@ -1551,10 +1569,17 @@ static void net_tx_action(struct softirq qdisc_run(dev); spin_unlock(&dev->queue_lock); } else { - netif_schedule(dev); + /* + * Dont re-kick the queue here, it will cause + * excessive scheduling of ksoftirqd due + * to retry. When the queue is released + * it will be completed anyway. + */ +// netif_schedule(dev); } } } +// rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT } static __inline__ int deliver_skb(struct sk_buff *skb, @@ -1653,7 +1678,7 @@ int netif_receive_skb(struct sk_buff *sk pt_prev = NULL; - rcu_read_lock(); +// rcu_read_lock_read(&ptype_lock); #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { @@ -1715,7 +1740,7 @@ ncls: } out: - rcu_read_unlock(); +// rcu_read_unlock_read(&ptype_lock); return ret; } @@ -1769,11 +1794,12 @@ job_done: static void net_rx_action(struct softirq_action *h) { - struct softnet_data *queue = &__get_cpu_var(softnet_data); + struct softnet_data *queue; unsigned long start_time = jiffies; int budget = netdev_max_backlog; local_irq_disable(); + queue = &__get_cpu_var(softnet_data); while (!list_empty(&queue->poll_list)) { struct net_device *dev; @@ -1783,10 +1809,16 @@ static void net_rx_action(struct softirq local_irq_enable(); + if (unlikely(cond_resched_all())) { + local_irq_disable(); + continue; + } dev = list_entry(queue->poll_list.next, struct net_device, poll_list); +// rcu_read_lock_read(&ptype_lock); if (dev->quota <= 0 || dev->poll(dev, &budget)) { +// rcu_read_unlock_read(&ptype_lock); local_irq_disable(); list_del(&dev->poll_list); list_add_tail(&dev->poll_list, &queue->poll_list); @@ -1796,6 +1828,7 @@ static void net_rx_action(struct softirq dev->quota = dev->weight; } else { dev_put(dev); +// rcu_read_unlock_read(&ptype_lock); local_irq_disable(); } @@ -1808,8 +1841,10 @@ out: return; softnet_break: + preempt_disable(); __get_cpu_var(netdev_rx_stat).time_squeeze++; __raise_softirq_irqoff(NET_RX_SOFTIRQ); + preempt_enable(); goto out; } --- linux/net/core/rtnetlink.c.orig +++ linux/net/core/rtnetlink.c @@ -51,7 +51,7 @@ #include #include -DECLARE_MUTEX(rtnl_sem); +DECLARE_RWSEM(rtnl_sem); void rtnl_lock(void) { @@ -608,7 +608,7 @@ static void rtnetlink_rcv(struct sock *s kfree_skb(skb); } - up(&rtnl_sem); + up_write(&rtnl_sem); netdev_run_todo(); } while (rtnl && rtnl->sk_receive_queue.qlen); --- linux/net/bridge/br_device.c.orig +++ linux/net/bridge/br_device.c @@ -40,7 +40,7 @@ int br_dev_xmit(struct sk_buff *skb, str skb->mac.raw = skb->data; skb_pull(skb, ETH_HLEN); - rcu_read_lock(); + rcu_read_lock_spin(&br->hash_lock); if (dest[0] & 1) br_flood_deliver(br, skb, 0); else if ((dst = __br_fdb_get(br, dest)) != NULL) @@ -48,7 +48,7 @@ int br_dev_xmit(struct sk_buff *skb, str else br_flood_deliver(br, skb, 0); - rcu_read_unlock(); + rcu_read_unlock_spin(&br->hash_lock); return 0; } --- linux/net/bridge/br_ioctl.c.orig +++ linux/net/bridge/br_ioctl.c @@ -122,7 +122,7 @@ static int old_dev_ioctl(struct net_devi struct __bridge_info b; memset(&b, 0, sizeof(struct __bridge_info)); - rcu_read_lock(); + rcu_read_lock_spin(&br->hash_lock); memcpy(&b.designated_root, &br->designated_root, 8); memcpy(&b.bridge_id, &br->bridge_id, 8); b.root_path_cost = br->root_path_cost; @@ -141,7 +141,7 @@ static int old_dev_ioctl(struct net_devi b.tcn_timer_value = br_timer_value(&br->tcn_timer); b.topology_change_timer_value = br_timer_value(&br->topology_change_timer); b.gc_timer_value = br_timer_value(&br->gc_timer); - rcu_read_unlock(); + rcu_read_unlock_spin(&br->hash_lock); if (copy_to_user((void __user *)args[1], &b, sizeof(b))) return -EFAULT; @@ -219,9 +219,9 @@ static int old_dev_ioctl(struct net_devi struct __port_info p; struct net_bridge_port *pt; - rcu_read_lock(); + rcu_read_lock_spin(&br->lock); if ((pt = br_get_port(br, args[2])) == NULL) { - rcu_read_unlock(); + rcu_read_unlock_spin(&br->lock); return -EINVAL; } @@ -239,7 +239,7 @@ static int old_dev_ioctl(struct net_devi p.forward_delay_timer_value = br_timer_value(&pt->forward_delay_timer); p.hold_timer_value = br_timer_value(&pt->hold_timer); - rcu_read_unlock(); + rcu_read_unlock_spin(&br->lock); if (copy_to_user((void __user *)args[1], &p, sizeof(p))) return -EFAULT; --- linux/net/bridge/br_fdb.c.orig +++ linux/net/bridge/br_fdb.c @@ -211,11 +211,11 @@ struct net_bridge_fdb_entry *br_fdb_get( { struct net_bridge_fdb_entry *fdb; - rcu_read_lock(); + rcu_read_lock_spin(&br->hash_lock); fdb = __br_fdb_get(br, addr); if (fdb) atomic_inc(&fdb->use_count); - rcu_read_unlock(); + rcu_read_unlock_spin(&br->hash_lock); return fdb; } @@ -247,7 +247,7 @@ int br_fdb_fillbuf(struct net_bridge *br memset(buf, 0, maxnum*sizeof(struct __fdb_entry)); - rcu_read_lock(); + rcu_read_lock_spin(&br->hash_lock); for (i = 0; i < BR_HASH_SIZE; i++) { hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) { if (num >= maxnum) @@ -273,7 +273,7 @@ int br_fdb_fillbuf(struct net_bridge *br } out: - rcu_read_unlock(); + rcu_read_unlock_spin(&br->hash_lock); return num; } --- linux/net/sched/sch_generic.c.orig +++ linux/net/sched/sch_generic.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -95,6 +96,7 @@ int qdisc_restart(struct net_device *dev struct Qdisc *q = dev->qdisc; struct sk_buff *skb; +// rcu_read_lock_read(&ptype_lock); // for PREEMPT_RT /* Dequeue packet */ if ((skb = q->dequeue(q)) != NULL) { unsigned nolock = (dev->features & NETIF_F_LLTX); @@ -108,6 +110,10 @@ int qdisc_restart(struct net_device *dev * will be requeued. */ if (!nolock) { +#ifdef CONFIG_PREEMPT_RT + spin_lock(&dev->xmit_lock); + dev->xmit_lock_owner = _smp_processor_id(); +#else if (!spin_trylock(&dev->xmit_lock)) { collision: /* So, someone grabbed the driver. */ @@ -117,17 +123,19 @@ int qdisc_restart(struct net_device *dev it by checking xmit owner and drop the packet when deadloop is detected. */ - if (dev->xmit_lock_owner == smp_processor_id()) { + if (dev->xmit_lock_owner == _smp_processor_id()) { kfree_skb(skb); if (net_ratelimit()) printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); +// rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT return -1; } __get_cpu_var(netdev_rx_stat).cpu_collision++; goto requeue; } /* Remember that the driver is grabbed by us. */ - dev->xmit_lock_owner = smp_processor_id(); + dev->xmit_lock_owner = _smp_processor_id(); +#endif } { @@ -139,18 +147,34 @@ int qdisc_restart(struct net_device *dev if (netdev_nit) dev_queue_xmit_nit(skb, dev); + WARN_ON_RT(irqs_disabled()); ret = dev->hard_start_xmit(skb, dev); +#ifdef CONFIG_PREEMPT_RT + if (irqs_disabled()) { + if (printk_ratelimit()) + print_symbol("network driver disabled interrupts: %s\n", (unsigned long)dev->hard_start_xmit); + local_irq_enable(); + } +#endif if (ret == NETDEV_TX_OK) { if (!nolock) { dev->xmit_lock_owner = -1; spin_unlock(&dev->xmit_lock); } spin_lock(&dev->queue_lock); +// rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT return -1; } if (ret == NETDEV_TX_LOCKED && nolock) { spin_lock(&dev->queue_lock); +#ifdef CONFIG_PREEMPT_RT + preempt_disable(); + __get_cpu_var(netdev_rx_stat).cpu_collision++; + preempt_enable(); + goto requeue; +#else goto collision; +#endif } } @@ -177,8 +201,10 @@ int qdisc_restart(struct net_device *dev requeue: q->ops->requeue(skb, q); netif_schedule(dev); +// rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT return 1; } +// rcu_read_unlock_read(&ptype_lock); // for PREEMPT_RT return q->q.qlen; } --- linux/net/ipv6/protocol.c.orig +++ linux/net/ipv6/protocol.c @@ -40,14 +40,14 @@ #include struct inet6_protocol *inet6_protos[MAX_INET_PROTOS]; -static spinlock_t inet6_proto_lock = SPIN_LOCK_UNLOCKED; +rwlock_t inet6_proto_lock = RW_LOCK_UNLOCKED; int inet6_add_protocol(struct inet6_protocol *prot, unsigned char protocol) { int ret, hash = protocol & (MAX_INET_PROTOS - 1); - spin_lock_bh(&inet6_proto_lock); + write_lock_bh(&inet6_proto_lock); if (inet6_protos[hash]) { ret = -1; @@ -56,7 +56,7 @@ int inet6_add_protocol(struct inet6_prot ret = 0; } - spin_unlock_bh(&inet6_proto_lock); + write_unlock_bh(&inet6_proto_lock); return ret; } @@ -69,7 +69,7 @@ int inet6_del_protocol(struct inet6_prot { int ret, hash = protocol & (MAX_INET_PROTOS - 1); - spin_lock_bh(&inet6_proto_lock); + write_lock_bh(&inet6_proto_lock); if (inet6_protos[hash] != prot) { ret = -1; @@ -78,7 +78,7 @@ int inet6_del_protocol(struct inet6_prot ret = 0; } - spin_unlock_bh(&inet6_proto_lock); + write_unlock_bh(&inet6_proto_lock); synchronize_net(); --- linux/net/ipv6/af_inet6.c.orig +++ linux/net/ipv6/af_inet6.c @@ -94,7 +94,7 @@ atomic_t inet6_sock_nr; * build a new socket. */ static struct list_head inetsw6[SOCK_MAX]; -static spinlock_t inetsw6_lock = SPIN_LOCK_UNLOCKED; +static rwlock_t inetsw6_lock = RW_LOCK_UNLOCKED; static void inet6_sock_destruct(struct sock *sk) { @@ -127,7 +127,7 @@ static int inet6_create(struct socket *s /* Look for the requested type/protocol pair. */ answer = NULL; - rcu_read_lock(); + rcu_read_lock_read(&inetsw6_lock); list_for_each_rcu(p, &inetsw6[sock->type]) { answer = list_entry(p, struct inet_protosw, list); @@ -162,7 +162,7 @@ static int inet6_create(struct socket *s answer_prot = answer->prot; answer_no_check = answer->no_check; answer_flags = answer->flags; - rcu_read_unlock(); + rcu_read_unlock_read(&inetsw6_lock); BUG_TRAP(answer_prot->slab != NULL); @@ -242,7 +242,7 @@ static int inet6_create(struct socket *s out: return rc; out_rcu_unlock: - rcu_read_unlock(); + rcu_read_unlock_read(&inetsw6_lock); goto out; } @@ -564,7 +564,7 @@ inet6_register_protosw(struct inet_proto int protocol = p->protocol; struct list_head *last_perm; - spin_lock_bh(&inetsw6_lock); + write_lock_bh(&inetsw6_lock); if (p->type >= SOCK_MAX) goto out_illegal; @@ -595,7 +595,7 @@ inet6_register_protosw(struct inet_proto */ list_add_rcu(&p->list, last_perm); out: - spin_unlock_bh(&inetsw6_lock); + write_unlock_bh(&inetsw6_lock); return; out_permanent: @@ -618,9 +618,9 @@ inet6_unregister_protosw(struct inet_pro "Attempt to unregister permanent protocol %d.\n", p->protocol); } else { - spin_lock_bh(&inetsw6_lock); + write_lock_bh(&inetsw6_lock); list_del_rcu(&p->list); - spin_unlock_bh(&inetsw6_lock); + write_unlock_bh(&inetsw6_lock); synchronize_net(); } --- linux/net/ipv6/ndisc.c.orig +++ linux/net/ipv6/ndisc.c @@ -289,17 +289,17 @@ static int ndisc_constructor(struct neig struct neigh_parms *parms; int is_multicast = ipv6_addr_is_multicast(addr); - rcu_read_lock(); + rcu_read_lock_read(&addrconf_lock); in6_dev = in6_dev_get(dev); if (in6_dev == NULL) { - rcu_read_unlock(); + rcu_read_unlock_read(&addrconf_lock); return -EINVAL; } parms = in6_dev->nd_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); - rcu_read_unlock(); + rcu_read_unlock_read(&addrconf_lock); neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; if (dev->hard_header == NULL) { --- linux/net/ipv6/icmp.c.orig +++ linux/net/ipv6/icmp.c @@ -537,11 +537,11 @@ static void icmpv6_notify(struct sk_buff hash = nexthdr & (MAX_INET_PROTOS - 1); - rcu_read_lock(); + rcu_read_lock_read(&inet6_proto_lock); ipprot = rcu_dereference(inet6_protos[hash]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, NULL, type, code, inner_offset, info); - rcu_read_unlock(); + rcu_read_unlock_read(&inet6_proto_lock); read_lock(&raw_v6_lock); if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) { --- linux/net/ipv6/ip6_input.c.orig +++ linux/net/ipv6/ip6_input.c @@ -156,7 +156,7 @@ static inline int ip6_input_finish(struc skb->h.raw += (skb->h.raw[1]+1)<<3; } - rcu_read_lock(); + rcu_read_lock_read(&raw_v6_lock); resubmit: if (!pskb_pull(skb, skb->h.raw - skb->data)) goto discard; @@ -205,12 +205,12 @@ resubmit: kfree_skb(skb); } } - rcu_read_unlock(); + rcu_read_unlock_read(&raw_v6_lock); return 0; discard: IP6_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); - rcu_read_unlock(); + rcu_read_unlock_read(&raw_v6_lock); kfree_skb(skb); return 0; } --- linux/net/packet/af_packet.c.orig +++ linux/net/packet/af_packet.c @@ -393,7 +393,6 @@ static int packet_sendmsg_spkt(struct ki /* * Now send it */ - dev_queue_xmit(skb); dev_put(dev); return(len); --- linux/net/sunrpc/sched.c.orig +++ linux/net/sunrpc/sched.c @@ -959,8 +959,6 @@ void rpc_killall_tasks(struct rpc_clnt * spin_unlock(&rpc_sched_lock); } -static DECLARE_MUTEX_LOCKED(rpciod_running); - static void rpciod_killall(void) { unsigned long flags; --- linux/net/sunrpc/clnt.c.orig +++ linux/net/sunrpc/clnt.c @@ -231,7 +231,8 @@ rpc_shutdown_client(struct rpc_clnt *cln clnt->cl_oneshot = 0; clnt->cl_dead = 0; rpc_killall_tasks(clnt); - sleep_on_timeout(&destroy_wait, 1*HZ); + wait_event_timeout(destroy_wait, + atomic_read(&clnt->cl_users) > 0, 1*HZ); } if (atomic_read(&clnt->cl_users) < 0) { --- linux/net/ipv4/devinet.c.orig +++ linux/net/ipv4/devinet.c @@ -214,16 +214,16 @@ static void inetdev_destroy(struct in_de int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b) { - rcu_read_lock(); + rcu_read_lock_down_read(&rtnl_sem); for_primary_ifa(in_dev) { if (inet_ifa_match(a, ifa)) { if (!b || inet_ifa_match(b, ifa)) { - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); return 1; } } } endfor_ifa(in_dev); - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); return 0; } @@ -772,7 +772,7 @@ u32 inet_select_addr(const struct net_de u32 addr = 0; struct in_device *in_dev; - rcu_read_lock(); + rcu_read_lock_down_read(&rtnl_sem); in_dev = __in_dev_get(dev); if (!in_dev) goto no_in_dev; @@ -788,7 +788,7 @@ u32 inet_select_addr(const struct net_de addr = ifa->ifa_local; } endfor_ifa(in_dev); no_in_dev: - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); if (addr) goto out; @@ -798,7 +798,7 @@ no_in_dev: in dev_base list. */ read_lock(&dev_base_lock); - rcu_read_lock(); + rcu_read_lock_down_read(&rtnl_sem); for (dev = dev_base; dev; dev = dev->next) { if ((in_dev = __in_dev_get(dev)) == NULL) continue; @@ -812,8 +812,8 @@ no_in_dev: } endfor_ifa(in_dev); } out_unlock_both: + rcu_read_unlock_up_read(&rtnl_sem); read_unlock(&dev_base_lock); - rcu_read_unlock(); out: return addr; } @@ -868,16 +868,16 @@ u32 inet_confirm_addr(const struct net_d struct in_device *in_dev; if (dev) { - rcu_read_lock(); + rcu_read_lock_down_read(&rtnl_sem); if ((in_dev = __in_dev_get(dev))) addr = confirm_addr_indev(in_dev, dst, local, scope); - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); return addr; } read_lock(&dev_base_lock); - rcu_read_lock(); + rcu_read_lock_down_read(&rtnl_sem); for (dev = dev_base; dev; dev = dev->next) { if ((in_dev = __in_dev_get(dev))) { addr = confirm_addr_indev(in_dev, dst, local, scope); @@ -885,7 +885,7 @@ u32 inet_confirm_addr(const struct net_d break; } } - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); read_unlock(&dev_base_lock); return addr; @@ -1054,9 +1054,9 @@ static int inet_dump_ifaddr(struct sk_bu continue; if (idx > s_idx) s_ip_idx = 0; - rcu_read_lock(); + rcu_read_lock_down_read(&rtnl_sem); if ((in_dev = __in_dev_get(dev)) == NULL) { - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); continue; } @@ -1067,11 +1067,11 @@ static int inet_dump_ifaddr(struct sk_bu if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWADDR) <= 0) { - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); goto done; } } - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); } done: @@ -1125,11 +1125,11 @@ void inet_forward_change(void) read_lock(&dev_base_lock); for (dev = dev_base; dev; dev = dev->next) { struct in_device *in_dev; - rcu_read_lock(); + rcu_read_lock_down_read(&rtnl_sem); in_dev = __in_dev_get(dev); if (in_dev) in_dev->cnf.forwarding = on; - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); } read_unlock(&dev_base_lock); --- linux/net/ipv4/arp.c.orig +++ linux/net/ipv4/arp.c @@ -237,17 +237,17 @@ static int arp_constructor(struct neighb neigh->type = inet_addr_type(addr); - rcu_read_lock(); + rcu_read_lock_down_read(&rtnl_sem); in_dev = rcu_dereference(__in_dev_get(dev)); if (in_dev == NULL) { - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); return -EINVAL; } parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); if (dev->hard_header == NULL) { neigh->nud_state = NUD_NOARP; --- linux/net/ipv4/ip_input.c.orig +++ linux/net/ipv4/ip_input.c @@ -213,7 +213,7 @@ static inline int ip_local_deliver_finis /* Point into the IP datagram, just past the header. */ skb->h.raw = skb->data; - rcu_read_lock(); + rcu_read_lock_read(&inet_proto_lock); { /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ int protocol = skb->nh.iph->protocol; @@ -258,7 +258,7 @@ static inline int ip_local_deliver_finis } } out: - rcu_read_unlock(); + rcu_read_unlock_read(&inet_proto_lock); return 0; } --- linux/net/ipv4/tcp_timer.c.orig +++ linux/net/ipv4/tcp_timer.c @@ -210,6 +210,7 @@ static void tcp_delack_timer(unsigned lo struct sock *sk = (struct sock*)data; struct tcp_opt *tp = tcp_sk(sk); +// rcu_read_lock_read(&ptype_lock); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ @@ -263,6 +264,7 @@ out: sk_stream_mem_reclaim(sk); out_unlock: bh_unlock_sock(sk); +// rcu_read_unlock_read(&ptype_lock); sock_put(sk); } @@ -421,6 +423,7 @@ static void tcp_write_timer(unsigned lon struct tcp_opt *tp = tcp_sk(sk); int event; +// rcu_read_lock_read(&ptype_lock); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later */ @@ -452,6 +455,7 @@ static void tcp_write_timer(unsigned lon out: sk_stream_mem_reclaim(sk); out_unlock: +// rcu_read_unlock_read(&ptype_lock); bh_unlock_sock(sk); sock_put(sk); } @@ -577,6 +581,7 @@ static void tcp_keepalive_timer (unsigne __u32 elapsed; /* Only process if socket is not in use. */ +// rcu_read_lock_read(&ptype_lock); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ @@ -646,6 +651,7 @@ death: out: bh_unlock_sock(sk); +// rcu_read_unlock_read(&ptype_lock); sock_put(sk); } --- linux/net/ipv4/fib_frontend.c.orig +++ linux/net/ipv4/fib_frontend.c @@ -172,13 +172,13 @@ int fib_validate_source(u32 src, u32 dst int ret; no_addr = rpf = 0; - rcu_read_lock(); + rcu_read_lock_down_read(&rtnl_sem); in_dev = __in_dev_get(dev); if (in_dev) { no_addr = in_dev->ifa_list == NULL; rpf = IN_DEV_RPFILTER(in_dev); } - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); if (in_dev == NULL) goto e_inval; --- linux/net/ipv4/netfilter/ip_tables.c.orig +++ linux/net/ipv4/netfilter/ip_tables.c @@ -110,7 +110,11 @@ struct ipt_table_info static LIST_HEAD(ipt_target); static LIST_HEAD(ipt_match); static LIST_HEAD(ipt_tables); -#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) +/* + * Use atomic add because on PREEMPT_RT the same table might + * be used on two CPUs at once: + */ +#define ADD_COUNTER(c,b,p) do { atomic_add((b), (atomic_t *)(&(c).bcnt)); atomic_add((p), (atomic_t *)(&(c).pcnt)); } while(0) #ifdef CONFIG_SMP #define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) @@ -289,8 +293,17 @@ ipt_do_table(struct sk_buff **pskb, read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + /* + * on a PREEMPT_RT kernel the task could schedule + * off and smp_processor_id() is not safe. So we take + * the current value of the CPU and use that table. We + * only update the counters while read-locking the table + * and dont change the rules so the possibility of the + * same table being used by two tasks at once is not a + * problem. + */ table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); + + TABLE_OFFSET(table->private, _smp_processor_id()); e = get_entry(table_base, table->private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG @@ -298,7 +311,7 @@ ipt_do_table(struct sk_buff **pskb, if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) { printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n", - smp_processor_id(), + _smp_processor_id(), table->name, &((struct ipt_entry *)table_base)->comefrom, ((struct ipt_entry *)table_base)->comefrom); --- linux/net/ipv4/protocol.c.orig +++ linux/net/ipv4/protocol.c @@ -49,7 +49,7 @@ #include struct net_protocol *inet_protos[MAX_INET_PROTOS]; -static spinlock_t inet_proto_lock = SPIN_LOCK_UNLOCKED; +DEFINE_RWLOCK(inet_proto_lock); /* * Add a protocol handler to the hash tables @@ -61,14 +61,14 @@ int inet_add_protocol(struct net_protoco hash = protocol & (MAX_INET_PROTOS - 1); - spin_lock_bh(&inet_proto_lock); + write_lock_bh(&inet_proto_lock); if (inet_protos[hash]) { ret = -1; } else { inet_protos[hash] = prot; ret = 0; } - spin_unlock_bh(&inet_proto_lock); + write_unlock_bh(&inet_proto_lock); return ret; } @@ -83,14 +83,14 @@ int inet_del_protocol(struct net_protoco hash = protocol & (MAX_INET_PROTOS - 1); - spin_lock_bh(&inet_proto_lock); + write_lock_bh(&inet_proto_lock); if (inet_protos[hash] == prot) { inet_protos[hash] = NULL; ret = 0; } else { ret = -1; } - spin_unlock_bh(&inet_proto_lock); + write_unlock_bh(&inet_proto_lock); synchronize_net(); --- linux/net/ipv4/route.c.orig +++ linux/net/ipv4/route.c @@ -196,7 +196,7 @@ __u8 ip_tos2prio[16] = { struct rt_hash_bucket { struct rtable *chain; - spinlock_t lock; + rwlock_t lock; } __attribute__((__aligned__(8))); static struct rt_hash_bucket *rt_hash_table; @@ -226,11 +226,11 @@ static struct rtable *rt_cache_get_first struct rt_cache_iter_state *st = seq->private; for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { - rcu_read_lock_bh(); + rcu_read_lock_bh_read(&rt_hash_table[st->bucket].lock); r = rt_hash_table[st->bucket].chain; if (r) break; - rcu_read_unlock_bh(); + rcu_read_unlock_bh_read(&rt_hash_table[st->bucket].lock); } return r; } @@ -241,10 +241,10 @@ static struct rtable *rt_cache_get_next( r = r->u.rt_next; while (!r) { - rcu_read_unlock_bh(); + rcu_read_unlock_bh_read(&rt_hash_table[st->bucket].lock); if (--st->bucket < 0) break; - rcu_read_lock_bh(); + rcu_read_lock_bh_read(&rt_hash_table[st->bucket].lock); r = rt_hash_table[st->bucket].chain; } return r; @@ -279,8 +279,10 @@ static void *rt_cache_seq_next(struct se static void rt_cache_seq_stop(struct seq_file *seq, void *v) { + struct rt_cache_iter_state *st = rcu_dereference(seq->private); + if (v && v != SEQ_START_TOKEN) - rcu_read_unlock_bh(); + rcu_read_unlock_bh_read(&rt_hash_table[st->bucket].lock); } static int rt_cache_seq_show(struct seq_file *seq, void *v) @@ -527,7 +529,7 @@ static void rt_check_expire(unsigned lon i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; - spin_lock(&rt_hash_table[i].lock); + write_lock(&rt_hash_table[i].lock); while ((rth = *rthp) != NULL) { if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ @@ -546,7 +548,7 @@ static void rt_check_expire(unsigned lon *rthp = rth->u.rt_next; rt_free(rth); } - spin_unlock(&rt_hash_table[i].lock); + write_unlock(&rt_hash_table[i].lock); /* Fallback loop breaker. */ if (time_after(jiffies, now)) @@ -569,11 +571,12 @@ static void rt_run_flush(unsigned long d get_random_bytes(&rt_hash_rnd, 4); for (i = rt_hash_mask; i >= 0; i--) { - spin_lock_bh(&rt_hash_table[i].lock); + write_lock_bh(&rt_hash_table[i].lock); rth = rt_hash_table[i].chain; if (rth) rt_hash_table[i].chain = NULL; - spin_unlock_bh(&rt_hash_table[i].lock); + write_unlock_bh(&rt_hash_table[i].lock); + cond_resched_all(); for (; rth; rth = next) { next = rth->u.rt_next; @@ -582,7 +585,7 @@ static void rt_run_flush(unsigned long d } } -static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(rt_flush_lock); void rt_cache_flush(int delay) { @@ -703,7 +706,7 @@ static int rt_garbage_collect(void) k = (k + 1) & rt_hash_mask; rthp = &rt_hash_table[k].chain; - spin_lock_bh(&rt_hash_table[k].lock); + write_lock_bh(&rt_hash_table[k].lock); while ((rth = *rthp) != NULL) { if (!rt_may_expire(rth, tmo, expire)) { tmo >>= 1; @@ -714,7 +717,7 @@ static int rt_garbage_collect(void) rt_free(rth); goal--; } - spin_unlock_bh(&rt_hash_table[k].lock); + write_unlock_bh(&rt_hash_table[k].lock); if (goal <= 0) break; } @@ -791,7 +794,7 @@ restart: rthp = &rt_hash_table[hash].chain; - spin_lock_bh(&rt_hash_table[hash].lock); + write_lock_bh(&rt_hash_table[hash].lock); while ((rth = *rthp) != NULL) { if (compare_keys(&rth->fl, &rt->fl)) { /* Put it first */ @@ -812,7 +815,7 @@ restart: rth->u.dst.__use++; dst_hold(&rth->u.dst); rth->u.dst.lastuse = now; - spin_unlock_bh(&rt_hash_table[hash].lock); + write_unlock_bh(&rt_hash_table[hash].lock); rt_drop(rt); *rp = rth; @@ -853,7 +856,7 @@ restart: if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { int err = arp_bind_neighbour(&rt->u.dst); if (err) { - spin_unlock_bh(&rt_hash_table[hash].lock); + write_unlock_bh(&rt_hash_table[hash].lock); if (err != -ENOBUFS) { rt_drop(rt); @@ -894,14 +897,14 @@ restart: } #endif rt_hash_table[hash].chain = rt; - spin_unlock_bh(&rt_hash_table[hash].lock); + write_unlock_bh(&rt_hash_table[hash].lock); *rp = rt; return 0; } void rt_bind_peer(struct rtable *rt, int create) { - static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED; + static DEFINE_SPINLOCK(rt_peer_lock); struct inet_peer *peer; peer = inet_getpeer(rt->rt_dst, create); @@ -925,7 +928,7 @@ void rt_bind_peer(struct rtable *rt, int */ static void ip_select_fb_ident(struct iphdr *iph) { - static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED; + static DEFINE_SPINLOCK(ip_fb_id_lock); static u32 ip_fallback_id; u32 salt; @@ -961,7 +964,7 @@ static void rt_del(unsigned hash, struct { struct rtable **rthp; - spin_lock_bh(&rt_hash_table[hash].lock); + write_lock_bh(&rt_hash_table[hash].lock); ip_rt_put(rt); for (rthp = &rt_hash_table[hash].chain; *rthp; rthp = &(*rthp)->u.rt_next) @@ -970,7 +973,7 @@ static void rt_del(unsigned hash, struct rt_free(rt); break; } - spin_unlock_bh(&rt_hash_table[hash].lock); + write_unlock_bh(&rt_hash_table[hash].lock); } void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, @@ -1009,7 +1012,7 @@ void ip_rt_redirect(u32 old_gw, u32 dadd rthp=&rt_hash_table[hash].chain; - rcu_read_lock(); + rcu_read_lock_read(&rt_hash_table[hash].lock); while ((rth = rcu_dereference(*rthp)) != NULL) { struct rtable *rt; @@ -1030,7 +1033,7 @@ void ip_rt_redirect(u32 old_gw, u32 dadd break; dst_hold(&rth->u.dst); - rcu_read_unlock(); + rcu_read_unlock_read(&rt_hash_table[hash].lock); rt = dst_alloc(&ipv4_dst_ops); if (rt == NULL) { @@ -1082,7 +1085,7 @@ void ip_rt_redirect(u32 old_gw, u32 dadd ip_rt_put(rt); goto do_next; } - rcu_read_unlock(); + rcu_read_unlock_read(&rt_hash_table[hash].lock); do_next: ; } @@ -1263,7 +1266,7 @@ unsigned short ip_rt_frag_needed(struct for (i = 0; i < 2; i++) { unsigned hash = rt_hash_code(daddr, skeys[i], tos); - rcu_read_lock(); + rcu_read_lock_read(&rt_hash_table[hash].lock); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->u.rt_next)) { if (rth->fl.fl4_dst == daddr && @@ -1301,7 +1304,7 @@ unsigned short ip_rt_frag_needed(struct } } } - rcu_read_unlock(); + rcu_read_unlock_read(&rt_hash_table[hash].lock); } return est_mtu ? : new_mtu; } @@ -1823,7 +1826,7 @@ int ip_route_input(struct sk_buff *skb, tos &= IPTOS_RT_MASK; hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos); - rcu_read_lock(); + rcu_read_lock_read(&rt_hash_table[hash].lock); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->u.rt_next)) { if (rth->fl.fl4_dst == daddr && @@ -1838,13 +1841,13 @@ int ip_route_input(struct sk_buff *skb, dst_hold(&rth->u.dst); rth->u.dst.__use++; RT_CACHE_STAT_INC(in_hit); - rcu_read_unlock(); + rcu_read_unlock_read(&rt_hash_table[hash].lock); skb->dst = (struct dst_entry*)rth; return 0; } RT_CACHE_STAT_INC(in_hlist_search); } - rcu_read_unlock(); + rcu_read_unlock_read(&rt_hash_table[hash].lock); /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing @@ -1860,7 +1863,7 @@ int ip_route_input(struct sk_buff *skb, if (MULTICAST(daddr)) { struct in_device *in_dev; - rcu_read_lock(); + rcu_read_lock_down_read(&rtnl_sem); if ((in_dev = __in_dev_get(dev)) != NULL) { int our = ip_check_mc(in_dev, daddr, saddr, skb->nh.iph->protocol); @@ -1869,12 +1872,12 @@ int ip_route_input(struct sk_buff *skb, || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) #endif ) { - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); return ip_route_input_mc(skb, daddr, saddr, tos, dev, our); } } - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); return -EINVAL; } return ip_route_input_slow(skb, daddr, saddr, tos, dev); @@ -2184,7 +2187,7 @@ int __ip_route_output_key(struct rtable hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos); - rcu_read_lock_bh(); + rcu_read_lock_read(&rt_hash_table[hash].lock); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->u.rt_next)) { if (rth->fl.fl4_dst == flp->fl4_dst && @@ -2200,13 +2203,13 @@ int __ip_route_output_key(struct rtable dst_hold(&rth->u.dst); rth->u.dst.__use++; RT_CACHE_STAT_INC(out_hit); - rcu_read_unlock_bh(); + rcu_read_unlock_read(&rt_hash_table[hash].lock); *rp = rth; return 0; } RT_CACHE_STAT_INC(out_hlist_search); } - rcu_read_unlock_bh(); + rcu_read_unlock_read(&rt_hash_table[hash].lock); return ip_route_output_slow(rp, flp); } @@ -2421,7 +2424,7 @@ int ip_rt_dump(struct sk_buff *skb, str if (h < s_h) continue; if (h > s_h) s_idx = 0; - rcu_read_lock_bh(); + rcu_read_lock_read(&rt_hash_table[h].lock); for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; rt = rcu_dereference(rt->u.rt_next), idx++) { if (idx < s_idx) @@ -2431,12 +2434,12 @@ int ip_rt_dump(struct sk_buff *skb, str cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) { dst_release(xchg(&skb->dst, NULL)); - rcu_read_unlock_bh(); + rcu_read_unlock_read(&rt_hash_table[h].lock); goto done; } dst_release(xchg(&skb->dst, NULL)); } - rcu_read_unlock_bh(); + rcu_read_unlock_read(&rt_hash_table[h].lock); } done: @@ -2755,7 +2758,7 @@ int __init ip_rt_init(void) rt_hash_mask--; for (i = 0; i <= rt_hash_mask; i++) { - spin_lock_init(&rt_hash_table[i].lock); + rwlock_init(&rt_hash_table[i].lock); rt_hash_table[i].chain = NULL; } --- linux/net/ipv4/inetpeer.c.orig +++ linux/net/ipv4/inetpeer.c @@ -70,7 +70,7 @@ */ /* Exported for inet_getid inline function. */ -spinlock_t inet_peer_idlock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(inet_peer_idlock); static kmem_cache_t *peer_cachep; @@ -95,7 +95,7 @@ int inet_peer_maxttl = 10 * 60 * HZ; /* /* Exported for inet_putpeer inline function. */ struct inet_peer *inet_peer_unused_head, **inet_peer_unused_tailp = &inet_peer_unused_head; -spinlock_t inet_peer_unused_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(inet_peer_unused_lock); #define PEER_MAX_CLEANUP_WORK 30 static void peer_check_expire(unsigned long dummy); --- linux/net/ipv4/af_inet.c.orig +++ linux/net/ipv4/af_inet.c @@ -125,7 +125,7 @@ extern void ip_mc_drop_socket(struct soc * build a new socket. */ static struct list_head inetsw[SOCK_MAX]; -static spinlock_t inetsw_lock = SPIN_LOCK_UNLOCKED; +static rwlock_t inetsw_lock = RW_LOCK_UNLOCKED; /* New destruction routine */ @@ -242,7 +242,7 @@ static int inet_create(struct socket *so /* Look for the requested type/protocol pair. */ answer = NULL; - rcu_read_lock(); + rcu_read_lock_read(&inetsw_lock); list_for_each_rcu(p, &inetsw[sock->type]) { answer = list_entry(p, struct inet_protosw, list); @@ -276,7 +276,7 @@ static int inet_create(struct socket *so answer_prot = answer->prot; answer_no_check = answer->no_check; answer_flags = answer->flags; - rcu_read_unlock(); + rcu_read_unlock_read(&inetsw_lock); BUG_TRAP(answer_prot->slab != NULL); @@ -345,7 +345,7 @@ static int inet_create(struct socket *so out: return err; out_rcu_unlock: - rcu_read_unlock(); + rcu_read_unlock_read(&inetsw_lock); goto out; } @@ -902,7 +902,7 @@ void inet_register_protosw(struct inet_p int protocol = p->protocol; struct list_head *last_perm; - spin_lock_bh(&inetsw_lock); + write_lock_bh(&inetsw_lock); if (p->type >= SOCK_MAX) goto out_illegal; @@ -933,7 +933,7 @@ void inet_register_protosw(struct inet_p */ list_add_rcu(&p->list, last_perm); out: - spin_unlock_bh(&inetsw_lock); + write_unlock_bh(&inetsw_lock); synchronize_net(); @@ -958,9 +958,9 @@ void inet_unregister_protosw(struct inet "Attempt to unregister permanent protocol %d.\n", p->protocol); } else { - spin_lock_bh(&inetsw_lock); + write_lock_bh(&inetsw_lock); list_del_rcu(&p->list); - spin_unlock_bh(&inetsw_lock); + write_unlock_bh(&inetsw_lock); synchronize_net(); } --- linux/net/ipv4/tcp_minisocks.c.orig +++ linux/net/ipv4/tcp_minisocks.c @@ -417,7 +417,7 @@ static void tcp_twkill(unsigned long); #define TCP_TWKILL_QUOTA 100 static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS]; -static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(tw_death_lock); static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0); static void twkill_work(void *); static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL); @@ -512,7 +512,7 @@ static void twkill_work(void *dummy) continue; while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) { - if (need_resched()) { + if (softirq_need_resched()) { spin_unlock_bh(&tw_death_lock); schedule(); spin_lock_bh(&tw_death_lock); --- linux/net/ipv4/tcp_ipv4.c.orig +++ linux/net/ipv4/tcp_ipv4.c @@ -1015,6 +1015,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 return; } +// rcu_read_lock_read(&ptype_lock); bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. @@ -1132,6 +1133,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 out: bh_unlock_sock(sk); +// rcu_read_unlock_read(&ptype_lock); sock_put(sk); } @@ -1789,6 +1791,7 @@ process: skb->dev = NULL; +// rcu_read_lock_read(&ptype_lock); bh_lock_sock(sk); ret = 0; if (!sock_owned_by_user(sk)) { @@ -1797,6 +1800,7 @@ process: } else sk_add_backlog(sk, skb); bh_unlock_sock(sk); +// rcu_read_unlock_read(&ptype_lock); sock_put(sk); --- linux/net/ipv4/icmp.c.orig +++ linux/net/ipv4/icmp.c @@ -701,11 +701,11 @@ static void icmp_unreach(struct sk_buff } read_unlock(&raw_v4_lock); - rcu_read_lock(); + rcu_read_lock_read(&inet_proto_lock); ipprot = rcu_dereference(inet_protos[hash]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, info); - rcu_read_unlock(); + rcu_read_unlock_read(&inet_proto_lock); out: return; @@ -883,7 +883,7 @@ static void icmp_address_reply(struct sk in_dev = in_dev_get(dev); if (!in_dev) goto out; - rcu_read_lock(); + rcu_read_lock_down_read(&rtnl_sem); if (in_dev->ifa_list && IN_DEV_LOG_MARTIANS(in_dev) && IN_DEV_FORWARD(in_dev)) { @@ -903,7 +903,7 @@ static void icmp_address_reply(struct sk NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src)); } } - rcu_read_unlock(); + rcu_read_unlock_up_read(&rtnl_sem); in_dev_put(in_dev); out:; } --- linux/net/ipv4/ip_output.c.orig +++ linux/net/ipv4/ip_output.c @@ -1302,6 +1302,7 @@ void ip_send_reply(struct sock *sk, stru Note that it uses the fact, that this function is called with locally disabled BH and that sk cannot be already spinlocked. */ +// rcu_read_lock_read(&ptype_lock); bh_lock_sock(sk); inet->tos = skb->nh.iph->tos; sk->sk_priority = skb->priority; @@ -1316,6 +1317,7 @@ void ip_send_reply(struct sock *sk, stru } bh_unlock_sock(sk); +// rcu_read_unlock_read(&ptype_lock); ip_rt_put(rt); } --- linux/net/802/psnap.c.orig +++ linux/net/802/psnap.c @@ -55,7 +55,7 @@ static int snap_rcv(struct sk_buff *skb, .type = __constant_htons(ETH_P_SNAP), }; - rcu_read_lock(); + rcu_read_lock_spin(&snap_lock); proto = find_snap_client(skb->h.raw); if (proto) { /* Pass the frame on. */ @@ -68,7 +68,7 @@ static int snap_rcv(struct sk_buff *skb, rc = 1; } - rcu_read_unlock(); + rcu_read_unlock_spin(&snap_lock); return rc; } --- linux/sound/core/oss/pcm_oss.c.orig +++ linux/sound/core/oss/pcm_oss.c @@ -1918,7 +1918,7 @@ static int snd_pcm_oss_release(struct in return 0; } -static inline int _snd_pcm_oss_ioctl(struct inode *inode, struct file *file, +static inline int snd_pcm_oss_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { snd_pcm_oss_file_t *pcm_oss_file; @@ -2078,17 +2078,6 @@ static inline int _snd_pcm_oss_ioctl(str return -EINVAL; } -/* FIXME: need to unlock BKL to allow preemption */ -static int snd_pcm_oss_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) -{ - int err; - unlock_kernel(); - err = _snd_pcm_oss_ioctl(inode, file, cmd, arg); - lock_kernel(); - return err; -} - static ssize_t snd_pcm_oss_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { snd_pcm_oss_file_t *pcm_oss_file; @@ -2119,9 +2108,7 @@ static ssize_t snd_pcm_oss_write(struct substream = pcm_oss_file->streams[SNDRV_PCM_STREAM_PLAYBACK]; if (substream == NULL) return -ENXIO; - up(&file->f_dentry->d_inode->i_sem); result = snd_pcm_oss_write1(substream, buf, count); - down(&file->f_dentry->d_inode->i_sem); #ifdef OSS_DEBUG printk("pcm_oss: write %li bytes (wrote %li bytes)\n", (long)count, (long)result); #endif @@ -2415,7 +2402,7 @@ static struct file_operations snd_pcm_os .open = snd_pcm_oss_open, .release = snd_pcm_oss_release, .poll = snd_pcm_oss_poll, - .ioctl = snd_pcm_oss_ioctl, + .unlocked_ioctl = snd_pcm_oss_ioctl, .mmap = snd_pcm_oss_mmap, }; --- linux/sound/core/oss/mixer_oss.c.orig +++ linux/sound/core/oss/mixer_oss.c @@ -359,16 +359,10 @@ static int snd_mixer_oss_ioctl1(snd_mixe return -ENXIO; } -/* FIXME: need to unlock BKL to allow preemption */ int snd_mixer_oss_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { - int err; - /* FIXME: need to unlock BKL to allow preemption */ - unlock_kernel(); - err = snd_mixer_oss_ioctl1((snd_mixer_oss_file_t *) file->private_data, cmd, arg); - lock_kernel(); - return err; + return snd_mixer_oss_ioctl1((snd_mixer_oss_file_t *) file->private_data, cmd, arg); } int snd_mixer_oss_ioctl_card(snd_card_t *card, unsigned int cmd, unsigned long arg) @@ -393,7 +387,7 @@ static struct file_operations snd_mixer_ .owner = THIS_MODULE, .open = snd_mixer_oss_open, .release = snd_mixer_oss_release, - .ioctl = snd_mixer_oss_ioctl, + .unlocked_ioctl = snd_mixer_oss_ioctl, }; static snd_minor_t snd_mixer_oss_reg = --- linux/sound/core/pcm_lib.c.orig +++ linux/sound/core/pcm_lib.c @@ -133,6 +133,7 @@ static void xrun(snd_pcm_substream_t *su snd_pcm_stop(substream, SNDRV_PCM_STATE_XRUN); #ifdef CONFIG_SND_DEBUG if (substream->pstr->xrun_debug) { + user_trace_stop(); snd_printd(KERN_DEBUG "XRUN: pcmC%dD%d%c\n", substream->pcm->card->number, substream->pcm->device, --- linux/sound/core/control.c.orig +++ linux/sound/core/control.c @@ -1021,7 +1021,7 @@ static int snd_ctl_set_power_state(snd_c } #endif -static inline int _snd_ctl_ioctl(struct inode *inode, struct file *file, +static inline int snd_ctl_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { snd_ctl_file_t *ctl; @@ -1095,17 +1095,6 @@ static inline int _snd_ctl_ioctl(struct return -ENOTTY; } -/* FIXME: need to unlock BKL to allow preemption */ -static int snd_ctl_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) -{ - int err; - unlock_kernel(); - err = _snd_ctl_ioctl(inode, file, cmd, arg); - lock_kernel(); - return err; -} - static ssize_t snd_ctl_read(struct file *file, char __user *buffer, size_t count, loff_t * offset) { snd_ctl_file_t *ctl; @@ -1241,7 +1230,7 @@ static struct file_operations snd_ctl_f_ .open = snd_ctl_open, .release = snd_ctl_release, .poll = snd_ctl_poll, - .ioctl = snd_ctl_ioctl, + .unlocked_ioctl = snd_ctl_ioctl, .fasync = snd_ctl_fasync, }; --- linux/sound/core/seq/oss/seq_oss.c.orig +++ linux/sound/core/seq/oss/seq_oss.c @@ -181,14 +181,10 @@ static int odev_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { seq_oss_devinfo_t *dp; - int err; + dp = file->private_data; snd_assert(dp != NULL, return -EIO); - /* FIXME: need to unlock BKL to allow preemption */ - unlock_kernel(); - err = snd_seq_oss_ioctl(dp, cmd, arg); - lock_kernel(); - return err; + return snd_seq_oss_ioctl(dp, cmd, arg); } @@ -213,7 +209,7 @@ static struct file_operations seq_oss_f_ .open = odev_open, .release = odev_release, .poll = odev_poll, - .ioctl = odev_ioctl, + .unlocked_ioctl = odev_ioctl, }; static snd_minor_t seq_oss_reg = { --- linux/sound/core/seq/seq_clientmgr.c.orig +++ linux/sound/core/seq/seq_clientmgr.c @@ -2135,15 +2135,10 @@ static int snd_seq_ioctl(struct inode *i unsigned int cmd, unsigned long arg) { client_t *client = (client_t *) file->private_data; - int err; snd_assert(client != NULL, return -ENXIO); - /* FIXME: need to unlock BKL to allow preemption */ - unlock_kernel(); - err = snd_seq_do_ioctl(client, cmd, (void __user *) arg); - lock_kernel(); - return err; + return snd_seq_do_ioctl(client, cmd, (void __user *) arg); } @@ -2462,7 +2457,7 @@ static struct file_operations snd_seq_f_ .open = snd_seq_open, .release = snd_seq_release, .poll = snd_seq_poll, - .ioctl = snd_seq_ioctl, + .unlocked_ioctl = snd_seq_ioctl, }; static snd_minor_t snd_seq_reg = --- linux/sound/core/hwdep.c.orig +++ linux/sound/core/hwdep.c @@ -232,7 +232,7 @@ static int snd_hwdep_dsp_load(snd_hwdep_ return 0; } -static inline int _snd_hwdep_ioctl(struct inode *inode, struct file * file, +static inline int snd_hwdep_ioctl(struct inode *inode, struct file * file, unsigned int cmd, unsigned long arg) { snd_hwdep_t *hw = file->private_data; @@ -252,17 +252,6 @@ static inline int _snd_hwdep_ioctl(struc return -ENOTTY; } -/* FIXME: need to unlock BKL to allow preemption */ -static int snd_hwdep_ioctl(struct inode *inode, struct file * file, - unsigned int cmd, unsigned long arg) -{ - int err; - unlock_kernel(); - err = _snd_hwdep_ioctl(inode, file, cmd, arg); - lock_kernel(); - return err; -} - static int snd_hwdep_mmap(struct file * file, struct vm_area_struct * vma) { snd_hwdep_t *hw = file->private_data; @@ -328,7 +317,7 @@ static struct file_operations snd_hwdep_ .open = snd_hwdep_open, .release = snd_hwdep_release, .poll = snd_hwdep_poll, - .ioctl = snd_hwdep_ioctl, + .unlocked_ioctl = snd_hwdep_ioctl, .mmap = snd_hwdep_mmap, }; --- linux/sound/core/pcm_native.c.orig +++ linux/sound/core/pcm_native.c @@ -2644,36 +2644,26 @@ static int snd_pcm_playback_ioctl(struct unsigned int cmd, unsigned long arg) { snd_pcm_file_t *pcm_file; - int err; pcm_file = file->private_data; if (((cmd >> 8) & 0xff) != 'A') return -ENOTTY; - /* FIXME: need to unlock BKL to allow preemption */ - unlock_kernel(); - err = snd_pcm_playback_ioctl1(pcm_file->substream, cmd, (void __user *)arg); - lock_kernel(); - return err; + return snd_pcm_playback_ioctl1(pcm_file->substream, cmd, (void __user *)arg); } static int snd_pcm_capture_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { snd_pcm_file_t *pcm_file; - int err; pcm_file = file->private_data; if (((cmd >> 8) & 0xff) != 'A') return -ENOTTY; - /* FIXME: need to unlock BKL to allow preemption */ - unlock_kernel(); - err = snd_pcm_capture_ioctl1(pcm_file->substream, cmd, (void __user *)arg); - lock_kernel(); - return err; + return snd_pcm_capture_ioctl1(pcm_file->substream, cmd, (void __user *)arg); } int snd_pcm_kernel_playback_ioctl(snd_pcm_substream_t *substream, @@ -3318,7 +3308,7 @@ static struct file_operations snd_pcm_f_ .open = snd_pcm_open, .release = snd_pcm_release, .poll = snd_pcm_playback_poll, - .ioctl = snd_pcm_playback_ioctl, + .unlocked_ioctl = snd_pcm_playback_ioctl, .mmap = snd_pcm_mmap, .fasync = snd_pcm_fasync, }; @@ -3330,7 +3320,7 @@ static struct file_operations snd_pcm_f_ .open = snd_pcm_open, .release = snd_pcm_release, .poll = snd_pcm_capture_poll, - .ioctl = snd_pcm_capture_ioctl, + .unlocked_ioctl = snd_pcm_capture_ioctl, .mmap = snd_pcm_mmap, .fasync = snd_pcm_fasync, }; --- linux/sound/core/timer.c.orig +++ linux/sound/core/timer.c @@ -1657,7 +1657,7 @@ static int snd_timer_user_continue(struc return (err = snd_timer_continue(tu->timeri)) < 0 ? err : 0; } -static inline int _snd_timer_user_ioctl(struct inode *inode, struct file *file, +static inline int snd_timer_user_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { snd_timer_user_t *tu; @@ -1705,17 +1705,6 @@ static inline int _snd_timer_user_ioctl( return -ENOTTY; } -/* FIXME: need to unlock BKL to allow preemption */ -static int snd_timer_user_ioctl(struct inode *inode, struct file * file, - unsigned int cmd, unsigned long arg) -{ - int err; - unlock_kernel(); - err = _snd_timer_user_ioctl(inode, file, cmd, arg); - lock_kernel(); - return err; -} - static int snd_timer_user_fasync(int fd, struct file * file, int on) { snd_timer_user_t *tu; @@ -1814,7 +1803,7 @@ static struct file_operations snd_timer_ .open = snd_timer_user_open, .release = snd_timer_user_release, .poll = snd_timer_user_poll, - .ioctl = snd_timer_user_ioctl, + .unlocked_ioctl = snd_timer_user_ioctl, .fasync = snd_timer_user_fasync, }; --- linux/sound/core/info.c.orig +++ linux/sound/core/info.c @@ -448,7 +448,7 @@ static unsigned int snd_info_entry_poll( return mask; } -static inline int _snd_info_entry_ioctl(struct inode *inode, struct file *file, +static inline int snd_info_entry_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { snd_info_private_data_t *data; @@ -469,17 +469,6 @@ static inline int _snd_info_entry_ioctl( return -ENOTTY; } -/* FIXME: need to unlock BKL to allow preemption */ -static int snd_info_entry_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) -{ - int err; - unlock_kernel(); - err = _snd_info_entry_ioctl(inode, file, cmd, arg); - lock_kernel(); - return err; -} - static int snd_info_entry_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_dentry->d_inode; @@ -508,7 +497,7 @@ static struct file_operations snd_info_e .read = snd_info_entry_read, .write = snd_info_entry_write, .poll = snd_info_entry_poll, - .ioctl = snd_info_entry_ioctl, + .unlocked_ioctl = snd_info_entry_ioctl, .mmap = snd_info_entry_mmap, .open = snd_info_entry_open, .release = snd_info_entry_release, --- linux/sound/core/rawmidi.c.orig +++ linux/sound/core/rawmidi.c @@ -133,7 +133,8 @@ int snd_rawmidi_drain_output(snd_rawmidi err = 0; runtime->drain = 1; while (runtime->avail < runtime->buffer_size) { - timeout = interruptible_sleep_on_timeout(&runtime->sleep, 10 * HZ); + timeout = wait_event_interruptible_timeout(runtime->sleep, + runtime->avail < runtime->buffer_size, 10 * HZ); if (signal_pending(current)) { err = -ERESTARTSYS; break; @@ -673,7 +674,7 @@ static int snd_rawmidi_input_status(snd_ return 0; } -static inline int _snd_rawmidi_ioctl(struct inode *inode, struct file *file, +static inline int snd_rawmidi_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { snd_rawmidi_file_t *rfile; @@ -784,17 +785,6 @@ static inline int _snd_rawmidi_ioctl(str return -ENOTTY; } -/* FIXME: need to unlock BKL to allow preemption */ -static int snd_rawmidi_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) -{ - int err; - unlock_kernel(); - err = _snd_rawmidi_ioctl(inode, file, cmd, arg); - lock_kernel(); - return err; -} - int snd_rawmidi_control_ioctl(snd_card_t * card, snd_ctl_file_t * control, unsigned int cmd, unsigned long arg) { @@ -1345,7 +1335,7 @@ static struct file_operations snd_rawmid .open = snd_rawmidi_open, .release = snd_rawmidi_release, .poll = snd_rawmidi_poll, - .ioctl = snd_rawmidi_ioctl, + .unlocked_ioctl = snd_rawmidi_ioctl, }; static snd_minor_t snd_rawmidi_reg = --- linux/fs/xfs/linux-2.6/mutex.h.orig +++ linux/fs/xfs/linux-2.6/mutex.h @@ -44,8 +44,8 @@ #define MUTEX_DEFAULT 0x0 typedef struct semaphore mutex_t; -#define mutex_init(lock, type, name) sema_init(lock, 1) -#define mutex_destroy(lock) sema_init(lock, -99) +#define mutex_init(lock, type, name) sema_init_nocheck(lock, 1) +#define mutex_destroy(lock) sema_init_nocheck(lock, -99) #define mutex_lock(lock, num) down(lock) #define mutex_trylock(lock) (down_trylock(lock) ? 0 : 1) #define mutex_unlock(lock) up(lock) --- linux/fs/xfs/linux-2.6/sema.h.orig +++ linux/fs/xfs/linux-2.6/sema.h @@ -43,9 +43,9 @@ typedef struct semaphore sema_t; -#define init_sema(sp, val, c, d) sema_init(sp, val) -#define initsema(sp, val) sema_init(sp, val) -#define initnsema(sp, val, name) sema_init(sp, val) +#define init_sema(sp, val, c, d) sema_init_nocheck(sp, val) +#define initsema(sp, val) sema_init_nocheck(sp, val) +#define initnsema(sp, val, name) sema_init_nocheck(sp, val) #define psema(sp, b) down(sp) #define vsema(sp) up(sp) #define valusema(sp) (atomic_read(&(sp)->count)) --- linux/fs/proc/array.c.orig +++ linux/fs/proc/array.c @@ -129,17 +129,19 @@ static inline char * task_name(struct ta */ static const char *task_state_array[] = { "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "T (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)" /* 32 */ + "M (running-mutex)", /* 1 */ + "S (sleeping)", /* 2 */ + "D (disk sleep)", /* 4 */ + "T (stopped)", /* 8 */ + "T (tracing stop)", /* 16 */ + "Z (zombie)", /* 32 */ + "X (dead)" /* 64 */ }; static inline const char * get_task_state(struct task_struct *tsk) { unsigned int state = (tsk->state & (TASK_RUNNING | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE | TASK_STOPPED | --- linux/fs/proc/proc_misc.c.orig +++ linux/fs/proc/proc_misc.c @@ -397,6 +397,41 @@ static int show_stat(struct seq_file *p, nr_running(), nr_iowait()); +#ifdef CONFIG_PREEMPT_RT + { + unsigned long nr_uninterruptible_cpu(int cpu); + extern int pi_walk, pi_null, pi_prio; + extern int rt_overload_schedule, + rt_overload_wakeup, rt_overload_pulled; + unsigned long rt_nr_running_cpu(int cpu); + extern atomic_t rt_overload; + + int i; + + seq_printf(p, "rt_overload_schedule: %d\n", + rt_overload_schedule); + seq_printf(p, "rt_overload_wakeup: %d\n", + rt_overload_wakeup); + seq_printf(p, "rt_overload_pulled: %d\n", + rt_overload_pulled); + seq_printf(p, "pi_null: %d\n", pi_null); + seq_printf(p, "pi_prio: %d\n", pi_prio); + seq_printf(p, "pi_walk: %d\n", pi_walk); + seq_printf(p, "nr_running(): %ld\n", + nr_running()); + seq_printf(p, "nr_uninterruptible(): %ld\n", + nr_uninterruptible()); + for_each_cpu(i) + seq_printf(p, "nr_uninterruptible(%d): %ld\n", + i, nr_uninterruptible_cpu(i)); + for_each_cpu(i) + seq_printf(p, "rt_nr_running(%d): %ld\n", + i, rt_nr_running_cpu(i)); + seq_printf(p, "rt_overload: %d\n", atomic_read(&rt_overload)); + + } +#endif + return 0; } @@ -513,6 +548,20 @@ static int execdomains_read_proc(char *p return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_LATENCY_TRACE +extern struct seq_operations latency_trace_op; +static int latency_trace_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &latency_trace_op); +} +static struct file_operations proc_latency_trace_operations = { + .open = latency_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + #ifdef CONFIG_MAGIC_SYSRQ /* * writing 'C' to /proc/sysrq-trigger is like sysrq-C @@ -592,6 +641,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_SCHEDSTATS create_seq_entry("schedstat", 0, &proc_schedstat_operations); #endif +#ifdef CONFIG_LATENCY_TRACE + create_seq_entry("latency_trace", 0, &proc_latency_trace_operations); +#endif #ifdef CONFIG_PROC_KCORE proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); if (proc_root_kcore) { --- linux/fs/proc/task_mmu.c.orig +++ linux/fs/proc/task_mmu.c @@ -125,8 +125,10 @@ static void *m_start(struct seq_file *m, down_read(&mm->mmap_sem); map = mm->mmap; - while (l-- && map) + while (l-- && map) { map = map->vm_next; + cond_resched(); + } if (!map) { up_read(&mm->mmap_sem); mmput(mm); --- linux/fs/nfsd/nfssvc.c.orig +++ linux/fs/nfsd/nfssvc.c @@ -281,6 +281,7 @@ out: /* Release the thread */ svc_exit_thread(rqstp); + unlock_kernel(); /* Release module */ module_put_and_exit(0); } --- linux/fs/reiser4/log.c.orig +++ linux/fs/reiser4/log.c @@ -231,7 +231,7 @@ lock_log(reiser4_log_file * log) while (log->long_term) { /* sleep on a semaphore */ struct __wlink link; - sema_init(&link.sema, 0); + sema_init_nocheck(&link.sema, 0); list_add(&link.link, &log->wait); spin_unlock(&log->lock); --- linux/fs/reiser4/plugin/space/bitmap.c.orig +++ linux/fs/reiser4/plugin/space/bitmap.c @@ -636,7 +636,7 @@ init_bnode(struct bitmap_node *bnode, { xmemset(bnode, 0, sizeof (struct bitmap_node)); - sema_init(&bnode->sema, 1); + sema_init_nocheck(&bnode->sema, 1); atomic_set(&bnode->loaded, 0); } --- linux/fs/reiser4/lock.c.orig +++ linux/fs/reiser4/lock.c @@ -1184,7 +1184,7 @@ init_lock_stack(lock_stack * owner /* po requestors_list_clean(owner); spin_stack_init(owner); owner->curpri = 1; - sema_init(&owner->sema, 0); + sema_init_nocheck(&owner->sema, 0); } /* Initializes lock object. */ @@ -1308,7 +1308,7 @@ prepare_to_sleep(lock_stack * owner) if (0) { - NOTE-NIKITA: I commented call to sema_init() out hoping + NOTE-NIKITA: I commented call to sema_init_nocheck() out hoping that it is the reason or thread sleeping in down(&owner->sema) without any other thread running. @@ -1317,7 +1317,7 @@ prepare_to_sleep(lock_stack * owner) longterm_lock_znode() would have to iterate its loop once more. spin_lock_stack(owner); - sema_init(&owner->sema, 0); + sema_init_nocheck(&owner->sema, 0); spin_unlock_stack(owner); } */ --- linux/fs/reiser4/init_super.c.orig +++ linux/fs/reiser4/init_super.c @@ -63,8 +63,8 @@ _INIT_(sinfo) ON_DEBUG(INIT_LIST_HEAD(&sbinfo->all_jnodes)); ON_DEBUG(spin_lock_init(&sbinfo->all_guard)); - sema_init(&sbinfo->delete_sema, 1); - sema_init(&sbinfo->flush_sema, 1); + sema_init_nocheck(&sbinfo->delete_sema, 1); + sema_init_nocheck(&sbinfo->flush_sema, 1); spin_super_init(sbinfo); spin_super_eflush_init(sbinfo); --- linux/fs/reiser4/flush_queue.c.orig +++ linux/fs/reiser4/flush_queue.c @@ -108,7 +108,7 @@ init_fq(flush_queue_t * fq) capture_list_init(ATOM_FQ_LIST(fq)); - sema_init(&fq->io_sem, 0); + sema_init_nocheck(&fq->io_sem, 0); spin_fq_init(fq); } --- linux/fs/reiser4/search.c.orig +++ linux/fs/reiser4/search.c @@ -1174,7 +1174,6 @@ cbk_node_lookup(cbk_handle * h /* search assert("vs-361", h->level > h->stop_level); if (handle_eottl(h, &result)) { - /**/ assert("vs-1674", result == LOOKUP_DONE || result == LOOKUP_REST); return result; } @@ -1241,7 +1240,7 @@ cbk_cache_scan_slots(cbk_handle * h /* c * */ - rcu_read_lock(); + rcu_read_lock_nort(); read_lock_cbk_cache(cache); slot = cbk_cache_list_prev(cbk_cache_list_front(&cache->lru)); while (1) { @@ -1278,7 +1277,7 @@ cbk_cache_scan_slots(cbk_handle * h /* c if (unlikely(result == 0 && ZF_ISSET(node, JNODE_RIP))) result = -ENOENT; - rcu_read_unlock(); + rcu_read_unlock_nort(); if (result != 0) { h->result = CBK_COORD_NOTFOUND; --- linux/fs/reiser4/vfs_ops.c.orig +++ linux/fs/reiser4/vfs_ops.c @@ -433,7 +433,7 @@ init_once(void *obj /* pointer to new in inode_init_once(&info->vfs_inode); readdir_list_init(get_readdir_list(&info->vfs_inode)); init_rwsem(&info->p.coc_sem); - sema_init(&info->p.loading, 1); + sema_init_nocheck(&info->p.loading, 1); ON_DEBUG(info->p.nr_jnodes = 0); INIT_RADIX_TREE(jnode_tree_by_reiser4_inode(&info->p), GFP_ATOMIC); ON_DEBUG(info->p.captured_eflushed = 0); --- linux/fs/reiser4/txnmgr.c.orig +++ linux/fs/reiser4/txnmgr.c @@ -366,7 +366,7 @@ txnmgr_init(txn_mgr * mgr) atom_list_init(&mgr->atoms_list); spin_txnmgr_init(mgr); - sema_init(&mgr->commit_semaphore, 1); + sema_init_nocheck(&mgr->commit_semaphore, 1); } /* Free transaction manager. */ --- linux/fs/reiser4/entd.c.orig +++ linux/fs/reiser4/entd.c @@ -312,7 +312,7 @@ void write_page_by_ent (struct page * pa spin_unlock(&ent->guard); return; } - sema_init(&rq.sem, 0); + sema_init_nocheck(&rq.sem, 0); wbq_list_push_back(&ent->wbq_list, &rq); ent->nr_synchronous_requests ++; spin_unlock(&ent->guard); --- linux/fs/jbd/commit.c.orig +++ linux/fs/jbd/commit.c @@ -333,7 +333,7 @@ write_out_data: jbd_unlock_bh_state(bh); } put_bh(bh); - cond_resched_lock(&journal->j_list_lock); +// cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); --- linux/fs/pipe.c.orig +++ linux/fs/pipe.c @@ -160,8 +160,14 @@ pipe_readv(struct file *filp, const stru wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); } + /* + * Hack: we turn off atime updates for -RT kernels. + * Who uses them on pipes anyway? + */ +#ifndef CONFIG_PREEMPT_RT if (ret > 0) file_accessed(filp); +#endif return ret; } @@ -254,8 +260,14 @@ pipe_writev(struct file *filp, const str wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); } + /* + * Hack: we turn off atime updates for -RT kernels. + * Who uses them on pipes anyway? + */ +#ifndef CONFIG_PREEMPT_RT if (ret > 0) inode_update_time(inode, 1); /* mtime and ctime */ +#endif return ret; } --- linux/fs/lockd/svc.c.orig +++ linux/fs/lockd/svc.c @@ -49,7 +49,7 @@ static pid_t nlmsvc_pid; int nlmsvc_grace_period; unsigned long nlmsvc_timeout; -static DECLARE_MUTEX_LOCKED(lockd_start); +static DECLARE_WAIT_QUEUE_HEAD(lockd_start); static DECLARE_WAIT_QUEUE_HEAD(lockd_exit); /* @@ -112,7 +112,7 @@ lockd(struct svc_rqst *rqstp) * Let our maker know we're running. */ nlmsvc_pid = current->pid; - up(&lockd_start); + wake_up(&lockd_start); daemonize("lockd"); @@ -233,6 +233,7 @@ lockd_up(void) printk(KERN_WARNING "lockd_up: no pid, %d users??\n", nlmsvc_users); + error = -ENOMEM; serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE); if (!serv) { @@ -261,8 +262,15 @@ lockd_up(void) "lockd_up: create thread failed, error=%d\n", error); goto destroy_and_out; } - down(&lockd_start); - + /* + * Wait for the lockd process to start, but since we're holding + * the lockd semaphore, we can't wait around forever ... + */ + if (wait_event_interruptible_timeout(lockd_start, + nlmsvc_pid != 0, HZ) <= 0) { + printk(KERN_WARNING + "lockd_down: lockd failed to start\n"); + } /* * Note: svc_serv structures have an initial use count of 1, * so we exit through here on both success and failure. @@ -302,16 +310,12 @@ lockd_down(void) * Wait for the lockd process to exit, but since we're holding * the lockd semaphore, we can't wait around forever ... */ - clear_thread_flag(TIF_SIGPENDING); - interruptible_sleep_on_timeout(&lockd_exit, HZ); - if (nlmsvc_pid) { + if (wait_event_interruptible_timeout(lockd_exit, + nlmsvc_pid == 0, HZ) <= 0) { printk(KERN_WARNING "lockd_down: lockd failed to exit, clearing pid\n"); nlmsvc_pid = 0; } - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); out: up(&nlmsvc_sema); } --- linux/fs/fcntl.c.orig +++ linux/fs/fcntl.c @@ -473,7 +473,8 @@ static void send_sigio_to_task(struct ta break; /* fall-through: fall back on the old plain SIGIO signal */ case 0: - send_group_sig_info(SIGIO, SEND_SIG_PRIV, p); + // we hold the tasklist lock already: + group_send_sig_info(SIGIO, SEND_SIG_PRIV, p); } } @@ -507,7 +508,7 @@ static void send_sigurg_to_task(struct t struct fown_struct *fown) { if (sigio_perm(p, fown, SIGURG)) - send_group_sig_info(SIGURG, SEND_SIG_PRIV, p); + group_send_sig_info(SIGURG, SEND_SIG_PRIV, p); } int send_sigurg(struct fown_struct *fown) --- linux/fs/exec.c.orig +++ linux/fs/exec.c @@ -550,11 +550,16 @@ static int exec_mmap(struct mm_struct *m mm_release(tsk, old_mm); task_lock(tsk); + + local_irq_disable(); // FIXME active_mm = tsk->active_mm; + activate_mm(active_mm, mm); tsk->mm = mm; tsk->active_mm = mm; - activate_mm(active_mm, mm); + local_irq_enable(); + task_unlock(tsk); + arch_pick_mmap_layout(mm); if (old_mm) { if (active_mm != old_mm) BUG(); --- linux/fs/aio.c.orig +++ linux/fs/aio.c @@ -573,9 +573,11 @@ void use_mm(struct mm_struct *mm) tsk->flags |= PF_BORROWED_MM; active_mm = tsk->active_mm; atomic_inc(&mm->mm_count); + local_irq_disable(); // FIXME + activate_mm(active_mm, mm); tsk->mm = mm; tsk->active_mm = mm; - activate_mm(active_mm, mm); + local_irq_enable(); task_unlock(tsk); mmdrop(active_mm); --- linux/fs/dcache.c.orig +++ linux/fs/dcache.c @@ -37,8 +37,8 @@ int sysctl_vfs_cache_pressure = 100; -spinlock_t dcache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; -seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; +DEFINE_SPINLOCK(dcache_lock); +DECLARE_SEQLOCK(rename_lock); EXPORT_SYMBOL(dcache_lock); @@ -997,7 +997,7 @@ struct dentry * __d_lookup(struct dentry struct dentry *found = NULL; struct hlist_node *node; - rcu_read_lock(); + rcu_read_lock_spin(&dcache_lock); hlist_for_each_rcu(node, head) { struct dentry *dentry; @@ -1044,7 +1044,7 @@ struct dentry * __d_lookup(struct dentry next: spin_unlock(&dentry->d_lock); } - rcu_read_unlock(); + rcu_read_unlock_spin(&dcache_lock); return found; } @@ -1479,17 +1479,25 @@ int is_subdir(struct dentry * new_dentry { int result; struct dentry * saved = new_dentry; +#ifndef CONFIG_PREEMPT_RT unsigned long seq; +#endif result = 0; /* need rcu_readlock to protect against the d_parent trashing due to * d_move */ +#ifdef CONFIG_PREEMPT_RT + write_seqlock(&rename_lock); +#else rcu_read_lock(); +#endif do { /* for restarting inner loop in case of seq retry */ new_dentry = saved; +#ifndef CONFIG_PREEMPT_RT seq = read_seqbegin(&rename_lock); +#endif for (;;) { if (new_dentry != old_dentry) { struct dentry * parent = new_dentry->d_parent; @@ -1501,8 +1509,13 @@ int is_subdir(struct dentry * new_dentry result = 1; break; } +#ifdef CONFIG_PREEMPT_RT + } while (0); + write_sequnlock(&rename_lock); +#else } while (read_seqretry(&rename_lock, seq)); rcu_read_unlock(); +#endif return result; } --- linux/fs/ioctl.c.orig +++ linux/fs/ioctl.c @@ -93,10 +93,8 @@ asmlinkage long sys_ioctl(unsigned int f int block; int res; - if (!S_ISREG(inode->i_mode)) { - error = -ENOTTY; - goto done; - } + if (!S_ISREG(inode->i_mode)) + break; /* do we support this mess? */ if (!mapping->a_ops->bmap) { error = -EINVAL; @@ -116,19 +114,15 @@ asmlinkage long sys_ioctl(unsigned int f goto done; } case FIGETBSZ: - if (!S_ISREG(inode->i_mode)) { - error = -ENOTTY; - goto done; - } + if (!S_ISREG(inode->i_mode)) + break; error = -EBADF; if (inode->i_sb) error = put_user(inode->i_sb->s_blocksize, p); goto done; case FIONREAD: - if (!S_ISREG(inode->i_mode)) { - error = -ENOTTY; - goto done; - } + if (!S_ISREG(inode->i_mode)) + break; error = put_user(i_size_read(inode) - filp->f_pos, p); goto done; } --- linux/mm/slab.c.orig +++ linux/mm/slab.c @@ -560,9 +560,9 @@ static inline void ** ac_entry(struct ar return (void**)(ac+1); } -static inline struct array_cache *ac_data(kmem_cache_t *cachep) +static inline struct array_cache *ac_data(kmem_cache_t *cachep, int cpu) { - return cachep->array[smp_processor_id()]; + return cachep->array[cpu]; } static kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags) @@ -802,21 +802,22 @@ void __init kmem_cache_init(void) /* 4) Replace the bootstrap head arrays */ { void * ptr; + int cpu = smp_processor_id(); ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - local_irq_disable(); - BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); - memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init)); - cache_cache.array[smp_processor_id()] = ptr; - local_irq_enable(); + local_irq_disable_nort(); + BUG_ON(ac_data(&cache_cache, cpu) != &initarray_cache.cache); + memcpy(ptr, ac_data(&cache_cache, cpu), sizeof(struct arraycache_init)); + cache_cache.array[cpu] = ptr; + local_irq_enable_nort(); ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); - local_irq_disable(); - BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache); - memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep), + local_irq_disable_nort(); + BUG_ON(ac_data(malloc_sizes[0].cs_cachep, cpu) != &initarray_generic.cache); + memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep, cpu), sizeof(struct arraycache_init)); - malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr; - local_irq_enable(); + malloc_sizes[0].cs_cachep->array[cpu] = ptr; + local_irq_enable_nort(); } /* 5) resize the head arrays to their final sizes */ @@ -1174,6 +1175,7 @@ kmem_cache_create (const char *name, siz { size_t left_over, slab_size; kmem_cache_t *cachep = NULL; + int cpu = _smp_processor_id(); /* * Sanity checks... these are all serious usage bugs. @@ -1400,16 +1402,16 @@ next: * the cache that's used by kmalloc(24), otherwise * the creation of further caches will BUG(). */ - cachep->array[smp_processor_id()] = &initarray_generic.cache; + cachep->array[cpu] = &initarray_generic.cache; g_cpucache_up = PARTIAL; } else { - cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL); + cachep->array[cpu] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL); } - BUG_ON(!ac_data(cachep)); - ac_data(cachep)->avail = 0; - ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; - ac_data(cachep)->batchcount = 1; - ac_data(cachep)->touched = 0; + BUG_ON(!ac_data(cachep, cpu)); + ac_data(cachep, cpu)->avail = 0; + ac_data(cachep, cpu)->limit = BOOT_CPUCACHE_ENTRIES; + ac_data(cachep, cpu)->batchcount = 1; + ac_data(cachep, cpu)->touched = 0; cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; cachep->free_limit = (1+num_online_cpus())*cachep->batchcount @@ -1463,7 +1465,9 @@ EXPORT_SYMBOL(kmem_cache_create); #if DEBUG static void check_irq_off(void) { +#ifndef CONFIG_PREEMPT_RT BUG_ON(!irqs_disabled()); +#endif } static void check_irq_on(void) @@ -1505,22 +1509,39 @@ static void smp_call_function_all_cpus(v static void drain_array_locked(kmem_cache_t* cachep, struct array_cache *ac, int force); -static void do_drain(void *arg) +static void do_drain_cpu(kmem_cache_t *cachep, int cpu) { - kmem_cache_t *cachep = (kmem_cache_t*)arg; struct array_cache *ac; check_irq_off(); - ac = ac_data(cachep); + spin_lock(&cachep->spinlock); + ac = ac_data(cachep, cpu); free_block(cachep, &ac_entry(ac)[0], ac->avail); - spin_unlock(&cachep->spinlock); ac->avail = 0; + spin_unlock(&cachep->spinlock); +} + +#ifndef CONFIG_PREEMPT_RT +/* + * Executes in an IRQ context: + */ +static void do_drain(void *arg) +{ + do_drain_cpu((kmem_cache_t*)arg, smp_processor_id()); } +#endif static void drain_cpu_caches(kmem_cache_t *cachep) { +#ifndef CONFIG_PREEMPT_RT smp_call_function_all_cpus(do_drain, cachep); +#else + int cpu; + + for_each_online_cpu(cpu) + do_drain_cpu(cachep, cpu); +#endif check_irq_on(); spin_lock_irq(&cachep->spinlock); if (cachep->lists.shared) @@ -1789,7 +1810,7 @@ static int cache_grow (kmem_cache_t * ca spin_unlock(&cachep->spinlock); if (local_flags & __GFP_WAIT) - local_irq_enable(); + local_irq_enable_nort(); /* * The test for missing atomic flag is performed here, rather than @@ -1813,7 +1834,7 @@ static int cache_grow (kmem_cache_t * ca cache_init_objs(cachep, slabp, ctor_flags); if (local_flags & __GFP_WAIT) - local_irq_disable(); + local_irq_disable_nort(); check_irq_off(); spin_lock(&cachep->spinlock); @@ -1827,7 +1848,7 @@ opps1: kmem_freepages(cachep, objp); failed: if (local_flags & __GFP_WAIT) - local_irq_disable(); + local_irq_disable_nort(); return 0; } @@ -1953,14 +1974,14 @@ bad: #define check_slabp(x,y) do { } while(0) #endif -static void* cache_alloc_refill(kmem_cache_t* cachep, int flags) +static void* cache_alloc_refill(kmem_cache_t* cachep, int flags, int cpu) { int batchcount; struct kmem_list3 *l3; struct array_cache *ac; check_irq_off(); - ac = ac_data(cachep); + ac = ac_data(cachep, cpu); retry: batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -1973,7 +1994,7 @@ retry: l3 = list3_data(cachep); BUG_ON(ac->avail > 0); - spin_lock(&cachep->spinlock); + spin_lock_nort(&cachep->spinlock); if (l3->shared) { struct array_cache *shared_array = l3->shared; if (shared_array->avail) { @@ -2031,14 +2052,17 @@ retry: must_grow: l3->free_objects -= ac->avail; alloc_done: - spin_unlock(&cachep->spinlock); + spin_unlock_nort(&cachep->spinlock); if (unlikely(!ac->avail)) { int x; + spin_unlock_rt(&cachep->spinlock); x = cache_grow(cachep, flags, -1); - + + spin_lock_rt(&cachep->spinlock); // cache_grow can reenable interrupts, then ac could change. - ac = ac_data(cachep); + cpu = smp_processor_id_rt(cpu); + ac = ac_data(cachep, cpu); if (!x && ac->avail == 0) // no objects in sight? abort return NULL; @@ -2107,23 +2131,26 @@ cache_alloc_debugcheck_after(kmem_cache_ static inline void * __cache_alloc (kmem_cache_t *cachep, int flags) { + int cpu = _smp_processor_id(); unsigned long save_flags; void* objp; struct array_cache *ac; cache_alloc_debugcheck_before(cachep, flags); - local_irq_save(save_flags); - ac = ac_data(cachep); + local_irq_save_nort(save_flags); + spin_lock_rt(&cachep->spinlock); + ac = ac_data(cachep, cpu); if (likely(ac->avail)) { STATS_INC_ALLOCHIT(cachep); ac->touched = 1; objp = ac_entry(ac)[--ac->avail]; } else { STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags); + objp = cache_alloc_refill(cachep, flags, cpu); } - local_irq_restore(save_flags); + spin_unlock_rt(&cachep->spinlock); + local_irq_restore_nort(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0)); return objp; } @@ -2193,7 +2220,7 @@ static void cache_flusharray (kmem_cache BUG_ON(!batchcount || batchcount > ac->avail); #endif check_irq_off(); - spin_lock(&cachep->spinlock); + spin_lock_nort(&cachep->spinlock); if (cachep->lists.shared) { struct array_cache *shared_array = cachep->lists.shared; int max = shared_array->limit-shared_array->avail; @@ -2228,7 +2255,7 @@ free_done: STATS_SET_FREEABLE(cachep, i); } #endif - spin_unlock(&cachep->spinlock); + spin_unlock_nort(&cachep->spinlock); ac->avail -= batchcount; memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount], sizeof(void*)*ac->avail); @@ -2243,20 +2270,22 @@ free_done: */ static inline void __cache_free (kmem_cache_t *cachep, void* objp) { - struct array_cache *ac = ac_data(cachep); + int cpu = _smp_processor_id(); + struct array_cache *ac = ac_data(cachep, cpu); check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + spin_lock_rt(&cachep->spinlock); if (likely(ac->avail < ac->limit)) { STATS_INC_FREEHIT(cachep); ac_entry(ac)[ac->avail++] = objp; - return; } else { STATS_INC_FREEMISS(cachep); cache_flusharray(cachep, ac); ac_entry(ac)[ac->avail++] = objp; } + spin_unlock_rt(&cachep->spinlock); } /** @@ -2358,12 +2387,12 @@ void *kmem_cache_alloc_node(kmem_cache_t } spin_unlock_irq(&cachep->spinlock); - local_irq_disable(); + local_irq_disable_nort(); if (!cache_grow(cachep, GFP_KERNEL, nodeid)) { - local_irq_enable(); + local_irq_enable_nort(); return NULL; } - local_irq_enable(); + local_irq_enable_nort(); } got_slabp: /* found one: allocate object */ @@ -2503,9 +2532,9 @@ void kmem_cache_free (kmem_cache_t *cach { unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); __cache_free(cachep, objp); - local_irq_restore(flags); + local_irq_restore_nort(flags); } EXPORT_SYMBOL(kmem_cache_free); @@ -2545,11 +2574,11 @@ void kfree (const void *objp) if (!objp) return; - local_irq_save(flags); + local_irq_save_nort(flags); kfree_debugcheck(objp); c = GET_PAGE_CACHE(virt_to_page(objp)); __cache_free(c, (void*)objp); - local_irq_restore(flags); + local_irq_restore_nort(flags); } EXPORT_SYMBOL(kfree); @@ -2590,13 +2619,17 @@ struct ccupdate_struct { struct array_cache *new[NR_CPUS]; }; +/* + * Executes in IRQ context: + */ static void do_ccupdate_local(void *info) { struct ccupdate_struct *new = (struct ccupdate_struct *)info; struct array_cache *old; +// WARN_ON(!in_interrupt()); check_irq_off(); - old = ac_data(new->cachep); + old = ac_data(new->cachep, smp_processor_id()); new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; new->new[smp_processor_id()] = old; @@ -2704,6 +2737,10 @@ static void enable_cpucache (kmem_cache_ if (limit > 32) limit = 32; #endif +#ifdef CONFIG_PREEMPT + if (limit > 16) + limit = 16; +#endif err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared); if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", @@ -2743,11 +2780,12 @@ static void drain_array_locked(kmem_cach */ static void cache_reap(void *unused) { + int cpu = _smp_processor_id(); struct list_head *walk; if (down_trylock(&cache_chain_sem)) { /* Give up. Setup the next iteration. */ - schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); + schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + cpu); return; } @@ -2766,7 +2804,7 @@ static void cache_reap(void *unused) spin_lock_irq(&searchp->spinlock); - drain_array_locked(searchp, ac_data(searchp), 0); + drain_array_locked(searchp, ac_data(searchp, cpu), 0); if(time_after(searchp->lists.next_reap, jiffies)) goto next_unlock; @@ -2810,7 +2848,7 @@ next: check_irq_on(); up(&cache_chain_sem); /* Setup the next iteration */ - schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); + schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC+cpu); } #ifdef CONFIG_PROC_FS @@ -3031,10 +3069,10 @@ unsigned int ksize(const void *objp) unsigned int size = 0; if (likely(objp != NULL)) { - local_irq_save(flags); + local_irq_save_nort(flags); c = GET_PAGE_CACHE(virt_to_page(objp)); size = kmem_cache_size(c); - local_irq_restore(flags); + local_irq_restore_nort(flags); } return size; --- linux/mm/highmem.c.orig +++ linux/mm/highmem.c @@ -240,11 +240,11 @@ static void bounce_copy_vec(struct bio_v unsigned long flags; unsigned char *vto; - local_irq_save(flags); + local_irq_save_nort(flags); vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); memcpy(vto + to->bv_offset, vfrom, to->bv_len); kunmap_atomic(vto, KM_BOUNCE_READ); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #else /* CONFIG_HIGHMEM */ --- linux/mm/page_alloc.c.orig +++ linux/mm/page_alloc.c @@ -442,6 +442,7 @@ static struct page *__rmqueue(struct zon return NULL; } +#if !defined(CONFIG_PREEMPT_RT) /* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. @@ -466,6 +467,7 @@ static int rmqueue_bulk(struct zone *zon spin_unlock_irqrestore(&zone->lock, flags); return allocated; } +#endif #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) @@ -550,6 +552,7 @@ static void zone_statistics(struct zonel #endif } +#if !defined(CONFIG_PREEMPT_RT) /* * Free a 0-order page */ @@ -577,15 +580,32 @@ static void fastcall free_hot_cold_page( local_irq_restore(flags); put_cpu(); } +#endif +/* + * On PREEMPT_RT we use a simple solution for the time being, + * per-CPU allocation is disabled. + */ void fastcall free_hot_page(struct page *page) { +#if defined(CONFIG_PREEMPT_RT) + if (PageAnon(page)) + page->mapping = NULL; + __free_pages_ok(page, 0); +#else free_hot_cold_page(page, 0); +#endif } void fastcall free_cold_page(struct page *page) { +#ifdef CONFIG_PREEMPT_RT + if (PageAnon(page)) + page->mapping = NULL; + __free_pages_ok(page, 0); +#else free_hot_cold_page(page, 1); +#endif } static inline struct list_head *get_per_thread_pages(void) @@ -690,6 +710,7 @@ buffered_rmqueue(struct zone *zone, int { unsigned long flags; struct page *page = NULL; +#if !defined(CONFIG_PREEMPT_RT) int cold = !!(gfp_flags & __GFP_COLD); if (order == 0) { @@ -708,6 +729,7 @@ buffered_rmqueue(struct zone *zone, int local_irq_restore(flags); put_cpu(); } +#endif if (page == NULL) { spin_lock_irqsave(&zone->lock, flags); @@ -963,8 +985,15 @@ void __pagevec_free(struct pagevec *pvec { int i = pagevec_count(pvec); - while (--i >= 0) + while (--i >= 0) { +#if defined(CONFIG_PREEMPT_RT) + if (PageAnon(pvec->pages[i])) + pvec->pages[i]->mapping = NULL; + __free_pages_ok(pvec->pages[i], 0); +#else free_hot_cold_page(pvec->pages[i], pvec->cold); +#endif + } } fastcall void __free_pages(struct page *page, unsigned int order) --- linux/mm/swap.c.orig +++ linux/mm/swap.c @@ -136,39 +136,45 @@ EXPORT_SYMBOL(mark_page_accessed); * lru_cache_add: add a page to the page lists * @page: the page to add */ -static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; -static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; +static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_pvecs) = { 0, }; +static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_active_pvecs) = { 0, }; void fastcall lru_cache_add(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + int cpu = _smp_processor_id(); + struct pagevec *pvec = &get_cpu_var_locked(lru_add_pvecs, cpu); page_cache_get(page); if (!pagevec_add(pvec, page)) __pagevec_lru_add(pvec); - put_cpu_var(lru_add_pvecs); + put_cpu_var_locked(lru_add_pvecs, cpu); } void fastcall lru_cache_add_active(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); + int cpu = _smp_processor_id(); + struct pagevec *pvec = &get_cpu_var_locked(lru_add_active_pvecs, cpu); page_cache_get(page); if (!pagevec_add(pvec, page)) __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_active_pvecs); + put_cpu_var_locked(lru_add_active_pvecs, cpu); } void lru_add_drain(void) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + int cpu = _smp_processor_id(); + struct pagevec *pvec; + pvec = &get_cpu_var_locked(lru_add_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add(pvec); - pvec = &__get_cpu_var(lru_add_active_pvecs); + put_cpu_var_locked(lru_add_pvecs, cpu); + + pvec = &get_cpu_var_locked(lru_add_active_pvecs, cpu); if (pagevec_count(pvec)) __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_pvecs); + put_cpu_var_locked(lru_add_active_pvecs, cpu); } /* --- linux/mm/rmap.c.orig +++ linux/mm/rmap.c @@ -190,8 +190,8 @@ void __init anon_vma_init(void) */ static struct anon_vma *page_lock_anon_vma(struct page *page) { - struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; + struct anon_vma *anon_vma; rcu_read_lock(); anon_mapping = (unsigned long) page->mapping; @@ -201,10 +201,13 @@ static struct anon_vma *page_lock_anon_v goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + rcu_read_unlock(); // FIXME: hack spin_lock(&anon_vma->lock); + + return anon_vma; out: rcu_read_unlock(); - return anon_vma; + return NULL; } /* --- linux/mm/mmap.c.orig +++ linux/mm/mmap.c @@ -1512,7 +1512,7 @@ static void free_pgtables(struct mmu_gat unsigned long first = start & PGDIR_MASK; unsigned long last = end + PGDIR_SIZE - 1; unsigned long start_pml4_index, start_pgd_index; - struct mm_struct *mm = tlb->mm; + struct mm_struct *mm = tlb_mm(tlb); if (!prev) { prev = mm->mmap; --- linux/mm/memory.c.orig +++ linux/mm/memory.c @@ -115,7 +115,7 @@ static inline void free_one_pmd(struct m page = pmd_page(*dir); pmd_clear(dir); dec_page_state(nr_page_table_pages); - tlb->mm->nr_ptes--; + tlb_mm(tlb)->nr_ptes--; pte_free_tlb(tlb, page); } @@ -183,7 +183,7 @@ void clear_page_range(struct mmu_gather unsigned long end) { int i; - pml4_t *pml4 = tlb->mm->pml4; + pml4_t *pml4 = tlb_mm(tlb)->pml4; unsigned long next; for (i = pml4_index(addr); i <= pml4_index(end-1); i++) { @@ -523,10 +523,10 @@ static void zap_pte_range(struct mmu_gat if (pte_dirty(pte)) set_page_dirty(page); if (PageAnon(page)) - tlb->mm->anon_rss--; + tlb_mm(tlb)->anon_rss--; else if (pte_young(pte)) mark_page_accessed(page); - tlb->freed++; + tlb_free(tlb); page_remove_rmap(page); tlb_remove_page(tlb, page); continue; --- linux/kernel/rt.c.orig +++ linux/kernel/rt.c @@ -0,0 +1,1689 @@ +/* + * kernel/rt.c + * + * Real-Time Preemption Support + * + * started by Ingo Molnar: + * + * Copyright (C) 2004 Red Hat, Inc., Ingo Molnar + * + * lock debugging, locking tree, deadlock detection: + * + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Released under the General Public License (GPL). + * + * Includes portions of the generic R/W semaphore implementation from: + * + * Copyright (c) 2001 David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli + * - Derived also from comments by Linus + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This flag is good for debugging the PI code - it makes all tasks + * in the system fall under PI handling. Normally only SCHED_FIFO/RR + * tasks are PI-handled: + */ +//#define ALL_TASKS_PI + +/* + * We need a global lock for priority inheritance handling. + * This is only for the slow path, but still, we might want + * to optimize it later to be more scalable. + */ +static __cacheline_aligned_in_smp raw_spinlock_t pi_lock = + RAW_SPIN_LOCK_UNLOCKED; + +#ifdef CONFIG_RT_DEADLOCK_DETECT +/* + * We need a global lock when we walk through the multi-process + * lock tree... + */ +static raw_spinlock_t trace_lock = RAW_SPIN_LOCK_UNLOCKED; + +static LIST_HEAD(held_locks); + +/* + * deadlock detection flag. We turn it off when we detect + * the first problem because we dont want to recurse back + * into the tracing code when doing error printk or + * executing a BUG(): + */ +static int trace_on = 1; + +void deadlock_trace_off(void) +{ + trace_on = 0; +} + +#define trace_lock_irq(lock) \ + do { \ + local_irq_disable(); \ + if (trace_on) \ + spin_lock(lock); \ + } while (0) + +#define trace_unlock(lock) \ + do { \ + if (trace_on) \ + spin_unlock(lock); \ + } while (0) + +#define trace_unlock_irq(lock) \ + do { \ + if (trace_on) \ + spin_unlock(lock); \ + local_irq_enable(); \ + preempt_check_resched(); \ + } while (0) + +#define trace_lock_irqsave(lock, flags) \ + do { \ + local_irq_save(flags); \ + if (trace_on) \ + spin_lock(lock); \ + } while (0) + +#define trace_unlock_irqrestore(lock, flags) \ + do { \ + if (trace_on) \ + spin_unlock(lock); \ + local_irq_restore(flags); \ + preempt_check_resched(); \ + } while (0) + +#define TRACE_OFF() \ +do { \ + if (trace_on) { \ + trace_on = 0; \ + console_verbose(); \ + spin_unlock(&trace_lock); \ + } \ +} while (0) + +#define TRACE_BUG() \ +do { \ + TRACE_OFF(); \ + BUG(); \ +} while (0) + +#define TRACE_WARN_ON(c) \ +do { \ + if (c) { \ + TRACE_OFF(); \ + WARN_ON(1); \ + } \ +} while (0) + +#else +# define trace_lock_irq(lock) local_irq_disable() +# define trace_lock_irqsave(lock, flags) local_irq_save(flags) +# define trace_unlock(lock) do { } while (0) + +# define trace_unlock_irq(lock) \ + do { local_irq_enable(); preempt_check_resched(); } while (0) + +# define trace_unlock_irqrestore(lock, flags) \ + do { local_irq_restore(flags); preempt_check_resched(); } while (0) + +# define TRACE_BUG() do { } while (0) +# define TRACE_WARN_ON(c) do { } while (0) +# define TRACE_OFF() do { } while (0) +#endif /* CONFIG_RT_DEADLOCK_DETECT */ + +#define TRACE_BUG_ON(c) do { if (c) TRACE_BUG(); } while (0) + +/* + * Unlock these on crash: + */ +void zap_rt_locks(void) +{ + spin_lock_init(&pi_lock); +#ifdef CONFIG_RT_DEADLOCK_DETECT + spin_lock_init(&trace_lock); +#endif +} + +#ifdef CONFIG_RT_DEADLOCK_DETECT + +static void printk_task(struct task_struct *p) +{ + if (p) + printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); + else + printk(""); +} + +static void printk_task_short(struct task_struct *p) +{ + if (p) + printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); + else + printk(""); +} + +static void printk_lock(struct rt_mutex *lock, int print_owner) +{ + if (lock->name) + printk(" [%p] {%s}\n", + lock, lock->name); + else + printk(" [%p] {%s:%d}\n", + lock, lock->file, lock->line); + + if (print_owner && lock->owner) { + printk(".. held by: "); + printk_task(lock->owner); + printk("\n"); + } + if (lock->owner) { + printk("... acquired at: "); + print_symbol("%s\n", lock->acquire_eip); + } +} + +static void printk_waiter(struct rt_mutex_waiter *w) +{ + printk("-------------------------\n"); + printk("| waiter struct %p:\n", w); + printk("| w->task:\n"); + printk_task(w->task); + printk("\n| lock:\n"); + printk_lock(w->lock, 1); + printk("| blocked at: "); + print_symbol("%s\n", w->eip); + printk("-------------------------\n"); +} + +static void show_task_locks(struct task_struct *p) +{ + switch (p->state) { + case TASK_RUNNING: printk("R"); break; + case TASK_INTERRUPTIBLE: printk("s"); break; + case TASK_UNINTERRUPTIBLE: printk("D"); break; + case TASK_STOPPED: printk("T"); break; + case EXIT_ZOMBIE: printk("Z"); break; + case EXIT_DEAD: printk("X"); break; + default: printk("?"); break; + } + printk_task(p); + if (p->blocked_on) { + struct rt_mutex *lock = p->blocked_on->lock; + + printk(" blocked on:"); + printk_lock(lock, 1); + } else + printk(" (not blocked)\n"); +} + +static void show_held_locks(struct task_struct *filter) +{ + struct list_head *curr, *cursor = NULL; + struct rt_mutex *lock; + struct task_struct *p; + unsigned long flags; + int count = 0; + + printk("\n"); + if (filter) { + printk("------------------------------\n"); + printk("| showing all locks held by: | ("); + printk_task_short(filter); + printk("):\n"); + printk("------------------------------\n"); + } else { + printk("---------------------------\n"); + printk("| showing all locks held: |\n"); + printk("---------------------------\n"); + } + + /* + * Play safe and acquire the global trace lock. We + * cannot printk with that lock held so we iterate + * very carefully: + */ +next: + trace_lock_irqsave(&trace_lock, flags); + list_for_each(curr, &held_locks) { + if (cursor && curr != cursor) + continue; + lock = list_entry(curr, struct rt_mutex, held_list); + p = lock->owner; + if (filter && (p != filter)) + continue; + count++; + cursor = curr->next; + trace_unlock_irqrestore(&trace_lock, flags); + + printk("\n#%03d: ", count); + printk_lock(lock, filter ? 0 : 1); + goto next; + } + trace_unlock_irqrestore(&trace_lock, flags); +} + +void show_all_locks(void) +{ + struct task_struct *g, *p; + int count = 10; + int unlock = 1; + + printk("\nshowing all tasks:\n"); + + /* + * Here we try to get the tasklist_lock as hard as possible, + * if not successful after 2 seconds we ignore it (but keep + * trying). This is to enable a debug printout even if a + * tasklist_lock-holding task deadlocks or crashes. + */ +retry: + if (!read_trylock(&tasklist_lock)) { + if (count == 10) + printk("hm, tasklist_lock locked, retrying... "); + if (count) { + count--; + printk(" #%d", 10-count); + mdelay(200); + goto retry; + } + printk(" ignoring it.\n"); + unlock = 0; + } + if (count != 10) + printk(" locked it.\n"); + + do_each_thread(g, p) { + show_task_locks(p); + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; + } while_each_thread(g, p); + + show_held_locks(NULL); + printk("=============================================\n\n"); + + if (unlock) + read_unlock(&tasklist_lock); +} + +static int check_deadlock(struct rt_mutex *lock, int recursive, + unsigned long eip) +{ + struct rt_mutex *lockblk; + struct task_struct *task; + + if (!trace_on) + return 0; + /* + * Special-case: the BKL self-releases at schedule() + * time so it can never deadlock: + */ + if (lock == &kernel_sem.lock) + return 0; + task = lock->owner; + if (!task) + return 0; + lockblk = NULL; + if (task->blocked_on) + lockblk = task->blocked_on->lock; + if (current == task) { + TRACE_OFF(); + if (recursive) + return 1; + printk("\n==========================================\n"); + printk( "[ BUG: lock recursion deadlock detected! |\n"); + printk( "------------------------------------------\n"); + printk("already locked: "); + printk_lock(lock, 1); + show_held_locks(task); + printk("\n-{current task's backtrace}----------------->\n"); + dump_stack(); + show_all_locks(); + printk("[ turning off deadlock detection. Please report this trace. ]\n\n"); + local_irq_disable(); + return 0; + } + /* + * Skip the BKL: + */ + if (lockblk == &kernel_sem.lock) + return 0; + if (lockblk && check_deadlock(lockblk, 1, eip)) { + printk("\n============================================\n"); + printk( "[ BUG: circular locking deadlock detected! ]\n"); + printk( "--------------------------------------------\n"); + printk("%s/%d is deadlocking current task %s/%d\n\n", + task->comm, task->pid, current->comm, current->pid); + printk("\n1) %s/%d is trying to acquire this lock:\n", + current->comm, current->pid); + printk_lock(lock, 1); + + printk("... trying at: "); + print_symbol("%s\n", eip); + + printk("\n2) %s/%d is blocked on this lock:\n", + task->comm, task->pid); + printk_lock(lockblk, 1); + + show_held_locks(current); + show_held_locks(task); + + printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); + show_stack(task, NULL); + printk("\n%s/%d's [current] stackdump:\n\n", + current->comm, current->pid); + dump_stack(); + show_all_locks(); + printk("[ turning off deadlock detection. Please report this trace. ]\n\n"); + local_irq_disable(); + return 0; + } + return 0; +} + +void check_no_held_locks(struct task_struct *task) +{ + struct list_head *curr, *next, *cursor = NULL; + struct rt_mutex *lock; + struct rt_mutex_waiter *w; + struct task_struct *p; + unsigned long flags; + + if (!trace_on) + return; +restart: + trace_lock_irqsave(&trace_lock, flags); + list_for_each_safe(curr, next, &held_locks) { + if (cursor && curr != cursor) + continue; + lock = list_entry(curr, struct rt_mutex, held_list); + p = lock->owner; + if (p != task) + continue; + cursor = next; + list_del_init(curr); + trace_unlock_irqrestore(&trace_lock, flags); + + if (lock == &kernel_sem.lock) { + printk("BUG: %s/%d, BKL held at task exit time!\n", + current->comm, current->pid); + printk("BKL acquired at: "); + print_symbol("%s\n", + (unsigned long) current->last_kernel_lock); + } else + printk("BUG: %s/%d, lock held at task exit time!\n", + current->comm, current->pid); + printk_lock(lock, 1); + if (lock->owner != task) + printk("exiting task is not even the owner??\n"); + goto restart; + } + spin_lock(&pi_lock); + list_for_each(curr, &task->pi_waiters) { + w = list_entry(curr, struct rt_mutex_waiter, pi_list); + TRACE_OFF(); + spin_unlock(&pi_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + printk("hm, PI interest held at exit time? Task:\n"); + printk_task(task); + printk_waiter(w); + return; + } + spin_unlock(&pi_lock); + trace_unlock_irqrestore(&trace_lock, flags); +} + +#endif + +#if defined(ALL_TASKS_PI) && defined(CONFIG_RT_DEADLOCK_DETECT) + +static void +check_pi_list_present(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, + struct task_struct *old_owner) +{ + struct rt_mutex_waiter *w; + struct list_head *curr; + + TRACE_WARN_ON(list_empty(&waiter->pi_list)); + TRACE_WARN_ON(lock->owner); + + list_for_each(curr, &old_owner->pi_waiters) { + w = list_entry(curr, struct rt_mutex_waiter, pi_list); + if (w == waiter) + goto ok; + } + TRACE_WARN_ON(1); +ok: +} + +static void +check_pi_list_empty(struct rt_mutex *lock, struct task_struct *old_owner) +{ + struct rt_mutex_waiter *w; + struct list_head *curr; + + list_for_each(curr, &old_owner->pi_waiters) { + w = list_entry(curr, struct rt_mutex_waiter, pi_list); + if (w->lock == lock) { + TRACE_OFF(); + printk("hm, PI interest but no waiter? Old owner:\n"); + printk_waiter(w); + printk("\n"); + TRACE_WARN_ON(1); + return; + } + } +} + +#else + +static inline void +check_pi_list_present(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, + struct task_struct *old_owner) +{ +} + +static inline void +check_pi_list_empty(struct rt_mutex *lock, struct task_struct *old_owner) +{ +} + +#endif + +/* + * Move PI waiters of this lock to the new owner: + */ +static void +change_owner(struct rt_mutex *lock, struct task_struct *old_owner, + struct task_struct *new_owner) +{ + struct list_head *curr, *next; + struct rt_mutex_waiter *w; + int requeued = 0, sum = 0; + + if (old_owner == new_owner) + return; + list_for_each_safe(curr, next, &old_owner->pi_waiters) { + w = list_entry(curr, struct rt_mutex_waiter, pi_list); + if (w->lock == lock) { + list_del_init(curr); + list_add_tail(curr, &new_owner->pi_waiters); + requeued++; + } + sum++; + } + trace_special(sum, requeued, 0); +} + +int pi_walk, pi_null, pi_prio; + +static void pi_setprio(struct rt_mutex *lock, struct task_struct *p, int prio) +{ + if (unlikely(!p->pid)) { + pi_null++; + return; + } + +#ifdef CONFIG_RT_DEADLOCK_DETECT + pi_prio++; + if (p->policy != SCHED_NORMAL && prio > mutex_getprio(p)) { + TRACE_OFF(); + + printk("huh? (%d->%d??)\n", p->prio, prio); + printk("owner:\n"); + printk_task(p); + printk("\ncurrent:\n"); + printk_task(current); + printk("\nlock:\n"); + printk_lock(lock, 1); + dump_stack(); + local_irq_disable(); + } +#endif + /* + * If the task is blocked on some other task then boost that + * other task (or tasks) too: + */ + for (;;) { + struct rt_mutex_waiter *w = p->blocked_on; + int was_rt = rt_task(p); + + mutex_setprio(p, prio); + if (!w) + break; + /* + * If the task is blocked on a lock, and we just made + * it RT, then register the task in the PI list and + * requeue it to the head of the wait list: + */ + lock = w->lock; + TRACE_BUG_ON(!lock); + TRACE_BUG_ON(!lock->owner); + if (rt_task(p) && list_empty(&w->pi_list)) { + TRACE_BUG_ON(was_rt); + list_add_tail(&w->pi_list, &lock->owner->pi_waiters); + list_del(&w->list); + list_add(&w->list, &lock->wait_list); + } + /* + * If the task is blocked on a lock, and we just restored + * it from RT to non-RT then unregister the task from + * the PI list and requeue it to the tail of the wait + * list: + * + * (TODO: this can be unfair to SCHED_NORMAL tasks if they + * get PI handled.) + */ + if (!rt_task(p) && !list_empty(&w->pi_list)) { + TRACE_BUG_ON(!was_rt); + list_del(&w->pi_list); + list_del(&w->list); + list_add_tail(&w->list, &lock->wait_list); + } + + pi_walk++; + + p = lock->owner; + TRACE_BUG_ON(!p); + /* + * If the dependee is already higher-prio then + * no need to boost it, and all further tasks down + * the dependency chain are already boosted: + */ + if (p->prio <= prio) + break; + } +} + +static void +task_blocks_on_lock(struct rt_mutex_waiter *waiter, struct task_struct *task, + struct rt_mutex *lock, unsigned long eip) +{ +#ifdef CONFIG_RT_DEADLOCK_DETECT + if (lock->debug) + check_deadlock(lock, 0, eip); + /* mark the current thread as blocked on the lock */ + waiter->eip = eip; +#endif + task->blocked_on = waiter; + waiter->lock = lock; + waiter->task = task; + INIT_LIST_HEAD(&waiter->pi_list); + /* + * Add SCHED_NORMAL tasks to the end of the waitqueue (FIFO): + */ +#ifndef ALL_TASKS_PI + if (!rt_task(task)) { + list_add_tail(&waiter->list, &lock->wait_list); + return; + } +#endif + spin_lock(&pi_lock); + list_add_tail(&waiter->pi_list, &lock->owner->pi_waiters); + /* + * Add RT tasks to the head: + */ + list_add(&waiter->list, &lock->wait_list); + /* + * If the waiter has higher priority than the owner + * then temporarily boost the owner: + */ + if (task->prio < lock->owner->prio) + pi_setprio(lock, lock->owner, task->prio); + spin_unlock(&pi_lock); +} + +/* + * initialise the lock: + */ +static void __init_rt_mutex(struct rt_mutex *lock, int save_state, int debug, + char *name, char *file, int line) +{ + lock->owner = NULL; + spin_lock_init(&lock->wait_lock); + INIT_LIST_HEAD(&lock->wait_list); +#ifdef CONFIG_RT_DEADLOCK_DETECT + lock->save_state = save_state; + lock->debug = debug; + INIT_LIST_HEAD(&lock->held_list); + lock->name = name; + lock->file = file; + lock->line = line; +#endif +} + +void fastcall __init_rwsem(struct rw_semaphore *rwsem, int save_state, + int debug, char *name, char *file, int line) +{ + __init_rt_mutex(&rwsem->lock, save_state, debug, name, file, line); + rwsem->read_depth = 0; +} +EXPORT_SYMBOL(__init_rwsem); + +static void set_new_owner(struct rt_mutex *lock, struct task_struct *old_owner, + struct task_struct *new_owner, unsigned long eip) +{ + if (new_owner) + trace_special_pid(new_owner->pid, new_owner->prio, 0); + if (old_owner) + change_owner(lock, old_owner, new_owner); + lock->owner = new_owner; + lock->owner_prio = new_owner->prio; +#ifdef CONFIG_RT_DEADLOCK_DETECT + if (lock->debug) { + TRACE_WARN_ON(!list_empty(&lock->held_list)); + list_add_tail(&lock->held_list, &held_locks); + } + lock->acquire_eip = eip; +#endif +} + +/* + * handle the lock release when processes blocked on it that can now run + * - the spinlock must be held by the caller + */ +static inline struct task_struct * pick_new_owner(struct rt_mutex *lock, + struct task_struct *old_owner, int save_state, + unsigned long eip) +{ + struct rt_mutex_waiter *w, *waiter = NULL; + struct task_struct *new_owner; + struct list_head *curr; + + /* + * Get the highest prio one: + * + * (same-prio RT tasks go FIFO) + */ + list_for_each(curr, &lock->wait_list) { + w = list_entry(curr, struct rt_mutex_waiter, list); + trace_special_pid(w->task->pid, w->task->prio, 0); + /* + * Break out upon meeting the first non-RT-prio + * task - we inserted them to the tail, so if we + * see the first one the rest is SCHED_NORMAL too: + */ + if (!rt_task(w->task)) + break; + if (!waiter || w->task->prio <= waiter->task->prio) + waiter = w; + } + + /* + * If no RT waiter then pick the first one: + */ + if (!waiter) + waiter = list_entry(lock->wait_list.next, + struct rt_mutex_waiter, list); + trace_special_pid(waiter->task->pid, waiter->task->prio, 0); + +#ifdef ALL_TASKS_PI + check_pi_list_present(lock, waiter, old_owner); +#endif + new_owner = waiter->task; + list_del_init(&waiter->list); + + list_del_init(&waiter->pi_list); + + set_new_owner(lock, old_owner, new_owner, eip); + /* Don't touch waiter after ->task has been NULLed */ + mb(); + waiter->task = NULL; + new_owner->blocked_on = NULL; + TRACE_WARN_ON(save_state != lock->save_state); + + return new_owner; +} + +static inline void init_lists(struct rt_mutex *lock) +{ + // we have to do this until the static initializers get fixed: + if (!lock->wait_list.prev && !lock->wait_list.next) + INIT_LIST_HEAD(&lock->wait_list); +#ifdef CONFIG_RT_DEADLOCK_DETECT + if (!lock->held_list.prev && !lock->held_list.next) + INIT_LIST_HEAD(&lock->held_list); +#endif +} + +/* + * lock it semaphore-style: no worries about missed wakeups. + */ +static void __sched __down(struct rt_mutex *lock, unsigned long eip) +{ + struct task_struct *task = current; + unsigned long flags, nosched_flag; + struct rt_mutex_waiter waiter; + + trace_lock_irqsave(&trace_lock, flags); + TRACE_BUG_ON(!irqs_disabled()); + spin_lock(&lock->wait_lock); + + init_lists(lock); + + if (!lock->owner) { + /* granted */ + TRACE_WARN_ON(!list_empty(&lock->wait_list)); + spin_lock(&pi_lock); + set_new_owner(lock, NULL, task, eip); + spin_unlock(&pi_lock); + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + return; + } + + set_task_state(task, TASK_UNINTERRUPTIBLE); + + task_blocks_on_lock(&waiter, task, lock, eip); + + TRACE_BUG_ON(!irqs_disabled()); + /* we don't need to touch the lock struct anymore */ + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + might_sleep(); + + nosched_flag = current->flags & PF_NOSCHED; +#ifdef CONFIG_RT_DEADLOCK_DETECT + if (!lock->debug) +#endif + current->flags &= ~PF_NOSCHED; + + /* wait to be given the lock */ + for (;;) { + if (!waiter.task) + break; + schedule(); + set_task_state(task, TASK_UNINTERRUPTIBLE); + } + current->flags |= nosched_flag; + task->state = TASK_RUNNING; +} + +/* + * get a write lock on the rw-semaphore + */ +void fastcall __sched down_write(struct rw_semaphore *rwsem) +{ + __down(&rwsem->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(down_write); + +/* + * get a read lock on the rw-semaphore + */ +void fastcall __sched down_read(struct rw_semaphore *rwsem) +{ + /* + * Read locks within the write lock succeed. + */ + if (rwsem->lock.owner == current) { + rwsem->read_depth++; + return; + } + return __down(&rwsem->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(down_read); + +/* + * lock it mutex-style: this variant is very careful not to + * miss any non-mutex wakeups. + * + * The wakeup side uses wake_up_process_mutex, which, combined with + * the xchg code of this function is a transparent sleep/wakeup + * mechanism nested within any existing sleep/wakeup mechanism. This + * enables the seemless use of arbitrary (blocking) spinlocks within + * sleep/wakeup event loops. + */ +static void __sched __down_mutex(struct rt_mutex *lock, unsigned long eip) +{ + unsigned long state, saved_state, nosched_flag; + struct task_struct *task = current; + struct rt_mutex_waiter waiter; + int got_wakeup = 0; + + might_sleep(); + + trace_lock_irq(&trace_lock); + TRACE_BUG_ON(!irqs_disabled()); + spin_lock(&lock->wait_lock); + + init_lists(lock); + + if (!lock->owner) { + /* granted */ + TRACE_WARN_ON(!list_empty(&lock->wait_list)); + spin_lock(&pi_lock); + set_new_owner(lock, NULL, task, eip); + spin_unlock(&pi_lock); + spin_unlock(&lock->wait_lock); + trace_unlock_irq(&trace_lock); + + return; + } + + task_blocks_on_lock(&waiter, task, lock, eip); + + TRACE_BUG_ON(!irqs_disabled()); + /* + * Here we save whatever state the task was in originally, + * we'll restore it at the end of the function and we'll + * take any intermediate wakeup into account as well, + * independently of the mutex sleep/wakeup mechanism: + */ + saved_state = xchg(&task->state, TASK_UNINTERRUPTIBLE); + + /* we don't need to touch the lock struct anymore */ + spin_unlock(&lock->wait_lock); + trace_unlock(&trace_lock); + + nosched_flag = current->flags & PF_NOSCHED; +#ifdef CONFIG_RT_DEADLOCK_DETECT + if (!lock->debug) +#endif + current->flags &= ~PF_NOSCHED; + + /* wait to be given the lock */ + for (;;) { + unsigned long saved_flags = current->flags & PF_NOSCHED; + + if (!waiter.task) + break; + local_irq_enable(); + current->flags &= ~PF_NOSCHED; + schedule(); + current->flags |= saved_flags; + local_irq_disable(); + state = xchg(&task->state, TASK_UNINTERRUPTIBLE); + if (state == TASK_RUNNING) + got_wakeup = 1; + } + /* + * Only set the task's state to TASK_RUNNING if it got + * a non-mutex wakeup. We keep the original state otherwise. + * A mutex wakeup changes the task's state to TASK_RUNNING_MUTEX, + * not TASK_RUNNING - hence we can differenciate between the two + * cases: + */ + state = xchg(&task->state, saved_state); + if (state == TASK_RUNNING) + got_wakeup = 1; + if (got_wakeup) + task->state = TASK_RUNNING; + local_irq_enable(); + preempt_check_resched(); + + current->flags |= nosched_flag; +} + +/* + * TODO: push this into __down_mutex() + * + * BKL users expect the BKL to be held across spinlock/rwlock-acquire. + * Save and clear it, this will cause the scheduler to not drop the + * BKL semaphore if we end up scheduling: + */ +#define SAVE_BKL(ACTION) \ +{ \ + struct task_struct *task = current; \ + unsigned int saved_lock_depth; \ + \ + saved_lock_depth = task->lock_depth; \ + task->lock_depth = -1; \ + \ + might_sleep(); \ + ACTION; \ + \ + task->lock_depth = saved_lock_depth; \ +} + + +static void __sched down_write_mutex(struct rw_semaphore *rwsem, + unsigned long eip) +{ + SAVE_BKL(__down_mutex(&rwsem->lock, eip)); +} + +static void __sched down_read_mutex(struct rw_semaphore *rwsem, + unsigned long eip) +{ + /* + * Read locks within the write lock succeed. + */ + if (rwsem->lock.owner == current) { + rwsem->read_depth++; + return; + } + SAVE_BKL(__down_mutex(&rwsem->lock, eip)); +} + +/* + * get a lock - interruptible + */ +static int __sched __down_interruptible(struct rt_mutex *lock, + unsigned long eip) +{ + struct task_struct *task = current; + unsigned long flags, nosched_flag; + struct rt_mutex_waiter waiter; + int ret; + + trace_lock_irqsave(&trace_lock, flags); + TRACE_BUG_ON(!irqs_disabled()); + spin_lock(&lock->wait_lock); + + init_lists(lock); + + if (!lock->owner) { + /* granted */ + TRACE_WARN_ON(!list_empty(&lock->wait_list)); + spin_lock(&pi_lock); + set_new_owner(lock, NULL, task, eip); + spin_unlock(&pi_lock); + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + return 0; + } + + set_task_state(task, TASK_INTERRUPTIBLE); + + task_blocks_on_lock(&waiter, task, lock, eip); + + TRACE_BUG_ON(!irqs_disabled()); + /* we don't need to touch the lock struct anymore */ + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + might_sleep(); + + nosched_flag = current->flags & PF_NOSCHED; +#ifdef CONFIG_RT_DEADLOCK_DETECT + if (!lock->debug) +#endif + current->flags &= ~PF_NOSCHED; + + ret = 0; + /* wait to be given the lock */ + for (;;) { + if (signal_pending(current)) { + /* + * Remove ourselves from the wait list if we + * didnt get the lock - else return success: + */ + trace_lock_irq(&trace_lock); + spin_lock(&lock->wait_lock); + if (waiter.task) { + list_del_init(&waiter.list); + /* + * Just remove ourselves from the PI list. + * (No big problem if our PI effect lingers + * a bit - owner will restore prio.) + */ + spin_lock(&pi_lock); + list_del_init(&waiter.pi_list); + spin_unlock(&pi_lock); + ret = -EINTR; + } + spin_unlock(&lock->wait_lock); + trace_unlock_irq(&trace_lock); + break; + } + if (!waiter.task) + break; + schedule(); + set_task_state(task, TASK_INTERRUPTIBLE); + } + + task->state = TASK_RUNNING; + current->flags |= nosched_flag; + + return ret; +} + +int fastcall __sched down_write_interruptible(struct rw_semaphore *rwsem) +{ + return __down_interruptible(&rwsem->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(down_write_interruptible); + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +static int __down_trylock(struct rt_mutex *lock, unsigned long eip) +{ + struct task_struct *task = current; + unsigned long flags; + int ret = 0; + + trace_lock_irqsave(&trace_lock, flags); + TRACE_BUG_ON(!irqs_disabled()); + spin_lock(&lock->wait_lock); + + init_lists(lock); + + if (!lock->owner) { + /* granted */ + TRACE_WARN_ON(!list_empty(&lock->wait_list)); + spin_lock(&pi_lock); + set_new_owner(lock, NULL, task, eip); + spin_unlock(&pi_lock); + ret = 1; + } + + spin_unlock(&lock->wait_lock); + trace_unlock_irqrestore(&trace_lock, flags); + + return ret; +} + +int fastcall down_write_trylock(struct rw_semaphore *rwsem) +{ + return __down_trylock(&rwsem->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(down_write_trylock); + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +int fastcall down_read_trylock(struct rw_semaphore *rwsem) +{ + /* + * Read locks within the self-held write lock succeed. + */ + if (rwsem->lock.owner == current) { + rwsem->read_depth++; + return 1; + } + return __down_trylock(&rwsem->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(down_read_trylock); + +static int down_write_trylock_mutex(struct rw_semaphore *rwsem) +{ + return __down_trylock(&rwsem->lock, CALLER_ADDR0); +} + +static int down_read_trylock_mutex(struct rw_semaphore *rwsem) +{ + /* + * Read locks within the self-held write lock succeed. + */ + if (rwsem->lock.owner == current) { + rwsem->read_depth++; + return 1; + } + return __down_trylock(&rwsem->lock, CALLER_ADDR0); +} + +/* + * release the lock: + */ +static void __up_mutex(struct rt_mutex *lock, int save_state, unsigned long eip) +{ + struct task_struct *old_owner, *new_owner; + struct rt_mutex_waiter *w; + struct list_head *curr; + unsigned long flags; + int prio; + + TRACE_WARN_ON(save_state != lock->save_state); + + trace_lock_irqsave(&trace_lock, flags); + TRACE_BUG_ON(!irqs_disabled()); + spin_lock(&lock->wait_lock); + TRACE_BUG_ON(!lock->wait_list.prev && !lock->wait_list.next); + +#ifdef CONFIG_RT_DEADLOCK_DETECT + if (lock->debug) { + TRACE_WARN_ON(list_empty(&lock->held_list)); + list_del_init(&lock->held_list); + } +#endif + spin_lock(&pi_lock); + + old_owner = lock->owner; +#ifdef ALL_TASKS_PI + if (list_empty(&lock->wait_list)) + check_pi_list_empty(lock, old_owner); +#endif + lock->owner = NULL; + new_owner = NULL; + if (!list_empty(&lock->wait_list)) + new_owner = pick_new_owner(lock, old_owner, save_state, eip); + + /* + * If the owner got priority-boosted then restore it + * to the previous priority (or to the next highest prio + * waiter's priority): + */ + prio = mutex_getprio(old_owner); + list_for_each(curr, &old_owner->pi_waiters) { + w = list_entry(curr, struct rt_mutex_waiter, pi_list); + if (w->task->prio < prio) + prio = w->task->prio; + trace_special_pid(w->task->pid, w->task->prio, 0); + } + if (prio != old_owner->prio) + pi_setprio(lock, old_owner, prio); + if (new_owner) { + if (save_state) + wake_up_process_mutex(new_owner); + else + wake_up_process(new_owner); + } + spin_unlock(&pi_lock); + spin_unlock(&lock->wait_lock); + +#ifdef PREEMPT_DIRECT + trace_unlock(&trace_lock); + /* + * Common place where preemption is requested - if we can + * reschedule then do it here without enabling interrupts + * again (and lengthening latency): + */ + if (need_resched() && !irqs_disabled_flags(flags) && !preempt_count()) + preempt_schedule_irq(); + local_irq_restore(flags); +#else + trace_unlock_irqrestore(&trace_lock, flags); +#endif + /* no need to check for preempt here - we just handled it */ +} + +/* + * Do owner check too: + */ +void fastcall up_write(struct rw_semaphore *rwsem) +{ + WARN_ON(rwsem->lock.owner != current); + BUG_ON(rwsem->read_depth); + __up_mutex(&rwsem->lock, 0, CALLER_ADDR0); +} +EXPORT_SYMBOL(up_write); + +static void _up_write(struct rw_semaphore *rwsem, unsigned long eip) +{ + WARN_ON(rwsem->lock.owner != current); + BUG_ON(rwsem->read_depth); + __up_mutex(&rwsem->lock, 0, eip); +} + +void fastcall up_write_mutex(struct rw_semaphore *rwsem, unsigned long eip) +{ + TRACE_WARN_ON(rwsem->lock.save_state != 1); + WARN_ON(rwsem->lock.owner != current); + BUG_ON(rwsem->read_depth); + __up_mutex(&rwsem->lock, 1, eip); +} + +/* + * release a read lock on the semaphore + */ +void fastcall up_read(struct rw_semaphore *rwsem) +{ + /* + * Read locks within the self-held write lock succeed. + */ + if (rwsem->lock.owner == current && rwsem->read_depth) { + rwsem->read_depth--; + return; + } + return _up_write(rwsem, CALLER_ADDR0); +} +EXPORT_SYMBOL(up_read); + +void fastcall up_read_mutex(struct rw_semaphore *rwsem, unsigned long eip) +{ + TRACE_WARN_ON(rwsem->lock.save_state != 1); + /* + * Read locks within the self-held write lock succeed. + */ + if (rwsem->lock.owner == current && rwsem->read_depth) { + rwsem->read_depth--; + return; + } + return up_write_mutex(rwsem, eip); +} + +/* + * downgrade a write lock into a read lock + * - just wake up any readers at the front of the queue + */ +void fastcall downgrade_write(struct rw_semaphore *rwsem) +{ + BUG(); +} +EXPORT_SYMBOL(downgrade_write); + +static int rt_mutex_is_locked(struct rt_mutex *lock) +{ + int ret; + + mb(); + ret = lock->owner != NULL; + + return ret; +} + +int fastcall rwsem_is_locked(struct rw_semaphore *rwsem) +{ + return rt_mutex_is_locked(&rwsem->lock); +} +EXPORT_SYMBOL(rwsem_is_locked); + +static void _down_mutex(struct rt_mutex *lock, unsigned long eip) +{ + TRACE_WARN_ON(lock->save_state != 1); + __down_mutex(lock, eip); +} + +void fastcall __sema_init(struct semaphore *sem, int val, int debug, + char *name, char *file, int line) +{ + atomic_set(&sem->count, val); + switch (val) { + case 0: + __init_rt_mutex(&sem->lock, 0, debug, name, file, line); + __down(&sem->lock, CALLER_ADDR0); + break; + default: + __init_rt_mutex(&sem->lock, 0, debug, name, file, line); + break; + } +} +EXPORT_SYMBOL(__sema_init); + +void fastcall __init_MUTEX(struct semaphore *sem, char *name, char *file, + int line) +{ + __sema_init(sem, 1, 1, name, file, line); +} +EXPORT_SYMBOL(__init_MUTEX); + +/* + * We initialize them to nodebug because mutexes that are initialized + * locked are almost always used for completion purposes, not genuine + * locking: + */ +void fastcall __init_MUTEX_LOCKED(struct semaphore *sem, char *name, + char *file, int line) +{ + __sema_init(sem, 0, 0, name, file, line); +} +EXPORT_SYMBOL(__init_MUTEX_LOCKED); + +static int down_trylock_mutex(struct rt_mutex *lock, unsigned long eip) +{ + TRACE_WARN_ON(lock->save_state != 1); + return __down_trylock(lock, eip); +} + +void fastcall up_mutex(struct rt_mutex *lock, unsigned long eip) +{ + TRACE_WARN_ON(lock->save_state != 1); + WARN_ON(lock->owner != current); + __up_mutex(lock, 1, eip); +} + +/* + * Linux Semaphores implemented via RT-mutexes. + * + * In the down() variants we use the mutex as the semaphore blocking + * object: we always acquire it, decrease the counter and keep the lock + * locked if we did the 1->0 transition. The next down() will then block. + * + * In the up() path we atomically increase the counter and do the + * unlock if we were the one doing the 0->1 transition. + */ + +static inline void __down_complete(struct semaphore *sem, unsigned long eip) +{ + int count = atomic_dec_return(&sem->count); + + TRACE_WARN_ON(sem->lock.save_state != 0); + WARN_ON(count < 0); + + if (count > 0) + __up_mutex(&sem->lock, 0, eip); +} + +void fastcall down(struct semaphore *sem) +{ + TRACE_WARN_ON(sem->lock.save_state != 0); + __down(&sem->lock, CALLER_ADDR0); + __down_complete(sem, CALLER_ADDR0); +} +EXPORT_SYMBOL(down); + +int fastcall down_interruptible(struct semaphore *sem) +{ + int ret; + + TRACE_WARN_ON(sem->lock.save_state != 0); + ret = __down_interruptible(&sem->lock, CALLER_ADDR0); + if (ret) + return ret; + __down_complete(sem, CALLER_ADDR0); + return 0; +} +EXPORT_SYMBOL(down_interruptible); + +/* + * try to down the semaphore, 0 on success and 1 on failure. (inverted) + */ +int fastcall down_trylock(struct semaphore *sem) +{ + TRACE_WARN_ON(sem->lock.save_state != 0); + /* + * Here we are a tiny bit different from ordinary Linux semaphores, + * because we can get 'transient' locking-failures when say a + * process decreases the count from 9 to 8 and locks/releases the + * embedded mutex internally. It would be quite complex to remove + * these transient failures so lets try it the simple way first: + */ + if (__down_trylock(&sem->lock, CALLER_ADDR0)) { + __down_complete(sem, CALLER_ADDR0); + return 0; + } + return 1; +} +EXPORT_SYMBOL(down_trylock); + +void fastcall up(struct semaphore *sem) +{ + int count; + + TRACE_WARN_ON(sem->lock.save_state != 0); + /* + * Disable preemption to make sure a highprio trylock-er cannot + * preempt us here and get into an infinite loop: + */ + preempt_disable(); + count = atomic_inc_return(&sem->count); + /* + * If we did the 0 -> 1 transition then we are the ones to unlock it: + */ + if (count == 1) + __up_mutex(&sem->lock, 0, CALLER_ADDR0); + preempt_enable(); +} +EXPORT_SYMBOL(up); + +int fastcall sem_is_locked(struct semaphore *sem) +{ + TRACE_WARN_ON(sem->lock.save_state != 0); + return rt_mutex_is_locked(&sem->lock); +} +EXPORT_SYMBOL(sem_is_locked); + +int fastcall sema_count(struct semaphore *sem) +{ + TRACE_WARN_ON(sem->lock.save_state != 0); + return atomic_read(&sem->count); +} +EXPORT_SYMBOL(sema_count); + +/* + * Spinlock wrappers: + */ + +static void __spin_lock(spinlock_t *lock, unsigned long eip) +{ + SAVE_BKL(_down_mutex(&lock->lock, eip)); +} + +void _spin_lock(spinlock_t *spin) +{ + __spin_lock(spin, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_lock); + +void _spin_lock_bh(spinlock_t *spin) +{ + __spin_lock(spin, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_lock_bh); + +void _spin_lock_irq(spinlock_t *spin) +{ + __spin_lock(spin, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_lock_irq); + +unsigned long _spin_lock_irqsave(spinlock_t *spin) +{ + unsigned long flags; + + __spin_lock(spin, CALLER_ADDR0); + local_save_flags(flags); + + return flags; +} +EXPORT_SYMBOL(_spin_lock_irqsave); + +void _spin_unlock(spinlock_t *lock) +{ + up_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_unlock); + +void _spin_unlock_wait(spinlock_t *lock) +{ + do { + barrier(); + } while (_spin_is_locked(lock)); +} +EXPORT_SYMBOL(_spin_unlock_wait); + +void _spin_unlock_bh(spinlock_t *lock) +{ + up_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_unlock_bh); + +void _spin_unlock_irq(spinlock_t *lock) +{ + up_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_unlock_irq); + +void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +{ + up_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_unlock_irqrestore); + +int _spin_trylock(spinlock_t *lock) +{ + return down_trylock_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_trylock); + +int _spin_trylock_bh(spinlock_t *lock) +{ + return down_trylock_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_trylock_bh); + +int _spin_trylock_irq(spinlock_t *lock) +{ + return down_trylock_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_trylock_irq); + +int _spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags) +{ + local_save_flags(*flags); + return down_trylock_mutex(&lock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_spin_trylock_irqsave); + +int _spin_is_locked(spinlock_t *lock) +{ + return rt_mutex_is_locked(&lock->lock); +} +EXPORT_SYMBOL(_spin_is_locked); + +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock) +{ + __spin_lock(lock, CALLER_ADDR0); + if (atomic_dec_and_test(atomic)) + return 1; + _spin_unlock(lock); + + return 0; +} +EXPORT_SYMBOL(atomic_dec_and_spin_lock); + +void _spin_lock_init(spinlock_t *lock, char *name, char *file, int line) +{ + __init_rt_mutex(&lock->lock, 1, 1, name, file, line); +} +EXPORT_SYMBOL(_spin_lock_init); + + +/* + * RW-lock wrappers: + */ +int _read_trylock(rwlock_t *rwlock) +{ + return down_read_trylock_mutex(&rwlock->lock); +} +EXPORT_SYMBOL(_read_trylock); + +int _write_trylock(rwlock_t *rwlock) +{ + return down_write_trylock_mutex(&rwlock->lock); +} +EXPORT_SYMBOL(_write_trylock); + +void _write_lock(rwlock_t *rwlock) +{ + down_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_lock); + +void _read_lock(rwlock_t *rwlock) +{ + down_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_lock); + +void _write_unlock(rwlock_t *rwlock) +{ + up_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_unlock); + +void _read_unlock(rwlock_t *rwlock) +{ + up_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_unlock); + +unsigned long _write_lock_irqsave(rwlock_t *rwlock) +{ + unsigned long flags; + + down_write_mutex(&rwlock->lock, CALLER_ADDR0); + + local_save_flags(flags); + return flags; +} +EXPORT_SYMBOL(_write_lock_irqsave); + +unsigned long _read_lock_irqsave(rwlock_t *rwlock) +{ + unsigned long flags; + + down_read_mutex(&rwlock->lock, CALLER_ADDR0); + + local_save_flags(flags); + return flags; +} +EXPORT_SYMBOL(_read_lock_irqsave); + +void _write_lock_irq(rwlock_t *rwlock) +{ + down_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_lock_irq); + +void _read_lock_irq(rwlock_t *rwlock) +{ + down_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_lock_irq); + +void _write_lock_bh(rwlock_t *rwlock) +{ + down_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_lock_bh); + +void _read_lock_bh(rwlock_t *rwlock) +{ + down_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_lock_bh); + +void _write_unlock_irq(rwlock_t *rwlock) +{ + up_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_unlock_irq); + +void _read_unlock_irq(rwlock_t *rwlock) +{ + up_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_unlock_irq); + +void _write_unlock_bh(rwlock_t *rwlock) +{ + up_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_unlock_bh); + +void _read_unlock_bh(rwlock_t *rwlock) +{ + up_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_unlock_bh); + +void _write_unlock_irqrestore(rwlock_t *rwlock, + unsigned long flags) +{ + up_write_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_write_unlock_irqrestore); + +void _read_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags) +{ + up_read_mutex(&rwlock->lock, CALLER_ADDR0); +} +EXPORT_SYMBOL(_read_unlock_irqrestore); + +void _rwlock_init(rwlock_t *rwlock, char *name, char *file, int line) +{ + __init_rwsem(&rwlock->lock, 1, 1, name, file, line); +} +EXPORT_SYMBOL(_rwlock_init); + +int _rwlock_is_locked(rwlock_t *rwlock) +{ + return rwsem_is_locked(&rwlock->lock); +} +EXPORT_SYMBOL(_rwlock_is_locked); + --- linux/kernel/time.c.orig +++ linux/kernel/time.c @@ -98,6 +98,20 @@ asmlinkage long sys_stime(time_t __user asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz) { +#ifdef CONFIG_LATENCY_TRACE + if (!tv && ((long)tz == 1)) + return user_trace_start(); + if (!tv && !tz) + return user_trace_stop(); +#endif + if (((long)tv == 1) && ((long)tz == 1)) { + current->flags |= PF_NOSCHED; + return 0; + } + if (((long)tv == 1) && ((long)tz == 0)) { + current->flags &= ~PF_NOSCHED; + return 0; + } if (likely(tv != NULL)) { struct timeval ktv; do_gettimeofday(&ktv); --- linux/kernel/exit.c.orig +++ linux/kernel/exit.c @@ -47,8 +47,11 @@ static void __unhash_process(struct task if (thread_group_leader(p)) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); - if (p->pid) + if (p->pid) { + preempt_disable(); __get_cpu_var(process_counts)--; + preempt_enable(); + } } REMOVE_LINKS(p); @@ -372,8 +375,10 @@ static inline void close_files(struct fi while (set) { if (set & 1) { struct file * file = xchg(&files->fd[i], NULL); - if (file) + if (file) { filp_close(file, files); + cond_resched(); + } } i++; set >>= 1; @@ -503,9 +508,11 @@ static inline void __exit_mm(struct task if (mm != tsk->active_mm) BUG(); /* more a memory barrier than a real lock */ task_lock(tsk); + preempt_disable(); // FIXME tsk->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); + preempt_enable(); task_unlock(tsk); mmput(mm); } @@ -776,10 +783,6 @@ static void exit_notify(struct task_stru /* If the process is dead, release it - nobody will wait for it */ if (state == EXIT_DEAD) release_task(tsk); - - /* PF_DEAD causes final put_task_struct after we schedule. */ - preempt_disable(); - tsk->flags |= PF_DEAD; } fastcall NORET_TYPE void do_exit(long code) @@ -838,12 +841,18 @@ fastcall NORET_TYPE void do_exit(long co mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; #endif - - BUG_ON(!(current->flags & PF_DEAD)); - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) ; + check_no_held_locks(tsk); + /* PF_DEAD causes final put_task_struct after we schedule. */ +again: + local_irq_disable(); + tsk->flags |= PF_DEAD; + __schedule(); + printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n", + current->comm, current->pid); + printk(KERN_ERR ".... flags: %08lx, count: %d, state: %08lx\n", + current->flags, atomic_read(¤t->usage), current->state); + printk(KERN_ERR ".... trying again ...\n"); + goto again; } NORET_TYPE void complete_and_exit(struct completion *comp, long code) @@ -867,8 +876,21 @@ task_t fastcall *next_thread(const task_ if (!p->sighand) BUG(); if (!spin_is_locked(&p->sighand->siglock) && - !rwlock_is_locked(&tasklist_lock)) + !rwlock_is_locked(&tasklist_lock)) { +#ifdef CONFIG_PREEMPT_RT +#if 0 + printk("hm #1, siglock: %d. tasklist_lock: %d.\n", + atomic_read(&p->sighand->siglock.lock.count), + tasklist_lock.lock.activity); + spin_lock(&tasklist_lock.lock.wait_lock); + spin_unlock(&tasklist_lock.lock.wait_lock); + printk("hm #2, siglock: %d. tasklist_lock: %d.\n", + atomic_read(&p->sighand->siglock.lock.count), + tasklist_lock.lock.activity); +#endif +#endif BUG(); + } #endif return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID); } @@ -1348,6 +1370,7 @@ repeat: list_for_each(_p,&tsk->children) { p = list_entry(_p,struct task_struct,sibling); + BUG_ON(!atomic_read(&p->usage)); ret = eligible_child(pid, options, p); if (!ret) continue; --- linux/kernel/printk.c.orig +++ linux/kernel/printk.c @@ -78,7 +78,7 @@ static int console_locked; * It is also used in interesting ways to provide interlocking in * release_console_sem(). */ -static DEFINE_SPINLOCK(logbuf_lock); +static DEFINE_RAW_SPINLOCK(logbuf_lock); static char __log_buf[__LOG_BUF_LEN]; static char *log_buf = __log_buf; @@ -390,10 +390,12 @@ static void __call_console_drivers(unsig { struct console *con; + touch_critical_timing(); for (con = console_drivers; con; con = con->next) { if ((con->flags & CON_ENABLED) && con->write) con->write(con, &LOG_BUF(start), end - start); } + touch_critical_timing(); } /* @@ -497,6 +499,7 @@ static void zap_locks(void) spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ init_MUTEX(&console_sem); + zap_rt_locks(); } /* @@ -651,8 +654,17 @@ void release_console_sem(void) } console_locked = 0; console_may_schedule = 0; - up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); + up(&console_sem); + /* + * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd + * up only if we are in a preemptible section. We normally dont + * printk from non-preemptible sections so this is for the emergency + * case only. + */ +#ifdef CONFIG_PREEMPT_RT + if (!in_atomic() && !irqs_disabled()) +#endif if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) wake_up_interruptible(&log_wait); } @@ -875,7 +887,7 @@ void tty_write_message(struct tty_struct */ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) { - static DEFINE_SPINLOCK(ratelimit_lock); + static DEFINE_RAW_SPINLOCK(ratelimit_lock); static unsigned long toks = 10*5*HZ; static unsigned long last_msg; static int missed; --- linux/kernel/sched.c.orig +++ linux/kernel/sched.c @@ -4,6 +4,7 @@ * Kernel scheduler and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds + * Copyright (C) 2004 Red Hat, Inc., Ingo Molnar * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe @@ -16,6 +17,7 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2004-10-13 Real-Time Preemption support by Ingo Molnar */ #include @@ -48,6 +50,7 @@ #include #include #include +#include #include #include @@ -186,6 +189,7 @@ static unsigned int task_timeslice(task_ typedef struct runqueue runqueue_t; struct prio_array { + runqueue_t *rq; unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; struct list_head queue[MAX_PRIO]; @@ -199,7 +203,7 @@ struct prio_array { * acquire operations must be ordered by ascending &runqueue. */ struct runqueue { - spinlock_t lock; + raw_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because @@ -207,6 +211,9 @@ struct runqueue { */ unsigned long nr_running; #ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT_RT + unsigned long rt_nr_running; +#endif unsigned long cpu_load; #endif unsigned long long nr_switches; @@ -291,12 +298,18 @@ static DEFINE_PER_CPU(struct runqueue, r #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#ifdef CONFIG_PREEMPT_RT +# ifdef prepare_arch_switch +# error FIXME +# endif +#endif + /* * Default context-switch locking: */ #ifndef prepare_arch_switch # define prepare_arch_switch(rq, next) do { } while (0) -# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +# define _finish_arch_switch(rq, next) spin_unlock(&(rq)->lock) # define task_running(rq, p) ((rq)->curr == (p)) #endif @@ -563,6 +576,33 @@ static inline void sched_info_switch(tas #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS */ +int rt_overload_schedule, rt_overload_wakeup, rt_overload_pulled; + +__cacheline_aligned_in_smp atomic_t rt_overload; + +static inline void inc_rt_tasks(task_t *p, runqueue_t *rq) +{ +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + if (rt_task(p)) { + rq->rt_nr_running++; + if (rq->rt_nr_running == 2) + atomic_inc(&rt_overload); + } +#endif +} + +static inline void dec_rt_tasks(task_t *p, runqueue_t *rq) +{ +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + if (rt_task(p)) { + WARN_ON(!rq->rt_nr_running); + rq->rt_nr_running--; + if (rq->rt_nr_running == 1) + atomic_dec(&rt_overload); + } +#endif +} + /* * Adding/removing a task to/from a priority array: */ @@ -572,15 +612,21 @@ static void dequeue_task(struct task_str list_del(&p->run_list); if (list_empty(array->queue + p->prio)) __clear_bit(p->prio, array->bitmap); + dec_rt_tasks(p, array->rq); } static void enqueue_task(struct task_struct *p, prio_array_t *array) { + if (p->flags & PF_DEAD) { + printk("BUG: %s/%d: dead task enqueued!\n", p->comm, p->pid); + dump_stack(); + } sched_info_queued(p); list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; + inc_rt_tasks(p, array->rq); } /* @@ -619,13 +665,11 @@ static inline void enqueue_task_head(str * * Both properties are important to certain workloads. */ -static int effective_prio(task_t *p) + +static inline int __effective_prio(task_t *p) { int bonus, prio; - if (rt_task(p)) - return p->prio; - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; @@ -636,22 +680,52 @@ static int effective_prio(task_t *p) return prio; } +static int effective_prio(task_t *p) +{ + if (rt_task(p)) + return p->prio; + return __effective_prio(p); +} + +static inline void trace_start_sched_wakeup(task_t *p, runqueue_t *rq) +{ + if (TASK_PREEMPTS_CURR(p, rq) && (p != rq->curr)) + __trace_start_sched_wakeup(p); +} + /* * __activate_task - move a task to the runqueue. */ static inline void __activate_task(task_t *p, runqueue_t *rq) { + trace_special_pid(p->pid, p->prio, rq->nr_running); enqueue_task(p, rq->active); rq->nr_running++; } /* + * __activate_task_after - move a task to the runqueue, + * to execute after a specific task. + */ +static inline +void __activate_task_after(task_t *p, task_t *parent, runqueue_t *rq) +{ + // FIXME: to head rather? + list_add_tail(&p->run_list, &parent->run_list); + p->array = parent->array; + p->array->nr_active++; + rq->nr_running++; + inc_rt_tasks(p, rq); +} + +/* * __activate_idle_task - move idle task to the _front_ of runqueue. */ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { enqueue_task_head(p, rq->active); rq->nr_running++; + WARN_ON(rt_task(p)); } static void recalc_task_prio(task_t *p, unsigned long long now) @@ -984,7 +1058,7 @@ static inline int wake_idle(int cpu, tas * * returns failure only if the task is already active. */ -static int try_to_wake_up(task_t * p, unsigned int state, int sync) +static int try_to_wake_up(task_t * p, unsigned int state, int sync, int mutex) { int cpu, this_cpu, success = 0; unsigned long flags; @@ -996,12 +1070,25 @@ static int try_to_wake_up(task_t * p, un int new_cpu; #endif +#ifdef CONFIG_PREEMPT_RT + /* + * sync wakeups can increase wakeup latencies: + */ + sync = 0; +#endif + rq = task_rq_lock(p, &flags); schedstat_inc(rq, ttwu_cnt); old_state = p->state; if (!(old_state & state)) goto out; + if (p->flags & PF_DEAD) { + printk("BUG: %s/%d: dead task woken up!\n", p->comm, p->pid); + dump_stack(); + goto out; + } + if (p->array) goto out_running; @@ -1086,6 +1173,16 @@ out_set_cpu: this_cpu = smp_processor_id(); cpu = task_cpu(p); + } else { + /* + * If a newly woken up RT task cannot preempt the + * current (RT) task then try to find another + * CPU it can preempt: + */ + if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) { + smp_send_reschedule_allbutself(); + rt_overload_wakeup++; + } } out_activate: @@ -1112,27 +1209,62 @@ out_activate: if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } + trace_start_sched_wakeup(p, rq); + if (rq->curr && p && rq && _need_resched()) + trace_special_pid(p->pid, p->prio, rq->curr->prio); success = 1; out_running: - p->state = TASK_RUNNING; + if (mutex) + p->state = TASK_RUNNING_MUTEX; + else + p->state = TASK_RUNNING; out: - task_rq_unlock(rq, &flags); +#ifdef PREEMPT_DIRECT + spin_unlock(&rq->lock); + /* + * Common place where preemption is requested - if we can + * reschedule then do it here without enabling interrupts + * again (and lengthening latency): + */ + if (_need_resched() && !irqs_disabled_flags(flags) && !preempt_count()) + preempt_schedule_irq(); + local_irq_restore(flags); +#else + spin_unlock_irqrestore(&rq->lock, flags); +#endif + /* no need to check for preempt here - we just handled it */ return success; } int fastcall wake_up_process(task_t * p) { - return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | - TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); + int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | + TASK_UNINTERRUPTIBLE, 0, 0); + mcount(); + return ret; } EXPORT_SYMBOL(wake_up_process); +int fastcall wake_up_process_mutex(task_t * p) +{ + int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | + TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE | + TASK_UNINTERRUPTIBLE, 0, 1); + mcount(); + return ret; +} + +EXPORT_SYMBOL(wake_up_process_mutex); + int fastcall wake_up_state(task_t *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + int ret = try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0); + mcount(); + return ret; } #ifdef CONFIG_SMP @@ -1239,15 +1371,16 @@ void fastcall wake_up_new_task(task_t * __activate_task(p, rq); else { p->prio = current->prio; - list_add_tail(&p->run_list, ¤t->run_list); - p->array = current->array; - p->array->nr_active++; - rq->nr_running++; + __activate_task_after(p, current, rq); } set_need_resched(); - } else + trace_start_sched_wakeup(p, rq); + } else { /* Run child last */ __activate_task(p, rq); + if (rt_task(p) && TASK_PREEMPTS_CURR(p, rq)) + set_need_resched(); + } /* * We skip the following code due to cpu == this_cpu * @@ -1326,13 +1459,14 @@ void fastcall sched_exit(task_t * p) * details.) */ static void finish_task_switch(task_t *prev) - __releases(rq->lock) + __releases(this_rq->lock) { - runqueue_t *rq = this_rq(); - struct mm_struct *mm = rq->prev_mm; + int this_cpu = smp_processor_id(); + runqueue_t *this_rq = cpu_rq(this_cpu); + struct mm_struct *mm = this_rq->prev_mm; unsigned long prev_task_flags; - rq->prev_mm = NULL; + this_rq->prev_mm = NULL; /* * A task struct has one reference for the use as "current". @@ -1346,11 +1480,28 @@ static void finish_task_switch(task_t *p * Manfred Spraul */ prev_task_flags = prev->flags; - finish_arch_switch(rq, prev); + +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + /* + * If we pushed an RT task off the runqueue, + * then kick other CPUs, they might run it: + */ + if (unlikely(rt_task(current) && prev->array && rt_task(prev))) { + rt_overload_schedule++; + smp_send_reschedule_allbutself(); + } +#endif + _finish_arch_switch(this_rq, prev); + + trace_stop_sched_switched(current); + /* + * Delay the final freeing of the mm or task, so that we dont have + * to do complex work from within the scheduler: + */ if (mm) - mmdrop(mm); + mmdrop_delayed(mm); if (unlikely(prev_task_flags & PF_DEAD)) - put_task_struct(prev); + put_task_struct_delayed(prev); } /** @@ -1360,7 +1511,11 @@ static void finish_task_switch(task_t *p asmlinkage void schedule_tail(task_t *prev) __releases(rq->lock) { + preempt_disable(); // TODO: move this to fork setup finish_task_switch(prev); + preempt_enable_no_resched(); + local_irq_enable(); + preempt_check_resched(); if (current->set_child_tid) put_user(current->pid, current->set_child_tid); @@ -1389,6 +1544,8 @@ task_t * context_switch(runqueue_t *rq, rq->prev_mm = oldmm; } + trace_cmdline(); + /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -1429,6 +1586,21 @@ unsigned long nr_uninterruptible(void) return sum; } +unsigned long nr_uninterruptible_cpu(int cpu) +{ + return cpu_rq(cpu)->nr_uninterruptible; +} + +unsigned long rt_nr_running_cpu(int cpu) +{ +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + return cpu_rq(cpu)->rt_nr_running; +#else + return 0; +#endif +} + + unsigned long long nr_context_switches(void) { unsigned long long i, sum = 0; @@ -1510,6 +1682,100 @@ static void double_lock_balance(runqueue } } +#ifdef CONFIG_PREEMPT_RT + +static task_t * pick_rt_task(runqueue_t *src_rq, int this_cpu) +{ + struct list_head *head, *curr; + prio_array_t *array; + task_t *tmp; + int idx; + + WARN_ON(!spin_is_locked(&src_rq->lock)); + /* + * Only consider the active array - we are looking for + * RT tasks. Must have 2 tasks at least: + */ + array = src_rq->active; + if (unlikely(array->nr_active < 2)) + return NULL; + + idx = sched_find_first_bit(array->bitmap); +next_in_bitmap: + /* + * Only non-RT tasks available - abort the search: + */ + if (idx >= MAX_RT_PRIO) + return NULL; + + head = array->queue + idx; + curr = head->next; +next_in_queue: + tmp = list_entry(curr, task_t, run_list); + /* + * Return the highest-prio non-running RT task (if task + * may run on this CPU): + */ + if (!task_running(src_rq, tmp) && + cpu_isset(this_cpu, tmp->cpus_allowed)) + return tmp; + + curr = curr->next; + if (curr != head) + goto next_in_queue; + + idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1); + goto next_in_bitmap; +} + +/* + * Pull RT tasks from other CPUs in the RT-overload + * case. Interrupts are disabled, local rq is locked. + */ +static void pull_rt_tasks(runqueue_t *this_rq, int this_cpu) +{ + runqueue_t *src_rq; + task_t *p; + int cpu; + + WARN_ON(!irqs_disabled()); + + for_each_online_cpu(cpu) { + if (cpu == this_cpu) + continue; + src_rq = cpu_rq(cpu); + if (src_rq->rt_nr_running <= 1) + continue; + + double_lock_balance(this_rq, src_rq); + + p = pick_rt_task(src_rq, this_cpu); + + if (p /* && TASK_PREEMPTS_CURR(p, this_rq) */ ) { + WARN_ON(p == src_rq->curr); + WARN_ON(!p->array); + rt_overload_pulled++; + + set_task_cpu(p, this_cpu); + + p->timestamp = p->timestamp - + src_rq->timestamp_last_tick + + this_rq->timestamp_last_tick; + deactivate_task(p, src_rq); + activate_task(p, this_rq, 0); + /* + * We continue with the search, just in + * case there's an even higher prio task + * in another runqueue. + */ + } + spin_unlock(&src_rq->lock); + } +} + +#endif + + /* * find_idlest_cpu - find the least busy runqueue. */ @@ -2271,7 +2537,9 @@ static inline void account_it_virt(struc cputime_gt(cputime, cputime_zero)) { if (cputime_ge(cputime, it_virt)) { it_virt = cputime_add(it_virt, p->it_virt_incr); +#if 0 send_sig(SIGVTALRM, p, 1); +#endif } it_virt = cputime_sub(it_virt, cputime); p->it_virt_value = it_virt; @@ -2291,7 +2559,9 @@ static void account_it_prof(struct task_ cputime_gt(cputime, cputime_zero)) { if (cputime_ge(cputime, it_prof)) { it_prof = cputime_add(it_prof, p->it_prof_incr); +#if 0 send_sig(SIGPROF, p, 1); +#endif } it_prof = cputime_sub(it_prof, cputime); p->it_prof_value = it_prof; @@ -2313,12 +2583,18 @@ static void check_rlimit(struct task_str if (unlikely(cputime_gt(total, tmp))) { /* Send SIGXCPU every second. */ tmp = cputime_sub(total, cputime); - if (cputime_to_secs(tmp) < cputime_to_secs(total)) + if (cputime_to_secs(tmp) < cputime_to_secs(total)) { +#if 0 send_sig(SIGXCPU, p, 1); +#endif + } /* and SIGKILL when we go over max.. */ tmp = jiffies_to_cputime(p->signal->rlim[RLIMIT_CPU].rlim_max); - if (cputime_gt(total, tmp)) + if (cputime_gt(total, tmp)) { +#if 0 send_sig(SIGKILL, p, 1); +#endif + } } } @@ -2413,6 +2689,8 @@ void scheduler_tick(void) runqueue_t *rq = this_rq(); task_t *p = current; + BUG_ON(!irqs_disabled()); + rq->timestamp_last_tick = sched_clock(); if (p == rq->idle) { @@ -2434,6 +2712,8 @@ void scheduler_tick(void) * priority until it either goes to sleep or uses up its * timeslice. This makes it possible for interactive tasks * to use up their timeslices at their highest priority levels. + * + * Priority-boosted SCHED_NORMAL tasks may go here too. */ if (rt_task(p)) { /* @@ -2622,42 +2902,51 @@ static inline int dependent_sleeper(int } #endif -#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) +#if defined(CONFIG_LATENCY_TRACE) && defined(CONFIG_RT_DEADLOCK_DETECT) -void fastcall add_preempt_count(int val) +static void trace_array(prio_array_t *array) { - /* - * Underflow? - */ - BUG_ON(((int)preempt_count() < 0)); - preempt_count() += val; - /* - * Spinlock count overflowing soon? - */ - BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); + int i; + task_t *p; + struct list_head *head, *tmp; + + for (i = 0; i < MAX_PRIO; i++) { + head = array->queue + i; + if (list_empty(head)) { + WARN_ON(test_bit(i, array->bitmap)); + continue; + } + WARN_ON(!test_bit(i, array->bitmap)); + list_for_each(tmp, head) { + p = list_entry(tmp, task_t, run_list); + trace_special_pid(p->pid, p->prio, + p->policy == SCHED_NORMAL ? + p->static_prio : + MAX_USER_RT_PRIO - p->rt_priority); + } + } } -EXPORT_SYMBOL(add_preempt_count); -void fastcall sub_preempt_count(int val) +static inline void trace_all_runnable_tasks(runqueue_t *rq) +{ + if (trace_enabled) { + trace_array(rq->active); + trace_array(rq->expired); + } +} + +#else + +static inline void trace_all_runnable_tasks(runqueue_t *rq) { - /* - * Underflow? - */ - BUG_ON(val > preempt_count()); - /* - * Is the spinlock portion underflowing? - */ - BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); - preempt_count() -= val; } -EXPORT_SYMBOL(sub_preempt_count); #endif /* - * schedule() is the main scheduler function. + * __schedule() is the main scheduler function. */ -asmlinkage void __sched schedule(void) +void __sched __schedule(void) { long *switch_count; task_t *prev, *next; @@ -2668,26 +2957,24 @@ asmlinkage void __sched schedule(void) unsigned long run_time; int cpu, idx; + WARN_ON(system_state == SYSTEM_BOOTING); /* - * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path for now. - * Otherwise, whine if we are scheduling when we should not be. - */ - if (likely(!(current->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)))) { - if (unlikely(in_atomic())) { - printk(KERN_ERR "scheduling while atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), current->pid); - dump_stack(); - } + * Test if we are atomic. + */ + if (unlikely(in_atomic())) { + stop_trace(); + printk(KERN_ERR "BUG: scheduling while atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + print_symbol("caller is %s\n", + (long)__builtin_return_address(0)); + dump_stack(); } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -need_resched: - preempt_disable(); + preempt_disable(); // FIXME: disable irqs here prev = current; release_kernel_lock(prev); -need_resched_nonpreemptible: rq = this_rq(); /* @@ -2695,7 +2982,7 @@ need_resched_nonpreemptible: * Remove this check after it has been exercised a bit. */ if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + printk(KERN_ERR "BUG: scheduling from the idle thread!\n"); dump_stack(); } @@ -2712,16 +2999,17 @@ need_resched_nonpreemptible: */ run_time /= (CURRENT_BONUS(prev) ? : 1); + cpu = smp_processor_id(); spin_lock_irq(&rq->lock); - if (unlikely(prev->flags & PF_DEAD)) - prev->state = EXIT_DEAD; /* * if entering off of a kernel preemption go straight * to picking the next task. */ - switch_count = &prev->nivcsw; - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; // TODO: temporary - to see it in vmstat + + if ((prev->state & ~TASK_RUNNING_MUTEX) && + !(preempt_count() & PREEMPT_ACTIVE)) { switch_count = &prev->nvcsw; if (unlikely((prev->state & TASK_INTERRUPTIBLE) && unlikely(signal_pending(prev)))) @@ -2732,8 +3020,23 @@ need_resched_nonpreemptible: deactivate_task(prev, rq); } } + if (preempt_count() & PREEMPT_ACTIVE) + sub_preempt_count(PREEMPT_ACTIVE); + if (unlikely(prev->flags & PF_DEAD)) { + if (prev->state != TASK_RUNNING) { + printk("prev->state: %ld != TASK_RUNNING??\n", + prev->state); + WARN_ON(1); + } else + deactivate_task(prev, rq); + prev->state = EXIT_DEAD; + } + +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + if (unlikely(atomic_read(&rt_overload))) + pull_rt_tasks(rq, cpu); +#endif - cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { go_idle: idle_balance(cpu, rq); @@ -2805,6 +3108,8 @@ switch_tasks: prev->sleep_avg = 0; prev->timestamp = prev->last_ran = now; + trace_all_runnable_tasks(rq); + sched_info_switch(prev, next); if (likely(prev != next)) { next->timestamp = now; @@ -2815,22 +3120,79 @@ switch_tasks: prepare_arch_switch(rq, next); prev = context_switch(rq, prev, next); barrier(); - + if (prev && current) + trace_special_pid(prev->pid, prev->prio, current->prio); finish_task_switch(prev); - } else - spin_unlock_irq(&rq->lock); + preempt_enable_no_resched(); + } else { + trace_stop_sched_switched(next); + preempt_enable_no_resched(); + spin_unlock(&rq->lock); + } - prev = current; - if (unlikely(reacquire_kernel_lock(prev) < 0)) - goto need_resched_nonpreemptible; - preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) - goto need_resched; + reacquire_kernel_lock(current); +} + +/* + * schedule() is the main scheduler function. + */ +asmlinkage void __sched schedule(void) +{ + WARN_ON(system_state == SYSTEM_BOOTING); + /* + * Test if we have interrupts disabled. + */ + if (unlikely(irqs_disabled())) { + stop_trace(); + printk(KERN_ERR "BUG: scheduling with irqs disabled: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), current->pid); + print_symbol("caller is %s\n", + (long)__builtin_return_address(0)); + dump_stack(); + } + if (unlikely(current->flags & PF_NOSCHED)) { + current->flags &= ~PF_NOSCHED; + printk(KERN_ERR "%s:%d userspace BUG: scheduling in user-atomic context!\n", current->comm, current->pid); + dump_stack(); + send_sig(SIGUSR2, current, 1); + } + do { + __schedule(); + } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); + local_irq_enable(); // TODO: do sti; ret } EXPORT_SYMBOL(schedule); #ifdef CONFIG_PREEMPT + +int kernel_preemption = 1; + +static int __init preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) { + if (kernel_preemption) { + printk("turning off kernel preemption!\n"); + kernel_preemption = 0; + } + return 1; + } + if (!strncmp(str, "on", 2)) { + if (!kernel_preemption) { + printk("turning on kernel preemption!\n"); + kernel_preemption = 1; + } + return 1; + } + get_option(&str, &kernel_preemption); + + return 1; +} + +__setup("preempt=", preempt_setup); + + /* * this is is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt @@ -2843,6 +3205,9 @@ asmlinkage void __sched preempt_schedule struct task_struct *task = current; int saved_lock_depth; #endif + + if (!kernel_preemption) + return; /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. @@ -2851,6 +3216,7 @@ asmlinkage void __sched preempt_schedule return; need_resched: + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); /* * We keep the big kernel semaphore locked, but we @@ -2861,25 +3227,72 @@ need_resched: saved_lock_depth = task->lock_depth; task->lock_depth = -1; #endif - schedule(); + __schedule(); #ifdef CONFIG_PREEMPT_BKL task->lock_depth = saved_lock_depth; #endif - sub_preempt_count(PREEMPT_ACTIVE); /* we could miss a preemption opportunity between schedule and now */ barrier(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; + local_irq_enable(); } EXPORT_SYMBOL(preempt_schedule); + +/* + * this is is the entry point for the IRQ return path. Called with + * interrupts disabled. To avoid infinite irq-entry recursion problems + * with fast-paced IRQ sources we do all of this carefully to never + * enable interrupts again. + */ +asmlinkage void __sched preempt_schedule_irq(void) +{ + struct thread_info *ti = current_thread_info(); +#ifdef CONFIG_PREEMPT_BKL + struct task_struct *task = current; + int saved_lock_depth; +#endif + + if (!kernel_preemption) + return; + /* + * If there is a non-zero preempt_count then just return. + * (interrupts are disabled) + */ + if (unlikely(ti->preempt_count)) + return; + +need_resched: + add_preempt_count(PREEMPT_ACTIVE); + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ +#ifdef CONFIG_PREEMPT_BKL + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; +#endif + __schedule(); + local_irq_disable(); +#ifdef CONFIG_PREEMPT_BKL + task->lock_depth = saved_lock_depth; +#endif + + /* we could miss a preemption opportunity between schedule and now */ + barrier(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + #endif /* CONFIG_PREEMPT */ int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { task_t *p = curr->task; - return try_to_wake_up(p, mode, sync); + return try_to_wake_up(p, mode | TASK_RUNNING_MUTEX, sync, 0); } EXPORT_SYMBOL(default_wake_function); @@ -2990,6 +3403,13 @@ void fastcall complete_all(struct comple } EXPORT_SYMBOL(complete_all); +unsigned int fastcall completion_done(struct completion *x) +{ + return x->done; +} +EXPORT_SYMBOL(completion_done); + + void fastcall __sched wait_for_completion(struct completion *x) { might_sleep(); @@ -3012,6 +3432,101 @@ void fastcall __sched wait_for_completio } EXPORT_SYMBOL(wait_for_completion); +unsigned long fastcall __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + timeout = schedule_timeout(timeout); + if (!timeout) + goto out; + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; + spin_unlock_irq(&x->wait.lock); +out: + return timeout; +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +int fastcall __sched wait_for_completion_interruptible(struct completion *x) +{ + int ret = 0; + + might_sleep(); + + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + goto out; + } + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; +out: + spin_unlock_irq(&x->wait.lock); + + return ret; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +unsigned long fastcall __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + if (signal_pending(current)) { + timeout = -ERESTARTSYS; + goto out_unlock; + } + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + timeout = schedule_timeout(timeout); + if (!timeout) + goto out; + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; +out_unlock: + spin_unlock_irq(&x->wait.lock); +out: + return timeout; +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + + #define SLEEP_ON_VAR \ unsigned long flags; \ wait_queue_t wait; \ @@ -3237,6 +3752,65 @@ static void __setscheduler(struct task_s p->prio = p->static_prio; } +int mutex_getprio(task_t *p) +{ + int prio; + + if (p->policy != SCHED_NORMAL) + prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + else + prio = __effective_prio(p); + trace_special_pid(p->pid, p->prio, prio); + return prio; +} + +/* + * Used by the PREEMPT_RT code to implement + * priority inheritance logic: + */ +void mutex_setprio(task_t *p, int prio) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int oldprio, prev_resched; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; + array = p->array; + if (array) + dequeue_task(p, array); + p->prio = prio; + + trace_special_pid(p->pid, oldprio, prio); + prev_resched = _need_resched(); + if (array) { + /* + * If changing to an RT priority then queue it + * in the active array! + */ + if (rt_task(p)) + array = rq->active; + enqueue_task(p, array); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + trace_special(prev_resched, _need_resched(), 0); + + task_rq_unlock(rq, &flags); +} + /* * setscheduler - change the scheduling policy and/or RT priority of a thread. */ @@ -3602,21 +4176,28 @@ asmlinkage long sys_sched_yield(void) * no need to preempt or enable interrupts: */ __release(rq->lock); - _raw_spin_unlock(&rq->lock); + __raw_spin_unlock(&rq->lock); preempt_enable_no_resched(); - schedule(); + __schedule(); + local_irq_enable(); + preempt_check_resched(); return 0; } -static inline void __cond_resched(void) +static void __cond_resched(void) { + if (system_state == SYSTEM_BOOTING || !current->pid) + return; + if (preempt_count() & PREEMPT_ACTIVE) + return; do { + local_irq_disable(); add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + __schedule(); } while (need_resched()); + local_irq_enable(); } int __sched cond_resched(void) @@ -3638,7 +4219,7 @@ EXPORT_SYMBOL(cond_resched); * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t * lock) +int __cond_resched_raw_spinlock(raw_spinlock_t *lock) { #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) if (lock->break_lock) { @@ -3649,7 +4230,7 @@ int cond_resched_lock(spinlock_t * lock) } #endif if (need_resched()) { - _raw_spin_unlock(lock); + __raw_spin_unlock(lock); preempt_enable_no_resched(); __cond_resched(); spin_lock(lock); @@ -3658,23 +4239,104 @@ int cond_resched_lock(spinlock_t * lock) return 0; } -EXPORT_SYMBOL(cond_resched_lock); +EXPORT_SYMBOL(__cond_resched_raw_spinlock); + +#ifdef CONFIG_PREEMPT_RT +int __cond_resched_spinlock(spinlock_t *lock) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) + if (lock->break_lock) { + lock->break_lock = 0; + _spin_unlock(lock); + __cond_resched(); + _spin_lock(lock); + } +#endif + return 0; +} + +EXPORT_SYMBOL(__cond_resched_spinlock); + +#endif + + +/* + * Preempt a softirq context if necessary: + */ int __sched cond_resched_softirq(void) { +#ifndef CONFIG_PREEMPT_RT BUG_ON(!in_softirq()); - if (need_resched()) { + if (softirq_need_resched()) { __local_bh_enable(); __cond_resched(); local_bh_disable(); return 1; } +#endif return 0; } EXPORT_SYMBOL(cond_resched_softirq); +/* + * Preempt a hardirq context if necessary: + */ +int cond_resched_hardirq(void) +{ + unsigned long flags; + + BUG_ON(!in_irq()); + if (hardirq_need_resched()) { + local_save_flags(flags); + irq_exit(); + __cond_resched(); + local_irq_restore(flags); + irq_enter(); + return 1; + } + return 0; +} + +EXPORT_SYMBOL(cond_resched_hardirq); + +/* + * Preempt any context: + */ +int cond_resched_all(void) +{ + if (hardirq_count()) + return cond_resched_hardirq(); + if (softirq_count()) + return cond_resched_softirq(); + return cond_resched(); +} + +EXPORT_SYMBOL(cond_resched_all); + +#ifdef CONFIG_PREEMPT_VOLUNTARY + +int voluntary_preemption = 1; + +EXPORT_SYMBOL(voluntary_preemption); + +static int __init voluntary_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + voluntary_preemption = 0; + else + get_option(&str, &voluntary_preemption); + if (!voluntary_preemption) + printk("turning off voluntary preemption!\n"); + + return 1; +} + +__setup("voluntary-preempt=", voluntary_preempt_setup); + +#endif /** * yield - yield the current processor to other threads. @@ -3828,23 +4490,27 @@ static void show_task(task_t * p) unsigned long free = 0; static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; - printk("%-13.13s ", p->comm); + printk("%-13.13s [%p]", p->comm, p); state = p->state ? __ffs(p->state) + 1 : 0; if (state < ARRAY_SIZE(stat_nam)) printk(stat_nam[state]); else printk("?"); #if (BITS_PER_LONG == 32) - if (state == TASK_RUNNING) + if (0 && (state == TASK_RUNNING)) printk(" running "); else printk(" %08lX ", thread_saved_pc(p)); #else - if (state == TASK_RUNNING) + if (0 && (state == TASK_RUNNING)) printk(" running task "); else printk(" %016lx ", thread_saved_pc(p)); #endif + if (task_curr(p)) + printk("[curr] "); + else if (p->array) + printk("[on rq] "); #ifdef CONFIG_DEBUG_STACK_USAGE { unsigned long * n = (unsigned long *) (p->thread_info+1); @@ -3878,6 +4544,7 @@ static void show_task(task_t * p) void show_state(void) { task_t *g, *p; + int do_unlock = 1; #if (BITS_PER_LONG == 32) printk("\n" @@ -3888,7 +4555,16 @@ void show_state(void) " sibling\n"); printk(" task PC pid father child younger older\n"); #endif +#ifdef CONFIG_PREEMPT_RT + if (!read_trylock(&tasklist_lock)) { + printk("hm, tasklist_lock write-locked.\n"); + printk("ignoring ...\n"); + do_unlock = 0; + } +#else read_lock(&tasklist_lock); +#endif + do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow @@ -3898,7 +4574,9 @@ void show_state(void) show_task(p); } while_each_thread(g, p); - read_unlock(&tasklist_lock); + if (do_unlock) + read_unlock(&tasklist_lock); + show_all_locks(); } void __devinit init_idle(task_t *idle, int cpu) @@ -3918,7 +4596,9 @@ void __devinit init_idle(task_t *idle, i spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) +#if defined(CONFIG_PREEMPT) && \ + !defined(CONFIG_PREEMPT_BKL) && \ + !defined(CONFIG_PREEMPT_RT) idle->thread_info->preempt_count = (idle->lock_depth >= 0); #else idle->thread_info->preempt_count = 0; @@ -4004,12 +4684,13 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. */ -static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { runqueue_t *rq_dest, *rq_src; + int ret = 0; if (unlikely(cpu_is_offline(dest_cpu))) - return; + return 0; rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); @@ -4022,7 +4703,9 @@ static void __migrate_task(struct task_s if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; + WARN_ON(p == rq_src->curr); set_task_cpu(p, dest_cpu); + if (p->array) { /* * Sync timestamp with rq_dest's before activating. @@ -4036,10 +4719,13 @@ static void __migrate_task(struct task_s activate_task(p, rq_dest, 0); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); + ret = 1; } out: double_rq_unlock(rq_src, rq_dest); + + return ret; } /* @@ -4820,6 +5506,7 @@ void __init sched_init(void) for (j = 0; j < 2; j++) { array = rq->arrays + j; + array->rq = rq; for (k = 0; k < MAX_PRIO; k++) { INIT_LIST_HEAD(array->queue + k); __clear_bit(k, array->bitmap); @@ -4835,6 +5522,9 @@ void __init sched_init(void) atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); +#ifdef CONFIG_PREEMPT_RT + printk("Real-Time Preemption Support (c) Ingo Molnar\n"); +#endif /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, @@ -4844,7 +5534,7 @@ void __init sched_init(void) init_idle(current, smp_processor_id()); } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT) void __might_sleep(char *file, int line) { #if defined(in_atomic) @@ -4852,13 +5542,17 @@ void __might_sleep(char *file, int line) if ((in_atomic() || irqs_disabled()) && system_state == SYSTEM_RUNNING && !oops_in_progress) { + if (debug_direct_keyboard && hardirq_count()) + return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; - printk(KERN_ERR "Debug: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); + stop_trace(); + printk(KERN_ERR "BUG: sleeping function called from invalid" + " context %s(%d) at %s:%d\n", + current->comm, current->pid, file, line); + printk("in_atomic():%d [%08x], irqs_disabled():%d\n", + in_atomic(), preempt_count(), irqs_disabled()); dump_stack(); } #endif --- linux/kernel/fork.c.orig +++ linux/kernel/fork.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #include @@ -63,6 +65,16 @@ DEFINE_PER_CPU(unsigned long, process_co EXPORT_SYMBOL(tasklist_lock); +/* + * Delayed mmdrop/put_task_struct. In the PREEMPT_RT case we + * dont want to do this from the scheduling context. + */ +static DEFINE_PER_CPU(struct task_struct *, desched_task); + +static DEFINE_PER_CPU(struct list_head, delayed_put_list); +static DEFINE_PER_CPU(struct list_head, delayed_drop_list); + + int nr_processes(void) { int cpu; @@ -89,6 +101,8 @@ EXPORT_SYMBOL(free_task); void __put_task_struct(struct task_struct *tsk) { + BUG_ON(atomic_read(&tsk->usage)); + WARN_ON(!(tsk->flags & PF_DEAD)); WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); @@ -103,8 +117,29 @@ void __put_task_struct(struct task_struc free_task(tsk); } +void put_task_struct(struct task_struct *tsk) +{ + BUG_ON(!atomic_read(&tsk->usage)); + + if (!atomic_dec_and_test(&tsk->usage)) + return; + __put_task_struct(tsk); +} + +EXPORT_SYMBOL(put_task_struct); + +void get_task_struct(struct task_struct *tsk) +{ + BUG_ON(!atomic_read(&tsk->usage)); + atomic_inc(&tsk->usage); +} + +EXPORT_SYMBOL(get_task_struct); + void __init fork_init(unsigned long mempages) { + int i; + #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES @@ -130,6 +165,11 @@ void __init fork_init(unsigned long memp init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; + + for (i = 0; i < NR_CPUS; i++) { + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i)); + INIT_LIST_HEAD(&per_cpu(delayed_put_list, i)); + } } static struct task_struct *dup_task_struct(struct task_struct *orig) @@ -305,6 +345,7 @@ static struct mm_struct * mm_init(struct rwlock_init(&mm->ioctx_list_lock); mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); + INIT_LIST_HEAD(&mm->delayed_drop); mm->free_area_cache = TASK_UNMAPPED_BASE; if (likely(!mm_alloc_pgd(mm))) { @@ -905,6 +946,9 @@ static task_t *copy_process(unsigned lon goto bad_fork_cleanup; } #endif + INIT_LIST_HEAD(&p->delayed_put); + INIT_LIST_HEAD(&p->pi_waiters); + p->blocked_on = NULL; /* not blocked yet */ p->tgid = p->pid; if (clone_flags & CLONE_THREAD) @@ -978,8 +1022,10 @@ static task_t *copy_process(unsigned lon * another CPU - so we re-copy it here and set the child's CPU to * the parent's CPU. This avoids alot of nasty races. */ + preempt_disable(); p->cpus_allowed = current->cpus_allowed; set_task_cpu(p, smp_processor_id()); + preempt_enable(); /* * Check for pending SIGKILL! The new thread should not be allowed @@ -1037,8 +1083,11 @@ static task_t *copy_process(unsigned lon if (thread_group_leader(p)) { attach_pid(p, PIDTYPE_PGID, process_group(p)); attach_pid(p, PIDTYPE_SID, p->signal->session); - if (p->pid) + if (p->pid) { + preempt_disable(); __get_cpu_var(process_counts)++; + preempt_enable(); + } } nr_threads++; @@ -1232,3 +1281,173 @@ void __init proc_caches_init(void) sizeof(struct mm_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); } + +static int put_task_complete(void) +{ + struct list_head *head; + int ret = 0; + + head = &get_cpu_var(delayed_put_list); + while (!list_empty(head)) { + struct task_struct *task = list_entry(head->next, + struct task_struct, delayed_put); + list_del(&task->delayed_put); + put_cpu_var(delayed_put_list); + + __put_task_struct(task); + ret = 1; + + head = &get_cpu_var(delayed_put_list); + } + put_cpu_var(delayed_put_list); + + return ret; +} + +/* + * We dont want to do complex work from the scheduler, thus + * we delay the work to a per-CPU worker thread: + */ +void fastcall __put_task_struct_delayed(struct task_struct *task) +{ + struct task_struct *desched_task; + struct list_head *head; + + head = &get_cpu_var(delayed_put_list); + list_add_tail(&task->delayed_put, head); + desched_task = __get_cpu_var(desched_task); + if (desched_task) + wake_up_process(desched_task); + put_cpu_var(delayed_put_list); +} + +void put_task_struct_delayed(struct task_struct *tsk) +{ + BUG_ON(!atomic_read(&tsk->usage)); + + if (!atomic_dec_and_test(&tsk->usage)) + return; + __put_task_struct_delayed(tsk); +} + +static int mmdrop_complete(void) +{ + struct list_head *head; + int ret = 0; + + head = &get_cpu_var(delayed_drop_list); + while (!list_empty(head)) { + struct mm_struct *mm = list_entry(head->next, + struct mm_struct, delayed_drop); + list_del(&mm->delayed_drop); + put_cpu_var(delayed_drop_list); + + __mmdrop(mm); + ret = 1; + + head = &get_cpu_var(delayed_drop_list); + } + put_cpu_var(delayed_drop_list); + + return ret; +} + +/* + * We dont want to do complex work from the scheduler, thus + * we delay the work to a per-CPU worker thread: + */ +void fastcall __mmdrop_delayed(struct mm_struct *mm) +{ + struct task_struct *desched_task; + struct list_head *head; + + head = &get_cpu_var(delayed_drop_list); + list_add_tail(&mm->delayed_drop, head); + desched_task = __get_cpu_var(desched_task); + if (desched_task) + wake_up_process(desched_task); + put_cpu_var(delayed_drop_list); +} + +static int desched_thread(void * __bind_cpu) +{ + printk("desched thread %ld started up.\n", (long) __bind_cpu); + + set_user_nice(current, -10); + current->flags |= PF_NOFREEZE | PF_SOFTIRQ; + + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + int ret; + + ret = put_task_complete(); + ret |= mmdrop_complete(); + if (ret) + continue; + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int __devinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + printk("desched cpu_callback %ld/%p\n", action, hcpu); + switch (action) { + case CPU_UP_PREPARE: + + BUG_ON(per_cpu(desched_task, hotcpu)); + INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu)); + INIT_LIST_HEAD(&per_cpu(delayed_put_list, hotcpu)); + p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu); + if (IS_ERR(p)) { + printk("desched_thread for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + per_cpu(desched_task, hotcpu) = p; + kthread_bind(p, hotcpu); + break; + case CPU_ONLINE: + + wake_up_process(per_cpu(desched_task, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id()); + case CPU_DEAD: + + p = per_cpu(desched_task, hotcpu); + per_cpu(desched_task, hotcpu) = NULL; + kthread_stop(p); + takeover_tasklets(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init int spawn_desched_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + + printk("spawn_desched_task(%p)\n", cpu); + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + return 0; +} + --- linux/kernel/irq/proc.c.orig +++ linux/kernel/irq/proc.c @@ -7,9 +7,12 @@ */ #include +#include #include #include +#include "internals.h" + static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; #ifdef CONFIG_SMP @@ -63,37 +66,6 @@ static int irq_affinity_write_proc(struc #endif -#define MAX_NAMELEN 128 - -static int name_unique(unsigned int irq, struct irqaction *new_action) -{ - struct irq_desc *desc = irq_desc + irq; - struct irqaction *action; - - for (action = desc->action ; action; action = action->next) - if ((action != new_action) && action->name && - !strcmp(new_action->name, action->name)) - return 0; - return 1; -} - -void register_handler_proc(unsigned int irq, struct irqaction *action) -{ - char name [MAX_NAMELEN]; - - if (!irq_dir[irq] || action->dir || !action->name || - !name_unique(irq, action)) - return; - - memset(name, 0, MAX_NAMELEN); - snprintf(name, MAX_NAMELEN, "%s", action->name); - - /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, irq_dir[irq]); -} - -#undef MAX_NAMELEN - #define MAX_NAMELEN 10 void register_irq_proc(unsigned int irq) @@ -133,10 +105,96 @@ void register_irq_proc(unsigned int irq) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { + if (action->threaded) + remove_proc_entry(action->threaded->name, action->dir); if (action->dir) remove_proc_entry(action->dir->name, irq_dir[irq]); } +#ifndef CONFIG_PREEMPT_RT + +static int threaded_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + return sprintf(page, "%c\n", + ((struct irqaction *)data)->flags & SA_NODELAY ? '0' : '1'); +} + +static int threaded_write_proc(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + int c; + struct irqaction *action = data; + irq_desc_t *desc = irq_desc + action->irq; + + if (get_user(c, buffer)) + return -EFAULT; + if (c != '0' && c != '1') + return -EINVAL; + + spin_lock_irq(&desc->lock); + + if (c == '0') + action->flags |= SA_NODELAY; + if (c == '1') + action->flags &= ~SA_NODELAY; + recalculate_desc_flags(desc); + + spin_unlock_irq(&desc->lock); + + return 1; +} + +#endif + +#define MAX_NAMELEN 128 + +static int name_unique(unsigned int irq, struct irqaction *new_action) +{ + struct irq_desc *desc = irq_desc + irq; + struct irqaction *action; + + for (action = desc->action ; action; action = action->next) + if ((action != new_action) && action->name && + !strcmp(new_action->name, action->name)) + return 0; + return 1; +} + +void register_handler_proc(unsigned int irq, struct irqaction *action) +{ + char name [MAX_NAMELEN]; + + if (!irq_dir[irq] || action->dir || !action->name || + !name_unique(irq, action)) + return; + + memset(name, 0, MAX_NAMELEN); + snprintf(name, MAX_NAMELEN, "%s", action->name); + + /* create /proc/irq/1234/handler/ */ + action->dir = proc_mkdir(name, irq_dir[irq]); + if (!action->dir) + return; +#ifndef CONFIG_PREEMPT_RT + { + struct proc_dir_entry *entry; + /* create /proc/irq/1234/handler/threaded */ + entry = create_proc_entry("threaded", 0600, action->dir); + if (!entry) + return; + entry->nlink = 1; + entry->data = (void *)action; + entry->read_proc = threaded_read_proc; + entry->write_proc = threaded_write_proc; + action->threaded = entry; + } +#endif +} + +#undef MAX_NAMELEN + + void init_irq_proc(void) { int i; @@ -146,6 +204,9 @@ void init_irq_proc(void) if (!root_irq_dir) return; + /* create /proc/irq/prof_cpu_mask */ + create_prof_cpu_mask(root_irq_dir); + /* * Create entries for all existing IRQs. */ --- linux/kernel/irq/manage.c.orig +++ linux/kernel/irq/manage.c @@ -7,8 +7,10 @@ */ #include -#include #include +#include +#include +#include #include #include "internals.h" @@ -28,8 +30,12 @@ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_desc + irq; - while (desc->status & IRQ_INPROGRESS) - cpu_relax(); + if (hardirq_preemption && !(desc->status & IRQ_NODELAY)) + wait_event(desc->wait_for_handler, + !(desc->status & IRQ_INPROGRESS)); + else + while (desc->status & IRQ_INPROGRESS) + cpu_relax(); } EXPORT_SYMBOL(synchronize_irq); @@ -125,6 +131,21 @@ void enable_irq(unsigned int irq) EXPORT_SYMBOL(enable_irq); /* + * If any action has SA_NODELAY then turn IRQ_NODELAY on: + */ +void recalculate_desc_flags(struct irq_desc *desc) +{ + struct irqaction *action; + + desc->status &= ~IRQ_NODELAY; + for (action = desc->action ; action; action = action->next) + if (action->flags & SA_NODELAY) + desc->status |= IRQ_NODELAY; +} + +static int start_irq_thread(int irq, struct irq_desc *desc); + +/* * Internal function that tells the architecture code whether a * particular irq has been exclusively allocated or is available * for driver use. @@ -179,6 +200,9 @@ int setup_irq(unsigned int irq, struct i rand_initialize_irq(irq); } + if (!(new->flags & SA_NODELAY)) + if (start_irq_thread(irq, desc)) + return -ENOMEM; /* * The following block of code has to be executed atomically */ @@ -201,6 +225,11 @@ int setup_irq(unsigned int irq, struct i *p = new; + /* + * Propagate any possible SA_NODELAY flag into IRQ_NODELAY: + */ + recalculate_desc_flags(desc); + if (!shared) { desc->depth = 0; desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | @@ -214,7 +243,7 @@ int setup_irq(unsigned int irq, struct i new->irq = irq; register_irq_proc(irq); - new->dir = NULL; + new->dir = new->threaded = NULL; register_handler_proc(irq, new); return 0; @@ -264,6 +293,7 @@ int teardown_irq(unsigned int irq, struc else desc->handler->disable(irq); } + recalculate_desc_flags(desc); spin_unlock_irqrestore(&desc->lock,flags); unregister_handler_proc(irq, action); @@ -388,3 +418,175 @@ int request_irq(unsigned int irq, EXPORT_SYMBOL(request_irq); +#ifdef CONFIG_PREEMPT_HARDIRQS + +int hardirq_preemption = 1; + +EXPORT_SYMBOL(hardirq_preemption); + +/* + * Real-Time Preemption depends on hardirq threading: + */ +#ifndef CONFIG_PREEMPT_RT + +static int __init hardirq_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + hardirq_preemption = 0; + else + get_option(&str, &hardirq_preemption); + if (!hardirq_preemption) + printk("turning off hardirq preemption!\n"); + + return 1; +} + +__setup("hardirq-preempt=", hardirq_preempt_setup); + +#endif + +static void do_hardirq(struct irq_desc *desc) +{ + struct irqaction * action; + unsigned int irq = desc - irq_desc; + + local_irq_disable(); + + if (desc->status & IRQ_INPROGRESS) { + action = desc->action; + spin_lock(&desc->lock); + for (;;) { + irqreturn_t action_ret = 0; + + if (action) { + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, NULL,action); + local_irq_enable(); + cond_resched_all(); + spin_lock_irq(&desc->lock); + } + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + if (likely(!(desc->status & IRQ_PENDING))) + break; + desc->status &= ~IRQ_PENDING; + } + desc->status &= ~IRQ_INPROGRESS; + /* + * The ->end() handler has to deal with interrupts which got + * disabled while the handler was running. + */ + desc->handler->end(irq); + spin_unlock(&desc->lock); + } + local_irq_enable(); + if (waitqueue_active(&desc->wait_for_handler)) + wake_up(&desc->wait_for_handler); +} + +extern asmlinkage void __do_softirq(void); + +static int curr_irq_prio = 49; + +static int do_irqd(void * __desc) +{ + struct sched_param param = { 0, }; + struct irq_desc *desc = __desc; +#ifdef CONFIG_SMP + int irq = desc - irq_desc; + cpumask_t mask; + + mask = cpumask_of_cpu(any_online_cpu(irq_affinity[irq])); + set_cpus_allowed(current, mask); +#endif + current->flags |= PF_NOFREEZE | PF_HARDIRQ; + + /* + * Scale irq thread priorities from prio 50 to prio 25 + */ + param.sched_priority = curr_irq_prio; + if (param.sched_priority > 25) + curr_irq_prio = param.sched_priority - 1; + + sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + do_hardirq(desc); + cond_resched_all(); + __do_softirq(); + local_irq_enable(); +#ifdef CONFIG_SMP + /* + * Did IRQ affinities change? + */ + if (!cpu_isset(smp_processor_id(), irq_affinity[irq])) { + mask = cpumask_of_cpu(any_online_cpu(irq_affinity[irq])); + set_cpus_allowed(current, mask); + } +#endif + schedule(); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +static int ok_to_create_irq_threads; + +static int start_irq_thread(int irq, struct irq_desc *desc) +{ + if (desc->thread || !ok_to_create_irq_threads) + return 0; + + desc->thread = kthread_create(do_irqd, desc, "IRQ %d", irq); + if (!desc->thread) { + printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq); + return -ENOMEM; + } + + /* + * An interrupt may have come in before the thread pointer was + * stored in desc->thread; make sure the thread gets woken up in + * such a case: + */ + smp_mb(); + wake_up_process(desc->thread); + + return 0; +} + +void __init init_hardirqs(void) +{ + int i; + ok_to_create_irq_threads = 1; + + for (i = 0; i < NR_IRQS; i++) { + irq_desc_t *desc = irq_desc + i; + + if (desc->action && !(desc->status & IRQ_NODELAY)) + start_irq_thread(i, desc); + } +} + +#else + +void __init init_hardirqs(void) +{ +} + +static int start_irq_thread(int irq, struct irq_desc *desc) +{ + return 0; +} + +#endif + +void __init early_init_hardirqs(void) +{ + int i; + + for (i = 0; i < NR_IRQS; i++) + init_waitqueue_head(&irq_desc[i].wait_for_handler); +} + + --- linux/kernel/irq/handle.c.orig +++ linux/kernel/irq/handle.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -31,7 +32,7 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { .handler = &no_irq_type, - .lock = SPIN_LOCK_UNLOCKED + .lock = RAW_SPIN_LOCK_UNLOCKED } }; @@ -73,6 +74,32 @@ irqreturn_t no_action(int cpl, void *dev } /* + * Hack - used for development only. + */ +int debug_direct_keyboard = 0; + +int redirect_hardirq(struct irq_desc *desc) +{ + /* + * Direct execution: + */ + if (!hardirq_preemption || (desc->status & IRQ_NODELAY) || + !desc->thread) + return 0; + +#ifdef __i386__ + if (debug_direct_keyboard && (desc - irq_desc == 1)) + return 0; +#endif + + BUG_ON(!irqs_disabled()); + if (desc->thread && desc->thread->state != TASK_RUNNING) + wake_up_process(desc->thread); + + return 1; +} + +/* * Have got an event to handle: */ fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, @@ -80,30 +107,48 @@ fastcall int handle_IRQ_event(unsigned i { int ret, retval = 0, status = 0; - if (!(action->flags & SA_INTERRUPT)) + /* + * Unconditionally enable interrupts for threaded + * IRQ handlers: + */ + if (!hardirq_count() || !(action->flags & SA_INTERRUPT)) local_irq_enable(); do { + unsigned int preempt_count = preempt_count(); + ret = action->handler(irq, action->dev_id, regs); + if (preempt_count() != preempt_count) { + print_symbol("BUG: unbalanced irq-handler preempt count in %s!\n", (unsigned long) action->handler); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; + } if (ret == IRQ_HANDLED) status |= action->flags; retval |= ret; action = action->next; } while (action); - if (status & SA_SAMPLE_RANDOM) + if (status & SA_SAMPLE_RANDOM) { + local_irq_enable(); add_interrupt_randomness(irq); + } local_irq_disable(); return retval; } +cycles_t irq_timestamp(unsigned int irq) +{ + return irq_desc[irq].timestamp; +} + /* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) +fastcall notrace unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) { irq_desc_t *desc = irq_desc + irq; struct irqaction * action; @@ -123,6 +168,7 @@ fastcall unsigned int __do_IRQ(unsigned desc->handler->end(irq); return 1; } + desc->timestamp = get_cycles(); spin_lock(&desc->lock); desc->handler->ack(irq); @@ -155,6 +201,12 @@ fastcall unsigned int __do_IRQ(unsigned goto out; /* + * hardirq redirection to the irqd process context: + */ + if (redirect_hardirq(desc)) + goto out_no_end; + + /* * Edge triggered interrupts need to remember * pending events. * This applies to any hw interrupts that allow a second @@ -179,13 +231,13 @@ fastcall unsigned int __do_IRQ(unsigned desc->status &= ~IRQ_PENDING; } desc->status &= ~IRQ_INPROGRESS; - out: /* * The ->end() handler has to deal with interrupts which got * disabled while the handler was running. */ desc->handler->end(irq); +out_no_end: spin_unlock(&desc->lock); return 1; --- linux/kernel/irq/autoprobe.c.orig +++ linux/kernel/irq/autoprobe.c @@ -7,6 +7,7 @@ */ #include +#include #include #include @@ -26,7 +27,7 @@ static DECLARE_MUTEX(probe_sem); */ unsigned long probe_irq_on(void) { - unsigned long val, delay; + unsigned long val; irq_desc_t *desc; unsigned int i; @@ -44,9 +45,10 @@ unsigned long probe_irq_on(void) spin_unlock_irq(&desc->lock); } - /* Wait for longstanding interrupts to trigger. */ - for (delay = jiffies + HZ/50; time_after(delay, jiffies); ) - /* about 20ms delay */ barrier(); + /* + * Wait for longstanding interrupts to trigger, 20 msec delay: + */ + msleep(HZ/50); /* * enable any unassigned irqs @@ -66,10 +68,9 @@ unsigned long probe_irq_on(void) } /* - * Wait for spurious interrupts to trigger + * Wait for spurious interrupts to trigger, 100 msec delay: */ - for (delay = jiffies + HZ/10; time_after(delay, jiffies); ) - /* about 100ms delay */ barrier(); + msleep(HZ/10); /* * Now filter out any obviously spurious interrupts --- linux/kernel/irq/internals.h.orig +++ linux/kernel/irq/internals.h @@ -4,6 +4,8 @@ extern int noirqdebug; +void recalculate_desc_flags(struct irq_desc *desc); + #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq); extern void register_handler_proc(unsigned int irq, struct irqaction *action); --- linux/kernel/signal.c.orig +++ linux/kernel/signal.c @@ -845,11 +845,11 @@ specific_send_sig_info(int sig, struct s { int ret = 0; - if (!irqs_disabled()) - BUG(); +#ifndef CONFIG_PREEMPT_RT + BUG_ON(!irqs_disabled()); +#endif #ifdef CONFIG_SMP - if (!spin_is_locked(&t->sighand->siglock)) - BUG(); + BUG_ON(!spin_is_locked(&t->sighand->siglock)); #endif if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) @@ -1593,6 +1593,7 @@ static void ptrace_stop(int exit_code, i do_notify_parent_cldstop(current, current->parent, CLD_TRAPPED); read_unlock(&tasklist_lock); + current->flags &= ~PF_NOSCHED; schedule(); } else { /* @@ -1661,6 +1662,7 @@ finish_stop(int stop_count) read_unlock(&tasklist_lock); } + current->flags &= ~PF_NOSCHED; schedule(); /* * Now we don't run again until continued. @@ -1818,6 +1820,9 @@ int get_signal_to_deliver(siginfo_t *inf sigset_t *mask = ¤t->blocked; int signr = 0; +#ifdef CONFIG_PREEMPT_RT + might_sleep(); +#endif relock: spin_lock_irq(¤t->sighand->siglock); for (;;) { --- linux/kernel/workqueue.c.orig +++ linux/kernel/workqueue.c @@ -25,6 +25,7 @@ #include #include #include +#include /* * The per-CPU workqueue (if single thread, we always use cpu 0's). @@ -93,10 +94,12 @@ static void __queue_work(struct cpu_work * * We queue the work to the CPU it was submitted, but there is no * guarantee that it will be processed by that CPU. + * + * Especially no such guarantee on PREEMPT_RT. */ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) { - int ret = 0, cpu = get_cpu(); + int ret = 0, cpu = _smp_processor_id(); if (!test_and_set_bit(0, &work->pending)) { if (unlikely(is_single_threaded(wq))) @@ -105,7 +108,6 @@ int fastcall queue_work(struct workqueue __queue_work(wq->cpu_wq + cpu, work); ret = 1; } - put_cpu(); return ret; } @@ -365,6 +367,39 @@ static void cleanup_workqueue_thread(str kthread_stop(p); } +void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu, + int policy, int rt_priority, int nice) +{ + struct task_struct *p = wq->cpu_wq[cpu].thread; + struct sched_param param = { .sched_priority = rt_priority }; + int ret; + + set_user_nice(p, nice); + ret = sys_sched_setscheduler(p->pid, policy, ¶m); + if (ret) + printk("BUG: wq(%s) setscheduler() returned: %d.\n", + wq->name, ret); + +} + +void set_workqueue_prio(struct workqueue_struct *wq, int policy, + int rt_priority, int nice) +{ + int cpu; + + /* We don't need the distraction of CPUs appearing and vanishing. */ + lock_cpu_hotplug(); + if (is_single_threaded(wq)) + set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice); + else { + for_each_online_cpu(cpu) + set_workqueue_thread_prio(wq, cpu, policy, + rt_priority, nice); + } + unlock_cpu_hotplug(); +} + + void destroy_workqueue(struct workqueue_struct *wq) { int cpu; @@ -539,6 +574,7 @@ void init_workqueues(void) hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); + set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20); } EXPORT_SYMBOL_GPL(__create_workqueue); --- linux/kernel/module.c.orig +++ linux/kernel/module.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -96,6 +97,16 @@ static inline int strong_try_module_get( */ void __module_put_and_exit(struct module *mod, long code) { + /* + * Release the kernel lock if held: + */ + if (current->lock_depth >= 0) { + printk("BUG: module %s holds the BKL [%d] at exit time!\n", + mod->name, current->lock_depth); + dump_stack(); + while (current->lock_depth >= 0) + unlock_kernel(); + } module_put(mod); do_exit(code); } --- linux/kernel/profile.c.orig +++ linux/kernel/profile.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,7 @@ static atomic_t *prof_buffer; static unsigned long prof_len, prof_shift; static int prof_on; static cpumask_t prof_cpu_mask = CPU_MASK_ALL; +int prof_pid = -1; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); @@ -51,17 +53,25 @@ static int __init profile_setup(char * s { int par; + if (!strncmp(str, "preempt", 7)) { + prof_on = PREEMPT_PROFILING; + printk(KERN_INFO "kernel preemption profiling enabled\n"); + if (str[7] == ',') + str += 8; + } if (!strncmp(str, "schedule", 8)) { prof_on = SCHED_PROFILING; printk(KERN_INFO "kernel schedule profiling enabled\n"); - if (str[7] == ',') - str += 8; + if (str[8] == ',') + str += 9; } if (get_option(&str,&par)) { prof_shift = par; - prof_on = CPU_PROFILING; - printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", - prof_shift); + if (!prof_on) { + prof_on = CPU_PROFILING; + printk(KERN_INFO "kernel CPU profiling enabled\n"); + } + printk(KERN_INFO "kernel profiling shift: %ld\n", prof_shift); } return 1; } @@ -273,7 +283,7 @@ static void profile_discard_flip_buffers up(&profile_flip_mutex); } -void profile_hit(int type, void *__pc) +void notrace profile_hit(int type, void *__pc) { unsigned long primary, secondary, flags, pc = (unsigned long)__pc; int i, j, cpu; @@ -381,12 +391,36 @@ void profile_hit(int type, void *__pc) } #endif /* !CONFIG_SMP */ -void profile_tick(int type, struct pt_regs *regs) +#ifdef CONFIG_PREEMPT +static void preemption_enabled(void) +{ +} +#endif + +static void preemption_disabled(void) +{ +} + +void notrace profile_tick(int type, struct pt_regs *regs) { if (type == CPU_PROFILING && timer_hook) timer_hook(regs); - if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) - profile_hit(type, (void *)profile_pc(regs)); + if (!user_mode(regs) && (prof_pid == -1 || prof_pid == current->pid) && + cpu_isset(smp_processor_id(), prof_cpu_mask)) { + if (prof_on == PREEMPT_PROFILING && type == CPU_PROFILING) { +#ifdef CONFIG_PREEMPT + int count = preempt_count() - HARDIRQ_OFFSET; + + if (!count) + profile_hit(PREEMPT_PROFILING, + (void *)preemption_enabled); + else +#endif + profile_hit(PREEMPT_PROFILING, + (void *)preemption_disabled); + } else + profile_hit(type, (void *)profile_pc(regs)); + } } #ifdef CONFIG_PROC_FS --- linux/kernel/Makefile.orig +++ linux/kernel/Makefile @@ -9,6 +9,11 @@ obj-y = sched.o fork.o exec_domain.o rcupdate.o intermodule.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o +obj-$(CONFIG_PREEMPT_RT) += rt.o + +obj-$(CONFIG_DEBUG_PREEMPT) += latency.o +obj-$(CONFIG_LATENCY_TIMING) += latency.o + obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o --- linux/kernel/spinlock.c.orig +++ linux/kernel/spinlock.c @@ -17,151 +17,149 @@ * Generic declaration of the raw read_trylock() function, * architectures are supposed to optimize this: */ -int __lockfunc generic_raw_read_trylock(rwlock_t *lock) +int __lockfunc generic_raw_read_trylock(raw_rwlock_t *lock) { - _raw_read_lock(lock); + __raw_read_lock(lock); return 1; } EXPORT_SYMBOL(generic_raw_read_trylock); -int __lockfunc _spin_trylock(spinlock_t *lock) +int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { preempt_disable(); - if (_raw_spin_trylock(lock)) + if (__raw_spin_trylock(lock)) return 1; preempt_enable(); return 0; } -EXPORT_SYMBOL(_spin_trylock); +EXPORT_SYMBOL(_raw_spin_trylock); -int __lockfunc _read_trylock(rwlock_t *lock) +int __lockfunc _raw_read_trylock(raw_rwlock_t *lock) { preempt_disable(); - if (_raw_read_trylock(lock)) + if (__raw_read_trylock(lock)) return 1; preempt_enable(); return 0; } -EXPORT_SYMBOL(_read_trylock); +EXPORT_SYMBOL(_raw_read_trylock); -int __lockfunc _write_trylock(rwlock_t *lock) +int __lockfunc _raw_write_trylock(raw_rwlock_t *lock) { preempt_disable(); - if (_raw_write_trylock(lock)) + if (__raw_write_trylock(lock)) return 1; preempt_enable(); return 0; } -EXPORT_SYMBOL(_write_trylock); +EXPORT_SYMBOL(_raw_write_trylock); #ifndef CONFIG_PREEMPT -void __lockfunc _read_lock(rwlock_t *lock) +void __lockfunc _raw_read_lock(raw_rwlock_t *lock) { preempt_disable(); - _raw_read_lock(lock); + __raw_read_lock(lock); } -EXPORT_SYMBOL(_read_lock); +EXPORT_SYMBOL(_raw_read_lock); -unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) +unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) { unsigned long flags; local_irq_save(flags); preempt_disable(); - _raw_spin_lock_flags(lock, flags); + __raw_spin_lock_flags(lock, flags); return flags; } -EXPORT_SYMBOL(_spin_lock_irqsave); +EXPORT_SYMBOL(_raw_spin_lock_irqsave); -void __lockfunc _spin_lock_irq(spinlock_t *lock) +void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) { local_irq_disable(); preempt_disable(); - _raw_spin_lock(lock); + __raw_spin_lock(lock); } -EXPORT_SYMBOL(_spin_lock_irq); +EXPORT_SYMBOL(_raw_spin_lock_irq); -void __lockfunc _spin_lock_bh(spinlock_t *lock) +void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) { local_bh_disable(); preempt_disable(); - _raw_spin_lock(lock); + __raw_spin_lock(lock); } -EXPORT_SYMBOL(_spin_lock_bh); +EXPORT_SYMBOL(_raw_spin_lock_bh); -unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) +unsigned long __lockfunc _raw_read_lock_irqsave(raw_rwlock_t *lock) { unsigned long flags; local_irq_save(flags); preempt_disable(); - _raw_read_lock(lock); + __raw_read_lock(lock); return flags; } -EXPORT_SYMBOL(_read_lock_irqsave); +EXPORT_SYMBOL(_raw_read_lock_irqsave); -void __lockfunc _read_lock_irq(rwlock_t *lock) +void __lockfunc _raw_read_lock_irq(raw_rwlock_t *lock) { local_irq_disable(); preempt_disable(); - _raw_read_lock(lock); + __raw_read_lock(lock); } -EXPORT_SYMBOL(_read_lock_irq); +EXPORT_SYMBOL(_raw_read_lock_irq); -void __lockfunc _read_lock_bh(rwlock_t *lock) +void __lockfunc _raw_read_lock_bh(raw_rwlock_t *lock) { local_bh_disable(); preempt_disable(); - _raw_read_lock(lock); + __raw_read_lock(lock); } -EXPORT_SYMBOL(_read_lock_bh); +EXPORT_SYMBOL(_raw_read_lock_bh); -unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) +unsigned long __lockfunc _raw_write_lock_irqsave(raw_rwlock_t *lock) { unsigned long flags; local_irq_save(flags); preempt_disable(); - _raw_write_lock(lock); + __raw_write_lock(lock); return flags; } -EXPORT_SYMBOL(_write_lock_irqsave); +EXPORT_SYMBOL(_raw_write_lock_irqsave); -void __lockfunc _write_lock_irq(rwlock_t *lock) +void __lockfunc _raw_write_lock_irq(raw_rwlock_t *lock) { local_irq_disable(); preempt_disable(); - _raw_write_lock(lock); + __raw_write_lock(lock); } -EXPORT_SYMBOL(_write_lock_irq); +EXPORT_SYMBOL(_raw_write_lock_irq); -void __lockfunc _write_lock_bh(rwlock_t *lock) +void __lockfunc _raw_write_lock_bh(raw_rwlock_t *lock) { local_bh_disable(); preempt_disable(); - _raw_write_lock(lock); + __raw_write_lock(lock); } -EXPORT_SYMBOL(_write_lock_bh); +EXPORT_SYMBOL(_raw_write_lock_bh); -void __lockfunc _spin_lock(spinlock_t *lock) +void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) { preempt_disable(); - _raw_spin_lock(lock); + __raw_spin_lock(lock); } +EXPORT_SYMBOL(_raw_spin_lock); -EXPORT_SYMBOL(_spin_lock); - -void __lockfunc _write_lock(rwlock_t *lock) +void __lockfunc _raw_write_lock(raw_rwlock_t *lock) { preempt_disable(); - _raw_write_lock(lock); + __raw_write_lock(lock); } - -EXPORT_SYMBOL(_write_lock); +EXPORT_SYMBOL(_raw_write_lock); #else /* CONFIG_PREEMPT: */ @@ -174,11 +172,11 @@ EXPORT_SYMBOL(_write_lock); */ #define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc _##op##_lock(locktype *lock) \ +void __lockfunc _raw_##op##_lock(locktype *lock) \ { \ preempt_disable(); \ for (;;) { \ - if (likely(_raw_##op##_trylock(lock))) \ + if (likely(__raw_##op##_trylock(lock))) \ break; \ preempt_enable(); \ if (!(lock)->break_lock) \ @@ -188,16 +186,16 @@ void __lockfunc _##op##_lock(locktype *l } \ } \ \ -EXPORT_SYMBOL(_##op##_lock); \ +EXPORT_SYMBOL(_raw_##op##_lock); \ \ -unsigned long __lockfunc _##op##_lock_irqsave(locktype *lock) \ +unsigned long __lockfunc _raw_##op##_lock_irqsave(locktype *lock) \ { \ unsigned long flags; \ \ preempt_disable(); \ for (;;) { \ local_irq_save(flags); \ - if (likely(_raw_##op##_trylock(lock))) \ + if (likely(__raw_##op##_trylock(lock))) \ break; \ local_irq_restore(flags); \ \ @@ -210,16 +208,16 @@ unsigned long __lockfunc _##op##_lock_ir return flags; \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irqsave); \ +EXPORT_SYMBOL(_raw_##op##_lock_irqsave); \ \ -void __lockfunc _##op##_lock_irq(locktype *lock) \ +void __lockfunc _raw_##op##_lock_irq(locktype *lock) \ { \ - _##op##_lock_irqsave(lock); \ + _raw_##op##_lock_irqsave(lock); \ } \ \ -EXPORT_SYMBOL(_##op##_lock_irq); \ +EXPORT_SYMBOL(_raw_##op##_lock_irq); \ \ -void __lockfunc _##op##_lock_bh(locktype *lock) \ +void __lockfunc _raw_##op##_lock_bh(locktype *lock) \ { \ unsigned long flags; \ \ @@ -228,12 +226,12 @@ void __lockfunc _##op##_lock_bh(locktype /* irq-disabling. We use the generic preemption-aware */ \ /* function: */ \ /**/ \ - flags = _##op##_lock_irqsave(lock); \ + flags = _raw_##op##_lock_irqsave(lock); \ local_bh_disable(); \ local_irq_restore(flags); \ } \ \ -EXPORT_SYMBOL(_##op##_lock_bh) +EXPORT_SYMBOL(_raw_##op##_lock_bh) /* * Build preemption-friendly versions of the following @@ -244,119 +242,156 @@ EXPORT_SYMBOL(_##op##_lock_bh) * _[spin|read|write]_lock_irqsave() * _[spin|read|write]_lock_bh() */ -BUILD_LOCK_OPS(spin, spinlock_t); -BUILD_LOCK_OPS(read, rwlock_t); -BUILD_LOCK_OPS(write, rwlock_t); +BUILD_LOCK_OPS(spin, raw_spinlock_t); +BUILD_LOCK_OPS(read, raw_rwlock_t); +BUILD_LOCK_OPS(write, raw_rwlock_t); #endif /* CONFIG_PREEMPT */ -void __lockfunc _spin_unlock(spinlock_t *lock) +void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { - _raw_spin_unlock(lock); + __raw_spin_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_spin_unlock); +EXPORT_SYMBOL(_raw_spin_unlock); -void __lockfunc _write_unlock(rwlock_t *lock) +void __lockfunc _raw_write_unlock(raw_rwlock_t *lock) { - _raw_write_unlock(lock); + __raw_write_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_write_unlock); +EXPORT_SYMBOL(_raw_write_unlock); -void __lockfunc _read_unlock(rwlock_t *lock) +void __lockfunc _raw_read_unlock(raw_rwlock_t *lock) { - _raw_read_unlock(lock); + __raw_read_unlock(lock); preempt_enable(); } -EXPORT_SYMBOL(_read_unlock); +EXPORT_SYMBOL(_raw_read_unlock); -void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { - _raw_spin_unlock(lock); + __raw_spin_unlock(lock); + preempt_enable_no_resched(); local_irq_restore(flags); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_spin_unlock_irqrestore); +EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); -void __lockfunc _spin_unlock_irq(spinlock_t *lock) +void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) { - _raw_spin_unlock(lock); + __raw_spin_unlock(lock); + preempt_enable_no_resched(); local_irq_enable(); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_spin_unlock_irq); +EXPORT_SYMBOL(_raw_spin_unlock_irq); -void __lockfunc _spin_unlock_bh(spinlock_t *lock) +void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { - _raw_spin_unlock(lock); - preempt_enable(); + __raw_spin_unlock(lock); + preempt_enable_no_resched(); local_bh_enable(); } -EXPORT_SYMBOL(_spin_unlock_bh); +EXPORT_SYMBOL(_raw_spin_unlock_bh); -void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +void __lockfunc _raw_read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) { - _raw_read_unlock(lock); + __raw_read_unlock(lock); + preempt_enable_no_resched(); local_irq_restore(flags); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_read_unlock_irqrestore); +EXPORT_SYMBOL(_raw_read_unlock_irqrestore); -void __lockfunc _read_unlock_irq(rwlock_t *lock) +void __lockfunc _raw_read_unlock_irq(raw_rwlock_t *lock) { - _raw_read_unlock(lock); + __raw_read_unlock(lock); + preempt_enable_no_resched(); local_irq_enable(); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_read_unlock_irq); +EXPORT_SYMBOL(_raw_read_unlock_irq); -void __lockfunc _read_unlock_bh(rwlock_t *lock) +void __lockfunc _raw_read_unlock_bh(raw_rwlock_t *lock) { - _raw_read_unlock(lock); - preempt_enable(); + __raw_read_unlock(lock); + preempt_enable_no_resched(); local_bh_enable(); } -EXPORT_SYMBOL(_read_unlock_bh); +EXPORT_SYMBOL(_raw_read_unlock_bh); -void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +void __lockfunc _raw_write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags) { - _raw_write_unlock(lock); + __raw_write_unlock(lock); + preempt_enable_no_resched(); local_irq_restore(flags); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_write_unlock_irqrestore); +EXPORT_SYMBOL(_raw_write_unlock_irqrestore); -void __lockfunc _write_unlock_irq(rwlock_t *lock) +void __lockfunc _raw_write_unlock_irq(raw_rwlock_t *lock) { - _raw_write_unlock(lock); + __raw_write_unlock(lock); + preempt_enable_no_resched(); local_irq_enable(); - preempt_enable(); + preempt_check_resched(); } -EXPORT_SYMBOL(_write_unlock_irq); +EXPORT_SYMBOL(_raw_write_unlock_irq); -void __lockfunc _write_unlock_bh(rwlock_t *lock) +void __lockfunc _raw_write_unlock_bh(raw_rwlock_t *lock) { - _raw_write_unlock(lock); - preempt_enable(); + __raw_write_unlock(lock); + preempt_enable_no_resched(); local_bh_enable(); } -EXPORT_SYMBOL(_write_unlock_bh); +EXPORT_SYMBOL(_raw_write_unlock_bh); -int __lockfunc _spin_trylock_bh(spinlock_t *lock) +int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) { local_bh_disable(); preempt_disable(); - if (_raw_spin_trylock(lock)) + if (__raw_spin_trylock(lock)) return 1; - preempt_enable(); + preempt_enable_no_resched(); local_bh_enable(); return 0; } -EXPORT_SYMBOL(_spin_trylock_bh); +EXPORT_SYMBOL(_raw_spin_trylock_bh); + +int __lockfunc _raw_spin_trylock_irq(raw_spinlock_t *lock) +{ + local_irq_disable(); + preempt_disable(); + if (__raw_spin_trylock(lock)) + return 1; + + preempt_enable_no_resched(); + local_irq_enable(); + preempt_check_resched(); + + return 0; +} +EXPORT_SYMBOL(_raw_spin_trylock_irq); + +int __lockfunc _raw_spin_trylock_irqsave(raw_spinlock_t *lock, + unsigned long *flags) +{ + local_irq_save(*flags); + preempt_disable(); + if (__raw_spin_trylock(lock)) + return 1; + + preempt_enable_no_resched(); + local_irq_restore(*flags); + preempt_check_resched(); + + return 0; +} +EXPORT_SYMBOL(_raw_spin_trylock_irqsave); -int in_lock_functions(unsigned long addr) +int notrace in_lock_functions(unsigned long addr) { /* Linker adds these: start and end of __lockfunc functions */ extern char __lock_text_start[], __lock_text_end[]; --- linux/kernel/timer.c.orig +++ linux/kernel/timer.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -68,6 +69,7 @@ struct tvec_t_base_s { spinlock_t lock; unsigned long timer_jiffies; struct timer_list *running_timer; + wait_queue_head_t wait_for_running_timer; tvec_root_t tv1; tvec_t tv2; tvec_t tv3; @@ -159,14 +161,15 @@ int __mod_timer(struct timer_list *timer { tvec_base_t *old_base, *new_base; unsigned long flags; - int ret = 0; + int ret = 0, cpu; BUG_ON(!timer->function); check_timer(timer); spin_lock_irqsave(&timer->lock, flags); - new_base = &__get_cpu_var(tvec_bases); + cpu = _smp_processor_id(); + new_base = &per_cpu(tvec_bases, cpu); repeat: old_base = timer->base; @@ -354,10 +357,8 @@ del_again: for_each_online_cpu(i) { base = &per_cpu(tvec_bases, i); if (base->running_timer == timer) { - while (base->running_timer == timer) { - cpu_relax(); - preempt_check_resched(); - } + wait_event(base->wait_for_running_timer, + base->running_timer != timer); break; } } @@ -441,7 +442,23 @@ static inline void __run_timers(tvec_bas struct list_head work_list = LIST_HEAD_INIT(work_list); struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; - + + if (softirq_need_resched()) { + /* running_timer might be stale: */ + set_running_timer(base, NULL); +// if (waitqueue_active(&base->wait_running_timer)) + wake_up(&base->wait_for_running_timer); + spin_unlock_irq(&base->lock); + cond_resched_all(); + cpu_relax(); + spin_lock_irq(&base->lock); + /* + * We can simply continue after preemption, nobody + * else can touch timer_jiffies so 'index' is still + * valid. Any new jiffy will be taken care of in + * subsequent loops: + */ + } /* * Cascade timers: */ @@ -470,16 +487,20 @@ repeat: u32 preempt_count = preempt_count(); fn(data); if (preempt_count != preempt_count()) { - printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); - BUG(); + print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; } } + cond_resched_all(); spin_lock_irq(&base->lock); goto repeat; } } set_running_timer(base, NULL); spin_unlock_irq(&base->lock); +// if (waitqueue_active(&base->wait_running_timer)) + wake_up(&base->wait_for_running_timer); } #ifdef CONFIG_NO_IDLE_HZ @@ -833,7 +854,14 @@ void update_process_times(int user_tick) */ static unsigned long count_active_tasks(void) { +#ifdef CONFIG_PREEMPT_RT + /* + * -1 for the timer IRQ thread: + */ + return (nr_running() - 1 + nr_uninterruptible()) * FIXED_1; +#else return (nr_running() + nr_uninterruptible()) * FIXED_1; +#endif } /* @@ -873,23 +901,12 @@ unsigned long wall_jiffies = INITIAL_JIF * playing with xtime and avenrun. */ #ifndef ARCH_HAVE_XTIME_LOCK -seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; +DECLARE_SEQLOCK(xtime_lock); EXPORT_SYMBOL(xtime_lock); #endif /* - * This function runs timers and the timer-tq in bottom half context. - */ -static void run_timer_softirq(struct softirq_action *h) -{ - tvec_base_t *base = &__get_cpu_var(tvec_bases); - - if (time_after_eq(jiffies, base->timer_jiffies)) - __run_timers(base); -} - -/* * Called by the local, per-CPU timer interrupt on SMP. */ void run_local_timers(void) @@ -898,22 +915,48 @@ void run_local_timers(void) } /* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! + * Time of day handling: */ static inline void update_times(void) { - unsigned long ticks; + unsigned long ticks = 0; + /* + * First test outside the lock for performance reasons: + */ + if (jiffies != wall_jiffies) { + unsigned long flags; - ticks = jiffies - wall_jiffies; - if (ticks) { - wall_jiffies += ticks; - update_wall_time(ticks); + write_seqlock_irqsave(&xtime_lock, flags); + while (jiffies != wall_jiffies) { + wall_jiffies++; + ticks++; + update_wall_time(1); + /* + * Unlock unconditionally, to make sure + * we dont keep irqs off for a long time! + */ + write_sequnlock_irqrestore(&xtime_lock, flags); + cond_resched_softirq(); + write_seqlock_irqsave(&xtime_lock, flags); + } + calc_load(ticks); + write_sequnlock_irqrestore(&xtime_lock, flags); } - calc_load(ticks); } /* + * This function runs timers and the timer-tq in bottom half context. + */ +static void run_timer_softirq(struct softirq_action *h) +{ + tvec_base_t *base = &__get_cpu_var(tvec_bases); + + update_times(); + if (time_after_eq(jiffies, base->timer_jiffies)) + __run_timers(base); +} + +/* * The 64-bit jiffies value is not atomic - you MUST NOT read it * without sampling the sequence number in xtime_lock. * jiffies is defined in the linker script... @@ -922,7 +965,6 @@ static inline void update_times(void) void do_timer(struct pt_regs *regs) { jiffies_64++; - update_times(); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1286,6 +1328,8 @@ static void __devinit init_timers_cpu(in base = &per_cpu(tvec_bases, cpu); spin_lock_init(&base->lock); + init_waitqueue_head(&base->wait_for_running_timer); + for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); INIT_LIST_HEAD(base->tv4.vec + j); --- linux/kernel/sys.c.orig +++ linux/kernel/sys.c @@ -164,7 +164,7 @@ EXPORT_SYMBOL(notifier_chain_unregister) * of the last notifier function called. */ -int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) +int notrace notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) { int ret=NOTIFY_DONE; struct notifier_block *nb = *n; --- linux/kernel/latency.c.orig +++ linux/kernel/latency.c @@ -0,0 +1,1772 @@ +/* + * kernel/latency.c + * + * Copyright (C) 2004 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __i386__ +static inline cycles_t cycles(void) +{ + unsigned long long ret; + + rdtscll(ret); + + return ret; +} +#else +# define cycles() get_cycles() +#endif + +#ifdef CONFIG_WAKEUP_TIMING +struct sch_struct { + raw_spinlock_t trace_lock; + struct task_struct *task; + int cpu; + struct cpu_trace *tr; +} ____cacheline_aligned_in_smp; + +static __cacheline_aligned_in_smp struct sch_struct sch = + { trace_lock: RAW_SPIN_LOCK_UNLOCKED }; + +int wakeup_timing = 1; +#endif + +#ifdef CONFIG_LATENCY_TIMING + +/* + * Maximum preemption latency measured. Initialize to maximum, + * we clear it after bootup. + */ +static cycles_t preempt_max_latency = (cycles_t)ULONG_MAX; +static cycles_t preempt_thresh; + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(cycles_t delta) +{ + if (preempt_thresh) { + if (delta < preempt_thresh) + return 0; + } else { + if (delta <= preempt_max_latency) + return 0; + } + return 1; +} + +/* + * Track maximum latencies and save the trace: + */ +static __cacheline_aligned_in_smp DECLARE_MUTEX(max_mutex); +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesnt + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp int max_sequence; + +enum trace_type +{ + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_SPECIAL, + TRACE_SPECIAL_PID, + TRACE_CMDLINE, + TRACE_SYSCALL, + TRACE_SYSRET, + + __TRACE_LAST_TYPE +}; + +enum trace_flag_type +{ + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_HARDIRQ = 0x04, + TRACE_FLAG_SOFTIRQ = 0x08, +}; + + +#ifdef CONFIG_LATENCY_TRACE + +#define MAX_TRACE (unsigned long)(4096-1) + +#define CMDLINE_BYTES 16 + +/* + * 32 bytes on 32-bit platforms: + */ +struct trace_entry { + char type; + char cpu; + char flags; + char preempt_count; // assumes PREEMPT_MASK is 8 bits or less + int pid; + cycles_t timestamp; + union { + struct { + unsigned long eip; + unsigned long parent_eip; + } fn; + struct { + unsigned long eip; + unsigned long v1, v2, v3; + } special; + struct { + unsigned char str[CMDLINE_BYTES]; + } cmdline; + struct { + unsigned int nr; + unsigned long p1, p2, p3; + } syscall; + struct { + unsigned int ret; + } sysret; + struct { + int __pad3[4]; + } pad; + } u; +} __attribute__((packed)); + +#endif + +struct cpu_trace { + atomic_t disabled; + unsigned long trace_idx; + cycles_t preempt_timestamp; + unsigned long critical_start, critical_end; + int critical_sequence; + int early_warning; + +#ifdef CONFIG_LATENCY_TRACE + struct trace_entry trace[MAX_TRACE]; + char comm[CMDLINE_BYTES]; + pid_t pid; + unsigned long uid; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + unsigned long saved_latency; +#endif + +} ____cacheline_aligned_in_smp; + +static struct cpu_trace cpu_traces[NR_CPUS] ____cacheline_aligned_in_smp; + +static unsigned long notrace cycles_to_usecs(cycles_t delta) +{ +#ifdef CONFIG_X86 + do_div(delta, cpu_khz/1000+1); +#elif defined(CONFIG_PPC) + delta = mulhwu(tb_to_us, delta); +#else + #error Implement cycles_to_usecs. +#endif + + return (unsigned long) delta; +} + +static cycles_t notrace usecs_to_cycles(unsigned long delta) +{ + return (cycles_t) delta * (cycles_t) (cpu_khz/1000+1); +} + +#ifdef CONFIG_LATENCY_TRACE + +int trace_enabled = 1; +int mcount_enabled = 1; +int trace_freerunning = 0; +int trace_print_at_crash = 0; +int trace_verbose = 0; +int trace_all_cpus = 0; + +/* + * user-triggered via gettimeofday(0,1)/gettimeofday(0,0) + */ +int trace_user_triggered = 0; + +struct saved_trace_struct { + int cpu; + cycles_t first_timestamp, last_timestamp; + struct cpu_trace traces[NR_CPUS]; +} ____cacheline_aligned_in_smp; + +/* + * The current worst-case trace: + */ +static struct saved_trace_struct max_tr; + +/* + * /proc/latency_trace atomicity: + */ +static DECLARE_MUTEX(out_mutex); + +static struct saved_trace_struct out_tr; + + +static inline void notrace +____trace(int cpu, enum trace_type type, struct cpu_trace *tr, + unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, unsigned long v3) +{ + struct trace_entry *entry; + +#ifdef CONFIG_DEBUG_STACKOVERFLOW + /* Debugging check for stack overflow: is there less than 1KB free? */ + { + long esp; + + __asm__ __volatile__("andl %%esp,%0" : + "=r" (esp) : "0" (THREAD_SIZE - 1)); + if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { + printk("BUG: mcount: stack overflow: %ld [%08lx...%08lx...%08lx]\n", + esp - sizeof(struct thread_info), (long)&esp, (long)current_thread_info(), (long)current_thread_info() + THREAD_SIZE); + dump_stack(); + } + } +#endif + + if (likely(tr->critical_start) || unlikely(trace_user_triggered || trace_all_cpus)) + if (tr->trace_idx < MAX_TRACE) { + u32 pc = preempt_count(); + + entry = tr->trace + tr->trace_idx; + entry->type = type; +#ifdef CONFIG_SMP + entry->cpu = cpu; +#endif + entry->flags = (irqs_disabled() ? TRACE_FLAG_IRQS_OFF : 0) | + ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | + ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + (_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); + entry->preempt_count = pc & 0xff; + entry->pid = current->pid; + entry->timestamp = cycles(); + + switch (type) { + case TRACE_FN: + entry->u.fn.eip = eip; + entry->u.fn.parent_eip = parent_eip; + break; + case TRACE_SPECIAL: + case TRACE_SPECIAL_PID: + entry->u.special.eip = eip; + entry->u.special.v1 = v1; + entry->u.special.v2 = v2; + entry->u.special.v3 = v3; + break; + case TRACE_SYSCALL: + entry->u.syscall.nr = eip; + entry->u.syscall.p1 = v1; + entry->u.syscall.p2 = v2; + entry->u.syscall.p3 = v3; + break; + case TRACE_SYSRET: + entry->u.sysret.ret = eip; + break; + case TRACE_CMDLINE: + memcpy(entry->u.cmdline.str, current->comm, CMDLINE_BYTES); + break; + default: + break; + } + } + tr->trace_idx++; + if (unlikely(trace_freerunning && (tr->trace_idx >= MAX_TRACE))) + tr->trace_idx = 0; +} + +static inline void notrace +___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, + unsigned long v3) +{ + int cpu = _smp_processor_id(); + struct cpu_trace *tr; + + if (unlikely(trace_enabled <= 0)) + return; + + /* + * Trace on the CPU where the current highest-prio task + * is waiting to become runnable: + */ +#ifdef CONFIG_WAKEUP_TIMING + if (wakeup_timing && !trace_all_cpus) { + if (!sch.tr || cpu != sch.cpu) + return; + tr = sch.tr; + } else + tr = cpu_traces + cpu; +#else + tr = cpu_traces + cpu; +#endif + if (likely(!atomic_read(&tr->disabled))) { + atomic_inc(&tr->disabled); + ____trace(cpu, type, tr, eip, parent_eip, v1, v2, v3); + atomic_dec(&tr->disabled); + } +} + +/* + * Special, ad-hoc tracepoints: + */ +void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3) +{ + ___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, v1, v2, v3); +} + +EXPORT_SYMBOL(trace_special); + +void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2) +{ + ___trace(TRACE_SPECIAL_PID, CALLER_ADDR0, 0, pid, v1, v2); +} + +EXPORT_SYMBOL(trace_special_pid); + +/* + * Non-inlined function: + */ +void notrace __trace(unsigned long eip, unsigned long parent_eip) +{ + ___trace(TRACE_FN, eip, parent_eip, 0, 0, 0); +} + +extern void mcount(void); + +EXPORT_SYMBOL(mcount); + +void notrace __mcount(void) +{ + ___trace(TRACE_FN, CALLER_ADDR1, CALLER_ADDR2, 0, 0, 0); +} + +void notrace +sys_call(int nr, unsigned long p1, unsigned long p2, unsigned long p3) +{ + ___trace(TRACE_SYSCALL, nr, 0, p1, p2, p3); +} + +void notrace sys_ret(int ret) +{ + ___trace(TRACE_SYSRET, ret, 0, 0, 0, 0); +} + +static void notrace print_name(struct seq_file *m, unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + /* + * Special trace values: + */ + if (((long)eip < 10000L) && ((long)eip > -10000L)) { + seq_printf(m, "(%ld)", eip); + return; + } + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + seq_puts(m, sym_name); + else + seq_printf(m, "<%08lx>", eip); +} + +static void notrace printk_name(unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + printk("%s+%#lx/%#lx", sym_name, offset, size); + else + printk("<%08lx>", eip); +} + + +static void notrace print_name_offset(struct seq_file *m, unsigned long eip) +{ + char namebuf[KSYM_NAME_LEN+1]; + unsigned long size, offset; + const char *sym_name; + char *modname; + + sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf); + if (sym_name) + seq_printf(m, "%s+%#lx/%#lx <%08lx>", + sym_name, offset, size, eip); + else + seq_printf(m, "<%08lx>", eip); +} + +static unsigned int out_sequence = -1; +static int pid_to_cmdline_array[PID_MAX_DEFAULT+1]; + +void notrace trace_cmdline(void) +{ + ___trace(TRACE_CMDLINE, 0, 0, 0, 0, 0); +} + +static void construct_pid_to_cmdline(void) +{ + struct cpu_trace *tr = out_tr.traces; + unsigned int i, j, entries, pid; + +// printk("cs: %d, ls: %d, ms: %d\n", +// tr->critical_sequence, last_sequence, max_sequence); + if (tr->critical_sequence == out_sequence) + return; + out_sequence = tr->critical_sequence; + + memset(pid_to_cmdline_array, -1, sizeof(int) * (PID_MAX_DEFAULT + 1)); + + entries = min(tr->trace_idx, MAX_TRACE-1); +// printk("entries: %d\n", entries); + + for (i = 0; i < entries; i++) { + struct trace_entry *entry = tr->trace + i; + + if (entry->type != TRACE_CMDLINE) + continue; + pid = entry->pid; + if (pid < PID_MAX_DEFAULT) { + pid_to_cmdline_array[pid] = i; +// printk("pid %d -> idx %d [%16s]\n", +// pid, i, tr->trace[pid_to_cmdline_array[pid]].u.cmdline.str); + /* + * Replace space with underline - makes it easier + * to process for tools: + */ + for (j = 0; j < CMDLINE_BYTES; j++) + if (entry->u.cmdline.str[j] == ' ') + entry->u.cmdline.str[j] = '_'; + } + } +} + +char *pid_to_cmdline(unsigned long pid) +{ + struct cpu_trace *tr = out_tr.traces; + char *cmdline = ""; + int idx; + + pid = min(pid, (unsigned long)PID_MAX_DEFAULT); + if (pid_to_cmdline_array[pid] != -1) { + idx = pid_to_cmdline_array[pid]; + if (tr->trace[idx].type == TRACE_CMDLINE) + cmdline = tr->trace[idx].u.cmdline.str; + } + return cmdline; +} + +struct block_idx { + int idx[NR_CPUS]; +}; + +/* + * return the trace entry (position) of the smallest-timestamp + * one (that is still in the valid idx range): + */ +static int min_idx(struct block_idx *bidx) +{ + cycles_t min_stamp = (cycles_t) -1; + struct trace_entry *entry; + int cpu, min_cpu = -1, idx; + + for_each_online_cpu(cpu) { + idx = bidx->idx[cpu]; + entry = max_tr.traces[cpu].trace + bidx->idx[cpu]; + if (idx > max_tr.traces[cpu].trace_idx) + continue; + if (entry->timestamp < min_stamp) { + min_cpu = cpu; + min_stamp = entry->timestamp; + } + } + + return min_cpu; +} + +/* + * This code is called to construct an output trace from + * the maximum trace. Having separate traces serves both + * atomicity (a new max might be saved while we are busy + * accessing /proc/latency_trace) and it is also used to + * delay the (expensive) sorting of the output trace by + * timestamps, in the trace_all_cpus case. + */ +static void update_out_trace(void) +{ + int cpu, sum, entries; + struct cpu_trace *tmp_max, *tmp_out; + struct trace_entry *out_entry, *entry; + struct block_idx bidx = { { 0, } }; + cycles_t stamp, first_stamp = 0, last_stamp = (cycles_t)-1; + + /* + * Nasty trick. We might overflow the first array but + * there are NR_CPUS of them so we use it as a 'big' + * trace buffer. + */ + tmp_out = out_tr.traces + 0; + *tmp_out = max_tr.traces[max_tr.cpu]; + out_tr.cpu = max_tr.cpu; + out_entry = tmp_out->trace + 0; + + if (!trace_all_cpus) { + entries = min(tmp_out->trace_idx, MAX_TRACE-1); + if (!entries) + return; + out_tr.first_timestamp = tmp_out->trace[0].timestamp; + out_tr.last_timestamp = tmp_out->trace[entries-1].timestamp; + return; + } + /* + * Find the range of timestamps that are fully traced in + * all CPU traces. (since CPU traces can cover a variable + * range of time, we have to find the best range.) + */ + for_each_online_cpu(cpu) { + tmp_max = max_tr.traces + cpu; + stamp = tmp_max->trace[0].timestamp; +// printk("cpu%d stamp0: %016Lx [trace_idx: %ld]\n", +// cpu, stamp, tmp_max->trace_idx); + if (stamp > first_stamp) + first_stamp = stamp; + } +// printk("first_stamp: %016Lx\n", first_stamp); + /* + * Save the timestamp range: + */ + + tmp_max = max_tr.traces + max_tr.cpu; + entries = min(tmp_max->trace_idx, MAX_TRACE-1); + /* + * No saved trace yet? + */ + if (!entries) { + out_tr.traces[0].trace_idx = 0; + return; + } + + last_stamp = tmp_max->trace[entries-1].timestamp; +// printk(" last_stamp: %016Lx [max cpu: %d]\n", +// last_stamp, max_tr.cpu); + + WARN_ON(last_stamp < first_stamp); + + out_tr.first_timestamp = first_stamp; + out_tr.last_timestamp = last_stamp; + + + /* + * Fetch trace entries one by one, in increasing timestamp + * order. Start at first_stamp, stop at last_stamp: + */ + sum = 0; + for (;;) { + cpu = min_idx(&bidx); +// printk("cpu: %d\n", cpu); + if (cpu == -1) + break; + entry = max_tr.traces[cpu].trace + bidx.idx[cpu]; +// printk("entry [%d][%d], stamp: %016Lx", +// cpu, bidx.idx[cpu], entry->timestamp); + if (entry->timestamp > last_stamp) { +// printk(" ... skipped\n"); + break; + } +// printk(" ... copied.\n"); + + bidx.idx[cpu]++; + if (entry->timestamp < first_stamp) + continue; + *out_entry = *entry; + out_entry++; + sum++; + } +// printk("sum: %d\n\n", sum); + + WARN_ON(sum > MAX_TRACE*NR_CPUS); + tmp_out->trace_idx = sum; +} + +static void * notrace l_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + unsigned long entries; + struct cpu_trace *tr; + + down(&out_mutex); + /* + * if the file is being read newly, update the output trace: + */ + if (!n) { + // TODO: use the sequence counter here to optimize + down(&max_mutex); + update_out_trace(); + up(&max_mutex); + if (!out_tr.traces[0].trace_idx) { + up(&out_mutex); + return NULL; + } + construct_pid_to_cmdline(); + } + tr = out_tr.traces; + entries = min(tr->trace_idx, MAX_TRACE); + + if (!n) { + seq_printf(m, "preemption latency trace v1.1.4 on %s\n", UTS_RELEASE); + seq_puts(m, "--------------------------------------------------------------------\n"); + seq_printf(m, " latency: %lu µs, #%lu/%lu, CPU#%d | (M:%s VP:%d, KP:%d, SP:%d HP:%d #P:%d)\n", + cycles_to_usecs(tr->saved_latency), + entries, tr->trace_idx, out_tr.cpu, +#if defined(CONFIG_PREEMPT_NONE) + "server", +#elif defined(CONFIG_PREEMPT_VOLUNTARY) + "desktop", +#elif defined(CONFIG_PREEMPT_DESKTOP) + "preempt", +#else + "rt", +#endif + voluntary_preemption, kernel_preemption, + softirq_preemption, hardirq_preemption, + num_online_cpus()); + seq_puts(m, " -----------------\n"); + seq_printf(m, " | task: %.16s-%d (uid:%ld nice:%ld policy:%ld rt_prio:%ld)\n", + tr->comm, tr->pid, tr->uid, tr->nice, + tr->policy, tr->rt_priority); + seq_puts(m, " -----------------\n"); + if (trace_user_triggered) { + seq_puts(m, " => started at: "); + print_name_offset(m, tr->critical_start); + seq_puts(m, "\n => ended at: "); + print_name_offset(m, tr->critical_end); + seq_puts(m, "\n"); + } + seq_puts(m, "\n"); + + seq_puts(m, " _------=> CPU# \n"); + seq_puts(m, " / _-----=> irqs-off \n"); + seq_puts(m, " | / _----=> need-resched \n"); + seq_puts(m, " || / _---=> hardirq/softirq \n"); + seq_puts(m, " ||| / _--=> preempt-depth \n"); + seq_puts(m, " |||| / \n"); + seq_puts(m, " ||||| delay \n"); + seq_puts(m, " cmd pid ||||| time | caller \n"); + seq_puts(m, " \\ / ||||| \\ | / \n"); + + } + if (n >= entries) + return NULL; + + return tr->trace + n; +} + +static void * notrace l_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct cpu_trace *tr = out_tr.traces; + unsigned long entries = min(tr->trace_idx, MAX_TRACE); + + if (++*pos >= entries) { + if (*pos == entries) + seq_puts(m, "\n\nvim:ft=help\n"); + return NULL; + } + return tr->trace + *pos; +} + +static void notrace l_stop(struct seq_file *m, void *p) +{ + up(&out_mutex); +} + +static void print_timestamp(struct seq_file *m, unsigned long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4ldµs", abs_usecs); + if (rel_usecs > 100) + seq_puts(m, "!: "); + else if (rel_usecs > 1) + seq_puts(m, "+: "); + else + seq_puts(m, " : "); +} + +static void +print_timestamp_short(struct seq_file *m, unsigned long abs_usecs, + unsigned long rel_usecs) +{ + seq_printf(m, " %4ldµs", abs_usecs); + if (rel_usecs > 100) + seq_putc(m, '!'); + else if (rel_usecs > 1) + seq_putc(m, '+'); + else + seq_putc(m, ' '); +} + +static void +print_generic(struct seq_file *m, struct trace_entry *entry) +{ + int hardirq, softirq; + + seq_printf(m, "%8.8s-%-5d ", pid_to_cmdline(entry->pid), entry->pid); + seq_printf(m, "%d", entry->cpu); + seq_printf(m, "%c%c", + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.'); + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + if (hardirq && softirq) + seq_putc(m, 'H'); + else { + if (hardirq) + seq_putc(m, 'h'); + else { + if (softirq) + seq_putc(m, 's'); + else + seq_putc(m, '.'); + } + } + + if (entry->preempt_count) + seq_printf(m, "%x", entry->preempt_count); + else + seq_puts(m, "."); +} + + +static int notrace l_show_fn(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + if (trace_verbose) { + seq_printf(m, "%16s %5d %d %d %08x %08lx [%016Lu] %ld.%03ldms (+%ld.%03ldms): ", + pid_to_cmdline(entry->pid), + entry->pid, entry->cpu, entry->flags, + entry->preempt_count, trace_idx, + entry->timestamp, abs_usecs/1000, + abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); + print_name_offset(m, entry->u.fn.eip); + seq_puts(m, " ("); + print_name_offset(m, entry->u.fn.parent_eip); + seq_puts(m, ")\n"); + } else { + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + print_name(m, entry->u.fn.eip); + seq_puts(m, " ("); + print_name(m, entry->u.fn.parent_eip); + seq_puts(m, ")\n"); + } + return 0; +} + +static int notrace l_show_special(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_offset(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + seq_printf(m, " (%lx %lx %lx)\n", + entry->u.special.v1, entry->u.special.v2, entry->u.special.v3); + + return 0; +} + +static int notrace +l_show_special_pid(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + unsigned int pid; + + pid = entry->u.special.v1; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp(m, abs_usecs, rel_usecs); + if (trace_verbose) + print_name_offset(m, entry->u.special.eip); + else + print_name(m, entry->u.special.eip); + seq_printf(m, " <%.8s-%d> (%lx %lx): ", + pid_to_cmdline(pid), pid, + entry->u.special.v2, entry->u.special.v3); + + seq_puts(m, "\n"); + + return 0; +} + + +static int notrace l_show_cmdline(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + return 0; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + seq_printf(m, + "[ => %16s ] %ld.%03ldms (+%ld.%03ldms)\n", + entry->u.cmdline.str, + abs_usecs/1000, abs_usecs % 1000, + rel_usecs/1000, rel_usecs % 1000); + + return 0; +} + +extern unsigned long sys_call_table[NR_syscalls]; + +static int notrace l_show_syscall(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + unsigned int nr; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp_short(m, abs_usecs, rel_usecs); + + seq_puts(m, "> "); + nr = entry->u.syscall.nr; + if (nr < NR_syscalls) + print_name(m, sys_call_table[entry->u.syscall.nr]); + else + seq_puts(m, ""); + + seq_printf(m, " (%08lx %08lx %08lx)\n", + entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3); + + return 0; +} + +static int notrace l_show_sysret(struct seq_file *m, unsigned long trace_idx, + struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + print_generic(m, entry); + print_timestamp_short(m, abs_usecs, rel_usecs); + + seq_printf(m, "< (%d)\n", entry->u.sysret.ret); + + return 0; +} + + +static int notrace l_show(struct seq_file *m, void *p) +{ + struct cpu_trace *tr = out_tr.traces; + struct trace_entry *entry, *entry0, *next_entry; + unsigned long trace_idx; + + entry = p; + if (entry->timestamp < out_tr.first_timestamp) + return 0; + if (entry->timestamp > out_tr.last_timestamp) + return 0; + + entry0 = tr->trace; + trace_idx = entry - entry0; + + if (trace_idx + 1 < tr->trace_idx) + next_entry = entry + 1; + else + next_entry = entry; + + switch (entry->type) { + case TRACE_FN: + l_show_fn(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SPECIAL: + l_show_special(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SPECIAL_PID: + l_show_special_pid(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_CMDLINE: + l_show_cmdline(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SYSCALL: + l_show_syscall(m, trace_idx, entry, entry0, next_entry); + break; + case TRACE_SYSRET: + l_show_sysret(m, trace_idx, entry, entry0, next_entry); + break; + default: + seq_printf(m, "unknown trace type %d\n", entry->type); + } + return 0; +} + +struct seq_operations latency_trace_op = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show +}; + +static void copy_trace(struct cpu_trace *save, struct cpu_trace *tr) +{ + /* free-running needs reordering */ + if (trace_freerunning) { + int i, idx, idx0 = tr->trace_idx; + + for (i = 0; i < MAX_TRACE; i++) { + idx = (idx0 + i) % MAX_TRACE; + save->trace[i] = tr->trace[idx]; + } + save->trace_idx = MAX_TRACE-1; + } else { + save->trace_idx = tr->trace_idx; + + memcpy(save->trace, tr->trace, + min(save->trace_idx + 1, MAX_TRACE) * + sizeof(struct trace_entry)); + } +} + +static void update_max_tr(struct cpu_trace *tr) +{ + struct cpu_trace *save; + int this_cpu = smp_processor_id(), cpu, all_cpus = 0; + + WARN_ON(!preempt_count() && !irqs_disabled()); + + max_tr.cpu = this_cpu; + save = max_tr.traces + this_cpu; + + if ((wakeup_timing || trace_user_triggered) && trace_all_cpus) { + all_cpus = 1; + for_each_online_cpu(cpu) + atomic_inc(&cpu_traces[cpu].disabled); + } +// printk("this_cpu: %d, trace_idx: %ld.\n", this_cpu, tr->trace_idx); +// for_each_online_cpu(cpu) +// printk(".. cpu%d: %ld.\n", cpu, cpu_traces[cpu].trace_idx); + + save->saved_latency = preempt_max_latency; + save->preempt_timestamp = tr->preempt_timestamp; + save->critical_start = tr->critical_start; + save->critical_end = tr->critical_end; + save->critical_sequence = tr->critical_sequence; + + memcpy(save->comm, current->comm, CMDLINE_BYTES); + save->pid = current->pid; + save->uid = current->uid; + save->nice = current->static_prio - 20 - MAX_RT_PRIO; + save->policy = current->policy; + save->rt_priority = current->rt_priority; + + if (all_cpus) { + for_each_online_cpu(cpu) { + copy_trace(max_tr.traces + cpu, cpu_traces + cpu); + atomic_dec(&cpu_traces[cpu].disabled); + } + } else + copy_trace(save, tr); +} + +#else /* !LATENCY_TRACE */ + +static inline void notrace +____trace(int cpu, enum trace_type type, struct cpu_trace *tr, + unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, unsigned long v3) +{ +} + +static inline void notrace +___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip, + unsigned long v1, unsigned long v2, + unsigned long v3) +{ +} + +static inline void notrace __trace(unsigned long eip, unsigned long parent_eip) +{ +} + +static inline void update_max_tr(struct cpu_trace *tr) +{ +} + +#endif + +static int setup_preempt_thresh(char *s) +{ + int thresh; + + get_option(&s, &thresh); + if (thresh > 0) { + preempt_thresh = usecs_to_cycles(thresh); + printk("Preemption threshold = %u µs\n", thresh); + } + return 1; +} +__setup("preempt_thresh=", setup_preempt_thresh); + +#ifdef CONFIG_CRITICAL_TIMING + +static void notrace +check_critical_timing(struct cpu_trace *tr, unsigned long parent_eip) +{ + unsigned long latency, t0, t1; + cycles_t T1, T0, delta; + + if (trace_user_triggered) + return; + /* + * usecs conversion is slow so we try to delay the conversion + * as long as possible: + */ + T0 = tr->preempt_timestamp; + T1 = cycles(); + delta = T1-T0; + + if (!report_latency(delta)) + goto out; + ___trace(TRACE_FN, CALLER_ADDR0, parent_eip, 0, 0, 0); + /* + * Update the timestamp, because the trace entry above + * might change it (it can only get larger so the latency + * is fair to be reported): + */ + T1 = cycles(); + delta = T1-T0; + + if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex)) + goto out; + + preempt_max_latency = delta; + t0 = cycles_to_usecs(T0); + t1 = cycles_to_usecs(T1); + latency = cycles_to_usecs(delta); + + tr->critical_end = parent_eip; + + update_max_tr(tr); + + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu µs critical section " + "violates %lu µs threshold.\n" + " => started at timestamp %lu: ", + current->comm, current->pid, + _smp_processor_id(), + latency, cycles_to_usecs(preempt_thresh), t0); + else + printk("(%16s-%-5d|#%d): new %lu µs maximum-latency " + "critical section.\n => started at timestamp %lu: ", + current->comm, current->pid, + _smp_processor_id(), + latency, t0); + + print_symbol("<%s>\n", tr->critical_start); + printk(" => ended at timestamp %lu: ", t1); + print_symbol("<%s>\n", tr->critical_end); + dump_stack(); + t1 = cycles_to_usecs(cycles()); + printk(" => dump-end timestamp %lu\n\n", t1); + + max_sequence++; + + up(&max_mutex); +out: + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = cycles(); + tr->early_warning = 0; + tr->trace_idx = 0; + trace_cmdline(); + __trace(CALLER_ADDR0, parent_eip); +} + +void notrace touch_critical_timing(void) +{ + struct cpu_trace *tr = cpu_traces + _smp_processor_id(); + + if (!tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + if (preempt_count() > 0 && tr->critical_start) { + atomic_inc(&tr->disabled); + check_critical_timing(tr, CALLER_ADDR0); + tr->critical_start = CALLER_ADDR0; + tr->critical_sequence = max_sequence; + atomic_dec(&tr->disabled); + } +} +EXPORT_SYMBOL(touch_critical_timing); + +void notrace stop_critical_timing(void) +{ + struct cpu_trace *tr = cpu_traces + _smp_processor_id(); + + tr->critical_start = 0; +} +EXPORT_SYMBOL(stop_critical_timing); + +static inline void notrace +__start_critical_timing(unsigned long eip, unsigned long parent_eip) +{ + int cpu = _smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + + if (tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + atomic_inc(&tr->disabled); + + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = cycles(); + tr->critical_start = eip; + tr->trace_idx = 0; + trace_cmdline(); + ____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0); + + atomic_dec(&tr->disabled); +} + +static inline void notrace +__stop_critical_timing(unsigned long eip, unsigned long parent_eip) +{ + int cpu = _smp_processor_id(); + struct cpu_trace *tr = cpu_traces + cpu; + + if (!tr->critical_start || atomic_read(&tr->disabled) || + trace_user_triggered || wakeup_timing) + return; + + atomic_inc(&tr->disabled); + ____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0); + check_critical_timing(tr, eip); + tr->critical_start = 0; + atomic_dec(&tr->disabled); +} + +#endif + +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + +void notrace trace_irqs_off(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!preempt_count() && irqs_disabled_flags(flags)) + __start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +EXPORT_SYMBOL(trace_irqs_off); + +void notrace trace_irqs_on(void) +{ + unsigned long flags; + + local_save_flags(flags); + + if (!preempt_count() && irqs_disabled_flags(flags)) + __stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} + +EXPORT_SYMBOL(trace_irqs_on); + +#endif + +#endif /* LATENCY_TIMING */ + +#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING) + +void notrace add_preempt_count(int val) +{ + unsigned long eip = CALLER_ADDR0; + unsigned long parent_eip = CALLER_ADDR1; + +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + BUG_ON(((int)preempt_count() < 0)); + /* + * Spinlock count overflowing soon? + */ + BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); +#endif + + preempt_count() += val; +#ifdef CONFIG_PREEMPT_TRACE + if (val <= 10) { + unsigned int idx = preempt_count() & PREEMPT_MASK; + if (idx < MAX_PREEMPT_TRACE) { + current->preempt_trace_eip[idx] = eip; + current->preempt_trace_parent_eip[idx] = parent_eip; + } + } +#endif +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) +#endif + if (preempt_count() == val) + __start_critical_timing(eip, parent_eip); + } +#endif +} +EXPORT_SYMBOL(add_preempt_count); + +void notrace sub_preempt_count(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + BUG_ON(unlikely(val > preempt_count())); + + /* + * Is the spinlock portion underflowing? + */ + BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); +#endif + +#ifdef CONFIG_CRITICAL_PREEMPT_TIMING + { +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + unsigned long flags; + + local_save_flags(flags); + + if (!irqs_disabled_flags(flags)) +#endif + if (preempt_count() == val) + __stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); + } +#endif + preempt_count() -= val; +} + +EXPORT_SYMBOL(sub_preempt_count); + +#endif + +/* + * Wakeup latency timing/tracing. We get upcalls from the scheduler + * when a task is being woken up and we time/trace it until it gets + * to a CPU - or an even-higher-prio task supercedes it. (in that + * case we throw away the currently traced task - we dont try to + * handle nesting, that simplifies things significantly) + */ +#ifdef CONFIG_WAKEUP_TIMING + +static void notrace +check_wakeup_timing(struct cpu_trace *tr, unsigned long parent_eip) +{ + unsigned long latency; + unsigned long t0, t1; + cycles_t T0, T1, delta; + + if (trace_user_triggered) + return; + + atomic_inc(&tr->disabled); + if (atomic_read(&tr->disabled) != 1) + goto out; + + T0 = tr->preempt_timestamp; + T1 = cycles(); + delta = T1-T0; + + if (!report_latency(delta)) + goto out; + + ____trace(smp_processor_id(), TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0); + T1 = cycles(); + delta = T1-T0; + + if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex)) + goto out; + + preempt_max_latency = delta; + t0 = cycles_to_usecs(T0); + t1 = cycles_to_usecs(T1); + latency = cycles_to_usecs(delta); + + tr->critical_end = parent_eip; + + update_max_tr(tr); + + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu µs wakeup latency " + "violates %lu µs threshold.\n", + current->comm, current->pid, + _smp_processor_id(), latency, + cycles_to_usecs(preempt_thresh)); + else + printk("(%16s-%-5d|#%d): new %lu µs maximum-latency " + "wakeup.\n", current->comm, current->pid, + _smp_processor_id(), latency); + + max_sequence++; + + up(&max_mutex); +out: + atomic_dec(&tr->disabled); +} + +/* + * Start wakeup latency tracing - called with the runqueue held + * and interrupts disabled: + */ +void __trace_start_sched_wakeup(struct task_struct *p) +{ + struct cpu_trace *tr; + int cpu; + + if (trace_user_triggered || !wakeup_timing) + return; + + spin_lock(&sch.trace_lock); + if (sch.task && (sch.task->prio >= p->prio)) + goto out_unlock; + /* + * New highest-prio task just woke up - start tracing: + */ + sch.task = p; + sch.cpu = task_cpu(p); + /* + * We keep using this CPU's trace buffer even if the task + * gets migrated to another CPU. Tracing only happens on + * the CPU that 'owns' the highest-prio task so it's + * fundamentally single-threaded. + */ + sch.tr = tr = cpu_traces + sch.cpu; + if (trace_all_cpus) + for_each_online_cpu(cpu) + cpu_traces[cpu].trace_idx = 0; + else + tr->trace_idx = 0; + + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = cycles(); + tr->critical_start = CALLER_ADDR0; + trace_cmdline(); + mcount(); +out_unlock: + spin_unlock(&sch.trace_lock); +} + +void trace_stop_sched_switched(struct task_struct *p) +{ + struct cpu_trace *tr; + unsigned long flags; + + trace_cmdline(); + if (trace_user_triggered || !wakeup_timing) + return; + + trace_special_pid(p->pid, p->prio, 0); + + spin_lock_irqsave(&sch.trace_lock, flags); + if (p == sch.task) { + sch.task = NULL; + tr = sch.tr; + sch.tr = NULL; + WARN_ON(!tr); + /* + * Somewhat racy but safer: + */ + spin_unlock(&sch.trace_lock); + check_wakeup_timing(tr, CALLER_ADDR0); + local_irq_restore(flags); + } else { + if (sch.task) + trace_special_pid(sch.task->pid, sch.task->prio, p->prio); + if (sch.task && (sch.task->prio >= p->prio)) + sch.task = NULL; + spin_unlock_irqrestore(&sch.trace_lock, flags); + } +} + +void trace_change_sched_cpu(struct task_struct *p, int new_cpu) +{ + unsigned long flags; + + if (!wakeup_timing) + return; + + trace_special(task_cpu(p), task_cpu(p), new_cpu); + spin_lock_irqsave(&sch.trace_lock, flags); + if (p == sch.task && task_cpu(p) != new_cpu) { + sch.cpu = new_cpu; + trace_special(task_cpu(p), new_cpu, 0); + } + spin_unlock_irqrestore(&sch.trace_lock, flags); +} + +#endif + +#ifdef CONFIG_LATENCY_TRACE + +long user_trace_start(void) +{ + struct cpu_trace *tr; + unsigned long flags; + int cpu; + + if (!trace_user_triggered || trace_print_at_crash) + return -EINVAL; + + if (down_trylock(&max_mutex)) + return -EAGAIN; + + preempt_disable(); + tr = cpu_traces + smp_processor_id(); + + if (wakeup_timing) { + spin_lock_irqsave(&sch.trace_lock, flags); + sch.task = current; + sch.cpu = smp_processor_id(); + sch.tr = tr; + spin_unlock_irqrestore(&sch.trace_lock, flags); + } + + if (trace_all_cpus) + for_each_online_cpu(cpu) + cpu_traces[cpu].trace_idx = 0; + else + tr->trace_idx = 0; + tr->critical_sequence = max_sequence; + tr->preempt_timestamp = cycles(); + trace_cmdline(); + mcount(); + preempt_enable(); + + up(&max_mutex); + + return 0; +} + +long user_trace_stop(void) +{ + unsigned long latency, flags; + struct cpu_trace *tr; + cycles_t delta; + + if (!trace_user_triggered || trace_print_at_crash) + return -EINVAL; + + preempt_disable(); + mcount(); + + if (wakeup_timing) { + spin_lock_irqsave(&sch.trace_lock, flags); + if (current != sch.task) { + spin_unlock_irqrestore(&sch.trace_lock, flags); + preempt_enable(); + return -EINVAL; + } + sch.task = NULL; + tr = sch.tr; + sch.tr = NULL; + spin_unlock_irqrestore(&sch.trace_lock, flags); + } else + tr = cpu_traces + smp_processor_id(); + + atomic_inc(&tr->disabled); + if (tr->preempt_timestamp) { + delta = cycles() - tr->preempt_timestamp; + if (!report_latency(delta)) + goto out; + if (tr->critical_sequence != max_sequence || + down_trylock(&max_mutex)) + goto out; + + preempt_max_latency = delta; + update_max_tr(tr); + + latency = cycles_to_usecs(delta); + + if (preempt_thresh) + printk("(%16s-%-5d|#%d): %lu µs user-latency " + "violates %lu µs threshold.\n", + current->comm, current->pid, + _smp_processor_id(), latency, + cycles_to_usecs(preempt_thresh)); + else + printk("(%16s-%-5d|#%d): new %lu µs user-latency.\n", + current->comm, current->pid, + _smp_processor_id(), latency); + + max_sequence++; + up(&max_mutex); +out: + tr->preempt_timestamp = 0; + } + atomic_dec(&tr->disabled); + preempt_enable(); + + return 0; +} + +EXPORT_SYMBOL(user_trace_stop); + +void stop_trace(void) +{ + if (trace_print_at_crash) + trace_enabled = -1; +} + +static void print_entry(struct trace_entry *entry, struct trace_entry *entry0, + struct trace_entry *next_entry) +{ + unsigned long abs_usecs, rel_usecs; + + abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp); + rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp); + + printk("%-5d %d/%d %ld.%03ldms: ", + entry->pid, entry->flags, entry->preempt_count, + abs_usecs/1000, abs_usecs % 1000); + + printk_name(entry->u.fn.eip); + printk(" <= ("); + printk_name(entry->u.fn.parent_eip); + printk(")\n"); +} + +void print_last_trace(void) +{ + unsigned int idx0, idx, i; + struct cpu_trace *tr; + struct trace_entry *entry0, *entry, *next_entry; + + if (trace_enabled != -1) + return; + + preempt_disable(); + tr = cpu_traces + smp_processor_id(); + + printk("Last %ld trace entries:\n", MAX_TRACE); + idx0 = tr->trace_idx; + printk("curr idx: %d\n", idx0); + if (idx0 >= MAX_TRACE) + idx0 = MAX_TRACE-1; + idx = idx0; + entry0 = tr->trace + idx0; + + for (i = 0; i < MAX_TRACE; i++) { + entry = tr->trace + idx; + idx++; + if (idx == MAX_TRACE) + idx = 0; + next_entry = tr->trace + idx; + if (entry->type == TRACE_FN) + print_entry(entry, entry0, next_entry); + } + trace_print_at_crash = 1; + preempt_enable(); +} + +#ifdef CONFIG_SMP +/* + * On SMP, try to 'peek' on other CPU's traces and record them + * in this CPU's trace. This way we get a rough idea about what's + * going on there, without the overhead of global tracing. + * + * (no need to make this PER_CPU, we bounce it around anyway.) + */ +unsigned long nmi_eips[NR_CPUS]; +unsigned long nmi_flags[NR_CPUS]; + +void notrace nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags) +{ + int cpu, this_cpu = smp_processor_id(); + + __trace(eip, parent_eip); + + nmi_eips[this_cpu] = parent_eip; + nmi_flags[this_cpu] = flags; + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (cpu_online(cpu) && cpu != this_cpu) { + __trace(eip, nmi_eips[cpu]); + __trace(eip, nmi_flags[cpu]); + } +} +#else +/* + * On UP, NMI tracing is quite simple: + */ +void notrace nmi_trace(unsigned long eip, unsigned long parent_eip, + unsigned long flags) +{ + __trace(eip, parent_eip); +} +#endif + +#endif + +#ifdef CONFIG_PREEMPT_TRACE + +static void print_preempt_trace(struct task_struct *task) +{ + unsigned int count = task->thread_info->preempt_count; + unsigned int i, lim = count & PREEMPT_MASK; + if (lim >= MAX_PREEMPT_TRACE) + lim = MAX_PREEMPT_TRACE-1; + printk("---------------------------\n"); + printk("| preempt count: %08x ]\n", count); + printk("| %d-level deep critical section nesting:\n", lim); + printk("----------------------------------------\n"); + for (i = 1; i <= lim; i++) { + printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]); + print_symbol("%s\n", task->preempt_trace_eip[i]); + printk(".....[<%08lx>] .. ( <= ", + task->preempt_trace_parent_eip[i]); + print_symbol("%s)\n", task->preempt_trace_parent_eip[i]); + } + printk("\n"); +} + +#endif + +#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE) +void print_traces(struct task_struct *task) +{ + preempt_disable(); +#ifdef CONFIG_PREEMPT_TRACE + print_preempt_trace(task); +#endif +#ifdef CONFIG_LATENCY_TRACE + print_last_trace(); +#endif + preempt_enable(); +} +#endif + +#ifdef CONFIG_LATENCY_TIMING + +static int preempt_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + cycles_t *max = data; + + return sprintf(page, "%ld\n", cycles_to_usecs(*max)); +} + +static int preempt_write_proc(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + unsigned int c, done = 0, val, sum = 0; + cycles_t *max = data; + + while (count) { + if (get_user(c, buffer)) + return -EFAULT; + val = c - '0'; + buffer++; + done++; + count--; + if (c == 0 || c == '\n') + break; + if (val > 9) + return -EINVAL; + sum *= 10; + sum += val; + } + *max = usecs_to_cycles(sum); + return done; +} + +static __init int latency_init(void) +{ + struct proc_dir_entry *entry; + + entry = create_proc_entry("sys/kernel/preempt_max_latency", 0600, NULL); + + entry->nlink = 1; + entry->data = &preempt_max_latency; + entry->read_proc = preempt_read_proc; + entry->write_proc = preempt_write_proc; + + entry = create_proc_entry("sys/kernel/preempt_thresh", 0600, NULL); + + entry->nlink = 1; + entry->data = &preempt_thresh; + entry->read_proc = preempt_read_proc; + entry->write_proc = preempt_write_proc; + + return 0; +} +__initcall(latency_init); + +#endif + --- linux/kernel/softirq.c.orig +++ linux/kernel/softirq.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include /* @@ -71,7 +73,7 @@ static inline void wakeup_softirqd(void) */ #define MAX_SOFTIRQ_RESTART 10 -asmlinkage void __do_softirq(void) +asmlinkage void ___do_softirq(void) { struct softirq_action *h; __u32 pending; @@ -80,7 +82,6 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); - local_bh_disable(); cpu = smp_processor_id(); restart: /* Reset the pending bitmask before enabling irqs */ @@ -92,8 +93,17 @@ restart: do { if (pending & 1) { - h->action(h); + { + u32 preempt_count = preempt_count(); + h->action(h); + if (preempt_count != preempt_count()) { + print_symbol("softirq preempt bug: exited %s with wrong preemption count!\n", (unsigned long) h->action); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; + } + } rcu_bh_qsctr_inc(cpu); + cond_resched_all(); } h++; pending >>= 1; @@ -107,10 +117,51 @@ restart: if (pending) wakeup_softirqd(); +} + +asmlinkage void __do_softirq(void) +{ + unsigned long p_flags; + +#ifdef CONFIG_PREEMPT_SOFTIRQS + /* + * 'preempt harder'. Push all softirq processing off to ksoftirqd. + */ + if (softirq_preemption) { + if (local_softirq_pending()) + wakeup_softirqd(); + return; + } +#endif + /* + * 'immediate' softirq execution: + */ + local_bh_disable(); + p_flags = current->flags & PF_HARDIRQ; + current->flags &= ~PF_HARDIRQ; + ___do_softirq(); __local_bh_enable(); + + current->flags |= p_flags; } +/* + * 'delayed' softirq execution. Does not disable bhs and thus + * makes most of the softirq handlers preemptable - as long as + * they are not executed 'directly'. + */ +asmlinkage void _do_softirq(void) +{ + local_irq_disable(); + if (!softirq_preemption) + __do_softirq(); + else + ___do_softirq(); + local_irq_enable(); +} + + #ifndef __ARCH_HAS_DO_SOFTIRQ asmlinkage void do_softirq(void) @@ -135,6 +186,8 @@ EXPORT_SYMBOL(do_softirq); #endif +#ifndef CONFIG_PREEMPT_RT + void local_bh_enable(void) { WARN_ON(irqs_disabled()); @@ -152,6 +205,8 @@ void local_bh_enable(void) } EXPORT_SYMBOL(local_bh_enable); +#endif + #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED # define invoke_softirq() __do_softirq() #else @@ -349,8 +404,14 @@ void __init softirq_init(void) static int ksoftirqd(void * __bind_cpu) { - set_user_nice(current, 19); - current->flags |= PF_NOFREEZE; + struct sched_param param = { .sched_priority = MAX_RT_PRIO/4-1 }; + + printk("ksoftirqd started up.\n"); + + printk("softirq RT prio: %d.\n", param.sched_priority); +// sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); + set_user_nice(current, -10); + current->flags |= PF_NOFREEZE | PF_SOFTIRQ; set_current_state(TASK_INTERRUPTIBLE); @@ -367,8 +428,8 @@ static int ksoftirqd(void * __bind_cpu) preempt_disable(); if (cpu_is_offline((long)__bind_cpu)) goto wait_to_die; - do_softirq(); preempt_enable(); + _do_softirq(); cond_resched(); } @@ -419,7 +480,7 @@ void tasklet_kill_immediate(struct taskl BUG(); } -static void takeover_tasklets(unsigned int cpu) +void takeover_tasklets(unsigned int cpu) { struct tasklet_struct **i; @@ -490,3 +551,33 @@ __init int spawn_ksoftirqd(void) register_cpu_notifier(&cpu_nfb); return 0; } + +#ifdef CONFIG_PREEMPT_SOFTIRQS + +int softirq_preemption = 1; + +EXPORT_SYMBOL(softirq_preemption); + +/* + * Real-Time Preemption depends on softirq threading: + */ +#ifndef CONFIG_PREEMPT_RT + +static int __init softirq_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + softirq_preemption = 0; + else + get_option(&str, &softirq_preemption); + if (!softirq_preemption) + printk("turning off softirq preemption!\n"); + + return 1; +} + +__setup("softirq-preempt=", softirq_preempt_setup); + +#endif + +#endif + --- linux/kernel/futex.c.orig +++ linux/kernel/futex.c @@ -539,8 +539,13 @@ static int futex_wait(unsigned long uadd * !list_empty() is safe here without any lock. * q.lock_ptr != 0 is not safe, because of ordering against wakeup. */ - if (likely(!list_empty(&q.list))) + if (likely(!list_empty(&q.list))) { + unsigned long nosched_flag = current->flags & PF_NOSCHED; + + current->flags &= ~PF_NOSCHED; time = schedule_timeout(time); + current->flags |= nosched_flag; + } __set_current_state(TASK_RUNNING); /* --- linux/kernel/sysctl.c.orig +++ linux/kernel/sysctl.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -274,6 +275,130 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = KERN_PANIC, + .procname = "prof_pid", + .data = &prof_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_PREEMPT + { + .ctl_name = KERN_PANIC, + .procname = "kernel_preemption", + .data = &kernel_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_PREEMPT_VOLUNTARY + { + .ctl_name = KERN_PANIC, + .procname = "voluntary_preemption", + .data = &voluntary_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_PREEMPT_SOFTIRQS) && !defined(CONFIG_PREEMPT_RT) + { + .ctl_name = KERN_PANIC, + .procname = "softirq_preemption", + .data = &softirq_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_PREEMPT_RT) + { + .ctl_name = KERN_PANIC, + .procname = "hardirq_preemption", + .data = &hardirq_preemption, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_WAKEUP_TIMING + { + .ctl_name = KERN_PANIC, + .procname = "wakeup_timing", + .data = &wakeup_timing, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_LATENCY_TRACE + { + .ctl_name = KERN_PANIC, + .procname = "trace_enabled", + .data = &trace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "mcount_enabled", + .data = &mcount_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_user_triggered", + .data = &trace_user_triggered, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_freerunning", + .data = &trace_freerunning, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_print_at_crash", + .data = &trace_print_at_crash, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_verbose", + .data = &trace_verbose, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = KERN_PANIC, + .procname = "trace_all_cpus", + .data = &trace_all_cpus, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = KERN_PANIC, + .procname = "debug_direct_keyboard", + .data = &debug_direct_keyboard, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = KERN_CORE_USES_PID, .procname = "core_uses_pid", .data = &core_uses_pid, --- linux/ipc/msg.c.orig +++ linux/ipc/msg.c @@ -62,9 +62,14 @@ static atomic_t msg_hdrs = ATOMIC_INIT(0 static struct ipc_ids msg_ids; -#define msg_lock(id) ((struct msg_queue*)ipc_lock(&msg_ids,id)) -#define msg_unlock(msq) ipc_unlock(&(msq)->q_perm) -#define msg_rmid(id) ((struct msg_queue*)ipc_rmid(&msg_ids,id)) +#define msg_lock(id) ((struct msg_queue*)ipc_lock(&msg_ids,id)) +#define msg_lock_writer(id) ((struct msg_queue*)ipc_lock_writer(&msg_ids,id)) +#define msg_lock_ptr(msq) ipc_lock_by_ptr(&msg_ids, &(msq)->q_perm) +#define msg_lock_ptr_writer(msq) \ + ipc_lock_by_ptr_writer(&msg_ids, &(msq)->q_perm) +#define msg_unlock(msq) ipc_unlock(&msg_ids, &(msq)->q_perm) +#define msg_unlock_writer(msq) ipc_unlock_writer(&msg_ids, &(msq)->q_perm) +#define msg_rmid(id) ((struct msg_queue*)ipc_rmid(&msg_ids,id)) #define msg_checkid(msq, msgid) \ ipc_checkid(&msg_ids,&msq->q_perm,msgid) #define msg_buildid(id, seq) \ @@ -105,7 +110,7 @@ static int newque (key_t key, int msgflg return retval; } - id = ipc_addid(&msg_ids, &msq->q_perm, msg_ctlmni); + id = ipc_addid_writer(&msg_ids, &msq->q_perm, msg_ctlmni); if(id == -1) { security_msg_queue_free(msq); ipc_rcu_putref(msq); @@ -120,7 +125,7 @@ static int newque (key_t key, int msgflg INIT_LIST_HEAD(&msq->q_messages); INIT_LIST_HEAD(&msq->q_receivers); INIT_LIST_HEAD(&msq->q_senders); - msg_unlock(msq); + msg_unlock_writer(msq); return msg_buildid(id,msq->q_perm.seq); } @@ -185,7 +190,7 @@ static void freeque (struct msg_queue *m expunge_all(msq,-EIDRM); ss_wakeup(&msq->q_senders,1); msq = msg_rmid(id); - msg_unlock(msq); + msg_unlock_writer(msq); tmp = msq->q_messages.next; while(tmp != &msq->q_messages) { @@ -215,7 +220,7 @@ asmlinkage long sys_msgget (key_t key, i } else if (msgflg & IPC_CREAT && msgflg & IPC_EXCL) { ret = -EEXIST; } else { - msq = msg_lock(id); + msq = msg_lock_writer(id); if(msq==NULL) BUG(); if (ipcperms(&msq->q_perm, msgflg)) @@ -226,7 +231,7 @@ asmlinkage long sys_msgget (key_t key, i if (!ret) ret = qid; } - msg_unlock(msq); + msg_unlock_writer(msq); } up(&msg_ids.sem); return ret; @@ -433,7 +438,7 @@ asmlinkage long sys_msgctl (int msqid, i } down(&msg_ids.sem); - msq = msg_lock(msqid); + msq = msg_lock_writer(msqid); err=-EINVAL; if (msq == NULL) goto out_up; @@ -474,7 +479,7 @@ asmlinkage long sys_msgctl (int msqid, i * due to a larger queue size. */ ss_wakeup(&msq->q_senders,0); - msg_unlock(msq); + msg_unlock_writer(msq); break; } case IPC_RMID: @@ -486,7 +491,7 @@ out_up: up(&msg_ids.sem); return err; out_unlock_up: - msg_unlock(msq); + msg_unlock_writer(msq); goto out_up; out_unlock: msg_unlock(msq); @@ -602,7 +607,7 @@ asmlinkage long sys_msgsnd (int msqid, s msg_unlock(msq); schedule(); - ipc_lock_by_ptr(&msq->q_perm); + msg_lock_ptr(msq); ipc_rcu_putref(msq); if (msq->q_perm.deleted) { err = -EIDRM; @@ -749,7 +754,7 @@ asmlinkage long sys_msgrcv (int msqid, s * rcu_read_lock() prevents preemption between reading r_msg * and the spin_lock() inside ipc_lock_by_ptr(). */ - rcu_read_lock(); + rcu_read_lock_sem(&msg_ids.sem); /* Lockless receive, part 2: * Wait until pipelined_send or expunge_all are outside of @@ -767,15 +772,17 @@ asmlinkage long sys_msgrcv (int msqid, s * locking. */ if(msg != ERR_PTR(-EAGAIN)) { - rcu_read_unlock(); + rcu_read_unlock_sem(&msg_ids.sem); break; } /* Lockless receive, part 3: * Acquire the queue spinlock. + * + * in the PREEMPT_RT case keep the semaphore held: */ - ipc_lock_by_ptr(&msq->q_perm); - rcu_read_unlock(); + msg_lock_ptr_writer(msq); + rcu_read_unlock_nort(); /* Lockless receive, part 4: * Repeat test after acquiring the spinlock. @@ -816,7 +823,7 @@ static int sysvipc_msg_read_proc(char *b for(i = 0; i <= msg_ids.max_id; i++) { struct msg_queue * msq; - msq = msg_lock(i); + msq = msg_lock_writer(i); if(msq != NULL) { len += sprintf(buffer + len, "%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n", msq->q_perm.key, @@ -833,7 +840,7 @@ static int sysvipc_msg_read_proc(char *b msq->q_stime, msq->q_rtime, msq->q_ctime); - msg_unlock(msq); + msg_unlock_writer(msq); pos += len; if(pos < offset) { --- linux/ipc/sem.c.orig +++ linux/ipc/sem.c @@ -76,9 +76,12 @@ #include "util.h" -#define sem_lock(id) ((struct sem_array*)ipc_lock(&sem_ids,id)) -#define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) -#define sem_rmid(id) ((struct sem_array*)ipc_rmid(&sem_ids,id)) +#define sem_lock(id) ((struct sem_array*)ipc_lock(&sem_ids,id)) +#define sem_lock_writer(id) ((struct sem_array*)ipc_lock_writer(&sem_ids,id)) +#define sem_lock_ptr(sma) ipc_lock_by_ptr(&sem_ids,&(sma)->sem_perm) +#define sem_unlock(sma) ipc_unlock(&sem_ids, &(sma)->sem_perm) +#define sem_unlock_writer(sma) ipc_unlock_writer(&sem_ids, &(sma)->sem_perm) +#define sem_rmid(id) ((struct sem_array*)ipc_rmid(&sem_ids,id)) #define sem_checkid(sma, semid) \ ipc_checkid(&sem_ids,&sma->sem_perm,semid) #define sem_buildid(id, seq) \ @@ -184,7 +187,7 @@ static int newary (key_t key, int nsems, return retval; } - id = ipc_addid(&sem_ids, &sma->sem_perm, sc_semmni); + id = ipc_addid_writer(&sem_ids, &sma->sem_perm, sc_semmni); if(id == -1) { security_sem_free(sma); ipc_rcu_putref(sma); @@ -198,7 +201,7 @@ static int newary (key_t key, int nsems, /* sma->undo = NULL; */ sma->sem_nsems = nsems; sma->sem_ctime = get_seconds(); - sem_unlock(sma); + sem_unlock_writer(sma); return sem_buildid(id, sma->sem_perm.seq); } @@ -210,35 +213,44 @@ asmlinkage long sys_semget (key_t key, i if (nsems < 0 || nsems > sc_semmsl) return -EINVAL; - down(&sem_ids.sem); if (key == IPC_PRIVATE) { + down(&sem_ids.sem); err = newary(key, nsems, semflg); - } else if ((id = ipc_findkey(&sem_ids, key)) == -1) { /* key not used */ + up(&sem_ids.sem); + return err; + } + + down(&sem_ids.sem); + if ((id = ipc_findkey(&sem_ids, key)) == -1) { /* key not used */ if (!(semflg & IPC_CREAT)) err = -ENOENT; else err = newary(key, nsems, semflg); - } else if (semflg & IPC_CREAT && semflg & IPC_EXCL) { - err = -EEXIST; - } else { - sma = sem_lock(id); - if(sma==NULL) - BUG(); - if (nsems > sma->sem_nsems) - err = -EINVAL; - else if (ipcperms(&sma->sem_perm, semflg)) - err = -EACCES; - else { - int semid = sem_buildid(id, sma->sem_perm.seq); - err = security_sem_associate(sma, semflg); - if (!err) - err = semid; - } - sem_unlock(sma); + up(&sem_ids.sem); + return err; } + if (semflg & IPC_CREAT && semflg & IPC_EXCL) { + err = -EEXIST; + up(&sem_ids.sem); + return err; + } + sma = sem_lock_writer(id); + BUG_ON(!sma); + if (nsems > sma->sem_nsems) + err = -EINVAL; + else if (ipcperms(&sma->sem_perm, semflg)) + err = -EACCES; + else { + int semid = sem_buildid(id, sma->sem_perm.seq); + err = security_sem_associate(sma, semflg); + if (!err) + err = semid; + } + sem_unlock_writer(sma); up(&sem_ids.sem); + return err; } @@ -464,7 +476,7 @@ static void freeary (struct sem_array *s /* Remove the semaphore set from the ID array*/ sma = sem_rmid(id); - sem_unlock(sma); + sem_unlock_writer(sma); used_sems -= sma->sem_nsems; size = sizeof (*sma) + sma->sem_nsems * sizeof (struct sem); @@ -615,13 +627,13 @@ static int semctl_main(int semid, int se sem_io = ipc_alloc(sizeof(ushort)*nsems); if(sem_io == NULL) { - ipc_lock_by_ptr(&sma->sem_perm); + sem_lock_ptr(sma); ipc_rcu_putref(sma); sem_unlock(sma); return -ENOMEM; } - ipc_lock_by_ptr(&sma->sem_perm); + sem_lock_ptr(sma); ipc_rcu_putref(sma); if (sma->sem_perm.deleted) { sem_unlock(sma); @@ -649,7 +661,7 @@ static int semctl_main(int semid, int se if(nsems > SEMMSL_FAST) { sem_io = ipc_alloc(sizeof(ushort)*nsems); if(sem_io == NULL) { - ipc_lock_by_ptr(&sma->sem_perm); + sem_lock_ptr(sma); ipc_rcu_putref(sma); sem_unlock(sma); return -ENOMEM; @@ -657,7 +669,7 @@ static int semctl_main(int semid, int se } if (copy_from_user (sem_io, arg.array, nsems*sizeof(ushort))) { - ipc_lock_by_ptr(&sma->sem_perm); + sem_lock_ptr(sma); ipc_rcu_putref(sma); sem_unlock(sma); err = -EFAULT; @@ -666,14 +678,14 @@ static int semctl_main(int semid, int se for (i = 0; i < nsems; i++) { if (sem_io[i] > SEMVMX) { - ipc_lock_by_ptr(&sma->sem_perm); + sem_lock_ptr(sma); ipc_rcu_putref(sma); sem_unlock(sma); err = -ERANGE; goto out_free; } } - ipc_lock_by_ptr(&sma->sem_perm); + sem_lock_ptr(sma); ipc_rcu_putref(sma); if (sma->sem_perm.deleted) { sem_unlock(sma); @@ -804,7 +816,7 @@ static int semctl_down(int semid, int se if(copy_semid_from_user (&setbuf, arg.buf, version)) return -EFAULT; } - sma = sem_lock(semid); + sma = sem_lock_writer(semid); if(sma==NULL) return -EINVAL; @@ -835,18 +847,18 @@ static int semctl_down(int semid, int se ipcp->mode = (ipcp->mode & ~S_IRWXUGO) | (setbuf.mode & S_IRWXUGO); sma->sem_ctime = get_seconds(); - sem_unlock(sma); + sem_unlock_writer(sma); err = 0; break; default: - sem_unlock(sma); + sem_unlock_writer(sma); err = -EINVAL; break; } return err; out_unlock: - sem_unlock(sma); + sem_unlock_writer(sma); return err; } @@ -1004,7 +1016,7 @@ static struct sem_undo *find_undo(int se new = (struct sem_undo *) kmalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); if (!new) { - ipc_lock_by_ptr(&sma->sem_perm); + sem_lock_ptr(sma); ipc_rcu_putref(sma); sem_unlock(sma); return ERR_PTR(-ENOMEM); @@ -1018,12 +1030,12 @@ static struct sem_undo *find_undo(int se if (un) { unlock_semundo(); kfree(new); - ipc_lock_by_ptr(&sma->sem_perm); + sem_lock_ptr(sma); ipc_rcu_putref(sma); sem_unlock(sma); goto out; } - ipc_lock_by_ptr(&sma->sem_perm); + sem_lock_ptr(sma); ipc_rcu_putref(sma); if (sma->sem_perm.deleted) { sem_unlock(sma); @@ -1343,7 +1355,7 @@ static int sysvipc_sem_read_proc(char *b for(i = 0; i <= sem_ids.max_id; i++) { struct sem_array *sma; - sma = sem_lock(i); + sma = sem_lock_writer(i); if(sma) { len += sprintf(buffer + len, "%10d %10d %4o %10lu %5u %5u %5u %5u %10lu %10lu\n", sma->sem_perm.key, @@ -1356,7 +1368,7 @@ static int sysvipc_sem_read_proc(char *b sma->sem_perm.cgid, sma->sem_otime, sma->sem_ctime); - sem_unlock(sma); + sem_unlock_writer(sma); pos += len; if(pos < offset) { --- linux/ipc/util.h.orig +++ linux/ipc/util.h @@ -34,7 +34,7 @@ void __init ipc_init_ids(struct ipc_ids* /* must be called with ids->sem acquired.*/ int ipc_findkey(struct ipc_ids* ids, key_t key); -int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size); +int ipc_addid_writer(struct ipc_ids* ids, struct kern_ipc_perm* new, int size); /* must be called with both locks acquired. */ struct kern_ipc_perm* ipc_rmid(struct ipc_ids* ids, int id); @@ -59,8 +59,11 @@ void ipc_rcu_putref(void *ptr); struct kern_ipc_perm* ipc_get(struct ipc_ids* ids, int id); struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id); -void ipc_lock_by_ptr(struct kern_ipc_perm *ipcp); -void ipc_unlock(struct kern_ipc_perm* perm); +struct kern_ipc_perm* ipc_lock_writer(struct ipc_ids* ids, int id); +void ipc_lock_by_ptr(struct ipc_ids* ids, struct kern_ipc_perm *ipcp); +void ipc_lock_by_ptr_writer(struct ipc_ids* ids, struct kern_ipc_perm *ipcp); +void ipc_unlock(struct ipc_ids* ids, struct kern_ipc_perm* perm); +void ipc_unlock_writer(struct ipc_ids* ids, struct kern_ipc_perm* perm); int ipc_buildid(struct ipc_ids* ids, int id, int seq); int ipc_checkid(struct ipc_ids* ids, struct kern_ipc_perm* ipcp, int uid); --- linux/ipc/util.c.orig +++ linux/ipc/util.c @@ -166,7 +166,7 @@ static int grow_ary(struct ipc_ids* ids, * Called with ipc_ids.sem held. */ -int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) +int ipc_addid_writer(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) { int id; @@ -195,7 +195,11 @@ found: spin_lock_init(&new->lock); new->deleted = 0; - rcu_read_lock(); + /* + * We cannot use rcu_read_lock_sem(&ids->sem) here because + * we are holding it already - so it must not be taken again. + */ + rcu_read_lock_nort(); spin_lock(&new->lock); ids->entries->p[id] = new; return id; @@ -507,15 +511,45 @@ struct kern_ipc_perm* ipc_lock(struct ip int lid = id % SEQ_MULTIPLIER; struct ipc_id_ary* entries; - rcu_read_lock(); + rcu_read_lock_sem(&ids->sem); + entries = rcu_dereference(ids->entries); + if(lid >= entries->size) { + rcu_read_unlock_sem(&ids->sem); + return NULL; + } + out = entries->p[lid]; + if(out == NULL) { + rcu_read_unlock_sem(&ids->sem); + return NULL; + } + spin_lock(&out->lock); + + /* ipc_rmid() may have already freed the ID while ipc_lock + * was spinning: here verify that the structure is still valid + */ + if (out->deleted) { + spin_unlock(&out->lock); + rcu_read_unlock_sem(&ids->sem); + return NULL; + } + return out; +} + +struct kern_ipc_perm* ipc_lock_writer(struct ipc_ids* ids, int id) +{ + struct kern_ipc_perm* out; + int lid = id % SEQ_MULTIPLIER; + struct ipc_id_ary* entries; + + rcu_read_lock_nort(); entries = rcu_dereference(ids->entries); if(lid >= entries->size) { - rcu_read_unlock(); + rcu_read_unlock_nort(); return NULL; } out = entries->p[lid]; if(out == NULL) { - rcu_read_unlock(); + rcu_read_unlock_nort(); return NULL; } spin_lock(&out->lock); @@ -525,24 +559,45 @@ struct kern_ipc_perm* ipc_lock(struct ip */ if (out->deleted) { spin_unlock(&out->lock); - rcu_read_unlock(); + rcu_read_unlock_nort(); return NULL; } return out; } -void ipc_lock_by_ptr(struct kern_ipc_perm *perm) +void ipc_lock_by_ptr(struct ipc_ids* ids, struct kern_ipc_perm *perm) { - rcu_read_lock(); + rcu_read_lock_sem(&ids->sem); spin_lock(&perm->lock); } -void ipc_unlock(struct kern_ipc_perm* perm) +void ipc_lock_by_ptr_writer(struct ipc_ids* ids, struct kern_ipc_perm *perm) +{ + rcu_read_lock_nort(); + spin_lock(&perm->lock); +} + +void ipc_unlock(struct ipc_ids* ids, struct kern_ipc_perm* perm) { spin_unlock(&perm->lock); + rcu_read_unlock_sem(&ids->sem); +} + +/* + * In the PREEMPT_RT case it is important to distinguish between + * unlocks done while holding ids.sem for array growing purposes. This + * function does not drop the semaphore. rcu_read_lock/unlock can nest + * just fine in the PREEMPT_RT case. + */ +void ipc_unlock_writer(struct ipc_ids* ids, struct kern_ipc_perm* perm) +{ + spin_unlock(&perm->lock); +#ifndef CONFIG_PREEMPT_RT rcu_read_unlock(); +#endif } + int ipc_buildid(struct ipc_ids* ids, int id, int seq) { return SEQ_MULTIPLIER*seq + id; --- linux/ipc/shm.c.orig +++ linux/ipc/shm.c @@ -38,9 +38,11 @@ static struct vm_operations_struct shm_v static struct ipc_ids shm_ids; -#define shm_lock(id) ((struct shmid_kernel*)ipc_lock(&shm_ids,id)) -#define shm_unlock(shp) ipc_unlock(&(shp)->shm_perm) -#define shm_get(id) ((struct shmid_kernel*)ipc_get(&shm_ids,id)) +#define shm_lock(id) ((struct shmid_kernel*)ipc_lock(&shm_ids,id)) +#define shm_lock_writer(id) ((struct shmid_kernel*)ipc_lock_writer(&shm_ids,id)) +#define shm_unlock(shp) ipc_unlock(&shm_ids,&(shp)->shm_perm) +#define shm_unlock_writer(shp) ipc_unlock_writer(&shm_ids,&(shp)->shm_perm) +#define shm_get(id) ((struct shmid_kernel*)ipc_get(&shm_ids,id)) #define shm_buildid(id, seq) \ ipc_buildid(&shm_ids, id, seq) @@ -77,9 +79,9 @@ static inline struct shmid_kernel *shm_r return (struct shmid_kernel *)ipc_rmid(&shm_ids,id); } -static inline int shm_addid(struct shmid_kernel *shp) +static inline int shm_addid_writer(struct shmid_kernel *shp) { - return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni); + return ipc_addid_writer(&shm_ids, &shp->shm_perm, shm_ctlmni); } @@ -109,11 +111,11 @@ static void shm_open (struct vm_area_str * It has to be called with shp and shm_ids.sem locked, * but returns with shp unlocked and freed. */ -static void shm_destroy (struct shmid_kernel *shp) +static void shm_destroy_writer(struct shmid_kernel *shp) { shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; shm_rmid (shp->id); - shm_unlock(shp); + shm_unlock_writer(shp); if (!is_file_hugepages(shp->shm_file)) shmem_lock(shp->shm_file, 0, shp->mlock_user); else @@ -138,16 +140,16 @@ static void shm_close (struct vm_area_st down (&shm_ids.sem); /* remove from the list of attaches of the shm segment */ - if(!(shp = shm_lock(id))) + if(!(shp = shm_lock_writer(id))) BUG(); shp->shm_lprid = current->tgid; shp->shm_dtim = get_seconds(); shp->shm_nattch--; if(shp->shm_nattch == 0 && shp->shm_flags & SHM_DEST) - shm_destroy (shp); + shm_destroy_writer(shp); else - shm_unlock(shp); + shm_unlock_writer(shp); up (&shm_ids.sem); } @@ -216,7 +218,7 @@ static int newseg (key_t key, int shmflg goto no_file; error = -ENOSPC; - id = shm_addid(shp); + id = shm_addid_writer(shp); if(id == -1) goto no_id; @@ -234,7 +236,7 @@ static int newseg (key_t key, int shmflg else file->f_op = &shm_file_operations; shm_tot += numpages; - shm_unlock(shp); + shm_unlock_writer(shp); return shp->id; no_id: @@ -261,7 +263,7 @@ asmlinkage long sys_shmget (key_t key, s } else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) { err = -EEXIST; } else { - shp = shm_lock(id); + shp = shm_lock_writer(id); if(shp==NULL) BUG(); if (shp->shm_segsz < size) @@ -274,7 +276,7 @@ asmlinkage long sys_shmget (key_t key, s if (!err) err = shmid; } - shm_unlock(shp); + shm_unlock_writer(shp); } up(&shm_ids.sem); @@ -564,7 +566,7 @@ asmlinkage long sys_shmctl (int shmid, i * the name away when the usage hits zero. */ down(&shm_ids.sem); - shp = shm_lock(shmid); + shp = shm_lock_writer(shmid); err = -EINVAL; if (shp == NULL) goto out_up; @@ -587,9 +589,9 @@ asmlinkage long sys_shmctl (int shmid, i shp->shm_flags |= SHM_DEST; /* Do not find it any more */ shp->shm_perm.key = IPC_PRIVATE; - shm_unlock(shp); + shm_unlock_writer(shp); } else - shm_destroy (shp); + shm_destroy_writer(shp); up(&shm_ids.sem); goto out; } @@ -601,7 +603,7 @@ asmlinkage long sys_shmctl (int shmid, i goto out; } down(&shm_ids.sem); - shp = shm_lock(shmid); + shp = shm_lock_writer(shmid); err=-EINVAL; if(shp==NULL) goto out_up; @@ -634,7 +636,7 @@ asmlinkage long sys_shmctl (int shmid, i err = 0; out_unlock_up: - shm_unlock(shp); + shm_unlock_writer(shp); out_up: up(&shm_ids.sem); goto out; @@ -750,14 +752,14 @@ invalid: up_write(¤t->mm->mmap_sem); down (&shm_ids.sem); - if(!(shp = shm_lock(shmid))) + if(!(shp = shm_lock_writer(shmid))) BUG(); shp->shm_nattch--; if(shp->shm_nattch == 0 && shp->shm_flags & SHM_DEST) - shm_destroy (shp); + shm_destroy_writer(shp); else - shm_unlock(shp); + shm_unlock_writer(shp); up (&shm_ids.sem); *raddr = (unsigned long) user_addr; @@ -864,7 +866,7 @@ static int sysvipc_shm_read_proc(char *b for(i = 0; i <= shm_ids.max_id; i++) { struct shmid_kernel* shp; - shp = shm_lock(i); + shp = shm_lock_writer(i); if(shp!=NULL) { #define SMALL_STRING "%10d %10d %4o %10u %5u %5u %5d %5u %5u %5u %5u %10lu %10lu %10lu\n" #define BIG_STRING "%10d %10d %4o %21u %5u %5u %5d %5u %5u %5u %5u %10lu %10lu %10lu\n" @@ -889,7 +891,7 @@ static int sysvipc_shm_read_proc(char *b shp->shm_atim, shp->shm_dtim, shp->shm_ctim); - shm_unlock(shp); + shm_unlock_writer(shp); pos += len; if(pos < offset) { --- linux/init/main.c.orig +++ linux/init/main.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -373,6 +374,8 @@ static void __init smp_init(void) static void noinline rest_init(void) __releases(kernel_lock) { + system_state = SYSTEM_BOOTING_SCHEDULER_OK; + kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND); numa_default_policy(); /* @@ -453,6 +456,7 @@ asmlinkage void __init start_kernel(void preempt_disable(); build_all_zonelists(); page_alloc_init(); + early_init_hardirqs(); trap_init(); printk("Kernel command line: %s\n", saved_command_line); parse_early_param(); @@ -599,12 +603,14 @@ static void __init do_basic_setup(void) static void do_pre_smp_initcalls(void) { extern int spawn_ksoftirqd(void); + extern int spawn_desched_task(void); #ifdef CONFIG_SMP extern int migration_init(void); migration_init(); #endif spawn_ksoftirqd(); + spawn_desched_task(); } static void run_init_process(char *init_filename) @@ -647,6 +653,8 @@ static int init(void * unused) /* Sets up cpus_possible() */ smp_prepare_cpus(max_cpus); + init_hardirqs(); + do_pre_smp_initcalls(); fixup_cpu_present_map(); --- linux/arch/x86_64/mm/fault.c.orig +++ linux/arch/x86_64/mm/fault.c @@ -39,6 +39,7 @@ void bust_spinlocks(int yes) { int loglevel_save = console_loglevel; if (yes) { + stop_trace(); oops_in_progress = 1; } else { #ifdef CONFIG_VT --- linux/arch/x86_64/Kconfig.orig +++ linux/arch/x86_64/Kconfig @@ -34,13 +34,6 @@ config ISA config SBUS bool -config RWSEM_GENERIC_SPINLOCK - bool - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - config GENERIC_CALIBRATE_DELAY bool default y @@ -233,33 +226,6 @@ config SMP If you don't know what to do here, say N. -config PREEMPT - bool "Preemptible Kernel" - ---help--- - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. On contrary it may also break your drivers and add - priority inheritance problems to your system. Don't select it if - you rely on a stable system or have slightly obscure hardware. - It's also not very well tested on x86-64 currently. - You have been warned. - - Say Y here if you are feeling brave and building a kernel for a - desktop, embedded or real-time system. Say N if you are unsure. - -config PREEMPT_BKL - bool "Preempt The Big Kernel Lock" - depends on PREEMPT || SMP - default y - help - This option reduces the latency of the kernel by making the - big kernel lock preemptible. - - Say Y here if you are building a kernel for a desktop system. - Say N if you are unsure. - config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" depends on SMP @@ -270,6 +236,16 @@ config SCHED_SMT cost of slightly increased overhead in some places. If unsure say N here. +source "lib/Kconfig.RT" + +config RWSEM_GENERIC_SPINLOCK + bool + depends on !PREEMPT_RT + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + config K8_NUMA bool "K8 NUMA support" select NUMA --- linux/arch/x86_64/kernel/x8664_ksyms.c.orig +++ linux/arch/x86_64/kernel/x8664_ksyms.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -34,8 +35,6 @@ #include #include -extern spinlock_t rtc_lock; - #ifdef CONFIG_SMP extern void __write_lock_failed(rwlock_t *rw); extern void __read_lock_failed(rwlock_t *rw); @@ -63,10 +62,12 @@ EXPORT_SYMBOL(pm_idle); EXPORT_SYMBOL(pm_power_off); EXPORT_SYMBOL(get_cmos_time); +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK EXPORT_SYMBOL(__down_failed); EXPORT_SYMBOL(__down_failed_interruptible); EXPORT_SYMBOL(__down_failed_trylock); EXPORT_SYMBOL(__up_wakeup); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_nocheck); EXPORT_SYMBOL(ip_compute_csum); --- linux/arch/x86_64/kernel/io_apic.c.orig +++ linux/arch/x86_64/kernel/io_apic.c @@ -42,7 +42,7 @@ int sis_apic_bug; /* not actually supported, dummy for compile */ -static spinlock_t ioapic_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_RAW_SPINLOCK(ioapic_lock); /* * # of IRQ routing registers @@ -112,6 +112,9 @@ static void add_pin_to_irq(unsigned int reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ reg ACTION; \ io_apic_modify(entry->apic, reg); \ + /* Force POST flush by reading: */ \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + \ if (!entry->next) \ break; \ entry = irq_2_pin + entry->next; \ @@ -124,10 +127,8 @@ static void add_pin_to_irq(unsigned int static void name##_IO_APIC_irq (unsigned int irq) \ __DO_ACTION(R, ACTION, FINAL) -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ +DO_ACTION( __mask, 0, |= 0x00010000, ) /* mask = 1 */ +DO_ACTION( __unmask, 0, &= 0xfffeffff, ) /* mask = 0 */ static void mask_IO_APIC_irq (unsigned int irq) { @@ -1059,7 +1060,6 @@ void print_all_local_APICs (void) void __apicdebuginit print_PIC(void) { - extern spinlock_t i8259A_lock; unsigned int v; unsigned long flags; @@ -1342,11 +1342,48 @@ static unsigned int startup_level_ioapic return 0; /* don't check for pending */ } +/* + * In the preemptible case mask the IRQ first then handle it and ack it. + * + * (In the non-preemptible case we keep the IRQ unacked in the local APIC + * and dont need to do the masking, because the code executes atomically.) + */ +#ifdef CONFIG_PREEMPT_HARDIRQS + +static void mask_and_ack_level_ioapic_irq(unsigned int irq) +{ + mask_IO_APIC_irq(irq); + ack_APIC_irq(); +} + +static void end_level_ioapic_irq(unsigned int irq) +{ + if (!(irq_desc[irq].status & IRQ_INPROGRESS)) + unmask_IO_APIC_irq(irq); +} + +static void enable_level_ioapic_irq(unsigned int irq) +{ + unmask_IO_APIC_irq(irq); +} + +#else /* !CONFIG_PREEMPT_HARDIRQS */ + +static void mask_and_ack_level_ioapic_irq(unsigned int irq) +{ +} + static void end_level_ioapic_irq (unsigned int irq) { ack_APIC_irq(); } +static void enable_level_ioapic_irq(unsigned int irq) +{ + unmask_IO_APIC_irq(irq); +} +#endif /* !CONFIG_PREEMPT_HARDIRQS */ + static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { unsigned long flags; @@ -1386,6 +1423,13 @@ static unsigned int startup_level_ioapic return startup_level_ioapic_irq (irq); } +static void mask_and_ack_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + mask_and_ack_level_ioapic_irq(irq); +} + static void end_level_ioapic_vector (unsigned int vector) { int irq = vector_to_irq(vector); @@ -1393,6 +1437,11 @@ static void end_level_ioapic_vector (uns end_level_ioapic_irq(irq); } +static void enable_level_ioapic_vector(unsigned int vector) +{ + enable_level_ioapic_irq(vector_to_irq(vector)); +} + static void mask_IO_APIC_vector (unsigned int vector) { int irq = vector_to_irq(vector); --- linux/arch/x86_64/kernel/time.c.orig +++ linux/arch/x86_64/kernel/time.c @@ -49,8 +49,8 @@ static void cpufreq_delayed_get(void); extern int using_apic_timer; -spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; -spinlock_t i8253_lock = SPIN_LOCK_UNLOCKED; +DEFINE_RAW_SPINLOCK(rtc_lock); +DEFINE_RAW_SPINLOCK(i8253_lock); static int nohpet __initdata = 0; static int notsc __initdata = 0; @@ -863,7 +863,7 @@ int __init time_setup(char *str) } static struct irqaction irq0 = { - timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL + timer_interrupt, SA_INTERRUPT | SA_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL }; extern void __init config_acpi_tables(void); --- linux/arch/x86_64/kernel/nmi.c.orig +++ linux/arch/x86_64/kernel/nmi.c @@ -43,7 +43,7 @@ * This is maintained separately from nmi_active because the NMI * watchdog may also be driven from the I/O APIC timer. */ -static spinlock_t lapic_nmi_owner_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_RAW_SPINLOCK(lapic_nmi_owner_lock); static unsigned int lapic_nmi_owner; #define LAPIC_NMI_WATCHDOG (1<<0) #define LAPIC_NMI_RESERVED (1<<1) @@ -376,12 +376,41 @@ void touch_nmi_watchdog (void) alert_counter[i] = 0; } +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) +{ + int i; + + if (nmi_watchdog == NMI_NONE) + return; + if (system_state != SYSTEM_RUNNING) { + printk("nmi_show_all_regs(): system state %d, not doing.\n", + system_state); + return; + } + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + for_each_online_cpu(i) + while (nmi_show_regs[i] == 1) + barrier(); +} + +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) { int sum, cpu; cpu = safe_smp_processor_id(); sum = read_pda(apic_timer_irqs); + if (nmi_show_regs[cpu]) { + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + show_regs(regs); + spin_unlock(&nmi_print_lock); + } if (last_irq_sums[cpu] == sum) { /* * Ayiee, looks like this CPU is stuck ... @@ -389,6 +418,12 @@ void nmi_watchdog_tick (struct pt_regs * */ alert_counter[cpu]++; if (alert_counter[cpu] == 5*nmi_hz) { + int i; + + for (i = 0; i < NR_CPUS; i++) + nmi_show_regs[i] = 1; + } + if (alert_counter[cpu] == 5*nmi_hz) { if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { alert_counter[cpu] = 0; --- linux/arch/x86_64/kernel/entry.S.orig +++ linux/arch/x86_64/kernel/entry.S @@ -929,3 +929,40 @@ ENTRY(machine_check) ENTRY(call_debug) zeroentry do_call_debug +#ifdef CONFIG_LATENCY_TRACE + +ENTRY(mcount) + cmpq $0, trace_enabled + jz out + + push %rbp + mov %rsp,%rbp + + push %r9 + push %r8 + push %rdi + push %rsi + push %rdx + push %rcx + push %rax + + mov 0x0(%rbp),%rax + mov 0x8(%rbp),%rdi + mov 0x8(%rax),%rsi + + call __trace + + pop %rax + pop %rcx + pop %rdx + pop %rsi + pop %rdi + pop %r8 + pop %r9 + + leaveq +out: + ret + +#endif + --- linux/arch/x86_64/kernel/process.c.orig +++ linux/arch/x86_64/kernel/process.c @@ -90,7 +90,8 @@ void default_idle(void) safe_halt(); else local_irq_enable(); - } + } else + local_irq_enable(); } /* @@ -163,9 +164,10 @@ void cpu_idle (void) idle = pm_idle; if (!idle) idle = default_idle; + stop_critical_timing(); idle(); } - schedule(); + __schedule(); } } @@ -272,7 +274,7 @@ void __show_regs(struct pt_regs * regs) void show_regs(struct pt_regs *regs) { __show_regs(regs); - show_trace(®s->rsp); + show_trace(current, ®s->rsp); } /* @@ -283,13 +285,14 @@ void exit_thread(void) struct task_struct *me = current; struct thread_struct *t = &me->thread; if (me->thread.io_bitmap_ptr) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + struct tss_struct *tss; kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; /* * Careful, clear this in the TSS too: */ + tss = &per_cpu(init_tss, get_cpu()); memset(tss->io_bitmap, 0xff, t->io_bitmap_max); t->io_bitmap_max = 0; put_cpu(); --- linux/arch/x86_64/kernel/Makefile.orig +++ linux/arch/x86_64/kernel/Makefile @@ -4,11 +4,12 @@ extra-y := head.o head64.o init_task.o vmlinux.lds EXTRA_AFLAGS := -traditional -obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ +obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o +obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += semaphore.o obj-$(CONFIG_X86_MCE) += mce.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ --- linux/arch/x86_64/kernel/traps.c.orig +++ linux/arch/x86_64/kernel/traps.c @@ -143,7 +143,7 @@ unsigned long *in_exception_stack(int cp * Check and process them in order. */ -void show_trace(unsigned long *stack) +void show_trace(struct task_struct *task, unsigned long *stack) { unsigned long addr; unsigned long *irqstack, *irqstack_end, *estack_end; @@ -212,6 +212,7 @@ void show_trace(unsigned long *stack) } } printk("\n"); + print_traces(task); } void show_stack(struct task_struct *tsk, unsigned long * rsp) @@ -247,7 +248,7 @@ void show_stack(struct task_struct *tsk, printk("\n "); printk("%016lx ", *stack++); } - show_trace((unsigned long *)rsp); + show_trace(tsk, (unsigned long *)rsp); } /* @@ -256,7 +257,7 @@ void show_stack(struct task_struct *tsk, void dump_stack(void) { unsigned long dummy; - show_trace(&dummy); + show_trace(current, &dummy); } EXPORT_SYMBOL(dump_stack); @@ -327,7 +328,7 @@ void out_of_line_bug(void) BUG(); } -static spinlock_t die_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_RAW_SPINLOCK(die_lock); static int die_owner = -1; void oops_begin(void) --- linux/arch/x86_64/kernel/i8259.c.orig +++ linux/arch/x86_64/kernel/i8259.c @@ -131,7 +131,7 @@ void (*interrupt[NR_IRQS])(void) = { * moves to arch independent land */ -spinlock_t i8259A_lock = SPIN_LOCK_UNLOCKED; +DEFINE_RAW_SPINLOCK(i8259A_lock); static void end_8259A_irq (unsigned int irq) { @@ -455,7 +455,7 @@ device_initcall(i8259A_init_sysfs); * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; void __init init_ISA_irqs (void) { --- linux/arch/x86_64/kernel/smp.c.orig +++ linux/arch/x86_64/kernel/smp.c @@ -40,7 +40,7 @@ static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; -static spinlock_t tlbstate_lock = SPIN_LOCK_UNLOCKED; +DEFINE_RAW_SPINLOCK(tlbstate_lock); #define FLUSH_ALL 0xffffffff /* @@ -281,7 +281,7 @@ void smp_send_nmi_allbutself(void) * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static spinlock_t call_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); --- linux/arch/x86_64/kernel/vsyscall.c.orig +++ linux/arch/x86_64/kernel/vsyscall.c @@ -53,7 +53,7 @@ #define force_inline __attribute__((always_inline)) inline int __sysctl_vsyscall __section_sysctl_vsyscall = 1; -seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; +raw_seqlock_t __xtime_lock __section_xtime_lock = RAW_SEQLOCK_UNLOCKED; #include --- linux/arch/x86_64/lib/dec_and_lock.c.orig +++ linux/arch/x86_64/lib/dec_and_lock.c @@ -10,7 +10,7 @@ #include #include -int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +int _atomic_dec_and_lock(atomic_t *atomic, raw_spinlock_t *lock) { int counter; int newcount; --- linux/arch/x86_64/lib/thunk.S.orig +++ linux/arch/x86_64/lib/thunk.S @@ -43,11 +43,13 @@ thunk rwsem_downgrade_thunk,rwsem_downgrade_wake #endif thunk do_softirq_thunk,do_softirq - + +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK thunk __down_failed,__down thunk_retrax __down_failed_interruptible,__down_interruptible thunk_retrax __down_failed_trylock,__down_trylock thunk __up_wakeup,__up +#endif /* SAVE_ARGS below is used only for the .cfi directives it contains. */ CFI_STARTPROC --- linux/arch/i386/mm/highmem.c.orig +++ linux/arch/i386/mm/highmem.c @@ -17,6 +17,27 @@ void kunmap(struct page *page) kunmap_high(page); } +void kunmap_virt(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return; + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + kunmap(page); +} + +struct page *kmap_to_page(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return virt_to_page(ptr); + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + return page; +} + + /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -25,7 +46,7 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -char *kmap_atomic(struct page *page, enum km_type type) +char *__kmap_atomic(struct page *page, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; @@ -47,7 +68,7 @@ char *kmap_atomic(struct page *page, enu return (char *)vaddr; } -void kunmap_atomic(char *kaddr, enum km_type type) +void __kunmap_atomic(char *kaddr, enum km_type type) { #ifdef CONFIG_DEBUG_HIGHMEM unsigned long vaddr = (unsigned long)kaddr & PAGE_MASK; @@ -77,7 +98,7 @@ void kunmap_atomic(char *kaddr, enum km_ /* This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. */ -char *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +char *__kmap_atomic_pfn(unsigned long pfn, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; @@ -92,7 +113,7 @@ char *kmap_atomic_pfn(unsigned long pfn, return (char *)vaddr; } -struct page *kmap_atomic_to_page(char *ptr) +struct page *__kmap_atomic_to_page(char *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; pte_t *pte; --- linux/arch/i386/mm/fault.c.orig +++ linux/arch/i386/mm/fault.c @@ -38,6 +38,8 @@ void bust_spinlocks(int yes) int loglevel_save = console_loglevel; if (yes) { + stop_trace(); + zap_rt_locks(); oops_in_progress = 1; return; } @@ -213,7 +215,7 @@ fastcall void do_invalid_op(struct pt_re * bit 1 == 0 means read, 1 means write * bit 2 == 0 means kernel, 1 means user-mode */ -fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code) +fastcall notrace void do_page_fault(struct pt_regs *regs, unsigned long error_code) { struct task_struct *tsk; struct mm_struct *mm; @@ -225,6 +227,7 @@ fastcall void do_page_fault(struct pt_re /* get the address */ __asm__("movl %%cr2,%0":"=r" (address)); + trace_special(regs->eip, error_code, address); if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, SIGSEGV) == NOTIFY_STOP) @@ -454,9 +457,9 @@ no_context: } #endif if (address < PAGE_SIZE) - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); + printk(KERN_ALERT "BUG: Unable to handle kernel NULL pointer dereference"); else - printk(KERN_ALERT "Unable to handle kernel paging request"); + printk(KERN_ALERT "BUG: Unable to handle kernel paging request"); printk(" at virtual address %08lx\n",address); printk(KERN_ALERT " printing eip:\n"); printk("%08lx\n", regs->eip); --- linux/arch/i386/mm/pgtable.c.orig +++ linux/arch/i386/mm/pgtable.c @@ -169,7 +169,7 @@ void pmd_ctor(void *pmd, kmem_cache_t *c * recommendations and having no core impact whatsoever. * -- wli */ -spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; +DEFINE_RAW_SPINLOCK(pgd_lock); struct page *pgd_list; static inline void pgd_list_add(pgd_t *pgd) --- linux/arch/i386/boot/compressed/misc.c.orig +++ linux/arch/i386/boot/compressed/misc.c @@ -16,6 +16,12 @@ #include #include +#ifdef CONFIG_MCOUNT +void notrace mcount(void) +{ +} +#endif + /* * gzip declarations */ @@ -113,7 +119,7 @@ static long free_mem_end_ptr; #define INPLACE_MOVE_ROUTINE 0x1000 #define LOW_BUFFER_START 0x2000 #define LOW_BUFFER_MAX 0x90000 -#define HEAP_SIZE 0x3000 +#define HEAP_SIZE 0x4000 static unsigned int low_buffer_end, low_buffer_size; static int high_loaded =0; static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; --- linux/arch/i386/Kconfig.orig +++ linux/arch/i386/Kconfig @@ -368,16 +368,6 @@ config X86_L1_CACHE_SHIFT default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 default "6" if MK7 || MK8 || MPENTIUMM -config RWSEM_GENERIC_SPINLOCK - bool - depends on M386 - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - depends on !M386 - default y - config GENERIC_CALIBRATE_DELAY bool default y @@ -434,7 +424,7 @@ config X86_USE_PPRO_CHECKSUM config X86_USE_3DNOW bool - depends on MCYRIXIII || MK7 + depends on (MCYRIXIII || MK7) && !PREEMPT_RT default y config X86_OOSTORE @@ -510,28 +500,22 @@ config SCHED_SMT cost of slightly increased overhead in some places. If unsure say N here. -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. +source "lib/Kconfig.RT" - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +config RWSEM_GENERIC_SPINLOCK + bool + depends on M386 && !PREEMPT_RT + default y -config PREEMPT_BKL - bool "Preempt The Big Kernel Lock" - depends on PREEMPT || SMP +config ASM_SEMAPHORES + bool + depends on !PREEMPT_RT default y - help - This option reduces the latency of the kernel by making the - big kernel lock preemptible. - Say Y here if you are building a kernel for a desktop system. - Say N if you are unsure. +config RWSEM_XCHGADD_ALGORITHM + bool + depends on !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT + default y config X86_UP_APIC bool "Local APIC support on uniprocessors" if !SMP @@ -883,7 +867,7 @@ config BOOT_IOREMAP config REGPARM bool "Use register arguments (EXPERIMENTAL)" - depends on EXPERIMENTAL + depends on EXPERIMENTAL && !MCOUNT default n help Compile the kernel with -mregparm=3. This uses an different ABI --- linux/arch/i386/kernel/io_apic.c.orig +++ linux/arch/i386/kernel/io_apic.c @@ -37,6 +37,7 @@ #include #include #include +#include #include @@ -45,7 +46,7 @@ int (*ioapic_renumber_irq)(int ioapic, int irq); atomic_t irq_mis_count; -static spinlock_t ioapic_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_RAW_SPINLOCK(ioapic_lock); /* * Is the SiS APIC rmw bug present ? @@ -127,6 +128,37 @@ static void __init replace_pin_at_irq(un } } +/* + * Cache the register used by the irq-redirection hotpath: + */ +static unsigned int io_apic_cache[MAX_IO_APICS][NR_IRQS] + ____cacheline_aligned_in_smp; + +static void update_io_apic_cache(unsigned int irq) +{ + struct irq_pin_list *entry = irq_2_pin + irq; + unsigned int pin; + + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + io_apic_cache[entry->apic][irq] = + io_apic_read(entry->apic, 0x10 + pin*2); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +#define IOAPIC_CACHE +/* + * Some systems need a POST flush or else level-triggered interrupts + * generate lots of spurious interrupts due to the POST-ed write not + * reaching the IOAPIC before the IRQ is ACK-ed in the local APIC. + */ +#define IOAPIC_POSTFLUSH + static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) { struct irq_pin_list *entry = irq_2_pin + irq; @@ -136,10 +168,29 @@ static void __modify_IO_APIC_irq (unsign pin = entry->pin; if (pin == -1) break; +#ifdef IOAPIC_CACHE + reg = io_apic_cache[entry->apic][irq]; + if (unlikely(!reg)) { + reg = io_apic_read(entry->apic, 0x10 + pin*2); + io_apic_cache[entry->apic][irq] = reg; + printk("hm: ioapic cache empty for irq %d (e:%08lx/d:%08lx) %08x\n", irq, enable, disable, reg); + + } + reg &= ~disable; + reg |= enable; + io_apic_write(entry->apic, 0x10 + pin*2, reg); +#else reg = io_apic_read(entry->apic, 0x10 + pin*2); reg &= ~disable; reg |= enable; io_apic_modify(entry->apic, 0x10 + pin*2, reg); +#endif +#ifdef IOAPIC_POSTFLUSH + /* + * Force POST flush by reading: + */ + reg = *(IO_APIC_BASE(entry->apic)+4); +#endif if (!entry->next) break; entry = irq_2_pin + entry->next; @@ -158,18 +209,6 @@ static void __unmask_IO_APIC_irq (unsign __modify_IO_APIC_irq(irq, 0, 0x00010000); } -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); -} - static void mask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -1249,6 +1288,7 @@ void __init setup_IO_APIC_irqs(void) io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); spin_unlock_irqrestore(&ioapic_lock, flags); + update_io_apic_cache(irq); } } @@ -1564,7 +1604,6 @@ void print_all_local_APICs (void) void /*__init*/ print_PIC(void) { - extern spinlock_t i8259A_lock; unsigned int v; unsigned long flags; @@ -1855,6 +1894,37 @@ static unsigned int startup_level_ioapic return 0; /* don't check for pending */ } +/* + * Level-triggered interrupt handling is different for RT kernels. + * + * In the RT case mask the IRQ first, then ack it, redirect it, + * and the IRQ thread then will handle it (sometime later) and will + * unmask it. + */ +#if defined(CONFIG_PREEMPT_HARDIRQS) + +static void mask_and_ack_level_ioapic_irq(unsigned int irq) +{ + move_irq(irq); + mask_IO_APIC_irq(irq); + ack_APIC_irq(); +} + +static void end_level_ioapic_irq(unsigned int irq) +{ +#ifndef CONFIG_PCI_MSI + if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)) && + irq_desc[irq].action) +#endif + unmask_IO_APIC_irq(irq); +} + +#else /* !CONFIG_PREEMPT_HARDIRQS */ + +static void mask_and_ack_level_ioapic_irq(unsigned int irq) +{ +} + static void end_level_ioapic_irq (unsigned int irq) { unsigned long v; @@ -1889,12 +1959,21 @@ static void end_level_ioapic_irq (unsign if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + /* mask = 1, trigger = 0 */ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); + /* mask = 0, trigger = 1 */ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); spin_unlock(&ioapic_lock); } } +#endif /* !CONFIG_PREEMPT_HARDIRQS */ + +static void enable_level_ioapic_irq(unsigned int irq) +{ + unmask_IO_APIC_irq(irq); +} + #ifdef CONFIG_PCI_MSI static unsigned int startup_edge_ioapic_vector(unsigned int vector) { @@ -1917,11 +1996,27 @@ static unsigned int startup_level_ioapic return startup_level_ioapic_irq (irq); } +static void mask_and_ack_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + mask_and_ack_level_ioapic_irq(irq); +} + static void end_level_ioapic_vector (unsigned int vector) { int irq = vector_to_irq(vector); - end_level_ioapic_irq(irq); +#if defined(CONFIG_PREEMPT_HARDIRQS) + if (!(irq_desc[vector].status & (IRQ_DISABLED | IRQ_INPROGRESS)) && + irq_desc[vector].action) +#endif + end_level_ioapic_irq(irq); +} + +static void enable_level_ioapic_vector(unsigned int vector) +{ + enable_level_ioapic_irq(vector_to_irq(vector)); } static void mask_IO_APIC_vector (unsigned int vector) --- linux/arch/i386/kernel/apic.c.orig +++ linux/arch/i386/kernel/apic.c @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -874,7 +875,6 @@ fake_ioapic_page: */ static unsigned int __init get_8254_timer_count(void) { - extern spinlock_t i8253_lock; unsigned long flags; unsigned int count; @@ -1137,6 +1137,7 @@ inline void smp_local_timer_interrupt(st int cpu = smp_processor_id(); profile_tick(CPU_PROFILING, regs); + if (--per_cpu(prof_counter, cpu) <= 0) { /* * The multiplier may have changed since the last time we got @@ -1182,7 +1183,7 @@ inline void smp_local_timer_interrupt(st * interrupt as well. Thus we cannot inline the local irq ... ] */ -fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) +fastcall notrace void smp_apic_timer_interrupt(struct pt_regs *regs) { int cpu = smp_processor_id(); @@ -1191,6 +1192,8 @@ fastcall void smp_apic_timer_interrupt(s */ irq_stat[cpu].apic_timer_irqs++; + trace_special(regs->eip, 0, 0); + /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. --- linux/arch/i386/kernel/i386_ksyms.c.orig +++ linux/arch/i386/kernel/i386_ksyms.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -34,7 +35,6 @@ #include extern void dump_thread(struct pt_regs *, struct user *); -extern spinlock_t rtc_lock; /* This is definitely a GPL-only symbol */ EXPORT_SYMBOL_GPL(cpu_gdt_table); @@ -83,10 +83,12 @@ EXPORT_SYMBOL(get_cmos_time); EXPORT_SYMBOL(cpu_khz); EXPORT_SYMBOL(apm_info); +#ifdef CONFIG_ASM_SEMAPHORES EXPORT_SYMBOL(__down_failed); EXPORT_SYMBOL(__down_failed_interruptible); EXPORT_SYMBOL(__down_failed_trylock); EXPORT_SYMBOL(__up_wakeup); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); /* Delay loops */ @@ -138,8 +140,10 @@ EXPORT_SYMBOL(cpu_sibling_map); EXPORT_SYMBOL(cpu_data); EXPORT_SYMBOL(cpu_online_map); EXPORT_SYMBOL(cpu_callout_map); +#ifdef CONFIG_ASM_SEMAPHORES EXPORT_SYMBOL(__write_lock_failed); EXPORT_SYMBOL(__read_lock_failed); +#endif /* Global SMP stuff */ EXPORT_SYMBOL(smp_call_function); @@ -174,17 +178,19 @@ EXPORT_SYMBOL(memcmp); EXPORT_SYMBOL(register_die_notifier); #ifdef CONFIG_HAVE_DEC_LOCK -EXPORT_SYMBOL(_atomic_dec_and_lock); +EXPORT_SYMBOL(_atomic_dec_and_raw_spin_lock); #endif EXPORT_SYMBOL(__PAGE_KERNEL); #ifdef CONFIG_HIGHMEM EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kmap_to_page); EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); +EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_virt); +EXPORT_SYMBOL(__kmap_atomic_to_page); #endif #if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) --- linux/arch/i386/kernel/semaphore.c.orig +++ linux/arch/i386/kernel/semaphore.c @@ -16,6 +16,7 @@ #include #include #include +#include #include /* @@ -263,35 +264,10 @@ asm( "ret" ); -/* - * rw spinlock fallbacks - */ -#if defined(CONFIG_SMP) -asm( -".section .sched.text\n" -".align 4\n" -".globl __write_lock_failed\n" -"__write_lock_failed:\n\t" - LOCK "addl $" RW_LOCK_BIAS_STR ",(%eax)\n" -"1: rep; nop\n\t" - "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jne 1b\n\t" - LOCK "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jnz __write_lock_failed\n\t" - "ret" -); +int fastcall sem_is_locked(struct semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} + +EXPORT_SYMBOL(sem_is_locked); -asm( -".section .sched.text\n" -".align 4\n" -".globl __read_lock_failed\n" -"__read_lock_failed:\n\t" - LOCK "incl (%eax)\n" -"1: rep; nop\n\t" - "cmpl $1,(%eax)\n\t" - "js 1b\n\t" - LOCK "decl (%eax)\n\t" - "js __read_lock_failed\n\t" - "ret" -); -#endif --- linux/arch/i386/kernel/timers/timer_hpet.c.orig +++ linux/arch/i386/kernel/timers/timer_hpet.c @@ -24,7 +24,7 @@ static unsigned long hpet_last; /* hpet static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; +static DECLARE_RAW_SEQLOCK(monotonic_lock); /* convert from cycles(64bits) => nanoseconds (64bits) * basic equation: --- linux/arch/i386/kernel/timers/timer_tsc.c.orig +++ linux/arch/i386/kernel/timers/timer_tsc.c @@ -24,6 +24,7 @@ #include "mach_timer.h" #include +#include #ifdef CONFIG_HPET_TIMER static unsigned long hpet_usec_quotient; @@ -35,8 +36,6 @@ static inline void cpufreq_delayed_get(v int tsc_disable __initdata = 0; -extern spinlock_t i8253_lock; - static int use_tsc; /* Number of usecs that the last interrupt was delayed */ static int delay_at_last_interrupt; @@ -44,7 +43,7 @@ static int delay_at_last_interrupt; static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; +static DECLARE_SEQLOCK(monotonic_lock); /* convert from cycles(64bits) => nanoseconds (64bits) * basic equation: @@ -171,9 +170,9 @@ static void delay_tsc(unsigned long loop static void mark_offset_tsc_hpet(void) { unsigned long long this_offset, last_offset; - unsigned long offset, temp, hpet_current; + unsigned long offset, temp, hpet_current, flags; - write_seqlock(&monotonic_lock); + write_seqlock_irqsave(&monotonic_lock, flags); last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; /* * It is important that these two operations happen almost at @@ -201,7 +200,7 @@ static void mark_offset_tsc_hpet(void) /* update the monotonic base value */ this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); + write_sequnlock_irqrestore(&monotonic_lock, flags); /* calculate delay_at_last_interrupt */ /* @@ -322,7 +321,7 @@ static inline void cpufreq_delayed_get(v static void mark_offset_tsc(void) { - unsigned long lost,delay; + unsigned long lost,delay, flags, flags2; unsigned long delta = last_tsc_low; int count; int countmp; @@ -330,7 +329,7 @@ static void mark_offset_tsc(void) unsigned long long this_offset, last_offset; static int lost_count = 0; - write_seqlock(&monotonic_lock); + write_seqlock_irqsave(&monotonic_lock, flags); last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; /* * It is important that these two operations happen almost at @@ -348,24 +347,26 @@ static void mark_offset_tsc(void) rdtsc(last_tsc_low, last_tsc_high); - spin_lock(&i8253_lock); - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - - count = inb_p(PIT_CH0); /* read the latched count */ + spin_lock_irqsave(&i8253_lock, flags2); + outb(0x00, PIT_MODE); /* latch the count ASAP */ + count = inb(PIT_CH0); /* read the latched count */ count |= inb(PIT_CH0) << 8; +#undef VIA686A_WORKAROUND /* * VIA686a test code... reset the latch if count > max + 1 * from timer_pit.c - cjb */ +#ifdef VIA686A_WORKAROUND if (count > LATCH) { outb_p(0x34, PIT_MODE); outb_p(LATCH & 0xff, PIT_CH0); outb(LATCH >> 8, PIT_CH0); count = LATCH - 1; } +#endif - spin_unlock(&i8253_lock); + spin_unlock_irqrestore(&i8253_lock, flags2); if (pit_latch_buggy) { /* get center value of last 3 time lutch */ @@ -418,7 +419,7 @@ static void mark_offset_tsc(void) /* update the monotonic base value */ this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); + write_sequnlock_irqrestore(&monotonic_lock, flags); /* calculate delay_at_last_interrupt */ count = ((LATCH-1) - count) * TICK_SIZE; --- linux/arch/i386/kernel/timers/timer_pm.c.orig +++ linux/arch/i386/kernel/timers/timer_pm.c @@ -41,7 +41,7 @@ static u32 offset_tick; static u32 offset_delay; static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; +static DECLARE_RAW_SEQLOCK(monotonic_lock); #define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ --- linux/arch/i386/kernel/timers/timer_cyclone.c.orig +++ linux/arch/i386/kernel/timers/timer_cyclone.c @@ -17,9 +17,9 @@ #include #include #include -#include "io_ports.h" +#include -extern spinlock_t i8253_lock; +#include "io_ports.h" /* Number of usecs that the last interrupt was delayed */ static int delay_at_last_interrupt; @@ -36,7 +36,7 @@ static u32* volatile cyclone_timer; /* C static u32 last_cyclone_low; static u32 last_cyclone_high; static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; +static DECLARE_RAW_SEQLOCK(monotonic_lock); /* helper macro to atomically read both cyclone counter registers */ #define read_cyclone_counter(low,high) \ --- linux/arch/i386/kernel/timers/timer_pit.c.orig +++ linux/arch/i386/kernel/timers/timer_pit.c @@ -15,9 +15,8 @@ #include #include #include +#include -extern spinlock_t i8259A_lock; -extern spinlock_t i8253_lock; #include "do_timer.h" #include "io_ports.h" @@ -166,7 +165,6 @@ struct init_timer_opts __initdata timer_ void setup_pit_timer(void) { - extern spinlock_t i8253_lock; unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); --- linux/arch/i386/kernel/time.c.orig +++ linux/arch/i386/kernel/time.c @@ -67,7 +67,8 @@ #include "io_ports.h" -extern spinlock_t i8259A_lock; +#include + int pit_latch_buggy; /* extern */ #include "do_timer.h" @@ -80,9 +81,11 @@ unsigned long cpu_khz; /* Detected as we extern unsigned long wall_jiffies; -spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(rtc_lock); + +#include -spinlock_t i8253_lock = SPIN_LOCK_UNLOCKED; +DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); struct timer_opts *cur_timer = &timer_none; @@ -201,7 +204,7 @@ unsigned long long monotonic_clock(void) EXPORT_SYMBOL(monotonic_clock); #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); @@ -213,6 +216,19 @@ unsigned long profile_pc(struct pt_regs EXPORT_SYMBOL(profile_pc); #endif +#ifdef CONFIG_PREEMPT_HARDIRQS + +/* + * If the timer is redirected then this is the minimal + * interrupt-context processing we have to do: + */ +void direct_timer_interrupt(struct pt_regs *regs) +{ + do_timer_interrupt_hook(regs); +} + +#endif + /* * timer_interrupt() needs to keep up the real-time clock, * as well as call the "do_timer()" routine every clocktick @@ -222,21 +238,24 @@ static inline void do_timer_interrupt(in { #ifdef CONFIG_X86_IO_APIC if (timer_ack) { + unsigned long flags; /* * Subtle, when I/O APICs are used we have to ack timer IRQ * manually to reset the IRR bit for do_slow_gettimeoffset(). * This will also deassert NMI lines for the watchdog if run * on an 82489DX-based system. */ - spin_lock(&i8259A_lock); + spin_lock_irqsave(&i8259A_lock, flags); outb(0x0c, PIC_MASTER_OCW3); /* Ack the IRQ; AEOI will end it automatically. */ inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); + spin_unlock_irqrestore(&i8259A_lock, flags); } #endif +#ifndef CONFIG_PREEMPT_HARDIRQS do_timer_interrupt_hook(regs); +#endif /* * If we have an externally synchronized Linux clock, then update --- linux/arch/i386/kernel/apm.c.orig +++ linux/arch/i386/kernel/apm.c @@ -228,10 +228,10 @@ #include #include #include +#include #include "io_ports.h" -extern spinlock_t i8253_lock; extern unsigned long get_cmos_time(void); extern void machine_real_restart(unsigned char *, int); @@ -1168,8 +1168,7 @@ static void get_time_diff(void) static void reinit_timer(void) { #ifdef INIT_TIMER_AFTER_SUSPEND - unsigned long flags; - extern spinlock_t i8253_lock; + unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); /* set the clock to 100 Hz */ --- linux/arch/i386/kernel/cpu/mtrr/generic.c.orig +++ linux/arch/i386/kernel/cpu/mtrr/generic.c @@ -231,7 +231,7 @@ static unsigned long set_mtrr_state(u32 static unsigned long cr4 = 0; static u32 deftype_lo, deftype_hi; -static spinlock_t set_atomicity_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); static void prepare_set(void) { --- linux/arch/i386/kernel/signal.c.orig +++ linux/arch/i386/kernel/signal.c @@ -591,6 +591,13 @@ int fastcall do_signal(struct pt_regs *r int signr; struct k_sigaction ka; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from --- linux/arch/i386/kernel/kgdb_stub.c.orig +++ linux/arch/i386/kernel/kgdb_stub.c @@ -365,8 +365,8 @@ __asm__("fn_rtn_stub:\n\t" #ifdef CONFIG_SMP static int in_kgdb_called; -static spinlock_t waitlocks[MAX_NO_CPUS] = - {[0 ... MAX_NO_CPUS - 1] = SPIN_LOCK_UNLOCKED }; +static raw_spinlock_t waitlocks[MAX_NO_CPUS] = + {[0 ... MAX_NO_CPUS - 1] = RAW_SPIN_LOCK_UNLOCKED }; /* * The following array has the thread pointer of each of the "other" * cpus. We make it global so it can be seen by gdb. @@ -374,9 +374,9 @@ static spinlock_t waitlocks[MAX_NO_CPUS] volatile int in_kgdb_entry_log[MAX_NO_CPUS]; volatile struct pt_regs *in_kgdb_here_log[MAX_NO_CPUS]; /* -static spinlock_t continuelocks[MAX_NO_CPUS]; +static raw_spinlock_t continuelocks[MAX_NO_CPUS]; */ -spinlock_t kgdb_spinlock = SPIN_LOCK_UNLOCKED; +raw_spinlock_t kgdb_spinlock = RAW_SPIN_LOCK_UNLOCKED; /* waiters on our spinlock plus us */ static atomic_t spinlock_waiters = ATOMIC_INIT(1); static int spinlock_count = 0; @@ -2404,7 +2404,7 @@ int kgdb_and_then_count; void kgdb_tstamp(int line, char *source, int data0, int data1) { - static spinlock_t ts_spin = SPIN_LOCK_UNLOCKED; + static raw_spinlock_t ts_spin = RAW_SPIN_LOCK_UNLOCKED; int flags; kgdb_local_irq_save(flags); spin_lock(&ts_spin); --- linux/arch/i386/kernel/nmi.c.orig +++ linux/arch/i386/kernel/nmi.c @@ -46,7 +46,7 @@ unsigned int nmi_watchdog = NMI_NONE; #endif extern int unknown_nmi_panic; -static unsigned int nmi_hz = HZ; +static unsigned int nmi_hz = 1000; static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ static unsigned int nmi_p4_cccr_val; extern void show_registers(struct pt_regs *regs); @@ -122,7 +122,7 @@ int __init check_nmi_watchdog (void) for (cpu = 0; cpu < NR_CPUS; cpu++) prev_nmi_count[cpu] = irq_stat[cpu].__nmi_count; local_irq_enable(); - mdelay((10*1000)/nmi_hz); // wait 10 ticks + mdelay((100*1000)/nmi_hz); // wait 100 ticks /* FIXME: Only boot CPU is online at this stage. Check CPUs as they come up. */ @@ -141,7 +141,7 @@ int __init check_nmi_watchdog (void) /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = 1; + nmi_hz = 1000; return 0; } @@ -342,8 +342,8 @@ static void setup_k7_watchdog(void) | K7_NMI_EVENT; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); - Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000)); - wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1); + Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz*1000/nmi_hz)); + wrmsr(MSR_K7_PERFCTR0, -(cpu_khz*1000/nmi_hz), -1); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); @@ -364,8 +364,8 @@ static void setup_p6_watchdog(void) | P6_NMI_EVENT; wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); - Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000)); - wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), 0); + Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz*1000/nmi_hz)); + wrmsr(MSR_P6_PERFCTR0, -(cpu_khz*1000/nmi_hz), 0); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); @@ -405,8 +405,8 @@ static int setup_p4_watchdog(void) wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); - Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000)); - wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1); + Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz*1000/nmi_hz)); + wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz*1000/nmi_hz), -1); apic_write(APIC_LVTPC, APIC_DM_NMI); wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); return 1; @@ -482,7 +482,29 @@ int tune_watchdog = 5*HZ; extern void die_nmi(struct pt_regs *, const char *msg); -void nmi_watchdog_tick (struct pt_regs * regs) +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) +{ + int i; + + if (nmi_watchdog == NMI_NONE) + return; + if (system_state != SYSTEM_RUNNING) { + printk("nmi_show_all_regs(): system state %d, not doing.\n", + system_state); + return; + } + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + for_each_online_cpu(i) + while (nmi_show_regs[i] == 1) + barrier(); +} +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + +void notrace nmi_watchdog_tick (struct pt_regs * regs) { /* @@ -490,10 +512,17 @@ void nmi_watchdog_tick (struct pt_regs * * always switch the stack NMI-atomically, it's safe to use * smp_processor_id(). */ - int sum, cpu = smp_processor_id(); + int sum, cpu = _smp_processor_id(); sum = irq_stat[cpu].apic_timer_irqs; + if (nmi_show_regs[cpu]) { + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + show_regs(regs); + spin_unlock(&nmi_print_lock); + } + #ifdef CONFIG_KGDB if (!in_kgdb(regs) && last_irq_sums[cpu] == sum) { @@ -512,6 +541,13 @@ void nmi_watchdog_tick (struct pt_regs * alert_counter[cpu] = 0; } #endif + if (alert_counter[cpu] == 5*nmi_hz) { + int i; + + bust_spinlocks(1); + for (i = 0; i < NR_CPUS; i++) + nmi_show_regs[i] = 1; + } if (alert_counter[cpu] == 5*nmi_hz) die_nmi(regs, "NMI Watchdog detected LOCKUP"); } else { @@ -536,7 +572,7 @@ void nmi_watchdog_tick (struct pt_regs * * other P6 variant */ apic_write(APIC_LVTPC, APIC_DM_NMI); } - wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); + wrmsr(nmi_perfctr_msr, -(cpu_khz*1000/nmi_hz), -1); } } --- linux/arch/i386/kernel/entry.S.orig +++ linux/arch/i386/kernel/entry.S @@ -189,6 +189,8 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) + cmpl $0, kernel_preemption + jz restore_all cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_all need_resched: @@ -197,9 +199,8 @@ need_resched: jz restore_all testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all - sti - call preempt_schedule cli + call preempt_schedule_irq movl $0,TI_preempt_count(%ebp) jmp need_resched #endif @@ -233,6 +234,11 @@ sysenter_past_esp: pushl %eax SAVE_ALL +#ifdef CONFIG_LATENCY_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) @@ -245,6 +251,11 @@ sysenter_past_esp: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work +#ifdef CONFIG_LATENCY_TRACE + pushl %eax + call sys_ret + popl %eax +#endif /* if something modifies registers it must also disable sysexit */ movl EIP(%esp), %edx movl OLDESP(%esp), %ecx @@ -257,6 +268,11 @@ sysenter_past_esp: ENTRY(system_call) pushl %eax # save orig_eax SAVE_ALL +#ifdef CONFIG_LATENCY_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) # system call tracing in operation testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) @@ -274,6 +290,9 @@ syscall_exit: testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work restore_all: +#ifdef CONFIG_CRITICAL_TIMING + call touch_critical_timing +#endif #ifdef CONFIG_TRAP_BAD_SYSCALL_EXITS movl EFLAGS(%esp), %eax # mix EFLAGS and CS movb CS(%esp), %al @@ -287,6 +306,16 @@ restore_all: resume_kernelX: #endif +#if defined(CONFIG_CRITICAL_IRQSOFF_TIMING) || defined(CONFIG_LATENCY_TRACE) + pushl %eax +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + call trace_irqs_on +#endif +#ifdef CONFIG_LATENCY_TRACE + call sys_ret +#endif + popl %eax +#endif RESTORE_ALL # perform work that needs to be done immediately before resumption @@ -295,8 +324,9 @@ work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: - call schedule - cli # make sure we don't miss an interrupt + cli + call __schedule + # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -343,6 +373,11 @@ syscall_trace_entry: syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending +#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING + pushl %eax + call trace_irqs_on + popl %eax +#endif sti # could let do_syscall_trace() call # schedule() instead movl %esp, %eax --- linux/arch/i386/kernel/process.c.orig +++ linux/arch/i386/kernel/process.c @@ -106,6 +106,7 @@ void default_idle(void) else local_irq_enable(); } else { + local_irq_enable(); cpu_relax(); } } @@ -197,9 +198,10 @@ void cpu_idle (void) play_dead(); irq_stat[cpu].idle_timestamp = jiffies; + stop_critical_timing(); idle(); } - schedule(); + __schedule(); } } @@ -360,11 +362,16 @@ void exit_thread(void) /* The process may have allocated an io port bitmap... nuke it. */ if (unlikely(NULL != t->io_bitmap_ptr)) { - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + int cpu; + struct tss_struct *tss; + void *io_bitmap_ptr = t->io_bitmap_ptr; - kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; + mb(); + kfree(io_bitmap_ptr); + + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); /* * Careful, clear this in the TSS too: */ --- linux/arch/i386/kernel/Makefile.orig +++ linux/arch/i386/kernel/Makefile @@ -4,11 +4,12 @@ extra-y := head.o init_task.o vmlinux.lds -obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ +obj-y := process.o signal.o entry.o traps.o irq.o vm86.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \ doublefault.o quirks.o +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o obj-y += cpu/ obj-y += timers/ obj-$(CONFIG_ACPI_BOOT) += acpi/ @@ -21,6 +22,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-$(CONFIG_MCOUNT) += mcount-wrapper.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o --- linux/arch/i386/kernel/traps.c.orig +++ linux/arch/i386/kernel/traps.c @@ -93,7 +93,7 @@ asmlinkage void machine_check(void); static int kstack_depth_to_print = 24; struct notifier_block *i386die_chain; -static spinlock_t die_notifier_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_RAW_SPINLOCK(die_notifier_lock); int register_die_notifier(struct notifier_block *nb) { @@ -148,22 +148,27 @@ static inline unsigned long print_contex unsigned long *stack, unsigned long ebp) { unsigned long addr; +#ifndef CONFIG_FRAME_POINTER + unsigned long prev_frame; +#endif -#ifdef CONFIG_FRAME_POINTER +#ifdef CONFIG_FRAME_POINTER while (valid_stack_ptr(tinfo, (void *)ebp)) { addr = *(unsigned long *)(ebp + 4); printk(" [<%08lx>] ", addr); print_symbol("%s", addr); - printk("\n"); + printk(" (%ld)\n", *(unsigned long *)ebp - ebp); ebp = *(unsigned long *)ebp; } #else + prev_frame = (unsigned long)stack; while (valid_stack_ptr(tinfo, stack)) { addr = *stack++; if (__kernel_text_address(addr)) { printk(" [<%08lx>]", addr); print_symbol(" %s", addr); - printk("\n"); + printk(" (%ld)\n", (unsigned long)stack - prev_frame); + prev_frame = (unsigned long)stack; } } #endif @@ -195,6 +200,7 @@ void show_trace(struct task_struct *task break; printk(" =======================\n"); } + print_traces(task); } void show_stack(struct task_struct *task, unsigned long *esp) @@ -257,8 +263,8 @@ void show_registers(struct pt_regs *regs regs->eax, regs->ebx, regs->ecx, regs->edx); printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", regs->esi, regs->edi, regs->ebp, esp); - printk("ds: %04x es: %04x ss: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, ss); + printk("ds: %04x es: %04x ss: %04x preempt: %08x\n", + regs->xds & 0xffff, regs->xes & 0xffff, ss, preempt_count()); printk("Process %s (pid: %d, threadinfo=%p task=%p)", current->comm, current->pid, current_thread_info(), current); /* @@ -329,11 +335,11 @@ bug: void die(const char * str, struct pt_regs * regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED, .lock_owner = -1, .lock_owner_depth = 0 }; @@ -412,6 +418,11 @@ static void do_trap(int trapnr, int sign if (!(regs->xcs & 3)) goto kernel_trap; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + trap_signal: { struct task_struct *tsk = current; tsk->thread.error_code = error_code; @@ -596,10 +607,11 @@ static void unknown_nmi_error(unsigned c printk("Do you have a strange power saving mode enabled?\n"); } -static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_RAW_SPINLOCK(nmi_print_lock); void die_nmi (struct pt_regs *regs, const char *msg) { + deadlock_trace_off(); spin_lock(&nmi_print_lock); /* * We are in trouble anyway, lets at least try @@ -614,17 +626,19 @@ void die_nmi (struct pt_regs *regs, cons console_silent(); spin_unlock(&nmi_print_lock); bust_spinlocks(0); + nmi_exit(); do_exit(SIGSEGV); } -static void default_do_nmi(struct pt_regs * regs) +static void notrace default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; /* Only the BSP gets external NMIs from the system. */ if (!smp_processor_id()) reason = get_nmi_reason(); - + +// trace_special(6, 0, 0); if (!(reason & 0xc0)) { if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) == NOTIFY_STOP) @@ -636,6 +650,7 @@ static void default_do_nmi(struct pt_reg */ if (nmi_watchdog) { nmi_watchdog_tick(regs); +// trace_special(6, 1, 0); return; } #endif @@ -655,21 +670,26 @@ static void default_do_nmi(struct pt_reg reassert_nmi(); } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +static notrace int dummy_nmi_callback(struct pt_regs * regs, int cpu) { return 0; } static nmi_callback_t nmi_callback = dummy_nmi_callback; -fastcall void do_nmi(struct pt_regs * regs, long error_code) +fastcall notrace void do_nmi(struct pt_regs * regs, long error_code) { int cpu; nmi_enter(); + nmi_trace((unsigned long)do_nmi, regs->eip, regs->eflags); cpu = smp_processor_id(); + if (__kernel_text_address(regs->eip) && + *(unsigned char *)regs->eip == 0xf4) + regs->eip++; + #ifdef CONFIG_HOTPLUG_CPU if (!cpu_online(cpu)) { nmi_exit(); --- linux/arch/i386/kernel/i8259.c.orig +++ linux/arch/i386/kernel/i8259.c @@ -39,7 +39,7 @@ * moves to arch independent land */ -spinlock_t i8259A_lock = SPIN_LOCK_UNLOCKED; +DEFINE_RAW_SPINLOCK(i8259A_lock); static void end_8259A_irq (unsigned int irq) { @@ -195,14 +195,19 @@ void mask_and_ack_8259A(unsigned int irq goto spurious_8259A_irq; cached_irq_mask |= irqmask; +#undef DO_DUMMY_IMR_READ handle_real_irq: if (irq & 8) { +#ifdef DO_DUMMY_IMR_READ inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ +#endif outb(cached_slave_mask, PIC_SLAVE_IMR); outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ } else { +#ifdef DO_DUMMY_IMR_READ inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ +#endif outb(cached_master_mask, PIC_MASTER_IMR); outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ } @@ -371,7 +376,7 @@ static irqreturn_t math_error_irq(int cp * New motherboards sometimes make IRQ 13 be a PCI interrupt, * so allow interrupt sharing. */ -static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL }; +static struct irqaction fpu_irq = { math_error_irq, SA_NODELAY, CPU_MASK_NONE, "fpu", NULL, NULL }; void __init init_ISA_irqs (void) { --- linux/arch/i386/kernel/smp.c.orig +++ linux/arch/i386/kernel/smp.c @@ -251,7 +251,7 @@ inline void send_IPI_mask_sequence(cpuma static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; -static spinlock_t tlbstate_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_RAW_SPINLOCK(tlbstate_lock); #define FLUSH_ALL 0xffffffff /* @@ -396,7 +396,7 @@ static void flush_tlb_others(cpumask_t c while (!cpus_empty(flush_cpumask)) /* nothing. lockup detection does not belong here */ - mb(); + cpu_relax(); flush_mm = NULL; flush_va = 0; @@ -495,6 +495,16 @@ void smp_send_reschedule(int cpu) send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + void crash_dump_send_ipi(void) { send_IPI_allbutself(CRASH_DUMP_VECTOR); @@ -504,7 +514,7 @@ void crash_dump_send_ipi(void) * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static spinlock_t call_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); @@ -607,8 +617,9 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -fastcall void smp_reschedule_interrupt(struct pt_regs *regs) +fastcall notrace void smp_reschedule_interrupt(struct pt_regs *regs) { + trace_special(regs->eip, 0, 0); ack_APIC_irq(); } --- linux/arch/i386/kernel/irq.c.orig +++ linux/arch/i386/kernel/irq.c @@ -48,7 +48,7 @@ static union irq_ctx *softirq_ctx[NR_CPU * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) +fastcall notrace unsigned int do_IRQ(struct pt_regs *regs) { /* high bits used in ret_from_ code */ int irq = regs->orig_eax & 0xff; @@ -58,6 +58,7 @@ fastcall unsigned int do_IRQ(struct pt_r #endif irq_enter(); + trace_special(regs->eip, irq, 0); #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { @@ -66,12 +67,14 @@ fastcall unsigned int do_IRQ(struct pt_r __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (THREAD_SIZE - 1)); if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { - printk("do_IRQ: stack overflow: %ld\n", + printk("BUG: do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); dump_stack(); } } #endif + if (unlikely(!irq)) + direct_timer_interrupt(regs); #ifdef CONFIG_4KSTACKS @@ -234,6 +237,7 @@ int show_interrupts(struct seq_file *p, for (action=action->next; action; action = action->next) seq_printf(p, ", %s", action->name); + seq_printf(p, " %d/%d", irq_desc[i].irqs_unhandled, irq_desc[i].irq_count); seq_putc(p, '\n'); skip: --- linux/arch/i386/kernel/mcount-wrapper.S.orig +++ linux/arch/i386/kernel/mcount-wrapper.S @@ -0,0 +1,27 @@ +/* + * linux/arch/i386/mcount-wrapper.S + * + * Copyright (C) 2004 Ingo Molnar + */ + +.globl mcount +mcount: + + cmpl $0, mcount_enabled + jz out + + push %ebp + mov %esp, %ebp + pushl %eax + pushl %ecx + pushl %edx + + call __mcount + + popl %edx + popl %ecx + popl %eax + popl %ebp +out: + ret + --- linux/arch/i386/mach-voyager/voyager_basic.c.orig +++ linux/arch/i386/mach-voyager/voyager_basic.c @@ -30,6 +30,7 @@ #include #include #include +#include /* * Power off function, if any @@ -184,7 +185,6 @@ voyager_timer_interrupt(struct pt_regs * * and swiftly introduce it to something sharp and * pointy. */ __u16 val; - extern spinlock_t i8253_lock; spin_lock(&i8253_lock); --- linux/arch/i386/mach-voyager/setup.c.orig +++ linux/arch/i386/mach-voyager/setup.c @@ -17,7 +17,7 @@ void __init pre_intr_init_hook(void) /* * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; void __init intr_init_hook(void) { @@ -40,7 +40,7 @@ void __init trap_init_hook(void) { } -static struct irqaction irq0 = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL}; +static struct irqaction irq0 = { timer_interrupt, SA_INTERRUPT | SA_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL}; void __init time_init_hook(void) { --- linux/arch/i386/lib/dec_and_lock.c.orig +++ linux/arch/i386/lib/dec_and_lock.c @@ -10,7 +10,7 @@ #include #include -int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) +int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock) { int counter; int newcount; @@ -32,9 +32,9 @@ repeat: return 0; slow_path: - spin_lock(lock); + _raw_spin_lock(lock); if (atomic_dec_and_test(atomic)) return 1; - spin_unlock(lock); + _raw_spin_unlock(lock); return 0; } --- linux/arch/i386/lib/bitops.c.orig +++ linux/arch/i386/lib/bitops.c @@ -68,3 +68,37 @@ int find_next_zero_bit(const unsigned lo return (offset + set + res); } EXPORT_SYMBOL(find_next_zero_bit); + + +/* + * rw spinlock fallbacks + */ +#if defined(CONFIG_SMP) +asm( +".section .sched.text\n" +".align 4\n" +".globl __write_lock_failed\n" +"__write_lock_failed:\n\t" + LOCK "addl $" RW_LOCK_BIAS_STR ",(%eax)\n" +"1: rep; nop\n\t" + "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" + "jne 1b\n\t" + LOCK "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" + "jnz __write_lock_failed\n\t" + "ret" +); + +asm( +".section .sched.text\n" +".align 4\n" +".globl __read_lock_failed\n" +"__read_lock_failed:\n\t" + LOCK "incl (%eax)\n" +"1: rep; nop\n\t" + "cmpl $1,(%eax)\n\t" + "js 1b\n\t" + LOCK "decl (%eax)\n\t" + "js __read_lock_failed\n\t" + "ret" +); +#endif --- linux/arch/i386/lib/kgdb_serial.c.orig +++ linux/arch/i386/lib/kgdb_serial.c @@ -104,9 +104,9 @@ read_data_bfr(struct async_struct *info) * but we will just depend on the uart status to help keep that straight. */ -static spinlock_t uart_interrupt_lock = SPIN_LOCK_UNLOCKED; +static raw_spinlock_t uart_interrupt_lock = RAW_SPIN_LOCK_UNLOCKED; #ifdef CONFIG_SMP -extern spinlock_t kgdb_spinlock; +extern raw_spinlock_t kgdb_spinlock; #endif static int @@ -343,7 +343,7 @@ program_uart(struct async_struct *info) */ int kgdb_in_isr = 0; int kgdb_in_lsr = 0; -extern spinlock_t kgdb_spinlock; +extern raw_spinlock_t kgdb_spinlock; /* Caller takes needed protections */ @@ -381,7 +381,7 @@ tty_getDebugChar(void) } /* tty_getDebugChar */ static int count = 3; -static spinlock_t one_at_atime = SPIN_LOCK_UNLOCKED; +static raw_spinlock_t one_at_atime = RAW_SPIN_LOCK_UNLOCKED; static int __init kgdb_enable_ints(void) @@ -435,7 +435,7 @@ kgdb_enable_ints_now(void) #endif ints_disabled = request_irq(gdb_async_info->state->irq, gdb_interrupt, - IRQ_T(gdb_async_info), + IRQ_T(gdb_async_info) | SA_NODELAY, "KGDB-stub", NULL); intprintk(("KGDB: request_irq returned %d\n", ints_disabled)); } --- linux/arch/i386/mach-visws/visws_apic.c.orig +++ linux/arch/i386/mach-visws/visws_apic.c @@ -261,11 +261,13 @@ out_unlock: static struct irqaction master_action = { .handler = piix4_master_intr, .name = "PIIX4-8259", + .flags = SA_NODELAY, }; static struct irqaction cascade_action = { .handler = no_action, .name = "cascade", + .flags = SA_NODELAY, }; --- linux/arch/i386/mach-visws/setup.c.orig +++ linux/arch/i386/mach-visws/setup.c @@ -112,7 +112,7 @@ void __init pre_setup_arch_hook() static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = SA_INTERRUPT, + .flags = SA_INTERRUPT | SA_NODELAY, .name = "timer", }; --- linux/arch/i386/mach-default/setup.c.orig +++ linux/arch/i386/mach-default/setup.c @@ -27,7 +27,7 @@ void __init pre_intr_init_hook(void) /* * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; /** * intr_init_hook - post gate setup interrupt initialisation --- linux/drivers/net/tg3.c.orig +++ linux/drivers/net/tg3.c @@ -3070,7 +3070,7 @@ static int tg3_start_xmit(struct sk_buff * So we really do need to disable interrupts when taking * tx_lock here. */ - local_irq_save(flags); + local_irq_save_nort(flags); if (!spin_trylock(&tp->tx_lock)) { local_irq_restore(flags); return NETDEV_TX_LOCKED; @@ -3230,7 +3230,8 @@ static int tg3_start_xmit(struct sk_buff out_unlock: mmiowb(); - spin_unlock_irqrestore(&tp->tx_lock, flags); + spin_unlock(&tp->tx_lock); + local_irq_restore(flags); dev->trans_start = jiffies; --- linux/drivers/net/tulip/tulip_core.c.orig +++ linux/drivers/net/tulip/tulip_core.c @@ -1781,6 +1781,7 @@ static void __devexit tulip_remove_one ( pci_iounmap(pdev, tp->base_addr); free_netdev (dev); pci_release_regions (pdev); + pci_disable_device (pdev); pci_set_drvdata (pdev, NULL); /* pci_power_off (pdev, -1); */ --- linux/drivers/net/e1000/e1000_main.c.orig +++ linux/drivers/net/e1000/e1000_main.c @@ -1802,10 +1802,10 @@ e1000_xmit_frame(struct sk_buff *skb, st if(adapter->pcix_82544) count += nr_frags; - local_irq_save(flags); + local_irq_save_nort(flags); if (!spin_trylock(&adapter->tx_lock)) { /* Collision - tell upper layer to requeue */ - local_irq_restore(flags); + local_irq_restore_nort(flags); return NETDEV_TX_LOCKED; } --- linux/drivers/net/3c59x.c.orig +++ linux/drivers/net/3c59x.c @@ -954,9 +954,13 @@ static void poll_vortex(struct net_devic struct vortex_private *vp = netdev_priv(dev); unsigned long flags; local_save_flags(flags); +#ifndef CONFIG_PREEMPT_RT local_irq_disable(); +#endif (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev,NULL); +#ifndef CONFIG_PREEMPT_RT local_irq_restore(flags); +#endif } #endif @@ -1985,12 +1989,16 @@ static void vortex_tx_timeout(struct net * Block interrupts because vortex_interrupt does a bare spin_lock() */ unsigned long flags; +#ifndef CONFIG_PREEMPT_RT local_irq_save(flags); +#endif if (vp->full_bus_master_tx) boomerang_interrupt(dev->irq, dev, NULL); else vortex_interrupt(dev->irq, dev, NULL); +#ifndef CONFIG_PREEMPT_RT local_irq_restore(flags); +#endif } } --- linux/drivers/net/netconsole.c.orig +++ linux/drivers/net/netconsole.c @@ -74,10 +74,19 @@ static void write_msg(struct console *co return; local_irq_save(flags); +#ifdef CONFIG_PREEMPT_RT + /* + * A bit hairy. Netconsole uses mutexes (indirectly) and + * thus must have interrupts enabled: + */ + local_irq_enable(); +#endif for(left = len; left; ) { frag = min(left, MAX_PRINT_CHUNK); + WARN_ON_RT(irqs_disabled()); netpoll_send_udp(&np, msg, frag); + WARN_ON_RT(irqs_disabled()); msg += frag; left -= frag; } --- linux/drivers/base/driver.c.orig +++ linux/drivers/base/driver.c @@ -79,14 +79,13 @@ void put_driver(struct device_driver * d * since most of the things we have to do deal with the bus * structures. * - * The one interesting aspect is that we initialize @drv->unload_sem - * to a locked state here. It will be unlocked when the driver - * reference count reaches 0. + * We init the completion strcut here. When the reference + * count reaches zero, complete() is called from bus_release(). */ int driver_register(struct device_driver * drv) { INIT_LIST_HEAD(&drv->devices); - init_MUTEX_LOCKED(&drv->unload_sem); + init_completion(&drv->unload_done); return bus_add_driver(drv); } @@ -97,18 +96,16 @@ int driver_register(struct device_driver * * Again, we pass off most of the work to the bus-level call. * - * Though, once that is done, we attempt to take @drv->unload_sem. - * This will block until the driver refcount reaches 0, and it is - * released. Only modular drivers will call this function, and we + * Though, once that is done, we wait until the driver refcount + * reaches 0, and complete() is called in bus_release(). + * Only modular drivers will call this function, and we * have to guarantee that it won't complete, letting the driver * unload until all references are gone. */ - void driver_unregister(struct device_driver * drv) { bus_remove_driver(drv); - down(&drv->unload_sem); - up(&drv->unload_sem); + wait_for_completion(&drv->unload_done); } /** --- linux/drivers/base/bus.c.orig +++ linux/drivers/base/bus.c @@ -65,7 +65,7 @@ static struct sysfs_ops driver_sysfs_ops static void driver_release(struct kobject * kobj) { struct device_driver * drv = to_driver(kobj); - up(&drv->unload_sem); + complete(&drv->unload_done); } static struct kobj_type ktype_driver = { --- linux/drivers/char/Kconfig.orig +++ linux/drivers/char/Kconfig @@ -730,6 +730,22 @@ config RTC To compile this driver as a module, choose M here: the module will be called rtc. +config RTC_HISTOGRAM + tristate "Real Time Clock Histogram Support" + default y + depends on RTC + ---help--- + If you say Y here then the kernel will track the delivery and + wakeup latency of /dev/rtc using tasks and will report a + histogram to the kernel log when the application closes /dev/rtc. + +config BLOCKER + tristate "Priority Inheritance Debugging (Blocker) Device Support" + default y + ---help--- + If you say Y here then a device will be created that the userspace + pi_test suite uses to test and measure kernel locking primitives. + config SGI_DS1286 tristate "SGI DS1286 RTC support" depends on SGI_IP22 --- linux/drivers/char/vt.c.orig +++ linux/drivers/char/vt.c @@ -2174,6 +2174,13 @@ void vt_console_print(struct console *co if (vcmode != KD_TEXT) goto quit; + /* + * Skip kernel message from within a critical section going + * to a preemptible console (such as fbcon). + */ + if (in_atomic_rt() && sw->con_preemptible) + goto quit; + /* undraw cursor first */ if (IS_FG) hide_cursor(currcons); @@ -2817,8 +2824,8 @@ void do_blank_screen(int entering_gfx) return; if (vesa_off_interval) { - blank_state = blank_vesa_wait, - mod_timer(&console_timer, jiffies + vesa_off_interval); + blank_state = blank_vesa_wait; +// mod_timer(&console_timer, jiffies + vesa_off_interval); } if (vesa_blank_mode) @@ -2848,7 +2855,10 @@ void do_unblank_screen(int leaving_gfx) return; /* but leave console_blanked != 0 */ if (blankinterval) { - mod_timer(&console_timer, jiffies + blankinterval); +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); +#endif +// mod_timer(&console_timer, jiffies + blankinterval); blank_state = blank_normal_wait; } @@ -2891,16 +2901,16 @@ void poke_blanked_console(void) /* This isn't perfectly race free, but a race here would be mostly harmless, * at worse, we'll do a spurrious blank and it's unlikely */ - del_timer(&console_timer); - blank_timer_expired = 0; +// del_timer(&console_timer); +// blank_timer_expired = 0; if (ignore_poke || !vt_cons[fg_console] || vt_cons[fg_console]->vc_mode == KD_GRAPHICS) return; if (console_blanked) unblank_screen(); else if (blankinterval) { - mod_timer(&console_timer, jiffies + blankinterval); - blank_state = blank_normal_wait; +// mod_timer(&console_timer, jiffies + blankinterval); +// blank_state = blank_normal_wait; } } --- linux/drivers/char/blocker.c.orig +++ linux/drivers/char/blocker.c @@ -0,0 +1,118 @@ +/* + * priority inheritance testing device + */ + +#include +#include + +#define BLOCKER_MINOR 221 + +#define BLOCK_IOCTL 4245 +#define BLOCK_SET_DEPTH 4246 + +#define MAX_LOCK_DEPTH 10 + +u64 notrace get_cpu_tick(void) +{ + u64 tsc; +#ifdef ARCHARM + tsc = *oscr; +#else + __asm__ __volatile__("rdtsc" : "=A" (tsc)); +#endif + return tsc; +} + +void notrace loop(int loops) +{ + int i; + + for (i = 0; i < loops; i++) + get_cpu_tick(); +} + +static spinlock_t blocker_lock[MAX_LOCK_DEPTH]; + +static unsigned int lock_depth = 1; + +void do_the_lock_and_loop(unsigned int args) +{ + int i, max; + + if (rt_task(current)) + max = lock_depth; + else if (lock_depth > 1) + max = (current->pid % lock_depth) + 1; + else + max = 1; + + /* Always lock from the top down */ + for (i = max-1; i >= 0; i--) + spin_lock(&blocker_lock[i]); + loop(args); + for (i = 0; i < max; i++) + spin_unlock(&blocker_lock[i]); +} + +static int blocker_open(struct inode *in, struct file *file) +{ + printk(KERN_INFO "blocker_open called\n"); + + return 0; +} + +static int blocker_ioctl(struct inode *in, struct file *file, + unsigned int cmd, unsigned long args) +{ + switch(cmd) { + case BLOCK_IOCTL: + do_the_lock_and_loop(args); + return 0; + case BLOCK_SET_DEPTH: + if (args >= MAX_LOCK_DEPTH) + return -EINVAL; + lock_depth = args; + return 0; + default: + return -EINVAL; + } +} + +static struct file_operations blocker_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .unlocked_ioctl = blocker_ioctl, + .open = blocker_open, +}; + +static struct miscdevice blocker_dev = +{ + BLOCKER_MINOR, + "blocker", + &blocker_fops +}; + +static int __init blocker_init(void) +{ + int i; + + if (misc_register(&blocker_dev)) + return -ENODEV; + + for (i = 0; i < MAX_LOCK_DEPTH; i++) + blocker_lock[i] = SPIN_LOCK_UNLOCKED; + + return 0; +} + +void __exit blocker_exit(void) +{ + printk(KERN_INFO "blocker device uninstalled\n"); + misc_deregister(&blocker_dev); +} + +module_init(blocker_init); +module_exit(blocker_exit); + +MODULE_LICENSE("GPL"); + --- linux/drivers/char/ipmi/ipmi_watchdog.c.orig +++ linux/drivers/char/ipmi/ipmi_watchdog.c @@ -372,7 +372,7 @@ static void panic_halt_ipmi_set_timeout( when both messages are free. */ static atomic_t heartbeat_tofree = ATOMIC_INIT(0); static DECLARE_MUTEX(heartbeat_lock); -static DECLARE_MUTEX_LOCKED(heartbeat_wait_lock); +static DECLARE_MUTEX_NOCHECK(heartbeat_wait_lock); static void heartbeat_free_smi(struct ipmi_smi_msg *msg) { if (atomic_dec_and_test(&heartbeat_tofree)) @@ -931,6 +931,8 @@ static int __init ipmi_wdog_init(void) printk(KERN_INFO PFX "driver version " IPMI_WATCHDOG_VERSION "\n"); + down(&heartbeat_wait_lock); // initialize as locked + if (strcmp(action, "reset") == 0) { action_val = WDOG_TIMEOUT_RESET; } else if (strcmp(action, "none") == 0) { --- linux/drivers/char/tty_io.c.orig +++ linux/drivers/char/tty_io.c @@ -226,6 +226,7 @@ static int check_tty_count(struct tty_st printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) " "!= #fd's(%d) in %s\n", tty->name, tty->count, count, routine); + dump_stack(); return count; } #endif @@ -829,8 +830,8 @@ void do_tty_hangup(void *data) p->signal->tty = NULL; if (!p->signal->leader) continue; - send_group_sig_info(SIGHUP, SEND_SIG_PRIV, p); - send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p); + group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p); + group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p); if (tty->pgrp > 0) p->signal->tty_old_pgrp = tty->pgrp; } while_each_task_pid(tty->session, PIDTYPE_SID, p); --- linux/drivers/char/sysrq.c.orig +++ linux/drivers/char/sysrq.c @@ -175,6 +175,38 @@ static struct sysrq_key_op sysrq_showreg .enable_mask = SYSRQ_ENABLE_DUMP, }; +#ifdef CONFIG_RT_DEADLOCK_DETECT + +static void sysrq_handle_showlocks(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + show_all_locks(); +} + +static struct sysrq_key_op sysrq_showlocks_op = { + .handler = sysrq_handle_showlocks, + .help_msg = "show-all-locks(D)", + .action_msg = "Show Locks Held", +}; + +#endif + +#if defined(__i386__) + +static void sysrq_handle_showallregs(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + nmi_show_all_regs(); +} + +static struct sysrq_key_op sysrq_showallregs_op = { + .handler = sysrq_handle_showallregs, + .help_msg = "showalLcpupc", + .action_msg = "Show Regs On All CPUs", +}; + +#endif + static void sysrq_handle_showstate(int key, struct pt_regs *pt_regs, struct tty_struct *tty) @@ -290,7 +322,11 @@ static struct sysrq_key_op *sysrq_key_ta and will never arrive */ /* b */ &sysrq_reboot_op, /* c */ NULL, +#ifdef CONFIG_RT_DEADLOCK_DETECT +/* d */ &sysrq_showlocks_op, +#else /* d */ NULL, +#endif /* e */ &sysrq_term_op, /* f */ &sysrq_moom_op, /* g */ GDB_OP, @@ -302,7 +338,11 @@ static struct sysrq_key_op *sysrq_key_ta #else /* k */ NULL, #endif +#if defined(__i386__) +/* l */ &sysrq_showallregs_op, +#else /* l */ NULL, +#endif /* m */ &sysrq_showmem_op, /* n */ &sysrq_unrt_op, /* o */ NULL, /* This will often be registered --- linux/drivers/char/rtc.c.orig +++ linux/drivers/char/rtc.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include @@ -86,6 +87,28 @@ #include #endif +#ifdef CONFIG_RTC_HISTOGRAM + +static cycles_t last_interrupt_time; + +#include + +#define CPU_MHZ (cpu_khz / 1000) + +#define HISTSIZE 10000 +static int histogram[HISTSIZE]; + +static int rtc_state; + +enum rtc_states { + S_STARTUP, /* First round - let the application start */ + S_IDLE, /* Waiting for an interrupt */ + S_WAITING_FOR_READ, /* Signal delivered. waiting for rtc_read() */ + S_READ_MISSED, /* Signal delivered, read() deadline missed */ +}; + +#endif + #ifdef __sparc__ #include #include @@ -204,7 +227,147 @@ static inline unsigned char rtc_is_updat return uip; } +#ifndef RTC_IRQ +# undef CONFIG_RTC_HISTOGRAM +#endif + +static inline void rtc_open_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i; + + last_interrupt_time = 0; + rtc_state = S_STARTUP; + rtc_irq_data = 0; + + for (i = 0; i < HISTSIZE; i++) + histogram[i] = 0; +#endif +} + +static inline void rtc_wake_event(void) +{ +#ifndef CONFIG_RTC_HISTOGRAM + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); +#else + if (!(rtc_status & RTC_IS_OPEN)) + return; + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + break; + /* Waiting for an interrupt */ + case S_IDLE: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + last_interrupt_time = get_cycles(); + rtc_state = S_WAITING_FOR_READ; + break; + + /* Signal has been delivered. waiting for rtc_read() */ + case S_WAITING_FOR_READ: + /* + * Well foo. The usermode application didn't + * schedule and read in time. + */ + rtc_state = S_READ_MISSED; + printk("`%s'[%d] is being piggy. need_resched=%d, cpu=%d\n", + current->comm, current->pid, + need_resched(), smp_processor_id()); + printk("Read missed before next interrupt\n"); + break; + /* Signal has been delivered, read() deadline was missed */ + case S_READ_MISSED: + /* + * Not much we can do here. We're waiting for the usermode + * application to read the rtc + */ + break; + } +#endif +} + +static inline void rtc_read_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + cycles_t now = get_cycles(); + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + rtc_state = S_IDLE; + break; + + /* Waiting for an interrupt */ + case S_IDLE: + printk("bug in rtc_read(): called in state S_IDLE!\n"); + break; + case S_WAITING_FOR_READ: /* + * Signal has been delivered. + * waiting for rtc_read() + */ + /* + * Well done + */ + case S_READ_MISSED: /* + * Signal has been delivered, read() + * deadline was missed + */ + /* + * So, you finally got here. + */ + if (!last_interrupt_time) + printk("bug in rtc_read(): last_interrupt_time = 0\n"); + rtc_state = S_IDLE; + { + cycles_t latency = now - last_interrupt_time; + unsigned long delta; /* Microseconds */ + + delta = latency; + delta /= CPU_MHZ; + + if (delta > 1000 * 1000) { + printk("rtc: eek\n"); + } else { + unsigned long slot = delta; + if (slot >= HISTSIZE) + slot = HISTSIZE - 1; + histogram[slot]++; + if (delta > 2000) + printk("wow! That was a " + "%ld millisec bump\n", + delta / 1000); + } + } + rtc_state = S_IDLE; + break; + } +#endif +} + +static inline void rtc_close_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i = 0; + unsigned long total = 0; + + for (i = 0; i < HISTSIZE; i++) + total += histogram[i]; + if (!total) + return; + + printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n", + current->comm, current->pid, total); + for (i = 0; i < HISTSIZE; i++) { + if (histogram[i]) + printk("%d %d\n", i, histogram[i]); + } +#endif +} + #ifdef RTC_IRQ + /* * A very tiny interrupt handler. It runs with SA_INTERRUPT set, * but there is possibility of conflicting with the set_rtc_mmss() @@ -248,9 +411,9 @@ irqreturn_t rtc_interrupt(int irq, void if (rtc_callback) rtc_callback->func(rtc_callback->private_data); spin_unlock(&rtc_task_lock); - wake_up_interruptible(&rtc_wait); - kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + rtc_wake_event(); + wake_up_interruptible(&rtc_wait); return IRQ_HANDLED; } @@ -354,6 +517,8 @@ static ssize_t rtc_read(struct file *fil schedule(); } while (1); + rtc_read_event(); + if (count < sizeof(unsigned long)) retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int); else @@ -583,6 +748,11 @@ static int rtc_do_ioctl(unsigned int cmd save_freq_select = CMOS_READ(RTC_FREQ_SELECT); CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + /* + * Make CMOS date writes nonpreemptible even on PREEMPT_RT. + * There's a limit to everything! =B-) + */ + preempt_disable(); #ifdef CONFIG_MACH_DECSTATION CMOS_WRITE(real_yrs, RTC_DEC_YEAR); #endif @@ -592,6 +762,7 @@ static int rtc_do_ioctl(unsigned int cmd CMOS_WRITE(hrs, RTC_HOURS); CMOS_WRITE(min, RTC_MINUTES); CMOS_WRITE(sec, RTC_SECONDS); + preempt_enable(); CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); @@ -689,6 +860,7 @@ static int rtc_open(struct inode *inode, if(rtc_status & RTC_IS_OPEN) goto out_busy; + rtc_open_event(); rtc_status |= RTC_IS_OPEN; rtc_irq_data = 0; @@ -744,6 +916,7 @@ no_irq: rtc_irq_data = 0; rtc_status &= ~RTC_IS_OPEN; spin_unlock_irq (&rtc_lock); + rtc_close_event(); return 0; } @@ -886,7 +1059,6 @@ static int __init rtc_init(void) { #if defined(__alpha__) || defined(__mips__) unsigned int year, ctrl; - unsigned long uip_watchdog; char *guess = NULL; #endif #ifdef __sparc__ @@ -989,12 +1161,8 @@ no_irq: /* Each operating system on an Alpha uses its own epoch. Let's try to guess which one we are using now. */ - uip_watchdog = jiffies; if (rtc_is_updating() != 0) - while (jiffies - uip_watchdog < 2*HZ/100) { - barrier(); - cpu_relax(); - } + msleep(2*HZ/100); spin_lock_irq(&rtc_lock); year = CMOS_READ(RTC_YEAR); @@ -1211,7 +1379,6 @@ static int rtc_read_proc(char *page, cha void rtc_get_rtc_time(struct rtc_time *rtc_tm) { - unsigned long uip_watchdog = jiffies; unsigned char ctrl; #ifdef CONFIG_MACH_DECSTATION unsigned int real_year; @@ -1219,19 +1386,15 @@ void rtc_get_rtc_time(struct rtc_time *r /* * read RTC once any update in progress is done. The update - * can take just over 2ms. We wait 10 to 20ms. There is no need to + * can take just over 2ms. We wait 20ms. There is no need to * to poll-wait (up to 1s - eeccch) for the falling edge of RTC_UIP. * If you need to know *exactly* when a second has started, enable * periodic update complete interrupts, (via ioctl) and then * immediately read /dev/rtc which will block until you get the IRQ. * Once the read clears, read the RTC time (again via ioctl). Easy. */ - if (rtc_is_updating() != 0) - while (jiffies - uip_watchdog < 2*HZ/100) { - barrier(); - cpu_relax(); - } + msleep(2*HZ/100); /* * Only the values that we read from the RTC are set. We leave --- linux/drivers/char/Makefile.orig +++ linux/drivers/char/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_R3964) += n_r3964.o obj-$(CONFIG_APPLICOM) += applicom.o obj-$(CONFIG_SONYPI) += sonypi.o obj-$(CONFIG_RTC) += rtc.o +obj-$(CONFIG_BLOCKER) += blocker.o obj-$(CONFIG_HPET) += hpet.o obj-$(CONFIG_GEN_RTC) += genrtc.o obj-$(CONFIG_EFI_RTC) += efirtc.o --- linux/drivers/char/random.c.orig +++ linux/drivers/char/random.c @@ -822,8 +822,11 @@ static void add_timer_randomness(struct preempt_disable(); /* if over the trickle threshold, use only 1 in 4096 samples */ if ( random_state->entropy_count > trickle_thresh && - (__get_cpu_var(trickle_count)++ & 0xfff)) - goto out; + (__get_cpu_var(trickle_count)++ & 0xfff)) { + preempt_enable(); + return; + } + preempt_enable(); /* * Use get_cycles() if implemented, otherwise fall back to @@ -872,8 +875,6 @@ static void add_timer_randomness(struct entropy = int_ln_12bits(delta); } batch_entropy_store(num, time, entropy); -out: - preempt_enable(); } void add_keyboard_randomness(unsigned char scancode) --- linux/drivers/input/joystick/analog.c.orig +++ linux/drivers/input/joystick/analog.c @@ -141,12 +141,14 @@ struct analog_port { */ #ifdef __i386__ + +#include + #define GET_TIME(x) do { if (cpu_has_tsc) rdtscl(x); else x = get_time_pit(); } while (0) #define DELTA(x,y) (cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? CLOCK_TICK_RATE / HZ : 0))) #define TIME_NAME (cpu_has_tsc?"TSC":"PIT") static unsigned int get_time_pit(void) { - extern spinlock_t i8253_lock; unsigned long flags; unsigned int count; --- linux/drivers/input/gameport/gameport.c.orig +++ linux/drivers/input/gameport/gameport.c @@ -37,12 +37,13 @@ static LIST_HEAD(gameport_dev_list); #ifdef __i386__ +#include + #define DELTA(x,y) ((y)-(x)+((y)<(x)?1193182/HZ:0)) #define GET_TIME(x) do { x = get_time_pit(); } while (0) static unsigned int get_time_pit(void) { - extern spinlock_t i8253_lock; unsigned long flags; unsigned int count; --- linux/drivers/scsi/scsi_error.c.orig +++ linux/drivers/scsi/scsi_error.c @@ -477,10 +477,12 @@ static void scsi_eh_done(struct scsi_cmn static int scsi_send_eh_cmnd(struct scsi_cmnd *scmd, int timeout) { struct Scsi_Host *host = scmd->device->host; - DECLARE_MUTEX_LOCKED(sem); + DECLARE_MUTEX_NOCHECK(sem); unsigned long flags; int rtn = SUCCESS; + down(&sem); + /* * we will use a queued command if possible, otherwise we will * emulate the queuing and calling of completion function ourselves. @@ -1624,8 +1626,9 @@ int scsi_error_handler(void *data) { struct Scsi_Host *shost = (struct Scsi_Host *) data; int rtn; - DECLARE_MUTEX_LOCKED(sem); + DECLARE_MUTEX_NOCHECK(sem); + down(&sem); /* * Flush resources */ --- linux/drivers/scsi/aha152x.c.orig +++ linux/drivers/scsi/aha152x.c @@ -1160,11 +1160,13 @@ static void timer_expired(unsigned long static int aha152x_device_reset(Scsi_Cmnd * SCpnt) { struct Scsi_Host *shpnt = SCpnt->device->host; - DECLARE_MUTEX_LOCKED(sem); + DECLARE_MUTEX_NOCHECK(sem); struct timer_list timer; int ret, issued, disconnected; unsigned long flags; + down(&sem); + #if defined(AHA152X_DEBUG) if(HOSTDATA(shpnt)->debug & debug_eh) { printk(INFO_LEAD "aha152x_device_reset(%p)", CMDINFO(SCpnt), SCpnt); --- linux/drivers/scsi/qla2xxx/qla_os.c.orig +++ linux/drivers/scsi/qla2xxx/qla_os.c @@ -3190,7 +3190,7 @@ qla2x00_free_sp_pool( scsi_qla_host_t *h static int qla2x00_do_dpc(void *data) { - DECLARE_MUTEX_LOCKED(sem); + DECLARE_MUTEX_NOCHECK(sem); scsi_qla_host_t *ha; fc_port_t *fcport; os_lun_t *q; @@ -3204,6 +3204,8 @@ qla2x00_do_dpc(void *data) int t; os_tgt_t *tq; + down(&sem); + ha = (scsi_qla_host_t *)data; lock_kernel(); --- linux/drivers/ieee1394/raw1394.c.orig +++ linux/drivers/ieee1394/raw1394.c @@ -2529,7 +2529,7 @@ static int raw1394_open(struct inode *in fi->state = opened; INIT_LIST_HEAD(&fi->req_pending); INIT_LIST_HEAD(&fi->req_complete); - sema_init(&fi->complete_sem, 0); + sema_init_nocheck(&fi->complete_sem, 0); spin_lock_init(&fi->reqlists_lock); init_waitqueue_head(&fi->poll_wait_complete); INIT_LIST_HEAD(&fi->addr_list); --- linux/drivers/ieee1394/ieee1394_core.c.orig +++ linux/drivers/ieee1394/ieee1394_core.c @@ -1003,7 +1003,7 @@ void abort_timedouts(unsigned long __opa static int khpsbpkt_pid = -1, khpsbpkt_kill; static DECLARE_COMPLETION(khpsbpkt_complete); struct sk_buff_head hpsbpkt_queue; -static DECLARE_MUTEX_LOCKED(khpsbpkt_sig); +static DECLARE_MUTEX_NOCHECK(khpsbpkt_sig); static void queue_packet_complete(struct hpsb_packet *packet) @@ -1059,6 +1059,8 @@ static int __init ieee1394_init(void) { int i, ret; + down(&khpsbpkt_sig); // initialize as locked + skb_queue_head_init(&hpsbpkt_queue); /* non-fatal error */ --- linux/drivers/ieee1394/ieee1394_types.h.orig +++ linux/drivers/ieee1394/ieee1394_types.h @@ -28,7 +28,7 @@ do { \ spin_lock_init(&(_tp)->lock); \ (_tp)->next = 0; \ (_tp)->allocations = 0; \ - sema_init(&(_tp)->count, 63); \ + sema_init_nocheck(&(_tp)->count, 63); \ } while (0) --- linux/drivers/ieee1394/nodemgr.c.orig +++ linux/drivers/ieee1394/nodemgr.c @@ -1664,7 +1664,7 @@ static void nodemgr_add_host(struct hpsb hi->host = host; init_completion(&hi->exited); - sema_init(&hi->reset_sem, 0); + sema_init_nocheck(&hi->reset_sem, 0); sprintf(hi->daemon_name, "knodemgrd_%d", host->id); --- linux/drivers/video/console/vgacon.c.orig +++ linux/drivers/video/console/vgacon.c @@ -53,7 +53,7 @@ #include