7c91767a6b
Patch from Dmitry Mishin <dim@openvz.org>: Replace add_timer() by mod_timer() in dst_run_gc in order to avoid BUG message. CPU1 CPU2 dst_run_gc() entered dst_run_gc() entered spin_lock(&dst_lock) ..... del_timer(&dst_gc_timer) fail to get lock .... mod_timer() <--- puts timer back to the list add_timer(&dst_gc_timer) <--- BUG because timer is in list already. Found during OpenVZ internal testing. At first we thought that it is OpenVZ specific as we added dst_run_gc(0) call in dst_dev_event(), but as Alexey pointed to me it is possible to trigger this condition in mainstream kernel. F.e. timer has fired on CPU2, but the handler was preeempted by an irq before dst_lock is tried. Meanwhile, someone on CPU1 adds an entry to gc list and starts the timer. If CPU2 was preempted long enough, this timer can expire simultaneously with resuming timer handler on CPU1, arriving exactly to the situation described. Signed-off-by: Dmitry Mishin <dim@openvz.org> Signed-off-by: Kirill Korotaev <dev@openvz.org> Signed-off-by: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> Signed-off-by: David S. Miller <davem@davemloft.net>
283 lines
6.2 KiB
C
283 lines
6.2 KiB
C
/*
|
|
* net/core/dst.c Protocol independent destination cache.
|
|
*
|
|
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
|
*
|
|
*/
|
|
|
|
#include <linux/bitops.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/init.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/module.h>
|
|
#include <linux/netdevice.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/skbuff.h>
|
|
#include <linux/string.h>
|
|
#include <linux/types.h>
|
|
|
|
#include <net/dst.h>
|
|
|
|
/* Locking strategy:
|
|
* 1) Garbage collection state of dead destination cache
|
|
* entries is protected by dst_lock.
|
|
* 2) GC is run only from BH context, and is the only remover
|
|
* of entries.
|
|
* 3) Entries are added to the garbage list from both BH
|
|
* and non-BH context, so local BH disabling is needed.
|
|
* 4) All operations modify state, so a spinlock is used.
|
|
*/
|
|
static struct dst_entry *dst_garbage_list;
|
|
#if RT_CACHE_DEBUG >= 2
|
|
static atomic_t dst_total = ATOMIC_INIT(0);
|
|
#endif
|
|
static DEFINE_SPINLOCK(dst_lock);
|
|
|
|
static unsigned long dst_gc_timer_expires;
|
|
static unsigned long dst_gc_timer_inc = DST_GC_MAX;
|
|
static void dst_run_gc(unsigned long);
|
|
static void ___dst_free(struct dst_entry * dst);
|
|
|
|
static DEFINE_TIMER(dst_gc_timer, dst_run_gc, DST_GC_MIN, 0);
|
|
|
|
static void dst_run_gc(unsigned long dummy)
|
|
{
|
|
int delayed = 0;
|
|
int work_performed;
|
|
struct dst_entry * dst, **dstp;
|
|
|
|
if (!spin_trylock(&dst_lock)) {
|
|
mod_timer(&dst_gc_timer, jiffies + HZ/10);
|
|
return;
|
|
}
|
|
|
|
del_timer(&dst_gc_timer);
|
|
dstp = &dst_garbage_list;
|
|
work_performed = 0;
|
|
while ((dst = *dstp) != NULL) {
|
|
if (atomic_read(&dst->__refcnt)) {
|
|
dstp = &dst->next;
|
|
delayed++;
|
|
continue;
|
|
}
|
|
*dstp = dst->next;
|
|
work_performed = 1;
|
|
|
|
dst = dst_destroy(dst);
|
|
if (dst) {
|
|
/* NOHASH and still referenced. Unless it is already
|
|
* on gc list, invalidate it and add to gc list.
|
|
*
|
|
* Note: this is temporary. Actually, NOHASH dst's
|
|
* must be obsoleted when parent is obsoleted.
|
|
* But we do not have state "obsoleted, but
|
|
* referenced by parent", so it is right.
|
|
*/
|
|
if (dst->obsolete > 1)
|
|
continue;
|
|
|
|
___dst_free(dst);
|
|
dst->next = *dstp;
|
|
*dstp = dst;
|
|
dstp = &dst->next;
|
|
}
|
|
}
|
|
if (!dst_garbage_list) {
|
|
dst_gc_timer_inc = DST_GC_MAX;
|
|
goto out;
|
|
}
|
|
if (!work_performed) {
|
|
if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
|
|
dst_gc_timer_expires = DST_GC_MAX;
|
|
dst_gc_timer_inc += DST_GC_INC;
|
|
} else {
|
|
dst_gc_timer_inc = DST_GC_INC;
|
|
dst_gc_timer_expires = DST_GC_MIN;
|
|
}
|
|
#if RT_CACHE_DEBUG >= 2
|
|
printk("dst_total: %d/%d %ld\n",
|
|
atomic_read(&dst_total), delayed, dst_gc_timer_expires);
|
|
#endif
|
|
mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
|
|
|
|
out:
|
|
spin_unlock(&dst_lock);
|
|
}
|
|
|
|
static int dst_discard_in(struct sk_buff *skb)
|
|
{
|
|
kfree_skb(skb);
|
|
return 0;
|
|
}
|
|
|
|
static int dst_discard_out(struct sk_buff *skb)
|
|
{
|
|
kfree_skb(skb);
|
|
return 0;
|
|
}
|
|
|
|
void * dst_alloc(struct dst_ops * ops)
|
|
{
|
|
struct dst_entry * dst;
|
|
|
|
if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {
|
|
if (ops->gc())
|
|
return NULL;
|
|
}
|
|
dst = kmem_cache_alloc(ops->kmem_cachep, SLAB_ATOMIC);
|
|
if (!dst)
|
|
return NULL;
|
|
memset(dst, 0, ops->entry_size);
|
|
atomic_set(&dst->__refcnt, 0);
|
|
dst->ops = ops;
|
|
dst->lastuse = jiffies;
|
|
dst->path = dst;
|
|
dst->input = dst_discard_in;
|
|
dst->output = dst_discard_out;
|
|
#if RT_CACHE_DEBUG >= 2
|
|
atomic_inc(&dst_total);
|
|
#endif
|
|
atomic_inc(&ops->entries);
|
|
return dst;
|
|
}
|
|
|
|
static void ___dst_free(struct dst_entry * dst)
|
|
{
|
|
/* The first case (dev==NULL) is required, when
|
|
protocol module is unloaded.
|
|
*/
|
|
if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
|
|
dst->input = dst_discard_in;
|
|
dst->output = dst_discard_out;
|
|
}
|
|
dst->obsolete = 2;
|
|
}
|
|
|
|
void __dst_free(struct dst_entry * dst)
|
|
{
|
|
spin_lock_bh(&dst_lock);
|
|
___dst_free(dst);
|
|
dst->next = dst_garbage_list;
|
|
dst_garbage_list = dst;
|
|
if (dst_gc_timer_inc > DST_GC_INC) {
|
|
dst_gc_timer_inc = DST_GC_INC;
|
|
dst_gc_timer_expires = DST_GC_MIN;
|
|
mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
|
|
}
|
|
spin_unlock_bh(&dst_lock);
|
|
}
|
|
|
|
struct dst_entry *dst_destroy(struct dst_entry * dst)
|
|
{
|
|
struct dst_entry *child;
|
|
struct neighbour *neigh;
|
|
struct hh_cache *hh;
|
|
|
|
smp_rmb();
|
|
|
|
again:
|
|
neigh = dst->neighbour;
|
|
hh = dst->hh;
|
|
child = dst->child;
|
|
|
|
dst->hh = NULL;
|
|
if (hh && atomic_dec_and_test(&hh->hh_refcnt))
|
|
kfree(hh);
|
|
|
|
if (neigh) {
|
|
dst->neighbour = NULL;
|
|
neigh_release(neigh);
|
|
}
|
|
|
|
atomic_dec(&dst->ops->entries);
|
|
|
|
if (dst->ops->destroy)
|
|
dst->ops->destroy(dst);
|
|
if (dst->dev)
|
|
dev_put(dst->dev);
|
|
#if RT_CACHE_DEBUG >= 2
|
|
atomic_dec(&dst_total);
|
|
#endif
|
|
kmem_cache_free(dst->ops->kmem_cachep, dst);
|
|
|
|
dst = child;
|
|
if (dst) {
|
|
int nohash = dst->flags & DST_NOHASH;
|
|
|
|
if (atomic_dec_and_test(&dst->__refcnt)) {
|
|
/* We were real parent of this dst, so kill child. */
|
|
if (nohash)
|
|
goto again;
|
|
} else {
|
|
/* Child is still referenced, return it for freeing. */
|
|
if (nohash)
|
|
return dst;
|
|
/* Child is still in his hash table */
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/* Dirty hack. We did it in 2.2 (in __dst_free),
|
|
* we have _very_ good reasons not to repeat
|
|
* this mistake in 2.3, but we have no choice
|
|
* now. _It_ _is_ _explicit_ _deliberate_
|
|
* _race_ _condition_.
|
|
*
|
|
* Commented and originally written by Alexey.
|
|
*/
|
|
static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
|
|
int unregister)
|
|
{
|
|
if (dst->ops->ifdown)
|
|
dst->ops->ifdown(dst, dev, unregister);
|
|
|
|
if (dev != dst->dev)
|
|
return;
|
|
|
|
if (!unregister) {
|
|
dst->input = dst_discard_in;
|
|
dst->output = dst_discard_out;
|
|
} else {
|
|
dst->dev = &loopback_dev;
|
|
dev_hold(&loopback_dev);
|
|
dev_put(dev);
|
|
if (dst->neighbour && dst->neighbour->dev == dev) {
|
|
dst->neighbour->dev = &loopback_dev;
|
|
dev_put(dev);
|
|
dev_hold(&loopback_dev);
|
|
}
|
|
}
|
|
}
|
|
|
|
static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
|
|
{
|
|
struct net_device *dev = ptr;
|
|
struct dst_entry *dst;
|
|
|
|
switch (event) {
|
|
case NETDEV_UNREGISTER:
|
|
case NETDEV_DOWN:
|
|
spin_lock_bh(&dst_lock);
|
|
for (dst = dst_garbage_list; dst; dst = dst->next) {
|
|
dst_ifdown(dst, dev, event != NETDEV_DOWN);
|
|
}
|
|
spin_unlock_bh(&dst_lock);
|
|
break;
|
|
}
|
|
return NOTIFY_DONE;
|
|
}
|
|
|
|
static struct notifier_block dst_dev_notifier = {
|
|
.notifier_call = dst_dev_event,
|
|
};
|
|
|
|
void __init dst_init(void)
|
|
{
|
|
register_netdevice_notifier(&dst_dev_notifier);
|
|
}
|
|
|
|
EXPORT_SYMBOL(__dst_free);
|
|
EXPORT_SYMBOL(dst_alloc);
|
|
EXPORT_SYMBOL(dst_destroy);
|