Commit graph

10847 commits

Author SHA1 Message Date
Ulrich Drepper
de11defebf reintroduce accept4
Introduce a new accept4() system call.  The addition of this system call
matches analogous changes in 2.6.27 (dup3(), evenfd2(), signalfd4(),
inotify_init1(), epoll_create1(), pipe2()) which added new system calls
that differed from analogous traditional system calls in adding a flags
argument that can be used to access additional functionality.

The accept4() system call is exactly the same as accept(), except that
it adds a flags bit-mask argument.  Two flags are initially implemented.
(Most of the new system calls in 2.6.27 also had both of these flags.)

SOCK_CLOEXEC causes the close-on-exec (FD_CLOEXEC) flag to be enabled
for the new file descriptor returned by accept4().  This is a useful
security feature to avoid leaking information in a multithreaded
program where one thread is doing an accept() at the same time as
another thread is doing a fork() plus exec().  More details here:
http://udrepper.livejournal.com/20407.html "Secure File Descriptor Handling",
Ulrich Drepper).

The other flag is SOCK_NONBLOCK, which causes the O_NONBLOCK flag
to be enabled on the new open file description created by accept4().
(This flag is merely a convenience, saving the use of additional calls
fcntl(F_GETFL) and fcntl (F_SETFL) to achieve the same result.

Here's a test program.  Works on x86-32.  Should work on x86-64, but
I (mtk) don't have a system to hand to test with.

It tests accept4() with each of the four possible combinations of
SOCK_CLOEXEC and SOCK_NONBLOCK set/clear in 'flags', and verifies
that the appropriate flags are set on the file descriptor/open file
description returned by accept4().

I tested Ulrich's patch in this thread by applying against 2.6.28-rc2,
and it passes according to my test program.

/* test_accept4.c

  Copyright (C) 2008, Linux Foundation, written by Michael Kerrisk
       <mtk.manpages@gmail.com>

  Licensed under the GNU GPLv2 or later.
*/
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>

#define PORT_NUM 33333

#define die(msg) do { perror(msg); exit(EXIT_FAILURE); } while (0)

/**********************************************************************/

/* The following is what we need until glibc gets a wrapper for
  accept4() */

/* Flags for socket(), socketpair(), accept4() */
#ifndef SOCK_CLOEXEC
#define SOCK_CLOEXEC    O_CLOEXEC
#endif
#ifndef SOCK_NONBLOCK
#define SOCK_NONBLOCK   O_NONBLOCK
#endif

#ifdef __x86_64__
#define SYS_accept4 288
#elif __i386__
#define USE_SOCKETCALL 1
#define SYS_ACCEPT4 18
#else
#error "Sorry -- don't know the syscall # on this architecture"
#endif

static int
accept4(int fd, struct sockaddr *sockaddr, socklen_t *addrlen, int flags)
{
   printf("Calling accept4(): flags = %x", flags);
   if (flags != 0) {
       printf(" (");
       if (flags & SOCK_CLOEXEC)
           printf("SOCK_CLOEXEC");
       if ((flags & SOCK_CLOEXEC) && (flags & SOCK_NONBLOCK))
           printf(" ");
       if (flags & SOCK_NONBLOCK)
           printf("SOCK_NONBLOCK");
       printf(")");
   }
   printf("\n");

#if USE_SOCKETCALL
   long args[6];

   args[0] = fd;
   args[1] = (long) sockaddr;
   args[2] = (long) addrlen;
   args[3] = flags;

   return syscall(SYS_socketcall, SYS_ACCEPT4, args);
#else
   return syscall(SYS_accept4, fd, sockaddr, addrlen, flags);
#endif
}

/**********************************************************************/

static int
do_test(int lfd, struct sockaddr_in *conn_addr,
       int closeonexec_flag, int nonblock_flag)
{
   int connfd, acceptfd;
   int fdf, flf, fdf_pass, flf_pass;
   struct sockaddr_in claddr;
   socklen_t addrlen;

   printf("=======================================\n");

   connfd = socket(AF_INET, SOCK_STREAM, 0);
   if (connfd == -1)
       die("socket");
   if (connect(connfd, (struct sockaddr *) conn_addr,
               sizeof(struct sockaddr_in)) == -1)
       die("connect");

   addrlen = sizeof(struct sockaddr_in);
   acceptfd = accept4(lfd, (struct sockaddr *) &claddr, &addrlen,
                      closeonexec_flag | nonblock_flag);
   if (acceptfd == -1) {
       perror("accept4()");
       close(connfd);
       return 0;
   }

   fdf = fcntl(acceptfd, F_GETFD);
   if (fdf == -1)
       die("fcntl:F_GETFD");
   fdf_pass = ((fdf & FD_CLOEXEC) != 0) ==
              ((closeonexec_flag & SOCK_CLOEXEC) != 0);
   printf("Close-on-exec flag is %sset (%s); ",
           (fdf & FD_CLOEXEC) ? "" : "not ",
           fdf_pass ? "OK" : "failed");

   flf = fcntl(acceptfd, F_GETFL);
   if (flf == -1)
       die("fcntl:F_GETFD");
   flf_pass = ((flf & O_NONBLOCK) != 0) ==
              ((nonblock_flag & SOCK_NONBLOCK) !=0);
   printf("nonblock flag is %sset (%s)\n",
           (flf & O_NONBLOCK) ? "" : "not ",
           flf_pass ? "OK" : "failed");

   close(acceptfd);
   close(connfd);

   printf("Test result: %s\n", (fdf_pass && flf_pass) ? "PASS" : "FAIL");
   return fdf_pass && flf_pass;
}

static int
create_listening_socket(int port_num)
{
   struct sockaddr_in svaddr;
   int lfd;
   int optval;

   memset(&svaddr, 0, sizeof(struct sockaddr_in));
   svaddr.sin_family = AF_INET;
   svaddr.sin_addr.s_addr = htonl(INADDR_ANY);
   svaddr.sin_port = htons(port_num);

   lfd = socket(AF_INET, SOCK_STREAM, 0);
   if (lfd == -1)
       die("socket");

   optval = 1;
   if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &optval,
                  sizeof(optval)) == -1)
       die("setsockopt");

   if (bind(lfd, (struct sockaddr *) &svaddr,
            sizeof(struct sockaddr_in)) == -1)
       die("bind");

   if (listen(lfd, 5) == -1)
       die("listen");

   return lfd;
}

int
main(int argc, char *argv[])
{
   struct sockaddr_in conn_addr;
   int lfd;
   int port_num;
   int passed;

   passed = 1;

   port_num = (argc > 1) ? atoi(argv[1]) : PORT_NUM;

   memset(&conn_addr, 0, sizeof(struct sockaddr_in));
   conn_addr.sin_family = AF_INET;
   conn_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
   conn_addr.sin_port = htons(port_num);

   lfd = create_listening_socket(port_num);

   if (!do_test(lfd, &conn_addr, 0, 0))
       passed = 0;
   if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, 0))
       passed = 0;
   if (!do_test(lfd, &conn_addr, 0, SOCK_NONBLOCK))
       passed = 0;
   if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, SOCK_NONBLOCK))
       passed = 0;

   close(lfd);

   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
}

[mtk.manpages@gmail.com: rewrote changelog, updated test program]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: <linux-api@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-11-19 18:49:57 -08:00
Eric Dumazet
6b41e7dd90 net: af_unix should use KERN_INFO instead of KERN_DEBUG
As spotted by Joe Perches, we should use KERN_INFO in unix_sock_destructor()

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-19 15:48:09 -08:00
Joe Perches
07f0757a68 include/net net/ - csum_partial - remove unnecessary casts
The first argument to csum_partial is const void *
casts to char/u8 * are not necessary

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-19 15:44:53 -08:00
Eric Dumazet
a7a0d6a87b net: inet_diag_handler structs can be const
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-19 15:43:27 -08:00
David S. Miller
b47300168e net: Do not fire linkwatch events until the device is registered.
Several device drivers try to do things like netif_carrier_off()
before register_netdev() is invoked.  This is bogus, but too many
drivers do this to fix them all up in one go.

Reported-by: Folkert van Heusden <folkert@vanheusden.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-19 15:33:54 -08:00
Eric Dumazet
14e943db13 net: make /proc/net/protocols namespace aware
Converting /proc/net/protocols to be namespace aware is quite easy
and permits us to use sock_prot_inuse_get().

This provides seperate counters for each protocol. For example
we can really count TCPv6 sockets and TCPv4 sockets, while previously,
we had the same value, and this value was not namespace aware.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-19 15:14:01 -08:00
Eric Dumazet
3680453c8b net: af_packet should update its inuse counter
This patch is a preparation to namespace conversion of /proc/net/protocols

In order to have relevant information for PACKET protocols, we should use
sock_prot_inuse_add() to update a (percpu and pernamespace) counter of
inuse sockets.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-19 14:25:35 -08:00
Alexey Dobriyan
566521d637 phonet: fix compilation with gcc-3.4
CC [M]  net/phonet/af_phonet.o
net/phonet/af_phonet.c: In function `pn_socket_create':
net/phonet/af_phonet.c:38: sorry, unimplemented: inlining failed in call to 'phonet_proto_put': function body not available
net/phonet/af_phonet.c:99: sorry, unimplemented: called from here
make[3]: *** [net/phonet/af_phonet.o] Error 1

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-19 14:17:41 -08:00
Robert Olsson
bfdbc0acad pktgen: fix multiple queue warning
As number of TX queues in unrelated to number of CPU's we remove this test
and just make sure nxtq never gets exceeded.

Signed-off-by: Robert Olsson <robert.olsson@its.uu.se>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-19 14:09:47 -08:00
Benjamin Thery
c3e388964b net: fix ip_mr_init() error path
Similarly to IPv6 ip6_mr_init() (fixed last week), the order of cleanup
operations in the error/exit section of ip_mr_init() is completely 
inversed. It should be the other way around.
Also a del_timer() is missing in the error path.

I should have guessed last week that this same error existed in ipmr.c
too, as ip6mr.c is largely inspired by ipmr.c.

Signed-off-by: Benjamin Thery <benjamin.thery@bull.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-19 14:07:41 -08:00
David S. Miller
198d6ba4d7 Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
Conflicts:

	drivers/isdn/i4l/isdn_net.c
	fs/cifs/connect.c
2008-11-18 23:38:23 -08:00
Johannes Berg
8e3bad65a5 mac80211: remove ieee80211_notify_mac
Before ieee80211_notify_mac() was added, it was presented with the
use case of using it to tell mac80211 that the association may
have been lost because the firmware crashed/reset.

Since then, it has also been used by iwlwifi to (slightly) speed
up re-association after resume, a workaround around the fact that
mac80211 has no suspend/resume handling yet. It is also not used
by any other drivers, so clearly it cannot be necessary for "good
enough" suspend/resume.

Unfortunately, the callback suffers from a severe problem: It only
works for station mode. If suspend/resume happens while in IBSS or
any other mode (but station), then the callback is pointless.

Recently, it has created a number of locking issues, first because
it required rtnl locking rather than RCU due to calling sleeping
functions within the critical section, and now because it's called
by iwlwifi from the mac80211 workqueue that may not use the rtnl
because it is flushed under rtnl.
(cf. http://bugzilla.kernel.org/show_bug.cgi?id=12046)

I think, therefore, that we should take a step back, remove it
entirely for now and add the small feature it provided properly.
For suspend and resume we will need to introduce new hooks, and for
the case where the firmware was reset the driver will probably
simply just pretend it has done a suspend/resume cycle to get
mac80211 to reprogram the hardware completely, not just try to
connect to the current AP again in station mode. When doing so, we
will need to take into account locking issues and possibly defer
to schedule_work from within mac80211 for the resume operation,
while the suspend operation must be done directly.

Proper suspend/resume should also not necessarily try to reconnect
to the current AP, the time spent in suspend may have been short
enough to not be disconnected from the AP, mac80211 will detect
that the AP went out of range quickly if it did, and if the
association is lost then the AP will disassoc as soon as a data
frame is sent. We might also take into account WWOL then, and
have mac80211 program the hardware into such a mode where it is
available and requested.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-18 17:26:26 -05:00
Eric Dumazet
9a57f7fabd net: sctp should update its inuse counter
This patch is a preparation to namespace conversion of /proc/net/protocols

In order to have relevant information for SCTP protocols, we should use
sock_prot_inuse_add() to update a (percpu and pernamespace) counter of
inuse sockets.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-17 02:41:00 -08:00
Eric Dumazet
a8076d8db9 net: af_unix should update its inuse counter
This patch is a preparation to namespace conversion of /proc/net/protocols

In order to have relevant information for UNIX protocol, we should use
sock_prot_inuse_add() to update a (percpu and pernamespace) counter of
inuse sockets.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-17 02:38:49 -08:00
Eric Dumazet
248969ae31 net: af_unix can make unix_nr_socks visbile in /proc
Currently, /proc/net/protocols displays socket counts only for TCP/TCPv6
protocols

We can provide unix_nr_socks for free here, this counter being
already maintained in af_unix

Before patch :

# grep UNIX /proc/net/protocols
UNIX       428     -1      -1   NI       0   yes  kernel

After patch :

# grep UNIX /proc/net/protocols
UNIX       428     98      -1   NI       0   yes  kernel

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-17 00:00:30 -08:00
Johannes Berg
5f9021cfdc rtnetlink: propagate error from dev_change_flags in do_setlink()
Unlike ifconfig, iproute doesn't report an error when setting
an interface up fails:

(example: put wireless network mac80211 interface into repeater mode
with iwconfig but do not set a peer MAC address, it should fail with
-ENOLINK)

without patch:
# ip link set wlan0 up ; echo $?
0
# 

with patch:
# ip link set wlan0 up ; echo $?
RTNETLINK answers: Link has been severed
2
# 

Propagate the return value from dev_change_flags() to fix this.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Tested-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-16 23:20:31 -08:00
Alexey Dobriyan
4d24b52ac5 ematch: simpler tcf_em_unregister()
Simply delete ops from list and let list debugging do the job.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-16 23:01:49 -08:00
Eric Dumazet
6eba6a372b net: Cleanup of af_unix
This is a pure cleanup of net/unix/af_unix.c to meet current code
style standards

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-16 22:58:44 -08:00
Gerrit Renker
1910299636 dccp: Tidy up setsockopt calls
This splits the setsockopt calls into two groups, depending on whether an
integer argument (val) is required and whether routines being called do
their own locking.

Some options (such as setting the CCID) use u8 rather than int, so that for
these the test with regard to integer-sizeof can not be used.

The second switch-case statement now only has those statements which need
locking and which make use of `val'.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Reviewed-by: Eugene Teo <eugeneteo@kernel.sg>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-16 22:56:55 -08:00
Gerrit Renker
dd9c0e363c dccp: Deprecate Ack Ratio sysctl
This patch deprecates the Ack Ratio sysctl, since
 * Ack Ratio is entirely ignored by CCID-3 and CCID-4,
 * Ack Ratio currently doesn't work in CCID-2 (i.e. is always set to 1);
 * even if it would work in CCID-2, there is no point for a user to change it:
   - Ack Ratio is constrained by cwnd (RFC 4341, 6.1.2),
   - if Ack Ratio > cwnd, the system resorts to spurious RTO timeouts
     (since waiting for Acks which will never arrive in this window),
   - cwnd is not a user-configurable value.

The only reasonable place for Ack Ratio is to print it for debugging. It is
planned to do this later on, as part of e.g. dccp_probe.

With this patch Ack Ratio is now under full control of feature negotiation:
 * Ack Ratio is resolved as a dependency of the selected CCID;
 * if the chosen CCID supports it (i.e. CCID == CCID-2), Ack Ratio is set to
   the default of 2, following RFC 4340, 11.3 - "New connections start with Ack
   Ratio 2 for both endpoints";
 * what happens then is part of another patch set, since it concerns the
   dynamic update of Ack Ratio while the connection is in full flight.

Thanks to Tomasz Grobelny for discussion leading up to this patch.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-16 22:55:08 -08:00
Gerrit Renker
2945055984 dccp: Feature negotiation for minimum-checksum-coverage
This provides feature negotiation for server minimum checksum coverage
which so far has been missing.

Since sender/receiver coverage values range only from 0...15, their
type has also been reduced in size from u16 to u4.

Feature-negotiation options are now generated for both sender and receiver
coverage, i.e. when the peer has `forgotten' to enable partial coverage
then feature negotiation will automatically enable (negotiate) the partial
coverage value for this connection.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-16 22:53:48 -08:00
Gerrit Renker
49aebc66d6 dccp: Deprecate old setsockopt framework
The previous setsockopt interface, which passed socket options via struct
dccp_so_feat, is complicated/difficult to use. Continuing to support it leads to
ugly code since the old approach did not distinguish between NN and SP values.

This patch removes the old setsockopt interface and replaces it with two new
functions to register NN/SP values for feature negotiation. 
These are essentially wrappers around the internal __feat_register functions,
with checking added to avoid

 * wrong usage (type);
 * changing values while the connection is in progress.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-16 22:51:23 -08:00
Gerrit Renker
0c1168398e dccp: Mechanism to resolve CCID dependencies
This adds a hook to resolve features whose value depends on the choice of
CCID. It is done at the server since it can only be done after the CCID
values have been negotiated; i.e. the client will add its CCID preference
list on the Change options sent in the Request, which will be reconciled
with the local preference list of the server.

The concept is documented on
http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/feature_negotiation/\
				implementation_notes.html#ccid_dependencies

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-16 22:49:52 -08:00
Alexey Dobriyan
908cd2dabb net: use %pF for /proc/net/ptype
Technically, patch changes format for modules, but I think nobody cares.

	-86dd          :ipv6:ipv6_rcv+0x0
	+86dd          ipv6_rcv+0x0/0x400 [ipv6]

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-16 19:50:35 -08:00
Rémi Denis-Courmont
ebfe92ca65 Phonet: refuse to send bigger than MTU packets
Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-16 19:48:49 -08:00
Eric Dumazet
3ab5aee7fe net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls
RCU was added to UDP lookups, using a fast infrastructure :
- sockets kmem_cache use SLAB_DESTROY_BY_RCU and dont pay the
  price of call_rcu() at freeing time.
- hlist_nulls permits to use few memory barriers.

This patch uses same infrastructure for TCP/DCCP established
and timewait sockets.

Thanks to SLAB_DESTROY_BY_RCU, no slowdown for applications
using short lived TCP connections. A followup patch, converting
rwlocks to spinlocks will even speedup this case.

__inet_lookup_established() is pretty fast now we dont have to
dirty a contended cache line (read_lock/read_unlock)

Only established and timewait hashtable are converted to RCU
(bind table and listen table are still using traditional locking)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-16 19:40:17 -08:00
Eric Dumazet
88ab1932ea udp: Use hlist_nulls in UDP RCU code
This is a straightforward patch, using hlist_nulls infrastructure.

RCUification already done on UDP two weeks ago.

Using hlist_nulls permits us to avoid some memory barriers, both
at lookup time and delete time.

Patch is large because it adds new macros to include/net/sock.h.
These macros will be used by TCP & DCCP in next patch.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-16 19:39:21 -08:00
Balazs Scheidler
e8b2dfe9b4 TPROXY: implemented IP_RECVORIGDSTADDR socket option
In case UDP traffic is redirected to a local UDP socket,
the originally addressed destination address/port
cannot be recovered with the in-kernel tproxy.

This patch adds an IP_RECVORIGDSTADDR sockopt that enables
a IP_ORIGDSTADDR ancillary message in recvmsg(). This
ancillary message contains the original destination address/port
of the packet being received.

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-16 19:32:39 -08:00
Ben Greear
8164f1b797 ipv4: Fix ARP behavior with many mac-vlans
Ben Greear wrote:
> I have 500 mac-vlans on a system talking to 500 other
> mac-vlans.  My problem is that the arp-table gets extremely
> huge because every time an arp-request comes in on all mac-vlans,
> a stale arp entry is added for each mac-vlan.  I have filtering
> turned on, but that doesn't help because the neigh_event_ns call
> below will cause a stale neighbor entry to be created regardless
> of whether a replay will be sent or not.
> Maybe the neigh_event code should be below the checks for dont_send,
> and only create check neigh_event_ns if we are !dont_send?

The attached patch makes it work much better for me.  The patch
will cause the code to NOT create a stale neighbor entry if we
are not going to respond to the ARP request.  The old code
*would* create a stale entry even if we are not going to respond.

Signed-off-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-16 19:19:38 -08:00
Pavel Emelyanov
5421ae0153 scm: fix scm_fp_list->list initialization made in wrong place
This is the next page of the scm recursion story (the commit 
f8d570a4 net: Fix recursive descent in __scm_destroy()).

In function scm_fp_dup(), the INIT_LIST_HEAD(&fpl->list) of newly
created fpl is done *before* the subsequent memcpy from the old 
structure and thus the freshly initialized list is overwritten.

But that's OK, since this initialization is not required at all,
since the fpl->list is list_add-ed at the destruction time in any
case (and is unused in other code), so I propose to drop both
initializations, rather than moving it after the memcpy.

Please, correct me if I miss something significant.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-14 14:51:45 -08:00
Eric Dumazet
ef711cf1d1 net: speedup dst_release()
During tbench/oprofile sessions, I found that dst_release() was in third position.

CPU: Core 2, speed 2999.68 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (Unhalted core cycles) count 100000
samples  %        symbol name
483726    9.0185  __copy_user_zeroing_intel
191466    3.5697  __copy_user_intel
185475    3.4580  dst_release
175114    3.2648  ip_queue_xmit
153447    2.8608  tcp_sendmsg
108775    2.0280  tcp_recvmsg
102659    1.9140  sysenter_past_esp
101450    1.8914  tcp_current_mss
95067     1.7724  __copy_from_user_ll
86531     1.6133  tcp_transmit_skb

Of course, all CPUS fight on the dst_entry associated with 127.0.0.1 

Instead of first checking the refcount value, then decrement it,
we use atomic_dec_return() to help CPU to make the right memory transaction
(ie getting the cache line in exclusive mode)

dst_release() is now at the fifth position, and tbench a litle bit faster ;)

CPU: Core 2, speed 3000.1 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (Unhalted core cycles) count 100000
samples  %        symbol name
647107    8.8072  __copy_user_zeroing_intel
258840    3.5229  ip_queue_xmit
258302    3.5155  __copy_user_intel
209629    2.8531  tcp_sendmsg
165632    2.2543  dst_release
149232    2.0311  tcp_current_mss
147821    2.0119  tcp_recvmsg
137893    1.8767  sysenter_past_esp
127473    1.7349  __copy_from_user_ll
121308    1.6510  ip_finish_output
118510    1.6129  tcp_transmit_skb
109295    1.4875  tcp_v4_rcv

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-14 00:53:54 -08:00
Ingo Molnar
e8f6fbf62d lockdep: include/linux/lockdep.h - fix warning in net/bluetooth/af_bluetooth.c
fix this warning:

  net/bluetooth/af_bluetooth.c:60: warning: ‘bt_key_strings’ defined but not used
  net/bluetooth/af_bluetooth.c:71: warning: ‘bt_slock_key_strings’ defined but not used

this is a lockdep macro problem in the !LOCKDEP case.

We cannot convert it to an inline because the macro works on multiple types,
but we can mark the parameter used.

[ also clean up a misaligned tab in sock_lock_init_class_and_name() ]

[ also remove #ifdefs from around af_family_clock_key strings - which
  were certainly added to get rid of the ugly build warnings. ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-13 23:19:10 -08:00
Jarek Poplawski
f30ab418a1 pkt_sched: Remove qdisc->ops->requeue() etc.
After implementing qdisc->ops->peek() and changing sch_netem into
classless qdisc there are no more qdisc->ops->requeue() users. This
patch removes this method with its wrappers (qdisc_requeue()), and
also unused qdisc->requeue structure. There are a few minor fixes of
warnings (htb_enqueue()) and comments btw.

The idea to kill ->requeue() and a similar patch were first developed
by David S. Miller.

Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-13 22:56:30 -08:00
Wang Chen
524ad0a791 netdevice: safe convert to netdev_priv() #part-4
We have some reasons to kill netdev->priv:
1. netdev->priv is equal to netdev_priv().
2. netdev_priv() wraps the calculation of netdev->priv's offset, obviously
   netdev_priv() is more flexible than netdev->priv.
But we cann't kill netdev->priv, because so many drivers reference to it
directly.

This patch is a safe convert for netdev->priv to netdev_priv(netdev).
Since all of the netdev->priv is only for read.
But it is too big to be sent in one mail.
I split it to 4 parts and make every part smaller than 100,000 bytes,
which is max size allowed by vger.

Signed-off-by: Wang Chen <wangchen@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-12 23:39:10 -08:00
Randy Dunlap
1fa989e80a 9p: restrict RDMA usage
Make 9p's RDMA option depend on INET since it uses Infiniband rdma_*
functions and that code depends on INET.  Otherwise 9p can try to
use symbols which don't exist.

ERROR: "rdma_destroy_id" [net/9p/9pnet_rdma.ko] undefined!
ERROR: "rdma_connect" [net/9p/9pnet_rdma.ko] undefined!
ERROR: "rdma_create_id" [net/9p/9pnet_rdma.ko] undefined!
ERROR: "rdma_create_qp" [net/9p/9pnet_rdma.ko] undefined!
ERROR: "rdma_resolve_route" [net/9p/9pnet_rdma.ko] undefined!
ERROR: "rdma_disconnect" [net/9p/9pnet_rdma.ko] undefined!
ERROR: "rdma_resolve_addr" [net/9p/9pnet_rdma.ko] undefined!

I used an if/endif block so that the menu items would remain
presented together.

Also correct an article adjective.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-12 23:33:57 -08:00
Arnaud Ebalard
7a12122c7a net: Remove unused parameter of xfrm_gen_index()
In commit 2518c7c2b3 ("[XFRM]: Hash
policies when non-prefixed."), the last use of xfrm_gen_policy() first
argument was removed, but the argument was left behind in the
prototype.

Signed-off-by: Arnaud Ebalard <arno@natisbad.org>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-12 23:28:15 -08:00
Alexey Dobriyan
9c0188acf6 net: shy netns_ok check
Failure to pass netns_ok check is SILENT, except some MIB counter is
incremented somewhere.

And adding "netns_ok = 1" (after long head-scratching session) is
usually the last step in making some protocol netns-ready...

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-12 23:23:51 -08:00
Brian Haley
6e093d9dff ipv6: routing header fixes
This patch fixes two bugs:

1. setsockopt() of anything but a Type 2 routing header should return
EINVAL instead of EPERM.  Noticed by Shan Wei
(shanwei@cn.fujitsu.com).

2. setsockopt()/sendmsg() of a Type 2 routing header with invalid
length or segments should return EINVAL.  These values are statically
fixed in RFC 3775, unlike the variable Type 0 was.

Signed-off-by: Brian Haley <brian.haley@hp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-12 22:59:21 -08:00
David S. Miller
ddd535c713 Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6 2008-11-12 14:37:29 -08:00
Johannes Berg
db7fb86b0c mac80211: fix notify_mac function
The ieee80211_notify_mac() function uses ieee80211_sta_req_auth() which
in turn calls ieee80211_set_disassoc() which calls a few functions that
need to be able to sleep, so ieee80211_notify_mac() cannot use RCU
locking for the interface list and must use rtnl locking instead.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-12 16:49:53 -05:00
Patrick Ohly
d35aac10eb net: put_cmsg_compat + SO_TIMESTAMP[NS]: use same name for value as caller
In __sock_recv_timestamp() the additional SCM_TIMESTAMP[NS] is used. This
has the same value as SO_TIMESTAMP[NS], so this is a purely cosmetic change.

Signed-off-by: Patrick Ohly <patrick.ohly@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-12 01:54:56 -08:00
Doug Leith
8f65b5354b tcp_htcp: last_cong bug fix
This patch fixes a minor bug in tcp_htcp.c which has been
highlighted by Lachlan Andrew and Lawrence Stewart.  Currently, the
time since the last congestion event, which is stored in variable
last_cong, is reset whenever there is a state change into
TCP_CA_Open.  This includes transitions of the type
TCP_CA_Open->TCP_CA_Disorder->TCP_CA_Open which are not associated
with backoff of cwnd.  The patch changes last_cong to be updated
only on transitions into TCP_CA_Open that occur after experiencing
the congestion-related states TCP_CA_Loss, TCP_CA_Recovery,
TCP_CA_CWR.

Signed-off-by: Doug Leith <doug.leith@nuim.ie>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-12 01:41:09 -08:00
Eric Dumazet
e42ea986e4 net: Cleanup of neighbour code
Using read_pnet() and write_pnet() in neighbour code ease the reading
of code.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-12 00:54:54 -08:00
Eric Dumazet
7a9546ee35 net: ib_net pointer should depends on CONFIG_NET_NS
We can shrink size of "struct inet_bind_bucket" by 50%, using
read_pnet() and write_pnet()

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-12 00:54:20 -08:00
Gerrit Renker
9eca0a47de dccp: Resolve dependencies of features on choice of CCID
This provides a missing link in the code chain, as several features implicitly
depend and/or rely on the choice of CCID. Most notably, this is the Send Ack Vector
feature, but also Ack Ratio and Send Loss Event Rate (also taken care of).

For Send Ack Vector, the situation is as follows:
 * since CCID2 mandates the use of Ack Vectors, there is no point in allowing 
   endpoints which use CCID2 to disable Ack Vector features such a connection;

 * a peer with a TX CCID of CCID2 will always expect Ack Vectors, and a peer
   with a RX CCID of CCID2 must always send Ack Vectors (RFC 4341, sec. 4);

 * for all other CCIDs, the use of (Send) Ack Vector is optional and thus
   negotiable. However, this implies that the code negotiating the use of Ack
   Vectors also supports it (i.e. is able to supply and to either parse or
   ignore received Ack Vectors). Since this is not the case (CCID-3 has no Ack
   Vector support), the use of Ack Vectors is here disabled, with a comment
   in the source code.

An analogous consideration arises for the Send Loss Event Rate feature,
since the CCID-3 implementation does not support the loss interval options
of RFC 4342. To make such use explicit, corresponding feature-negotiation
options are inserted which signal the use of the loss event rate option,
as it is used by the CCID3 code.

Lastly, the values of the Ack Ratio feature are matched to the choice of CCID.

The patch implements this as a function which is called after the user has
made all other registrations for changing default values of features.

The table is variable-length, the reserved (and hence for feature-negotiation
invalid, confirmed by considering section 19.4 of RFC 4340) feature number `0'
is used to mark the end of the table.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-12 00:48:44 -08:00
Gerrit Renker
d90ebcbfa7 dccp: Query supported CCIDs
This provides a data structure to record which CCIDs are locally supported
and three accessor functions:
 - a test function for internal use which is used to validate CCID requests
   made by the user;
 - a copy function so that the list can be used for feature-negotiation;   
 - documented getsockopt() support so that the user can query capabilities.

The data structure is a table which is filled in at compile-time with the
list of available CCIDs (which in turn depends on the Kconfig choices).

Using the copy function for cloning the list of supported CCIDs is useful for
feature negotiation, since the negotiation is now with the full list of available
CCIDs (e.g. {2, 3}) instead of the default value {2}. This means negotiation 
will not fail if the peer requests to use CCID3 instead of CCID2. 

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-12 00:47:26 -08:00
Gerrit Renker
e8ef967a54 dccp: Registration routines for changing feature values
Two registration routines, for SP and NN features, are provided by this patch,
replacing a previous routine which was used for both feature types.

These are internal-only routines and therefore start with `__feat_register'.

It further exports the known limits of Sequence Window and Ack Ratio as symbolic
constants.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-12 00:43:40 -08:00
Gerrit Renker
f74e91b6cc dccp: Limit feature negotiation to connection setup phase
This patch limits feature (capability) negotation to the connection setup phase:

 1. Although it is theoretically possible to perform feature negotiation at any
    time (and RFC 4340 supports this), in practice this is prohibitively complex,
    as it requires to put traffic on hold for each new negotiation.
 2. As a byproduct of restricting feature negotiation to connection setup, the
    feature-negotiation retransmit timer is no longer required. This part is now
    mapped onto the protocol-level retransmission.
    Details indicating why timers are no longer needed can be found on
    http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/feature_negotiation/\
	                                      implementation_notes.html

This patch disables anytime negotiation, subsequent patches work out full
feature negotiation support for connection setup.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-12 00:42:58 -08:00
Alexey Dobriyan
6bb3ce25d0 net: remove struct dst_entry::entry_size
Unused after kmem_cache_zalloc() conversion.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-11 17:25:22 -08:00
Alexey Dobriyan
9b739ba5e6 net: remove struct neigh_table::pde
->pde isn't actually needed, since name is stashed in ->id.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-11 16:47:44 -08:00
David S. Miller
7e452baf6b Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
Conflicts:

	drivers/message/fusion/mptlan.c
	drivers/net/sfc/ethtool.c
	net/mac80211/debugfs_sta.c
2008-11-11 15:43:02 -08:00
Linus Torvalds
0a4cf2c878 Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6:
  dsa: fix master interface allmulti/promisc handling
  dsa: fix skb->pkt_type when mac address of slave interface differs
  net: fix setting of skb->tail in skb_recycle_check()
  net: fix /proc/net/snmp as memory corruptor
  mac80211: fix a buffer overrun in station debug code
  netfilter: payload_len is be16, add size of struct rather than size of pointer
  ipv6: fix ip6_mr_init error path
  [4/4] dca: fixup initialization dependency
  [3/4] I/OAT: fix async_tx.callback checking
  [2/4] I/OAT: fix dma_pin_iovec_pages() error handling
  [1/4] I/OAT: fix channel resources free for not allocated channels
  ssb: Fix DMA-API compilation for non-PCI systems
  SSB: hide empty sub menu
  vlan: Fix typos in proc output string
  [netdrvr] usb/hso: Cleanup rfkill error handling
  sfc: Correct address of gPXE boot configuration in EEPROM
  el3_common_init() should be __devinit, not __init
  hso: rfkill type should be WWAN
  mlx4_en: Start port error flow bug fix
  af_key: mark policy as dead before destroying
2008-11-11 09:20:29 -08:00
Lennert Buytenhek
df02c6ff2e dsa: fix master interface allmulti/promisc handling
Before commit b6c40d68ff ("net: only
invoke dev->change_rx_flags when device is UP"), the dsa driver could
sort-of get away with only fiddling with the master interface's
allmulti/promisc counts in ->change_rx_flags() and not touching them
in ->open() or ->stop().  After this commit (note that it was merged
almost simultaneously with the dsa patches, which is why this wasn't
caught initially), the breakage that was already there became more
apparent.

Since it makes no sense to keep the master interface's allmulti or
promisc count pinned for a slave interface that is down, copy the vlan
driver's sync logic (which does exactly what we want) over to dsa to
fix this.

Bug report from Dirk Teurlings <dirk@upexia.nl> and Peter van Valderen
<linux@ddcrew.com>.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Tested-by: Dirk Teurlings <dirk@upexia.nl>
Tested-by: Peter van Valderen <linux@ddcrew.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-10 21:53:12 -08:00
Lennert Buytenhek
14ee6742b1 dsa: fix skb->pkt_type when mac address of slave interface differs
When a dsa slave interface has a mac address that differs from that
of the master interface, eth_type_trans() won't explicitly set
skb->pkt_type back to PACKET_HOST -- we need to do this ourselves
before calling eth_type_trans().

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-10 21:52:42 -08:00
Lennert Buytenhek
5cd33db212 net: fix setting of skb->tail in skb_recycle_check()
Since skb_reset_tail_pointer() reads skb->data, we need to set
skb->data before calling skb_reset_tail_pointer().  This was causing
spurious skb_over_panic()s from skb_put() being called on a recycled
skb that had its skb->tail set to beyond where it should have been.

Bug report from Peter van Valderen <linux@ddcrew.com>.

Signed-off-by: Lennert Buytenhek <buytenh@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-10 21:45:05 -08:00
Eric Dumazet
b971e7ac83 net: fix /proc/net/snmp as memory corruptor
icmpmsg_put() can happily corrupt kernel memory, using a static
table and forgetting to reset an array index in a loop.

Remove the static array since its not safe without proper locking.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-10 21:43:08 -08:00
Jianjun Kong
013cd39753 mac80211: fix a buffer overrun in station debug code
net/mac80211/debugfs_sta.c
The trailing zero was written to state[4], it's out of bounds.

Signed-off-by: Jianjun Kong <jianjun@zeuux.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-10 21:37:39 -08:00
Jesse Brandeburg
eb37b41cc2 pktgen: add full reset functionality
While testing pktgen, I found that sometimes my configurations from
previous runs would be left over, particularly when going from a test
with 8 threads down to a test with 4 threads.

This adds new functionality to pktgen where you can call
pgset "reset"

and it will be just like you just insmod'ed pktgen again.

Signed-off-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Robert Olsson <robert.olsson@its.uu.se>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-10 16:48:03 -08:00
Harvey Harrison
b7b45f47d6 netfilter: payload_len is be16, add size of struct rather than size of pointer
payload_len is a be16 value, not cpu_endian, also the size of a ponter
to a struct ipv6hdr was being added, not the size of the struct itself.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-10 16:46:06 -08:00
Benjamin Thery
87b30a6530 ipv6: fix ip6_mr_init error path
The order of cleanup operations in the error/exit section of ip6_mr_init()
is completely inversed. It should be the other way around.
Also a del_timer() is missing in the error path.

Signed-off-by: Benjamin Thery <benjamin.thery@bull.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-10 16:34:11 -08:00
Rémi Denis-Courmont
9b1582d451 Phonet: use net_device built-in stats for GPRS
Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-10 16:21:05 -08:00
Kay Sievers
fb28ad3590 net: struct device - replace bus_id with dev_name(), dev_set_name()
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-10 13:55:14 -08:00
Ferenc Wagner
309f796f30 vlan: Fix typos in proc output string
Signed-off-by: Ferenc Wagner <wferi@niif.hu>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-10 13:37:40 -08:00
David S. Miller
2377989754 Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next-2.6 2008-11-10 13:24:44 -08:00
Luis R. Rodriguez
5166ccd220 cfg80211: Add kdoc for struct regulatory_request
As regulatory_request gets bigger there will be more questions
of what things means, so clarify documenation for it and
keep track of the special alpha2 codes we use internally
and on the userspace regulatory agents.

Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-10 15:17:41 -05:00
Luis R. Rodriguez
9c96477d10 cfg80211: Add regulatory domain intersection capability
There are certain scenerios where we require intersecting
two regulatory domains. This adds intersection support.
When we enable 802.11d support we will use this to intersect
the regulatory domain from the AP's country IE and what our
regulatory agent believes is correct for a country.

This patch enables intersection for now in the case where
the last regdomain was set by a country IE which was parsed
and the user then wants to set the regulatory domain. Since
we don't support country IE parsing yet this code path will not
be hit, however this allows us to pave the way for 11d support.

Intersection code has been tested in userspace with CRDA.

Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-10 15:17:41 -05:00
Luis R. Rodriguez
d71aaf6053 cfg80211: a reg rule is invalid if freq diff is 0
A regulatory rule is invalid when the frequency difference
between the end of the frequency range and the start is 0.

Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-10 15:17:41 -05:00
Jouni Malinen
fbf1892739 mac80211: Allow AP mode to be enabled
With the addition of basic rate set and TX queue parameter
configuration and confirmation that power save buffering is
working again, mac80211 is now in state that allows AP mode to be
used without major problems. Consequently, it is time to allow this
mode to be enabled without having to patch the kernel.

AP mode requires hostapd for management frame processing and as such,
configuring this mode is only allowed through cfg80211 (not with
iwconfig and WEXT).

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-10 15:17:40 -05:00
Tomas Winkler
d61272cbb3 mac80211: fix basic rates setting from association response
In previous code all the rates were marked as basic.

Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-10 15:17:40 -05:00
Jouni Malinen
318884875b nl80211: Add TX queue parameter configuration
Add a new attribute, NL80211_ATTR_WIPHY_TXQ_PARAMS, that can be used with
NL80211_CMD_SET_WIPHY for userspace (e.g., hostapd) to set TX queue
parameters (txop, cwmin, cwmax, aifs).

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-10 15:17:40 -05:00
Jouni Malinen
90c97a040d nl80211: Add basic rate configuration for AP mode
Add a new attribute, NL80211_ATTR_BSS_BASIC_RATES, that can be used with
NL80211_CMD_SET_BSS for userspace (e.g., hostapd) to set which rates are
in the basic rate set.

Signed-off-by: Jouni Malinen <jouni.malinen@atheros.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-10 15:17:39 -05:00
Johannes Berg
bd81525272 wireless: implement basic rate helper function
This adds a helper function that, given a bitmap of basic
rates and a bitrate returns the response rate for this rate.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-10 15:17:35 -05:00
Holger Schurig
2a941ecb51 wireless: fix two bad print_ssid conversions
This patch fixes two current compilation problems. They showed up
with CONFIG_IEEE80211_DEBUG defined.

Signed-off-by: Holger Schurig <hs4233@mail.mn-solutions.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-10 15:17:33 -05:00
Sujith
8469cdef1f mac80211: Add a new event in ieee80211_ampdu_mlme_action
Send a notification to the driver on succesful
reception of an ADDBA response, add IEEE80211_AMPDU_TX_RESUME
for this purpose.

Signed-off-by: Sujith <Sujith.Manoharan@atheros.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-10 15:17:32 -05:00
Johannes Berg
41bb73eeac mac80211: remove SSID driver code
Remove the SSID from the driver API since now there is no
driver that requires knowing the SSID and I think it's
unlikely that any hardware design that does require the
SSID will play well with mac80211.

This also removes support for setting the SSID in master
mode which will require a patch to hostapd to not try.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-10 15:11:56 -05:00
Johannes Berg
2df78167ad wireless: fix a few sparse warnings
Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-10 15:10:17 -05:00
Johannes Berg
1239cd58d2 wireless: move mesh config length constant
This is a constant from the 802.11 specification.

Signed-off-by: Johannes Berg <johannes@sipsolutions.net>
Cc: Javier Cardona <javier@cozybit.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-10 15:10:16 -05:00
Zhu Yi
97c8b013da mac80211: print reason code for deauth/dissoc frames
The patch prints reason code for deauth/dissoc frames to give users
more ideas what's happened for the disconnection.

Signed-off-by: Zhu Yi <yi.zhu@intel.com>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-10 15:10:16 -05:00
Miklos Szeredi
6209344f5a net: unix: fix inflight counting bug in garbage collector
Previously I assumed that the receive queues of candidates don't
change during the GC.  This is only half true, nothing can be received
from the queues (see comment in unix_gc()), but buffers could be added
through the other half of the socket pair, which may still have file
descriptors referring to it.

This can result in inc_inflight_move_tail() erronously increasing the
"inflight" counter for a unix socket for which dec_inflight() wasn't
previously called.  This in turn can trigger the "BUG_ON(total_refs <
inflight_refs)" in a later garbage collection run.

Fix this by only manipulating the "inflight" counter for sockets which
are candidates themselves.  Duplicating the file references in
unix_attach_fds() is also needed to prevent a socket becoming a
candidate for GC while the skb that contains it is not yet queued.

Reported-by: Andrea Bittau <a.bittau@cs.ucl.ac.uk>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-11-09 11:17:33 -08:00
Harvey Harrison
f574179b63 tipc: trivial endian annotation in debug statement
Use htonl rather than ntohl on a u32.
net/tipc/name_table.c:557:2: warning: cast to restricted __be32

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-07 23:37:50 -08:00
Thomas Graf
f400923735 pkt_sched: Control group classifier
The classifier should cover the most common use case and will work
without any special configuration.

The principle of the classifier is to directly access the
task_struct via get_current(). In order for this to work,
classification requests from softirqs must be ignored. This is
not a problem because the vast majority of packets in softirq
context are not assigned to a task anyway. For this to work, a
mechanism is needed to trace softirq context. 

This repost goes back to the method of relying on the number of
nested bh disable calls for the sake of not adding too much
complexity and the option to come up with something more reliable
if actually needed.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-07 22:56:00 -08:00
Eric W. Biederman
505d4f73dd net: Guaranetee the proper ordering of the loopback device. v2
I was recently hunting a bug that occurred in network namespace
cleanup.  In looking at the code it became apparrent that we have
and will continue to have cases where if we have anything going
on in a network namespace there will be assumptions that the
loopback device is present.   Things like sending igmp unsubscribe
messages when we bring down network devices invokes the routing
code which assumes that at least the loopback driver is present.

Therefore to avoid magic initcall ordering hackery that is hard
to follow and hard to get right insert a call to register the
loopback device directly from net_dev_init().    This guarantes
that the loopback device is the first device registered and
the last network device to go away.

But do it carefully so we register the loopback device after
we clear dev_boot_phase.

Signed-off-by: Eric W. Biederman <ebiederm@maxwell.aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-07 22:54:20 -08:00
Eric W. Biederman
5d6d480908 net: fib_rules ordering fixes.
We need to setup the network namespace state before we register
the notifier.  Otherwise if a network device is already registered
we get a nasty NULL pointer dereference.

Signed-off-by: Eric W. Biederman <ebiederm@maxwell.aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-07 22:52:34 -08:00
David S. Miller
3d8160b149 Revert "net: Guaranetee the proper ordering of the loopback device."
This reverts commit ae33bc40c0.
2008-11-07 22:52:14 -08:00
David S. Miller
167c6274c3 Merge branch 'davem-next' of master.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6 2008-11-07 01:37:16 -08:00
Harvey Harrison
5c7f033358 phonet: sparse annotations of protocol, remove forward declaration
net/phonet/af_phonet.c:38:36: error: marked inline, but without a definition
net/phonet/pep-gprs.c:63:10: warning: incorrect type in return expression (different base types)
net/phonet/pep-gprs.c:63:10:    expected int
net/phonet/pep-gprs.c:63:10:    got restricted __be16 [usertype] <noident>
net/phonet/pep-gprs.c:65:10: warning: incorrect type in return expression (different base types)
net/phonet/pep-gprs.c:65:10:    expected int
net/phonet/pep-gprs.c:65:10:    got restricted __be16 [usertype] <noident>
net/phonet/pep-gprs.c:124:16: warning: incorrect type in assignment (different base types)
net/phonet/pep-gprs.c:124:16:    expected restricted __be16 [usertype] protocol
net/phonet/pep-gprs.c:124:16:    got unsigned short [unsigned] [usertype] protocol

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-06 23:10:50 -08:00
Harvey Harrison
ca62059b7e ipvs: oldlen, newlen should be be16, not be32
Noticed by sparse:
net/netfilter/ipvs/ip_vs_proto_tcp.c:195:6: warning: incorrect type in argument 5 (different base types)
net/netfilter/ipvs/ip_vs_proto_tcp.c:195:6:    expected restricted __be16 [usertype] oldlen
net/netfilter/ipvs/ip_vs_proto_tcp.c:195:6:    got restricted __be32 [usertype] <noident>
net/netfilter/ipvs/ip_vs_proto_tcp.c:196:6: warning: incorrect type in argument 6 (different base types)
net/netfilter/ipvs/ip_vs_proto_tcp.c:196:6:    expected restricted __be16 [usertype] newlen
net/netfilter/ipvs/ip_vs_proto_tcp.c:196:6:    got restricted __be32 [usertype] <noident>
net/netfilter/ipvs/ip_vs_proto_tcp.c:270:6: warning: incorrect type in argument 5 (different base types)
net/netfilter/ipvs/ip_vs_proto_tcp.c:270:6:    expected restricted __be16 [usertype] oldlen
net/netfilter/ipvs/ip_vs_proto_tcp.c:270:6:    got restricted __be32 [usertype] <noident>
net/netfilter/ipvs/ip_vs_proto_tcp.c:271:6: warning: incorrect type in argument 6 (different base types)
net/netfilter/ipvs/ip_vs_proto_tcp.c:271:6:    expected restricted __be16 [usertype] newlen
net/netfilter/ipvs/ip_vs_proto_tcp.c:271:6:    got restricted __be32 [usertype] <noident>
net/netfilter/ipvs/ip_vs_proto_udp.c:206:6: warning: incorrect type in argument 5 (different base types)
net/netfilter/ipvs/ip_vs_proto_udp.c:206:6:    expected restricted __be16 [usertype] oldlen
net/netfilter/ipvs/ip_vs_proto_udp.c:206:6:    got restricted __be32 [usertype] <noident>
net/netfilter/ipvs/ip_vs_proto_udp.c:207:6: warning: incorrect type in argument 6 (different base types)
net/netfilter/ipvs/ip_vs_proto_udp.c:207:6:    expected restricted __be16 [usertype] newlen
net/netfilter/ipvs/ip_vs_proto_udp.c:207:6:    got restricted __be32 [usertype] <noident>
net/netfilter/ipvs/ip_vs_proto_udp.c:282:6: warning: incorrect type in argument 5 (different base types)
net/netfilter/ipvs/ip_vs_proto_udp.c:282:6:    expected restricted __be16 [usertype] oldlen
net/netfilter/ipvs/ip_vs_proto_udp.c:282:6:    got restricted __be32 [usertype] <noident>
net/netfilter/ipvs/ip_vs_proto_udp.c:283:6: warning: incorrect type in argument 6 (different base types)
net/netfilter/ipvs/ip_vs_proto_udp.c:283:6:    expected restricted __be16 [usertype] newlen
net/netfilter/ipvs/ip_vs_proto_udp.c:283:6:    got restricted __be32 [usertype] <noident>

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-06 23:09:56 -08:00
Alexey Dobriyan
70e90679ff af_key: mark policy as dead before destroying
xfrm_policy_destroy() will oops if not dead policy is passed to it.
On error path in pfkey_compile_policy() exactly this happens.

Oopsable for CAP_NET_ADMIN owners.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-06 23:08:37 -08:00
Alexey Dobriyan
76acfdb9b7 net: mark flow_cache_cpu_prepare() as __init
It's called from __init code only. And__devinit in generic networking code
is pretty strange :^)

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-06 23:06:44 -08:00
David S. Miller
9eeda9abd1 Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
Conflicts:

	drivers/net/wireless/ath5k/base.c
	net/8021q/vlan_core.c
2008-11-06 22:43:03 -08:00
Linus Torvalds
4bab0ea1d4 Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6:
  net: Fix recursive descent in __scm_destroy().
  iwl3945: fix deadlock on suspend
  iwl3945: do not send scan command if channel count zero
  iwl3945: clear scanning bits upon failure
  ath5k: correct handling of rx status fields
  zd1211rw: Add 2 device IDs
  Fix logic error in rfkill_check_duplicity
  iwlagn: avoid sleep in softirq context
  iwlwifi: clear scanning bits upon failure
  Revert "ath5k: honor FIF_BCN_PRBRESP_PROMISC in STA mode"
  tcp: Fix recvmsg MSG_PEEK influence of blocking behavior.
  netfilter: netns ct: walk netns list under RTNL
  ipv6: fix run pending DAD when interface becomes ready
  net/9p: fix printk format warnings
  net: fix packet socket delivery in rx irq handler
  xfrm: Have af-specific init_tempsel() initialize family field of temporary selector
2008-11-06 16:44:23 -08:00
David S. Miller
ca409d6e08 Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6 2008-11-06 15:52:00 -08:00
Linus Torvalds
39d4e58d36 Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ericvh/v9fs:
  net/9p: fix printk format warnings
  unsigned fid->fid cannot be negative
  9p: rdma: remove duplicated #include
  p9: Fix leak of waitqueue in request allocation path
  9p: Remove unneeded free of fcall for Flush
  9p: Make all client spin locks IRQ safe
  9p: rdma: Set trans prior to requesting async connection ops
2008-11-06 15:45:57 -08:00
David S. Miller
3b53fbf431 net: Fix recursive descent in __scm_destroy().
__scm_destroy() walks the list of file descriptors in the scm_fp_list
pointed to by the scm_cookie argument.

Those, in turn, can close sockets and invoke __scm_destroy() again.

There is nothing which limits how deeply this can occur.

The idea for how to fix this is from Linus.  Basically, we do all of
the fput()s at the top level by collecting all of the scm_fp_list
objects hit by an fput().  Inside of the initial __scm_destroy() we
keep running the list until it is empty.

Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-06 15:45:32 -08:00
David Miller
f8d570a474 net: Fix recursive descent in __scm_destroy().
__scm_destroy() walks the list of file descriptors in the scm_fp_list
pointed to by the scm_cookie argument.

Those, in turn, can close sockets and invoke __scm_destroy() again.

There is nothing which limits how deeply this can occur.

The idea for how to fix this is from Linus.  Basically, we do all of
the fput()s at the top level by collecting all of the scm_fp_list
objects hit by an fput().  Inside of the initial __scm_destroy() we
keep running the list until it is empty.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-11-06 13:51:50 -08:00
Jonathan McDowell
4a9d916717 Fix logic error in rfkill_check_duplicity
> I'll have a prod at why the [hso] rfkill stuff isn't working next

Ok, I believe this is due to the addition of rfkill_check_duplicity in
rfkill and the fact that test_bit actually returns a negative value
rather than the postive one expected (which is of course equally true).
So when the second WLAN device (the hso device, with the EEE PC WLAN
being the first) comes along rfkill_check_duplicity returns a negative
value and so rfkill_register returns an error. Patch below fixes this
for me.

Signed-Off-By: Jonathan McDowell <noodles@earth.li>
Acked-by: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
2008-11-06 16:37:09 -05:00
Brian Haley
305d552acc bonding: send IPv6 neighbor advertisement on failover
This patch adds better IPv6 failover support for bonding devices,
especially when in active-backup mode and there are only IPv6 addresses
configured, as reported by Alex Sidorenko.

- Creates a new file, net/drivers/bonding/bond_ipv6.c, for the
   IPv6-specific routines.  Both regular bonds and VLANs over bonds
   are supported.

- Adds a new tunable, num_unsol_na, to limit the number of unsolicited
   IPv6 Neighbor Advertisements that are sent on a failover event.
   Default is 1.

- Creates two new IPv6 neighbor discovery functions:

   ndisc_build_skb()
   ndisc_send_skb()

   These were required to support VLANs since we have to be able to
   add the VLAN id to the skb since ndisc_send_na() and friends
   shouldn't be asked to do this.  These two routines are basically
   __ndisc_send() split into two pieces, in a slightly different order.

- Updates Documentation/networking/bonding.txt and bumps the rev of bond
   support to 3.4.0.

On failover, this new code will generate one packet:

- An unsolicited IPv6 Neighbor Advertisement, which helps the switch
   learn that the address has moved to the new slave.

Testing has shown that sending just the NA results in pretty good
behavior when in active-back mode, I saw no lost ping packets for example.

Signed-off-by: Brian Haley <brian.haley@hp.com>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
2008-11-06 00:49:37 -05:00
Eric W. Biederman
0a36b345ab net: Don't leak packets when a netns is going down
I have been tracking for a while a case where when the
network namespace exits the cleanup gets stck in an
endless precessess of:

unregister_netdevice: waiting for lo to become free. Usage count = 3
unregister_netdevice: waiting for lo to become free. Usage count = 3
unregister_netdevice: waiting for lo to become free. Usage count = 3
unregister_netdevice: waiting for lo to become free. Usage count = 3
unregister_netdevice: waiting for lo to become free. Usage count = 3
unregister_netdevice: waiting for lo to become free. Usage count = 3
unregister_netdevice: waiting for lo to become free. Usage count = 3

It turns out that if you listen on a multicast address an unsubscribe
packet is sent when the network device goes down.   If you shutdown
the network namespace without carefully cleaning up this can trigger
the unsubscribe packet to be sent over the loopback interface while
the network namespace is going down.

All of which is fine except when we drop the packet and forget to
free it leaking the skb and the dst entry attached to.  As it
turns out the dst entry hold a reference to the idev which holds
the dev and keeps everything from being cleaned up.  Yuck!

By fixing my earlier thinko and add the needed kfree_skb and everything
cleans up beautifully. 

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-05 16:00:24 -08:00
Eric W. Biederman
ae33bc40c0 net: Guaranetee the proper ordering of the loopback device.
I was recently hunting a bug that occurred in network namespace
cleanup.  In looking at the code it became apparrent that we have
and will continue to have cases where if we have anything going
on in a network namespace there will be assumptions that the
loopback device is present.   Things like sending igmp unsubscribe
messages when we bring down network devices invokes the routing
code which assumes that at least the loopback driver is present.

Therefore to avoid magic initcall ordering hackery that is hard
to follow and hard to get right insert a call to register the
loopback device directly from net_dev_init().    This guarantes
that the loopback device is the first device registered and
the last network device to go away.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-05 16:00:02 -08:00
Eric W. Biederman
d0c082cea6 netns: Delete virtual interfaces during namespace cleanup
When physical devices are inside of network namespace and that
network namespace terminates we can not make them go away.  We
have to keep them and moving them to the initial network namespace
is the best we can do.

For virtual devices left in a network namespace that is exiting
we have no need to preserve them and we now have the infrastructure
that allows us to delete them.  So delete virtual devices when we
exit a network namespace.  Keeping the necessary user space clean up
after a network namespace exits much more tractable.

Acked-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-11-05 15:59:38 -08:00