Merge branch 'for-linus' of git://linux-nfs.org/~bfields/linux
* 'for-linus' of git://linux-nfs.org/~bfields/linux: (100 commits)
SUNRPC: RPC program information is stored in unsigned integers
SUNRPC: Move exported symbol definitions after function declaration part 2
NLM: tear down RPC clients in nlm_shutdown_hosts
SUNRPC: spin svc_rqst initialization to its own function
nfsd: more careful input validation in nfsctl write methods
lockd: minor log message fix
knfsd: don't bother mapping putrootfh enoent to eperm
rdma: makefile
rdma: ONCRPC RDMA protocol marshalling
rdma: SVCRDMA sendto
rdma: SVCRDMA recvfrom
rdma: SVCRDMA Core Transport Services
rdma: SVCRDMA Transport Module
rdma: SVCRMDA Header File
svc: Add svc_xprt_names service to replace svc_sock_names
knfsd: Support adding transports by writing portlist file
svc: Add svc API that queries for a transport instance
svc: Add /proc/sys/sunrpc/transport files
svc: Add transport hdr size for defer/revisit
svc: Move the xprt independent code to the svc_xprt.c file
...
diff --git a/MAINTAINERS b/MAINTAINERS
index 91082e6..6cae137 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2247,7 +2247,7 @@
M: bfields@fieldses.org
P: Neil Brown
M: neilb@suse.de
-L: nfs@lists.sourceforge.net
+L: linux-nfs@vger.kernel.org
W: http://nfs.sourceforge.net/
S: Supported
diff --git a/fs/Kconfig b/fs/Kconfig
index 219ec06..987b5d7 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1674,6 +1674,8 @@
select CRYPTO_MD5 if NFSD_V4
select CRYPTO if NFSD_V4
select FS_POSIX_ACL if NFSD_V4
+ select PROC_FS if NFSD_V4
+ select PROC_FS if SUNRPC_GSS
help
If you want your Linux box to act as an NFS *server*, so that other
computers on your local network which support NFS can access certain
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 572601e9..ca6b16f 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -34,10 +34,10 @@
static void nlm_gc_hosts(void);
static struct nsm_handle * __nsm_find(const struct sockaddr_in *,
- const char *, int, int);
+ const char *, unsigned int, int);
static struct nsm_handle * nsm_find(const struct sockaddr_in *sin,
const char *hostname,
- int hostname_len);
+ unsigned int hostname_len);
/*
* Common host lookup routine for server & client
@@ -45,7 +45,8 @@
static struct nlm_host *
nlm_lookup_host(int server, const struct sockaddr_in *sin,
int proto, int version, const char *hostname,
- int hostname_len, const struct sockaddr_in *ssin)
+ unsigned int hostname_len,
+ const struct sockaddr_in *ssin)
{
struct hlist_head *chain;
struct hlist_node *pos;
@@ -176,7 +177,7 @@
*/
struct nlm_host *
nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
- const char *hostname, int hostname_len)
+ const char *hostname, unsigned int hostname_len)
{
struct sockaddr_in ssin = {0};
@@ -189,7 +190,7 @@
*/
struct nlm_host *
nlmsvc_lookup_host(struct svc_rqst *rqstp,
- const char *hostname, int hostname_len)
+ const char *hostname, unsigned int hostname_len)
{
struct sockaddr_in ssin = {0};
@@ -307,7 +308,8 @@
* Release all resources held by that peer.
*/
void nlm_host_rebooted(const struct sockaddr_in *sin,
- const char *hostname, int hostname_len,
+ const char *hostname,
+ unsigned int hostname_len,
u32 new_state)
{
struct hlist_head *chain;
@@ -377,8 +379,13 @@
/* First, make all hosts eligible for gc */
dprintk("lockd: nuking all hosts...\n");
for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
- hlist_for_each_entry(host, pos, chain, h_hash)
+ hlist_for_each_entry(host, pos, chain, h_hash) {
host->h_expires = jiffies - 1;
+ if (host->h_rpcclnt) {
+ rpc_shutdown_client(host->h_rpcclnt);
+ host->h_rpcclnt = NULL;
+ }
+ }
}
/* Then, perform a garbage collection pass */
@@ -449,7 +456,7 @@
static struct nsm_handle *
__nsm_find(const struct sockaddr_in *sin,
- const char *hostname, int hostname_len,
+ const char *hostname, unsigned int hostname_len,
int create)
{
struct nsm_handle *nsm = NULL;
@@ -503,7 +510,8 @@
}
static struct nsm_handle *
-nsm_find(const struct sockaddr_in *sin, const char *hostname, int hostname_len)
+nsm_find(const struct sockaddr_in *sin, const char *hostname,
+ unsigned int hostname_len)
{
return __nsm_find(sin, hostname, hostname_len, 1);
}
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 82e2192..0822646 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -219,19 +219,6 @@
module_put_and_exit(0);
}
-
-static int find_socket(struct svc_serv *serv, int proto)
-{
- struct svc_sock *svsk;
- int found = 0;
- list_for_each_entry(svsk, &serv->sv_permsocks, sk_list)
- if (svsk->sk_sk->sk_protocol == proto) {
- found = 1;
- break;
- }
- return found;
-}
-
/*
* Make any sockets that are needed but not present.
* If nlm_udpport or nlm_tcpport were set as module
@@ -240,17 +227,25 @@
static int make_socks(struct svc_serv *serv, int proto)
{
static int warned;
+ struct svc_xprt *xprt;
int err = 0;
- if (proto == IPPROTO_UDP || nlm_udpport)
- if (!find_socket(serv, IPPROTO_UDP))
- err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport,
- SVC_SOCK_DEFAULTS);
- if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport))
- if (!find_socket(serv, IPPROTO_TCP))
- err = svc_makesock(serv, IPPROTO_TCP, nlm_tcpport,
- SVC_SOCK_DEFAULTS);
-
+ if (proto == IPPROTO_UDP || nlm_udpport) {
+ xprt = svc_find_xprt(serv, "udp", 0, 0);
+ if (!xprt)
+ err = svc_create_xprt(serv, "udp", nlm_udpport,
+ SVC_SOCK_DEFAULTS);
+ else
+ svc_xprt_put(xprt);
+ }
+ if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) {
+ xprt = svc_find_xprt(serv, "tcp", 0, 0);
+ if (!xprt)
+ err = svc_create_xprt(serv, "tcp", nlm_tcpport,
+ SVC_SOCK_DEFAULTS);
+ else
+ svc_xprt_put(xprt);
+ }
if (err >= 0) {
warned = 0;
err = 0;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index bf27b6c..385437e 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -84,6 +84,7 @@
{
struct nlm_host *host;
struct nlm_file *file;
+ int rc = rpc_success;
dprintk("lockd: TEST4 called\n");
resp->cookie = argp->cookie;
@@ -91,7 +92,7 @@
/* Don't accept test requests during grace period */
if (nlmsvc_grace_period) {
resp->status = nlm_lck_denied_grace_period;
- return rpc_success;
+ return rc;
}
/* Obtain client and file */
@@ -101,12 +102,13 @@
/* Now check for conflicting locks */
resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie);
if (resp->status == nlm_drop_reply)
- return rpc_drop_reply;
+ rc = rpc_drop_reply;
+ else
+ dprintk("lockd: TEST4 status %d\n", ntohl(resp->status));
- dprintk("lockd: TEST4 status %d\n", ntohl(resp->status));
nlm_release_host(host);
nlm_release_file(file);
- return rpc_success;
+ return rc;
}
static __be32
@@ -115,6 +117,7 @@
{
struct nlm_host *host;
struct nlm_file *file;
+ int rc = rpc_success;
dprintk("lockd: LOCK called\n");
@@ -123,7 +126,7 @@
/* Don't accept new lock requests during grace period */
if (nlmsvc_grace_period && !argp->reclaim) {
resp->status = nlm_lck_denied_grace_period;
- return rpc_success;
+ return rc;
}
/* Obtain client and file */
@@ -146,12 +149,13 @@
resp->status = nlmsvc_lock(rqstp, file, &argp->lock,
argp->block, &argp->cookie);
if (resp->status == nlm_drop_reply)
- return rpc_drop_reply;
+ rc = rpc_drop_reply;
+ else
+ dprintk("lockd: LOCK status %d\n", ntohl(resp->status));
- dprintk("lockd: LOCK status %d\n", ntohl(resp->status));
nlm_release_host(host);
nlm_release_file(file);
- return rpc_success;
+ return rc;
}
static __be32
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d120ec3..2f4d8fa 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -501,25 +501,29 @@
block, block->b_flags, block->b_fl);
if (block->b_flags & B_TIMED_OUT) {
nlmsvc_unlink_block(block);
- return nlm_lck_denied;
+ ret = nlm_lck_denied;
+ goto out;
}
if (block->b_flags & B_GOT_CALLBACK) {
+ nlmsvc_unlink_block(block);
if (block->b_fl != NULL
&& block->b_fl->fl_type != F_UNLCK) {
lock->fl = *block->b_fl;
goto conf_lock;
- }
- else {
- nlmsvc_unlink_block(block);
- return nlm_granted;
+ } else {
+ ret = nlm_granted;
+ goto out;
}
}
- return nlm_drop_reply;
+ ret = nlm_drop_reply;
+ goto out;
}
error = vfs_test_lock(file->f_file, &lock->fl);
- if (error == -EINPROGRESS)
- return nlmsvc_defer_lock_rqst(rqstp, block);
+ if (error == -EINPROGRESS) {
+ ret = nlmsvc_defer_lock_rqst(rqstp, block);
+ goto out;
+ }
if (error) {
ret = nlm_lck_denied_nolocks;
goto out;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 9cd5c8b..88379cc 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -113,6 +113,7 @@
{
struct nlm_host *host;
struct nlm_file *file;
+ int rc = rpc_success;
dprintk("lockd: TEST called\n");
resp->cookie = argp->cookie;
@@ -120,7 +121,7 @@
/* Don't accept test requests during grace period */
if (nlmsvc_grace_period) {
resp->status = nlm_lck_denied_grace_period;
- return rpc_success;
+ return rc;
}
/* Obtain client and file */
@@ -130,13 +131,14 @@
/* Now check for conflicting locks */
resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie));
if (resp->status == nlm_drop_reply)
- return rpc_drop_reply;
+ rc = rpc_drop_reply;
+ else
+ dprintk("lockd: TEST status %d vers %d\n",
+ ntohl(resp->status), rqstp->rq_vers);
- dprintk("lockd: TEST status %d vers %d\n",
- ntohl(resp->status), rqstp->rq_vers);
nlm_release_host(host);
nlm_release_file(file);
- return rpc_success;
+ return rc;
}
static __be32
@@ -145,6 +147,7 @@
{
struct nlm_host *host;
struct nlm_file *file;
+ int rc = rpc_success;
dprintk("lockd: LOCK called\n");
@@ -153,7 +156,7 @@
/* Don't accept new lock requests during grace period */
if (nlmsvc_grace_period && !argp->reclaim) {
resp->status = nlm_lck_denied_grace_period;
- return rpc_success;
+ return rc;
}
/* Obtain client and file */
@@ -176,12 +179,13 @@
resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock,
argp->block, &argp->cookie));
if (resp->status == nlm_drop_reply)
- return rpc_drop_reply;
+ rc = rpc_drop_reply;
+ else
+ dprintk("lockd: LOCK status %d\n", ntohl(resp->status));
- dprintk("lockd: LOCK status %d\n", ntohl(resp->status));
nlm_release_host(host);
nlm_release_file(file);
- return rpc_success;
+ return rc;
}
static __be32
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 84ebba3..dbbefbc 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -87,7 +87,7 @@
unsigned int hash;
__be32 nfserr;
- nlm_debug_print_fh("nlm_file_lookup", f);
+ nlm_debug_print_fh("nlm_lookup_file", f);
hash = file_hash(f);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 9b6bbf1..bd185a5 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -119,8 +119,8 @@
if (!serv)
goto out_err;
- ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport,
- SVC_SOCK_ANONYMOUS);
+ ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
+ SVC_SOCK_ANONYMOUS);
if (ret <= 0)
goto out_destroy;
nfs_callback_tcpport = ret;
diff --git a/include/linux/nfsd/auth.h b/fs/nfsd/auth.h
similarity index 87%
rename from include/linux/nfsd/auth.h
rename to fs/nfsd/auth.h
index 0fb9f72..78b3c0e 100644
--- a/include/linux/nfsd/auth.h
+++ b/fs/nfsd/auth.h
@@ -1,6 +1,4 @@
/*
- * include/linux/nfsd/auth.h
- *
* nfsd-specific authentication stuff.
* uid/gid mapping not yet implemented.
*
@@ -10,8 +8,6 @@
#ifndef LINUX_NFSD_AUTH_H
#define LINUX_NFSD_AUTH_H
-#ifdef __KERNEL__
-
#define nfsd_luid(rq, uid) ((u32)(uid))
#define nfsd_lgid(rq, gid) ((u32)(gid))
#define nfsd_ruid(rq, uid) ((u32)(uid))
@@ -23,5 +19,4 @@
*/
int nfsd_setuser(struct svc_rqst *, struct svc_export *);
-#endif /* __KERNEL__ */
#endif /* LINUX_NFSD_AUTH_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 66d0aeb..79b4bf8 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1357,8 +1357,6 @@
mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
- if (PTR_ERR(exp) == -ENOENT)
- return nfserr_perm;
if (IS_ERR(exp))
return nfserrno(PTR_ERR(exp));
rv = fh_compose(fhp, exp, exp->ex_dentry, NULL);
@@ -1637,13 +1635,19 @@
/*
* Initialize the exports module.
*/
-void
+int
nfsd_export_init(void)
{
+ int rv;
dprintk("nfsd: initializing export module.\n");
- cache_register(&svc_export_cache);
- cache_register(&svc_expkey_cache);
+ rv = cache_register(&svc_export_cache);
+ if (rv)
+ return rv;
+ rv = cache_register(&svc_expkey_cache);
+ if (rv)
+ cache_unregister(&svc_export_cache);
+ return rv;
}
@@ -1670,10 +1674,8 @@
exp_writelock();
- if (cache_unregister(&svc_expkey_cache))
- printk(KERN_ERR "nfsd: failed to unregister expkey cache\n");
- if (cache_unregister(&svc_export_cache))
- printk(KERN_ERR "nfsd: failed to unregister export cache\n");
+ cache_unregister(&svc_expkey_cache);
+ cache_unregister(&svc_export_cache);
svcauth_unix_purge();
exp_writeunlock();
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 0e5fa11..1c3b765 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -221,12 +221,17 @@
struct nfsd3_getaclres *resp)
{
struct dentry *dentry = resp->fh.fh_dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode;
struct kvec *head = rqstp->rq_res.head;
unsigned int base;
int n;
int w;
+ /*
+ * Since this is version 2, the check for nfserr in
+ * nfsd_dispatch actually ensures the following cannot happen.
+ * However, it seems fragile to depend on that.
+ */
if (dentry == NULL || dentry->d_inode == NULL)
return 0;
inode = dentry->d_inode;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f917fd2..d7647f7 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -21,6 +21,7 @@
#include <linux/sunrpc/svc.h>
#include <linux/nfsd/nfsd.h>
#include <linux/nfsd/xdr3.h>
+#include "auth.h"
#define NFSDDBG_FACILITY NFSDDBG_XDR
@@ -88,10 +89,10 @@
* no slashes or null bytes.
*/
static __be32 *
-decode_filename(__be32 *p, char **namp, int *lenp)
+decode_filename(__be32 *p, char **namp, unsigned int *lenp)
{
char *name;
- int i;
+ unsigned int i;
if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) {
for (i = 0, name = *namp; i < *lenp; i++, name++) {
@@ -452,8 +453,7 @@
nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_symlinkargs *args)
{
- unsigned int len;
- int avail;
+ unsigned int len, avail;
char *old, *new;
struct kvec *vec;
@@ -486,7 +486,8 @@
/* now copy next page if there is one */
if (len && !avail && rqstp->rq_arg.page_len) {
avail = rqstp->rq_arg.page_len;
- if (avail > PAGE_SIZE) avail = PAGE_SIZE;
+ if (avail > PAGE_SIZE)
+ avail = PAGE_SIZE;
old = page_address(rqstp->rq_arg.pages[0]);
}
while (len && avail && *old) {
@@ -816,11 +817,11 @@
encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
struct svc_fh *fhp)
{
- p = encode_post_op_attr(cd->rqstp, p, fhp);
- *p++ = xdr_one; /* yes, a file handle follows */
- p = encode_fh(p, fhp);
- fh_put(fhp);
- return p;
+ p = encode_post_op_attr(cd->rqstp, p, fhp);
+ *p++ = xdr_one; /* yes, a file handle follows */
+ p = encode_fh(p, fhp);
+ fh_put(fhp);
+ return p;
}
static int
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 9d536a8..aae2b29 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -350,30 +350,6 @@
static int do_probe_callback(void *data)
{
struct nfs4_client *clp = data;
- struct nfs4_callback *cb = &clp->cl_callback;
- struct rpc_message msg = {
- .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
- .rpc_argp = clp,
- };
- int status;
-
- status = rpc_call_sync(cb->cb_client, &msg, RPC_TASK_SOFT);
-
- if (status) {
- rpc_shutdown_client(cb->cb_client);
- cb->cb_client = NULL;
- } else
- atomic_set(&cb->cb_set, 1);
- put_nfs4_client(clp);
- return 0;
-}
-
-/*
- * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
- */
-void
-nfsd4_probe_callback(struct nfs4_client *clp)
-{
struct sockaddr_in addr;
struct nfs4_callback *cb = &clp->cl_callback;
struct rpc_timeout timeparms = {
@@ -390,13 +366,15 @@
.timeout = &timeparms,
.program = program,
.version = nfs_cb_version[1]->number,
- .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
+ .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
.flags = (RPC_CLNT_CREATE_NOPING),
};
- struct task_struct *t;
-
- if (atomic_read(&cb->cb_set))
- return;
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
+ .rpc_argp = clp,
+ };
+ struct rpc_clnt *client;
+ int status;
/* Initialize address */
memset(&addr, 0, sizeof(addr));
@@ -416,29 +394,50 @@
program->stats->program = program;
/* Create RPC client */
- cb->cb_client = rpc_create(&args);
- if (IS_ERR(cb->cb_client)) {
+ client = rpc_create(&args);
+ if (IS_ERR(client)) {
dprintk("NFSD: couldn't create callback client\n");
+ status = PTR_ERR(client);
goto out_err;
}
+ status = rpc_call_sync(client, &msg, RPC_TASK_SOFT);
+
+ if (status)
+ goto out_release_client;
+
+ cb->cb_client = client;
+ atomic_set(&cb->cb_set, 1);
+ put_nfs4_client(clp);
+ return 0;
+out_release_client:
+ rpc_shutdown_client(client);
+out_err:
+ put_nfs4_client(clp);
+ dprintk("NFSD: warning: no callback path to client %.*s\n",
+ (int)clp->cl_name.len, clp->cl_name.data);
+ return status;
+}
+
+/*
+ * Set up the callback client and put a NFSPROC4_CB_NULL on the wire...
+ */
+void
+nfsd4_probe_callback(struct nfs4_client *clp)
+{
+ struct task_struct *t;
+
+ BUG_ON(atomic_read(&clp->cl_callback.cb_set));
+
/* the task holds a reference to the nfs4_client struct */
atomic_inc(&clp->cl_count);
t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe");
if (IS_ERR(t))
- goto out_release_clp;
+ atomic_dec(&clp->cl_count);
return;
-
-out_release_clp:
- atomic_dec(&clp->cl_count);
- rpc_shutdown_client(cb->cb_client);
-out_err:
- cb->cb_client = NULL;
- dprintk("NFSD: warning: no callback path to client %.*s\n",
- (int)clp->cl_name.len, clp->cl_name.data);
}
/*
@@ -458,9 +457,6 @@
int retries = 1;
int status = 0;
- if ((!atomic_read(&clp->cl_callback.cb_set)) || !clnt)
- return;
-
cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */
cbr->cbr_dp = dp;
@@ -469,6 +465,7 @@
switch (status) {
case -EIO:
/* Network partition? */
+ atomic_set(&clp->cl_callback.cb_set, 0);
case -EBADHANDLE:
case -NFS4ERR_BAD_STATEID:
/* Race: client probably got cb_recall
@@ -481,11 +478,10 @@
status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT);
}
out_put_cred:
- if (status == -EIO)
- atomic_set(&clp->cl_callback.cb_set, 0);
- /* Success or failure, now we're either waiting for lease expiration
- * or deleg_return. */
- dprintk("NFSD: nfs4_cb_recall: dp %p dl_flock %p dl_count %d\n",dp, dp->dl_flock, atomic_read(&dp->dl_count));
+ /*
+ * Success or failure, now we're either waiting for lease expiration
+ * or deleg_return.
+ */
put_nfs4_client(clp);
nfs4_put_delegation(dp);
return;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 4c0c683..996bd88 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -255,13 +255,10 @@
goto out;
if (len == 0)
set_bit(CACHE_NEGATIVE, &ent.h.flags);
- else {
- if (error >= IDMAP_NAMESZ) {
- error = -EINVAL;
- goto out;
- }
+ else if (len >= IDMAP_NAMESZ)
+ goto out;
+ else
memcpy(ent.name, buf1, sizeof(ent.name));
- }
error = -ENOMEM;
res = idtoname_update(&ent, res);
if (res == NULL)
@@ -467,20 +464,25 @@
* Exported API
*/
-void
+int
nfsd_idmap_init(void)
{
- cache_register(&idtoname_cache);
- cache_register(&nametoid_cache);
+ int rv;
+
+ rv = cache_register(&idtoname_cache);
+ if (rv)
+ return rv;
+ rv = cache_register(&nametoid_cache);
+ if (rv)
+ cache_unregister(&idtoname_cache);
+ return rv;
}
void
nfsd_idmap_shutdown(void)
{
- if (cache_unregister(&idtoname_cache))
- printk(KERN_ERR "nfsd: failed to unregister idtoname cache\n");
- if (cache_unregister(&nametoid_cache))
- printk(KERN_ERR "nfsd: failed to unregister nametoid cache\n");
+ cache_unregister(&idtoname_cache);
+ cache_unregister(&nametoid_cache);
}
/*
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 18ead17..c593db0 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -750,7 +750,7 @@
cstate->current_fh.fh_export,
cstate->current_fh.fh_dentry, buf,
&count, verify->ve_bmval,
- rqstp);
+ rqstp, 0);
/* this means that nfsd4_encode_fattr() ran out of space */
if (status == nfserr_resource && count == 0)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 31673cd..f6744bc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -61,7 +61,6 @@
static time_t user_lease_time = 90;
static time_t boot_time;
static int in_grace = 1;
-static u32 current_clientid = 1;
static u32 current_ownerid = 1;
static u32 current_fileid = 1;
static u32 current_delegid = 1;
@@ -340,21 +339,20 @@
* This type of memory management is somewhat inefficient, but we use it
* anyway since SETCLIENTID is not a common operation.
*/
-static inline struct nfs4_client *
-alloc_client(struct xdr_netobj name)
+static struct nfs4_client *alloc_client(struct xdr_netobj name)
{
struct nfs4_client *clp;
- if ((clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL))!= NULL) {
- if ((clp->cl_name.data = kmalloc(name.len, GFP_KERNEL)) != NULL) {
- memcpy(clp->cl_name.data, name.data, name.len);
- clp->cl_name.len = name.len;
- }
- else {
- kfree(clp);
- clp = NULL;
- }
+ clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
+ if (clp == NULL)
+ return NULL;
+ clp->cl_name.data = kmalloc(name.len, GFP_KERNEL);
+ if (clp->cl_name.data == NULL) {
+ kfree(clp);
+ return NULL;
}
+ memcpy(clp->cl_name.data, name.data, name.len);
+ clp->cl_name.len = name.len;
return clp;
}
@@ -363,8 +361,11 @@
{
struct rpc_clnt *clnt = clp->cl_callback.cb_client;
- /* shutdown rpc client, ending any outstanding recall rpcs */
if (clnt) {
+ /*
+ * Callback threads take a reference on the client, so there
+ * should be no outstanding callbacks at this point.
+ */
clp->cl_callback.cb_client = NULL;
rpc_shutdown_client(clnt);
}
@@ -422,12 +423,13 @@
put_nfs4_client(clp);
}
-static struct nfs4_client *
-create_client(struct xdr_netobj name, char *recdir) {
+static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
+{
struct nfs4_client *clp;
- if (!(clp = alloc_client(name)))
- goto out;
+ clp = alloc_client(name);
+ if (clp == NULL)
+ return NULL;
memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
atomic_set(&clp->cl_count, 1);
atomic_set(&clp->cl_callback.cb_set, 0);
@@ -436,32 +438,30 @@
INIT_LIST_HEAD(&clp->cl_openowners);
INIT_LIST_HEAD(&clp->cl_delegations);
INIT_LIST_HEAD(&clp->cl_lru);
-out:
return clp;
}
-static void
-copy_verf(struct nfs4_client *target, nfs4_verifier *source) {
- memcpy(target->cl_verifier.data, source->data, sizeof(target->cl_verifier.data));
+static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
+{
+ memcpy(target->cl_verifier.data, source->data,
+ sizeof(target->cl_verifier.data));
}
-static void
-copy_clid(struct nfs4_client *target, struct nfs4_client *source) {
+static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
+{
target->cl_clientid.cl_boot = source->cl_clientid.cl_boot;
target->cl_clientid.cl_id = source->cl_clientid.cl_id;
}
-static void
-copy_cred(struct svc_cred *target, struct svc_cred *source) {
-
+static void copy_cred(struct svc_cred *target, struct svc_cred *source)
+{
target->cr_uid = source->cr_uid;
target->cr_gid = source->cr_gid;
target->cr_group_info = source->cr_group_info;
get_group_info(target->cr_group_info);
}
-static inline int
-same_name(const char *n1, const char *n2)
+static int same_name(const char *n1, const char *n2)
{
return 0 == memcmp(n1, n2, HEXDIR_LEN);
}
@@ -485,26 +485,26 @@
return cr1->cr_uid == cr2->cr_uid;
}
-static void
-gen_clid(struct nfs4_client *clp) {
+static void gen_clid(struct nfs4_client *clp)
+{
+ static u32 current_clientid = 1;
+
clp->cl_clientid.cl_boot = boot_time;
clp->cl_clientid.cl_id = current_clientid++;
}
-static void
-gen_confirm(struct nfs4_client *clp) {
- struct timespec tv;
- u32 * p;
+static void gen_confirm(struct nfs4_client *clp)
+{
+ static u32 i;
+ u32 *p;
- tv = CURRENT_TIME;
p = (u32 *)clp->cl_confirm.data;
- *p++ = tv.tv_sec;
- *p++ = tv.tv_nsec;
+ *p++ = get_seconds();
+ *p++ = i++;
}
-static int
-check_name(struct xdr_netobj name) {
-
+static int check_name(struct xdr_netobj name)
+{
if (name.len == 0)
return 0;
if (name.len > NFS4_OPAQUE_LIMIT) {
@@ -683,39 +683,6 @@
return;
}
-/*
- * RFC 3010 has a complex implmentation description of processing a
- * SETCLIENTID request consisting of 5 bullets, labeled as
- * CASE0 - CASE4 below.
- *
- * NOTES:
- * callback information will be processed in a future patch
- *
- * an unconfirmed record is added when:
- * NORMAL (part of CASE 4): there is no confirmed nor unconfirmed record.
- * CASE 1: confirmed record found with matching name, principal,
- * verifier, and clientid.
- * CASE 2: confirmed record found with matching name, principal,
- * and there is no unconfirmed record with matching
- * name and principal
- *
- * an unconfirmed record is replaced when:
- * CASE 3: confirmed record found with matching name, principal,
- * and an unconfirmed record is found with matching
- * name, principal, and with clientid and
- * confirm that does not match the confirmed record.
- * CASE 4: there is no confirmed record with matching name and
- * principal. there is an unconfirmed record with
- * matching name, principal.
- *
- * an unconfirmed record is deleted when:
- * CASE 1: an unconfirmed record that matches input name, verifier,
- * and confirmed clientid.
- * CASE 4: any unconfirmed records with matching name and principal
- * that exist after an unconfirmed record has been replaced
- * as described above.
- *
- */
__be32
nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_setclientid *setclid)
@@ -748,11 +715,7 @@
nfs4_lock_state();
conf = find_confirmed_client_by_str(dname, strhashval);
if (conf) {
- /*
- * CASE 0:
- * clname match, confirmed, different principal
- * or different ip_address
- */
+ /* RFC 3530 14.2.33 CASE 0: */
status = nfserr_clid_inuse;
if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)
|| conf->cl_addr != sin->sin_addr.s_addr) {
@@ -761,12 +724,17 @@
goto out;
}
}
+ /*
+ * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION")
+ * has a description of SETCLIENTID request processing consisting
+ * of 5 bullet points, labeled as CASE0 - CASE4 below.
+ */
unconf = find_unconfirmed_client_by_str(dname, strhashval);
status = nfserr_resource;
if (!conf) {
- /*
- * CASE 4:
- * placed first, because it is the normal case.
+ /*
+ * RFC 3530 14.2.33 CASE 4:
+ * placed first, because it is the normal case
*/
if (unconf)
expire_client(unconf);
@@ -776,17 +744,8 @@
gen_clid(new);
} else if (same_verf(&conf->cl_verifier, &clverifier)) {
/*
- * CASE 1:
- * cl_name match, confirmed, principal match
- * verifier match: probable callback update
- *
- * remove any unconfirmed nfs4_client with
- * matching cl_name, cl_verifier, and cl_clientid
- *
- * create and insert an unconfirmed nfs4_client with same
- * cl_name, cl_verifier, and cl_clientid as existing
- * nfs4_client, but with the new callback info and a
- * new cl_confirm
+ * RFC 3530 14.2.33 CASE 1:
+ * probable callback update
*/
if (unconf) {
/* Note this is removing unconfirmed {*x***},
@@ -802,43 +761,25 @@
copy_clid(new, conf);
} else if (!unconf) {
/*
- * CASE 2:
- * clname match, confirmed, principal match
- * verfier does not match
- * no unconfirmed. create a new unconfirmed nfs4_client
- * using input clverifier, clname, and callback info
- * and generate a new cl_clientid and cl_confirm.
+ * RFC 3530 14.2.33 CASE 2:
+ * probable client reboot; state will be removed if
+ * confirmed.
*/
new = create_client(clname, dname);
if (new == NULL)
goto out;
gen_clid(new);
- } else if (!same_verf(&conf->cl_confirm, &unconf->cl_confirm)) {
- /*
- * CASE3:
- * confirmed found (name, principal match)
- * confirmed verifier does not match input clverifier
- *
- * unconfirmed found (name match)
- * confirmed->cl_confirm != unconfirmed->cl_confirm
- *
- * remove unconfirmed.
- *
- * create an unconfirmed nfs4_client
- * with same cl_name as existing confirmed nfs4_client,
- * but with new callback info, new cl_clientid,
- * new cl_verifier and a new cl_confirm
+ } else {
+ /*
+ * RFC 3530 14.2.33 CASE 3:
+ * probable client reboot; state will be removed if
+ * confirmed.
*/
expire_client(unconf);
new = create_client(clname, dname);
if (new == NULL)
goto out;
gen_clid(new);
- } else {
- /* No cases hit !!! */
- status = nfserr_inval;
- goto out;
-
}
copy_verf(new, &clverifier);
new->cl_addr = sin->sin_addr.s_addr;
@@ -857,11 +798,9 @@
/*
- * RFC 3010 has a complex implmentation description of processing a
- * SETCLIENTID_CONFIRM request consisting of 4 bullets describing
- * processing on a DRC miss, labeled as CASE1 - CASE4 below.
- *
- * NOTE: callback information will be processed here in a future patch
+ * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has
+ * a description of SETCLIENTID_CONFIRM request processing consisting of 4
+ * bullets, labeled as CASE1 - CASE4 below.
*/
__be32
nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
@@ -892,16 +831,16 @@
if (unconf && unconf->cl_addr != sin->sin_addr.s_addr)
goto out;
- if ((conf && unconf) &&
- (same_verf(&unconf->cl_confirm, &confirm)) &&
- (same_verf(&conf->cl_verifier, &unconf->cl_verifier)) &&
- (same_name(conf->cl_recdir,unconf->cl_recdir)) &&
- (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) {
- /* CASE 1:
- * unconf record that matches input clientid and input confirm.
- * conf record that matches input clientid.
- * conf and unconf records match names, verifiers
- */
+ /*
+ * section 14.2.34 of RFC 3530 has a description of
+ * SETCLIENTID_CONFIRM request processing consisting
+ * of 4 bullet points, labeled as CASE1 - CASE4 below.
+ */
+ if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) {
+ /*
+ * RFC 3530 14.2.34 CASE 1:
+ * callback update
+ */
if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
status = nfserr_clid_inuse;
else {
@@ -914,15 +853,11 @@
status = nfs_ok;
}
- } else if ((conf && !unconf) ||
- ((conf && unconf) &&
- (!same_verf(&conf->cl_verifier, &unconf->cl_verifier) ||
- !same_name(conf->cl_recdir, unconf->cl_recdir)))) {
- /* CASE 2:
- * conf record that matches input clientid.
- * if unconf record matches input clientid, then
- * unconf->cl_name or unconf->cl_verifier don't match the
- * conf record.
+ } else if (conf && !unconf) {
+ /*
+ * RFC 3530 14.2.34 CASE 2:
+ * probable retransmitted request; play it safe and
+ * do nothing.
*/
if (!same_creds(&conf->cl_cred, &rqstp->rq_cred))
status = nfserr_clid_inuse;
@@ -930,10 +865,9 @@
status = nfs_ok;
} else if (!conf && unconf
&& same_verf(&unconf->cl_confirm, &confirm)) {
- /* CASE 3:
- * conf record not found.
- * unconf record found.
- * unconf->cl_confirm matches input confirm
+ /*
+ * RFC 3530 14.2.34 CASE 3:
+ * Normal case; new or rebooted client:
*/
if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) {
status = nfserr_clid_inuse;
@@ -948,16 +882,15 @@
}
move_to_confirmed(unconf);
conf = unconf;
+ nfsd4_probe_callback(conf);
status = nfs_ok;
}
} else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
&& (!unconf || (unconf && !same_verf(&unconf->cl_confirm,
&confirm)))) {
- /* CASE 4:
- * conf record not found, or if conf, conf->cl_confirm does not
- * match input confirm.
- * unconf record not found, or if unconf, unconf->cl_confirm
- * does not match input confirm.
+ /*
+ * RFC 3530 14.2.34 CASE 4:
+ * Client probably hasn't noticed that we rebooted yet.
*/
status = nfserr_stale_clientid;
} else {
@@ -965,8 +898,6 @@
status = nfserr_clid_inuse;
}
out:
- if (!status)
- nfsd4_probe_callback(conf);
nfs4_unlock_state();
return status;
}
@@ -1226,14 +1157,19 @@
return NULL;
}
-static int access_valid(u32 x)
+static inline int access_valid(u32 x)
{
- return (x > 0 && x < 4);
+ if (x < NFS4_SHARE_ACCESS_READ)
+ return 0;
+ if (x > NFS4_SHARE_ACCESS_BOTH)
+ return 0;
+ return 1;
}
-static int deny_valid(u32 x)
+static inline int deny_valid(u32 x)
{
- return (x >= 0 && x < 5);
+ /* Note: unlike access bits, deny bits may be zero. */
+ return x <= NFS4_SHARE_DENY_BOTH;
}
static void
@@ -2162,8 +2098,10 @@
goto check_replay;
}
+ *stpp = stp;
+ *sopp = sop = stp->st_stateowner;
+
if (lock) {
- struct nfs4_stateowner *sop = stp->st_stateowner;
clientid_t *lockclid = &lock->v.new.clientid;
struct nfs4_client *clp = sop->so_client;
int lkflg = 0;
@@ -2193,9 +2131,6 @@
return nfserr_bad_stateid;
}
- *stpp = stp;
- *sopp = sop = stp->st_stateowner;
-
/*
* We now validate the seqid and stateid generation numbers.
* For the moment, we ignore the possibility of
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5733394..b0592e7 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -148,12 +148,12 @@
} \
} while (0)
-static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
+static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
{
/* We want more bytes than seem to be available.
* Maybe we need a new page, maybe we have just run out
*/
- int avail = (char*)argp->end - (char*)argp->p;
+ unsigned int avail = (char *)argp->end - (char *)argp->p;
__be32 *p;
if (avail + argp->pagelen < nbytes)
return NULL;
@@ -169,6 +169,11 @@
return NULL;
}
+ /*
+ * The following memcpy is safe because read_buf is always
+ * called with nbytes > avail, and the two cases above both
+ * guarantee p points to at least nbytes bytes.
+ */
memcpy(p, argp->p, avail);
/* step to next page */
argp->p = page_address(argp->pagelist[0]);
@@ -1448,7 +1453,7 @@
__be32
nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval,
- struct svc_rqst *rqstp)
+ struct svc_rqst *rqstp, int ignore_crossmnt)
{
u32 bmval0 = bmval[0];
u32 bmval1 = bmval[1];
@@ -1828,7 +1833,12 @@
if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
if ((buflen -= 8) < 0)
goto out_resource;
- if (exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) {
+ /*
+ * Get parent's attributes if not ignoring crossmount
+ * and this is the root of a cross-mounted filesystem.
+ */
+ if (ignore_crossmnt == 0 &&
+ exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) {
err = vfs_getattr(exp->ex_mnt->mnt_parent,
exp->ex_mnt->mnt_mountpoint, &stat);
if (err)
@@ -1864,13 +1874,25 @@
struct svc_export *exp = cd->rd_fhp->fh_export;
struct dentry *dentry;
__be32 nfserr;
+ int ignore_crossmnt = 0;
dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
if (IS_ERR(dentry))
return nfserrno(PTR_ERR(dentry));
exp_get(exp);
- if (d_mountpoint(dentry)) {
+ /*
+ * In the case of a mountpoint, the client may be asking for
+ * attributes that are only properties of the underlying filesystem
+ * as opposed to the cross-mounted file system. In such a case,
+ * we will not follow the cross mount and will fill the attribtutes
+ * directly from the mountpoint dentry.
+ */
+ if (d_mountpoint(dentry) &&
+ (cd->rd_bmval[0] & ~FATTR4_WORD0_RDATTR_ERROR) == 0 &&
+ (cd->rd_bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID) == 0)
+ ignore_crossmnt = 1;
+ else if (d_mountpoint(dentry)) {
int err;
/*
@@ -1889,7 +1911,7 @@
}
nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
- cd->rd_rqstp);
+ cd->rd_rqstp, ignore_crossmnt);
out_put:
dput(dentry);
exp_put(exp);
@@ -2043,7 +2065,7 @@
buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2);
nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
resp->p, &buflen, getattr->ga_bmval,
- resp->rqstp);
+ resp->rqstp, 0);
if (!nfserr)
resp->p += buflen;
return nfserr;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 578f2c9..5bfc2ac 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -44,17 +44,17 @@
*/
static DEFINE_SPINLOCK(cache_lock);
-void
-nfsd_cache_init(void)
+int nfsd_reply_cache_init(void)
{
struct svc_cacherep *rp;
int i;
INIT_LIST_HEAD(&lru_head);
i = CACHESIZE;
- while(i) {
+ while (i) {
rp = kmalloc(sizeof(*rp), GFP_KERNEL);
- if (!rp) break;
+ if (!rp)
+ goto out_nomem;
list_add(&rp->c_lru, &lru_head);
rp->c_state = RC_UNUSED;
rp->c_type = RC_NOCACHE;
@@ -62,23 +62,19 @@
i--;
}
- if (i)
- printk (KERN_ERR "nfsd: cannot allocate all %d cache entries, only got %d\n",
- CACHESIZE, CACHESIZE-i);
-
hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL);
- if (!hash_list) {
- nfsd_cache_shutdown();
- printk (KERN_ERR "nfsd: cannot allocate %Zd bytes for hash list\n",
- HASHSIZE * sizeof(struct hlist_head));
- return;
- }
+ if (!hash_list)
+ goto out_nomem;
cache_disabled = 0;
+ return 0;
+out_nomem:
+ printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
+ nfsd_reply_cache_shutdown();
+ return -ENOMEM;
}
-void
-nfsd_cache_shutdown(void)
+void nfsd_reply_cache_shutdown(void)
{
struct svc_cacherep *rp;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 77dc989..8516137 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -304,6 +304,9 @@
struct auth_domain *dom;
struct knfsd_fh fh;
+ if (size == 0)
+ return -EINVAL;
+
if (buf[size-1] != '\n')
return -EINVAL;
buf[size-1] = 0;
@@ -503,7 +506,7 @@
int len = 0;
lock_kernel();
if (nfsd_serv)
- len = svc_sock_names(buf, nfsd_serv, NULL);
+ len = svc_xprt_names(nfsd_serv, buf, 0);
unlock_kernel();
return len;
}
@@ -540,7 +543,7 @@
}
return err < 0 ? err : 0;
}
- if (buf[0] == '-') {
+ if (buf[0] == '-' && isdigit(buf[1])) {
char *toclose = kstrdup(buf+1, GFP_KERNEL);
int len = 0;
if (!toclose)
@@ -554,6 +557,53 @@
kfree(toclose);
return len;
}
+ /*
+ * Add a transport listener by writing it's transport name
+ */
+ if (isalpha(buf[0])) {
+ int err;
+ char transport[16];
+ int port;
+ if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
+ err = nfsd_create_serv();
+ if (!err) {
+ err = svc_create_xprt(nfsd_serv,
+ transport, port,
+ SVC_SOCK_ANONYMOUS);
+ if (err == -ENOENT)
+ /* Give a reasonable perror msg for
+ * bad transport string */
+ err = -EPROTONOSUPPORT;
+ }
+ return err < 0 ? err : 0;
+ }
+ }
+ /*
+ * Remove a transport by writing it's transport name and port number
+ */
+ if (buf[0] == '-' && isalpha(buf[1])) {
+ struct svc_xprt *xprt;
+ int err = -EINVAL;
+ char transport[16];
+ int port;
+ if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
+ if (port == 0)
+ return -EINVAL;
+ lock_kernel();
+ if (nfsd_serv) {
+ xprt = svc_find_xprt(nfsd_serv, transport,
+ AF_UNSPEC, port);
+ if (xprt) {
+ svc_close_xprt(xprt);
+ svc_xprt_put(xprt);
+ err = 0;
+ } else
+ err = -ENOTCONN;
+ }
+ unlock_kernel();
+ return err < 0 ? err : 0;
+ }
+ }
return -EINVAL;
}
@@ -616,7 +666,7 @@
char *recdir;
int len, status;
- if (size > PATH_MAX || buf[size-1] != '\n')
+ if (size == 0 || size > PATH_MAX || buf[size-1] != '\n')
return -EINVAL;
buf[size-1] = 0;
@@ -674,6 +724,27 @@
.kill_sb = kill_litter_super,
};
+#ifdef CONFIG_PROC_FS
+static int create_proc_exports_entry(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = proc_mkdir("fs/nfs", NULL);
+ if (!entry)
+ return -ENOMEM;
+ entry = create_proc_entry("fs/nfs/exports", 0, NULL);
+ if (!entry)
+ return -ENOMEM;
+ entry->proc_fops = &exports_operations;
+ return 0;
+}
+#else /* CONFIG_PROC_FS */
+static int create_proc_exports_entry(void)
+{
+ return 0;
+}
+#endif
+
static int __init init_nfsd(void)
{
int retval;
@@ -683,32 +754,43 @@
if (retval)
return retval;
nfsd_stat_init(); /* Statistics */
- nfsd_cache_init(); /* RPC reply cache */
- nfsd_export_init(); /* Exports table */
+ retval = nfsd_reply_cache_init();
+ if (retval)
+ goto out_free_stat;
+ retval = nfsd_export_init();
+ if (retval)
+ goto out_free_cache;
nfsd_lockd_init(); /* lockd->nfsd callbacks */
- nfsd_idmap_init(); /* Name to ID mapping */
- if (proc_mkdir("fs/nfs", NULL)) {
- struct proc_dir_entry *entry;
- entry = create_proc_entry("fs/nfs/exports", 0, NULL);
- if (entry)
- entry->proc_fops = &exports_operations;
- }
+ retval = nfsd_idmap_init();
+ if (retval)
+ goto out_free_lockd;
+ retval = create_proc_exports_entry();
+ if (retval)
+ goto out_free_idmap;
retval = register_filesystem(&nfsd_fs_type);
- if (retval) {
- nfsd_export_shutdown();
- nfsd_cache_shutdown();
- remove_proc_entry("fs/nfs/exports", NULL);
- remove_proc_entry("fs/nfs", NULL);
- nfsd_stat_shutdown();
- nfsd_lockd_shutdown();
- }
+ if (retval)
+ goto out_free_all;
+ return 0;
+out_free_all:
+ remove_proc_entry("fs/nfs/exports", NULL);
+ remove_proc_entry("fs/nfs", NULL);
+out_free_idmap:
+ nfsd_idmap_shutdown();
+out_free_lockd:
+ nfsd_lockd_shutdown();
+ nfsd_export_shutdown();
+out_free_cache:
+ nfsd_reply_cache_shutdown();
+out_free_stat:
+ nfsd_stat_shutdown();
+ nfsd4_free_slabs();
return retval;
}
static void __exit exit_nfsd(void)
{
nfsd_export_shutdown();
- nfsd_cache_shutdown();
+ nfsd_reply_cache_shutdown();
remove_proc_entry("fs/nfs/exports", NULL);
remove_proc_entry("fs/nfs", NULL);
nfsd_stat_shutdown();
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 468f17a..8fbd2dc 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -22,6 +22,7 @@
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/nfsd/nfsd.h>
+#include "auth.h"
#define NFSDDBG_FACILITY NFSDDBG_FH
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 1190aea..9647b0f 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -155,8 +155,8 @@
static void nfsd_last_thread(struct svc_serv *serv)
{
/* When last nfsd thread exits we need to do some clean-up */
- struct svc_sock *svsk;
- list_for_each_entry(svsk, &serv->sv_permsocks, sk_list)
+ struct svc_xprt *xprt;
+ list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
lockd_down();
nfsd_serv = NULL;
nfsd_racache_shutdown();
@@ -236,7 +236,7 @@
error = lockd_up(IPPROTO_UDP);
if (error >= 0) {
- error = svc_makesock(nfsd_serv, IPPROTO_UDP, port,
+ error = svc_create_xprt(nfsd_serv, "udp", port,
SVC_SOCK_DEFAULTS);
if (error < 0)
lockd_down();
@@ -247,7 +247,7 @@
#ifdef CONFIG_NFSD_TCP
error = lockd_up(IPPROTO_TCP);
if (error >= 0) {
- error = svc_makesock(nfsd_serv, IPPROTO_TCP, port,
+ error = svc_create_xprt(nfsd_serv, "tcp", port,
SVC_SOCK_DEFAULTS);
if (error < 0)
lockd_down();
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index b86e365..61ad617 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -15,6 +15,7 @@
#include <linux/nfsd/nfsd.h>
#include <linux/nfsd/xdr.h>
#include <linux/mm.h>
+#include "auth.h"
#define NFSDDBG_FACILITY NFSDDBG_XDR
@@ -62,10 +63,10 @@
* no slashes or null bytes.
*/
static __be32 *
-decode_filename(__be32 *p, char **namp, int *lenp)
+decode_filename(__be32 *p, char **namp, unsigned int *lenp)
{
char *name;
- int i;
+ unsigned int i;
if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) {
for (i = 0, name = *namp; i < *lenp; i++, name++) {
@@ -78,10 +79,10 @@
}
static __be32 *
-decode_pathname(__be32 *p, char **namp, int *lenp)
+decode_pathname(__be32 *p, char **namp, unsigned int *lenp)
{
char *name;
- int i;
+ unsigned int i;
if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) {
for (i = 0, name = *namp; i < *lenp; i++, name++) {
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d019918..cc75e4f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -132,7 +132,7 @@
__be32
nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
- const char *name, int len,
+ const char *name, unsigned int len,
struct svc_export **exp_ret, struct dentry **dentry_ret)
{
struct svc_export *exp;
@@ -226,7 +226,7 @@
*/
__be32
nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
- int len, struct svc_fh *resfh)
+ unsigned int len, struct svc_fh *resfh)
{
struct svc_export *exp;
struct dentry *dentry;
@@ -1151,6 +1151,26 @@
}
#endif /* CONFIG_NFSD_V3 */
+__be32
+nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
+ struct iattr *iap)
+{
+ /*
+ * Mode has already been set earlier in create:
+ */
+ iap->ia_valid &= ~ATTR_MODE;
+ /*
+ * Setting uid/gid works only for root. Irix appears to
+ * send along the gid on create when it tries to implement
+ * setgid directories via NFS:
+ */
+ if (current->fsuid != 0)
+ iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
+ if (iap->ia_valid)
+ return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+ return 0;
+}
+
/*
* Create a file (regular, directory, device, fifo); UNIX sockets
* not yet implemented.
@@ -1167,6 +1187,7 @@
struct dentry *dentry, *dchild = NULL;
struct inode *dirp;
__be32 err;
+ __be32 err2;
int host_err;
err = nfserr_perm;
@@ -1257,16 +1278,9 @@
}
- /* Set file attributes. Mode has already been set and
- * setting uid/gid works only for root. Irix appears to
- * send along the gid when it tries to implement setgid
- * directories via NFS.
- */
- if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
- __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
- if (err2)
- err = err2;
- }
+ err2 = nfsd_create_setattr(rqstp, resfhp, iap);
+ if (err2)
+ err = err2;
/*
* Update the file handle to get the new inode info.
*/
@@ -1295,6 +1309,7 @@
struct dentry *dentry, *dchild = NULL;
struct inode *dirp;
__be32 err;
+ __be32 err2;
int host_err;
__u32 v_mtime=0, v_atime=0;
@@ -1399,16 +1414,10 @@
iap->ia_atime.tv_nsec = 0;
}
- /* Set file attributes.
- * Irix appears to send along the gid when it tries to
- * implement setgid directories via NFS. Clear out all that cruft.
- */
set_attr:
- if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
- __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
- if (err2)
- err = err2;
- }
+ err2 = nfsd_create_setattr(rqstp, resfhp, iap);
+ if (err2)
+ err = err2;
/*
* Update the filehandle to get the new inode info.
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index e2d1ce3..4babb2a 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -173,14 +173,17 @@
/*
* Host cache
*/
-struct nlm_host * nlmclnt_lookup_host(const struct sockaddr_in *, int, int, const char *, int);
-struct nlm_host * nlmsvc_lookup_host(struct svc_rqst *, const char *, int);
+struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *, int, int,
+ const char *, unsigned int);
+struct nlm_host *nlmsvc_lookup_host(struct svc_rqst *, const char *,
+ unsigned int);
struct rpc_clnt * nlm_bind_host(struct nlm_host *);
void nlm_rebind_host(struct nlm_host *);
struct nlm_host * nlm_get_host(struct nlm_host *);
void nlm_release_host(struct nlm_host *);
void nlm_shutdown_hosts(void);
-extern void nlm_host_rebooted(const struct sockaddr_in *, const char *, int, u32);
+extern void nlm_host_rebooted(const struct sockaddr_in *, const char *,
+ unsigned int, u32);
void nsm_release(struct nsm_handle *);
diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h
index 83a1f9f..df18fa0 100644
--- a/include/linux/lockd/xdr.h
+++ b/include/linux/lockd/xdr.h
@@ -29,7 +29,7 @@
/* Lock info passed via NLM */
struct nlm_lock {
char * caller;
- int len; /* length of "caller" */
+ unsigned int len; /* length of "caller" */
struct nfs_fh fh;
struct xdr_netobj oh;
u32 svid;
@@ -78,7 +78,7 @@
*/
struct nlm_reboot {
char * mon;
- int len;
+ unsigned int len;
u32 state;
__be32 addr;
__be32 vers;
diff --git a/include/linux/nfsd/Kbuild b/include/linux/nfsd/Kbuild
index d9c5455..e726fc3 100644
--- a/include/linux/nfsd/Kbuild
+++ b/include/linux/nfsd/Kbuild
@@ -4,4 +4,3 @@
unifdef-y += syscall.h
unifdef-y += nfsfh.h
unifdef-y += debug.h
-unifdef-y += auth.h
diff --git a/include/linux/nfsd/cache.h b/include/linux/nfsd/cache.h
index 007480c..7b5d784 100644
--- a/include/linux/nfsd/cache.h
+++ b/include/linux/nfsd/cache.h
@@ -72,8 +72,8 @@
*/
#define RC_DELAY (HZ/5)
-void nfsd_cache_init(void);
-void nfsd_cache_shutdown(void);
+int nfsd_reply_cache_init(void);
+void nfsd_reply_cache_shutdown(void);
int nfsd_cache_lookup(struct svc_rqst *, int);
void nfsd_cache_update(struct svc_rqst *, int, __be32 *);
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index bcb7aba..3a16872 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -122,7 +122,7 @@
/*
* Function declarations
*/
-void nfsd_export_init(void);
+int nfsd_export_init(void);
void nfsd_export_shutdown(void);
void nfsd_export_flush(void);
void exp_readlock(void);
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 604a0d7..8caf4c4 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -20,7 +20,6 @@
#include <linux/nfsd/debug.h>
#include <linux/nfsd/nfsfh.h>
#include <linux/nfsd/export.h>
-#include <linux/nfsd/auth.h>
#include <linux/nfsd/stats.h>
/*
* nfsd version
@@ -70,9 +69,9 @@
int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
struct svc_export **expp);
__be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *,
- const char *, int, struct svc_fh *);
+ const char *, unsigned int, struct svc_fh *);
__be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
- const char *, int,
+ const char *, unsigned int,
struct svc_export **, struct dentry **);
__be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *,
struct iattr *, int, time_t);
diff --git a/include/linux/nfsd/syscall.h b/include/linux/nfsd/syscall.h
index 8bcddcc..4e43976 100644
--- a/include/linux/nfsd/syscall.h
+++ b/include/linux/nfsd/syscall.h
@@ -18,7 +18,6 @@
#include <linux/nfsd/const.h>
#include <linux/nfsd/export.h>
#include <linux/nfsd/nfsfh.h>
-#include <linux/nfsd/auth.h>
/*
* Version of the syscall interface
diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h
index 67885d5..a0132ef 100644
--- a/include/linux/nfsd/xdr.h
+++ b/include/linux/nfsd/xdr.h
@@ -23,7 +23,7 @@
struct nfsd_diropargs {
struct svc_fh fh;
char * name;
- int len;
+ unsigned int len;
};
struct nfsd_readargs {
@@ -43,17 +43,17 @@
struct nfsd_createargs {
struct svc_fh fh;
char * name;
- int len;
+ unsigned int len;
struct iattr attrs;
};
struct nfsd_renameargs {
struct svc_fh ffh;
char * fname;
- int flen;
+ unsigned int flen;
struct svc_fh tfh;
char * tname;
- int tlen;
+ unsigned int tlen;
};
struct nfsd_readlinkargs {
@@ -65,15 +65,15 @@
struct svc_fh ffh;
struct svc_fh tfh;
char * tname;
- int tlen;
+ unsigned int tlen;
};
struct nfsd_symlinkargs {
struct svc_fh ffh;
char * fname;
- int flen;
+ unsigned int flen;
char * tname;
- int tlen;
+ unsigned int tlen;
struct iattr attrs;
};
diff --git a/include/linux/nfsd/xdr3.h b/include/linux/nfsd/xdr3.h
index 89d9d60..421eddd 100644
--- a/include/linux/nfsd/xdr3.h
+++ b/include/linux/nfsd/xdr3.h
@@ -21,7 +21,7 @@
struct nfsd3_diropargs {
struct svc_fh fh;
char * name;
- int len;
+ unsigned int len;
};
struct nfsd3_accessargs {
@@ -48,7 +48,7 @@
struct nfsd3_createargs {
struct svc_fh fh;
char * name;
- int len;
+ unsigned int len;
int createmode;
struct iattr attrs;
__be32 * verf;
@@ -57,7 +57,7 @@
struct nfsd3_mknodargs {
struct svc_fh fh;
char * name;
- int len;
+ unsigned int len;
__u32 ftype;
__u32 major, minor;
struct iattr attrs;
@@ -66,10 +66,10 @@
struct nfsd3_renameargs {
struct svc_fh ffh;
char * fname;
- int flen;
+ unsigned int flen;
struct svc_fh tfh;
char * tname;
- int tlen;
+ unsigned int tlen;
};
struct nfsd3_readlinkargs {
@@ -81,15 +81,15 @@
struct svc_fh ffh;
struct svc_fh tfh;
char * tname;
- int tlen;
+ unsigned int tlen;
};
struct nfsd3_symlinkargs {
struct svc_fh ffh;
char * fname;
- int flen;
+ unsigned int flen;
char * tname;
- int tlen;
+ unsigned int tlen;
struct iattr attrs;
};
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index b0ddfb4..27bd3e3 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -441,7 +441,7 @@
void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
struct dentry *dentry, __be32 *buffer, int *countp,
- u32 *bmval, struct svc_rqst *);
+ u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
struct nfsd4_compound_state *,
struct nfsd4_setclientid *setclid);
diff --git a/include/linux/nfsd_idmap.h b/include/linux/nfsd_idmap.h
index e82746f..d4a2ac1 100644
--- a/include/linux/nfsd_idmap.h
+++ b/include/linux/nfsd_idmap.h
@@ -44,11 +44,16 @@
#define IDMAP_NAMESZ 128
#ifdef CONFIG_NFSD_V4
-void nfsd_idmap_init(void);
+int nfsd_idmap_init(void);
void nfsd_idmap_shutdown(void);
#else
-static inline void nfsd_idmap_init(void) {};
-static inline void nfsd_idmap_shutdown(void) {};
+static inline int nfsd_idmap_init(void)
+{
+ return 0;
+}
+static inline void nfsd_idmap_shutdown(void)
+{
+}
#endif
int nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index bd7a6b0..03547d6 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -169,8 +169,8 @@
extern void cache_flush(void);
extern void cache_purge(struct cache_detail *detail);
#define NEVER (0x7FFFFFFF)
-extern void cache_register(struct cache_detail *cd);
-extern int cache_unregister(struct cache_detail *cd);
+extern int cache_register(struct cache_detail *cd);
+extern void cache_unregister(struct cache_detail *cd);
extern void qword_add(char **bpp, int *lp, char *str);
extern void qword_addhex(char **bpp, int *lp, char *buf, int blen);
diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index 3912cf1..10709cb 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -20,7 +20,7 @@
#define RPCDBG_BIND 0x0020
#define RPCDBG_SCHED 0x0040
#define RPCDBG_TRANS 0x0080
-#define RPCDBG_SVCSOCK 0x0100
+#define RPCDBG_SVCXPRT 0x0100
#define RPCDBG_SVCDSP 0x0200
#define RPCDBG_MISC 0x0400
#define RPCDBG_CACHE 0x0800
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 8531a70..64c7710 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -204,7 +204,7 @@
struct svc_rqst {
struct list_head rq_list; /* idle list */
struct list_head rq_all; /* all threads list */
- struct svc_sock * rq_sock; /* socket */
+ struct svc_xprt * rq_xprt; /* transport ptr */
struct sockaddr_storage rq_addr; /* peer address */
size_t rq_addrlen;
@@ -214,9 +214,10 @@
struct auth_ops * rq_authop; /* authentication flavour */
u32 rq_flavor; /* pseudoflavor */
struct svc_cred rq_cred; /* auth info */
- struct sk_buff * rq_skbuff; /* fast recv inet buffer */
+ void * rq_xprt_ctxt; /* transport specific context ptr */
struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */
+ size_t rq_xprt_hlen; /* xprt header len */
struct xdr_buf rq_arg;
struct xdr_buf rq_res;
struct page * rq_pages[RPCSVC_MAXPAGES];
@@ -317,11 +318,12 @@
struct svc_deferred_req {
u32 prot; /* protocol (UDP or TCP) */
- struct svc_sock *svsk;
+ struct svc_xprt *xprt;
struct sockaddr_storage addr; /* where reply must go */
size_t addrlen;
union svc_addr_u daddr; /* where reply must come from */
struct cache_deferred_req handle;
+ size_t xprt_hlen;
int argslen;
__be32 args[0];
};
@@ -382,6 +384,8 @@
*/
struct svc_serv * svc_create(struct svc_program *, unsigned int,
void (*shutdown)(struct svc_serv*));
+struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
+ struct svc_pool *pool);
int svc_create_thread(svc_thread_fn, struct svc_serv *);
void svc_exit_thread(struct svc_rqst *);
struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int,
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
new file mode 100644
index 0000000..c11bbcc
--- /dev/null
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#ifndef SVC_RDMA_H
+#define SVC_RDMA_H
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#define SVCRDMA_DEBUG
+
+/* RPC/RDMA parameters and stats */
+extern unsigned int svcrdma_ord;
+extern unsigned int svcrdma_max_requests;
+extern unsigned int svcrdma_max_req_size;
+
+extern atomic_t rdma_stat_recv;
+extern atomic_t rdma_stat_read;
+extern atomic_t rdma_stat_write;
+extern atomic_t rdma_stat_sq_starve;
+extern atomic_t rdma_stat_rq_starve;
+extern atomic_t rdma_stat_rq_poll;
+extern atomic_t rdma_stat_rq_prod;
+extern atomic_t rdma_stat_sq_poll;
+extern atomic_t rdma_stat_sq_prod;
+
+#define RPCRDMA_VERSION 1
+
+/*
+ * Contexts are built when an RDMA request is created and are a
+ * record of the resources that can be recovered when the request
+ * completes.
+ */
+struct svc_rdma_op_ctxt {
+ struct svc_rdma_op_ctxt *next;
+ struct xdr_buf arg;
+ struct list_head dto_q;
+ enum ib_wr_opcode wr_op;
+ enum ib_wc_status wc_status;
+ u32 byte_len;
+ struct svcxprt_rdma *xprt;
+ unsigned long flags;
+ enum dma_data_direction direction;
+ int count;
+ struct ib_sge sge[RPCSVC_MAXPAGES];
+ struct page *pages[RPCSVC_MAXPAGES];
+};
+
+#define RDMACTXT_F_READ_DONE 1
+#define RDMACTXT_F_LAST_CTXT 2
+
+struct svcxprt_rdma {
+ struct svc_xprt sc_xprt; /* SVC transport structure */
+ struct rdma_cm_id *sc_cm_id; /* RDMA connection id */
+ struct list_head sc_accept_q; /* Conn. waiting accept */
+ int sc_ord; /* RDMA read limit */
+ wait_queue_head_t sc_read_wait;
+ int sc_max_sge;
+
+ int sc_sq_depth; /* Depth of SQ */
+ atomic_t sc_sq_count; /* Number of SQ WR on queue */
+
+ int sc_max_requests; /* Depth of RQ */
+ int sc_max_req_size; /* Size of each RQ WR buf */
+
+ struct ib_pd *sc_pd;
+
+ struct svc_rdma_op_ctxt *sc_ctxt_head;
+ int sc_ctxt_cnt;
+ int sc_ctxt_bump;
+ int sc_ctxt_max;
+ spinlock_t sc_ctxt_lock;
+ struct list_head sc_rq_dto_q;
+ spinlock_t sc_rq_dto_lock;
+ struct ib_qp *sc_qp;
+ struct ib_cq *sc_rq_cq;
+ struct ib_cq *sc_sq_cq;
+ struct ib_mr *sc_phys_mr; /* MR for server memory */
+
+ spinlock_t sc_lock; /* transport lock */
+
+ wait_queue_head_t sc_send_wait; /* SQ exhaustion waitlist */
+ unsigned long sc_flags;
+ struct list_head sc_dto_q; /* DTO tasklet I/O pending Q */
+ struct list_head sc_read_complete_q;
+ spinlock_t sc_read_complete_lock;
+};
+/* sc_flags */
+#define RDMAXPRT_RQ_PENDING 1
+#define RDMAXPRT_SQ_PENDING 2
+#define RDMAXPRT_CONN_PENDING 3
+
+#define RPCRDMA_LISTEN_BACKLOG 10
+/* The default ORD value is based on two outstanding full-size writes with a
+ * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ. */
+#define RPCRDMA_ORD (64/4)
+#define RPCRDMA_SQ_DEPTH_MULT 8
+#define RPCRDMA_MAX_THREADS 16
+#define RPCRDMA_MAX_REQUESTS 16
+#define RPCRDMA_MAX_REQ_SIZE 4096
+
+/* svc_rdma_marshal.c */
+extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *,
+ int *, int *);
+extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
+extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *);
+extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
+ struct rpcrdma_msg *,
+ enum rpcrdma_errcode, u32 *);
+extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int);
+extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
+extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
+ u32, u64, u32);
+extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *,
+ struct rpcrdma_msg *,
+ struct rpcrdma_msg *,
+ enum rpcrdma_proc);
+extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *);
+
+/* svc_rdma_recvfrom.c */
+extern int svc_rdma_recvfrom(struct svc_rqst *);
+
+/* svc_rdma_sendto.c */
+extern int svc_rdma_sendto(struct svc_rqst *);
+
+/* svc_rdma_transport.c */
+extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *);
+extern int svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
+ enum rpcrdma_errcode);
+struct page *svc_rdma_get_page(void);
+extern int svc_rdma_post_recv(struct svcxprt_rdma *);
+extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
+extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
+extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
+extern void svc_sq_reap(struct svcxprt_rdma *);
+extern void svc_rq_reap(struct svcxprt_rdma *);
+extern struct svc_xprt_class svc_rdma_class;
+extern void svc_rdma_prep_reply_hdr(struct svc_rqst *);
+
+/* svc_rdma.c */
+extern int svc_rdma_init(void);
+extern void svc_rdma_cleanup(void);
+
+/*
+ * Returns the address of the first read chunk or <nul> if no read chunk is
+ * present
+ */
+static inline struct rpcrdma_read_chunk *
+svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp)
+{
+ struct rpcrdma_read_chunk *ch =
+ (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+
+ if (ch->rc_discrim == 0)
+ return NULL;
+
+ return ch;
+}
+
+/*
+ * Returns the address of the first read write array element or <nul> if no
+ * write array list is present
+ */
+static inline struct rpcrdma_write_array *
+svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp)
+{
+ if (rmsgp->rm_body.rm_chunks[0] != 0
+ || rmsgp->rm_body.rm_chunks[1] == 0)
+ return NULL;
+
+ return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1];
+}
+
+/*
+ * Returns the address of the first reply array element or <nul> if no
+ * reply array is present
+ */
+static inline struct rpcrdma_write_array *
+svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp)
+{
+ struct rpcrdma_read_chunk *rch;
+ struct rpcrdma_write_array *wr_ary;
+ struct rpcrdma_write_array *rp_ary;
+
+ /* XXX: Need to fix when reply list may occur with read-list and/or
+ * write list */
+ if (rmsgp->rm_body.rm_chunks[0] != 0 ||
+ rmsgp->rm_body.rm_chunks[1] != 0)
+ return NULL;
+
+ rch = svc_rdma_get_read_chunk(rmsgp);
+ if (rch) {
+ while (rch->rc_discrim)
+ rch++;
+
+ /* The reply list follows an empty write array located
+ * at 'rc_position' here. The reply array is at rc_target.
+ */
+ rp_ary = (struct rpcrdma_write_array *)&rch->rc_target;
+
+ goto found_it;
+ }
+
+ wr_ary = svc_rdma_get_write_array(rmsgp);
+ if (wr_ary) {
+ rp_ary = (struct rpcrdma_write_array *)
+ &wr_ary->
+ wc_array[wr_ary->wc_nchunks].wc_target.rs_length;
+
+ goto found_it;
+ }
+
+ /* No read list, no write list */
+ rp_ary = (struct rpcrdma_write_array *)
+ &rmsgp->rm_body.rm_chunks[2];
+
+ found_it:
+ if (rp_ary->wc_discrim == 0)
+ return NULL;
+
+ return rp_ary;
+}
+#endif
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
new file mode 100644
index 0000000..6fd7b01
--- /dev/null
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -0,0 +1,159 @@
+/*
+ * linux/include/linux/sunrpc/svc_xprt.h
+ *
+ * RPC server transport I/O
+ */
+
+#ifndef SUNRPC_SVC_XPRT_H
+#define SUNRPC_SVC_XPRT_H
+
+#include <linux/sunrpc/svc.h>
+#include <linux/module.h>
+
+struct svc_xprt_ops {
+ struct svc_xprt *(*xpo_create)(struct svc_serv *,
+ struct sockaddr *, int,
+ int);
+ struct svc_xprt *(*xpo_accept)(struct svc_xprt *);
+ int (*xpo_has_wspace)(struct svc_xprt *);
+ int (*xpo_recvfrom)(struct svc_rqst *);
+ void (*xpo_prep_reply_hdr)(struct svc_rqst *);
+ int (*xpo_sendto)(struct svc_rqst *);
+ void (*xpo_release_rqst)(struct svc_rqst *);
+ void (*xpo_detach)(struct svc_xprt *);
+ void (*xpo_free)(struct svc_xprt *);
+};
+
+struct svc_xprt_class {
+ const char *xcl_name;
+ struct module *xcl_owner;
+ struct svc_xprt_ops *xcl_ops;
+ struct list_head xcl_list;
+ u32 xcl_max_payload;
+};
+
+struct svc_xprt {
+ struct svc_xprt_class *xpt_class;
+ struct svc_xprt_ops *xpt_ops;
+ struct kref xpt_ref;
+ struct list_head xpt_list;
+ struct list_head xpt_ready;
+ unsigned long xpt_flags;
+#define XPT_BUSY 0 /* enqueued/receiving */
+#define XPT_CONN 1 /* conn pending */
+#define XPT_CLOSE 2 /* dead or dying */
+#define XPT_DATA 3 /* data pending */
+#define XPT_TEMP 4 /* connected transport */
+#define XPT_DEAD 6 /* transport closed */
+#define XPT_CHNGBUF 7 /* need to change snd/rcv buf sizes */
+#define XPT_DEFERRED 8 /* deferred request pending */
+#define XPT_OLD 9 /* used for xprt aging mark+sweep */
+#define XPT_DETACHED 10 /* detached from tempsocks list */
+#define XPT_LISTENER 11 /* listening endpoint */
+#define XPT_CACHE_AUTH 12 /* cache auth info */
+
+ struct svc_pool *xpt_pool; /* current pool iff queued */
+ struct svc_serv *xpt_server; /* service for transport */
+ atomic_t xpt_reserved; /* space on outq that is rsvd */
+ struct mutex xpt_mutex; /* to serialize sending data */
+ spinlock_t xpt_lock; /* protects sk_deferred
+ * and xpt_auth_cache */
+ void *xpt_auth_cache;/* auth cache */
+ struct list_head xpt_deferred; /* deferred requests that need
+ * to be revisted */
+ struct sockaddr_storage xpt_local; /* local address */
+ size_t xpt_locallen; /* length of address */
+ struct sockaddr_storage xpt_remote; /* remote peer's address */
+ size_t xpt_remotelen; /* length of address */
+};
+
+int svc_reg_xprt_class(struct svc_xprt_class *);
+void svc_unreg_xprt_class(struct svc_xprt_class *);
+void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *,
+ struct svc_serv *);
+int svc_create_xprt(struct svc_serv *, char *, unsigned short, int);
+void svc_xprt_enqueue(struct svc_xprt *xprt);
+void svc_xprt_received(struct svc_xprt *);
+void svc_xprt_put(struct svc_xprt *xprt);
+void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt);
+void svc_close_xprt(struct svc_xprt *xprt);
+void svc_delete_xprt(struct svc_xprt *xprt);
+int svc_port_is_privileged(struct sockaddr *sin);
+int svc_print_xprts(char *buf, int maxlen);
+struct svc_xprt *svc_find_xprt(struct svc_serv *, char *, int, int);
+int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen);
+
+static inline void svc_xprt_get(struct svc_xprt *xprt)
+{
+ kref_get(&xprt->xpt_ref);
+}
+static inline void svc_xprt_set_local(struct svc_xprt *xprt,
+ struct sockaddr *sa, int salen)
+{
+ memcpy(&xprt->xpt_local, sa, salen);
+ xprt->xpt_locallen = salen;
+}
+static inline void svc_xprt_set_remote(struct svc_xprt *xprt,
+ struct sockaddr *sa, int salen)
+{
+ memcpy(&xprt->xpt_remote, sa, salen);
+ xprt->xpt_remotelen = salen;
+}
+static inline unsigned short svc_addr_port(struct sockaddr *sa)
+{
+ unsigned short ret = 0;
+ switch (sa->sa_family) {
+ case AF_INET:
+ ret = ntohs(((struct sockaddr_in *)sa)->sin_port);
+ break;
+ case AF_INET6:
+ ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port);
+ break;
+ }
+ return ret;
+}
+
+static inline size_t svc_addr_len(struct sockaddr *sa)
+{
+ switch (sa->sa_family) {
+ case AF_INET:
+ return sizeof(struct sockaddr_in);
+ case AF_INET6:
+ return sizeof(struct sockaddr_in6);
+ }
+ return -EAFNOSUPPORT;
+}
+
+static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt)
+{
+ return svc_addr_port((struct sockaddr *)&xprt->xpt_local);
+}
+
+static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt)
+{
+ return svc_addr_port((struct sockaddr *)&xprt->xpt_remote);
+}
+
+static inline char *__svc_print_addr(struct sockaddr *addr,
+ char *buf, size_t len)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ snprintf(buf, len, "%u.%u.%u.%u, port=%u",
+ NIPQUAD(((struct sockaddr_in *) addr)->sin_addr),
+ ntohs(((struct sockaddr_in *) addr)->sin_port));
+ break;
+
+ case AF_INET6:
+ snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u",
+ NIP6(((struct sockaddr_in6 *) addr)->sin6_addr),
+ ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
+ break;
+
+ default:
+ snprintf(buf, len, "unknown address type: %d", addr->sa_family);
+ break;
+ }
+ return buf;
+}
+#endif /* SUNRPC_SVC_XPRT_H */
diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
index a53e0fa..206f092 100644
--- a/include/linux/sunrpc/svcsock.h
+++ b/include/linux/sunrpc/svcsock.h
@@ -10,42 +10,16 @@
#define SUNRPC_SVCSOCK_H
#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
/*
* RPC server socket.
*/
struct svc_sock {
- struct list_head sk_ready; /* list of ready sockets */
- struct list_head sk_list; /* list of all sockets */
+ struct svc_xprt sk_xprt;
struct socket * sk_sock; /* berkeley socket layer */
struct sock * sk_sk; /* INET layer */
- struct svc_pool * sk_pool; /* current pool iff queued */
- struct svc_serv * sk_server; /* service for this socket */
- atomic_t sk_inuse; /* use count */
- unsigned long sk_flags;
-#define SK_BUSY 0 /* enqueued/receiving */
-#define SK_CONN 1 /* conn pending */
-#define SK_CLOSE 2 /* dead or dying */
-#define SK_DATA 3 /* data pending */
-#define SK_TEMP 4 /* temp (TCP) socket */
-#define SK_DEAD 6 /* socket closed */
-#define SK_CHNGBUF 7 /* need to change snd/rcv buffer sizes */
-#define SK_DEFERRED 8 /* request on sk_deferred */
-#define SK_OLD 9 /* used for temp socket aging mark+sweep */
-#define SK_DETACHED 10 /* detached from tempsocks list */
-
- atomic_t sk_reserved; /* space on outq that is reserved */
-
- spinlock_t sk_lock; /* protects sk_deferred and
- * sk_info_authunix */
- struct list_head sk_deferred; /* deferred requests that need to
- * be revisted */
- struct mutex sk_mutex; /* to serialize sending data */
-
- int (*sk_recvfrom)(struct svc_rqst *rqstp);
- int (*sk_sendto)(struct svc_rqst *rqstp);
-
/* We keep the old state_change and data_ready CB's here */
void (*sk_ostate)(struct sock *);
void (*sk_odata)(struct sock *, int bytes);
@@ -54,21 +28,12 @@
/* private TCP part */
int sk_reclen; /* length of record */
int sk_tcplen; /* current read length */
- time_t sk_lastrecv; /* time of last received request */
-
- /* cache of various info for TCP sockets */
- void *sk_info_authunix;
-
- struct sockaddr_storage sk_local; /* local address */
- struct sockaddr_storage sk_remote; /* remote peer's address */
- int sk_remotelen; /* length of address */
};
/*
* Function prototypes.
*/
-int svc_makesock(struct svc_serv *, int, unsigned short, int flags);
-void svc_force_close_socket(struct svc_sock *);
+void svc_close_all(struct list_head *);
int svc_recv(struct svc_rqst *, long);
int svc_send(struct svc_rqst *);
void svc_drop(struct svc_rqst *);
@@ -78,6 +43,8 @@
int fd,
char *name_return,
int *proto);
+void svc_init_xprt_sock(void);
+void svc_cleanup_xprt_sock(void);
/*
* svc_makesock socket characteristics
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 0751c94..e4057d7 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -112,7 +112,8 @@
__be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int len);
__be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int len);
__be32 *xdr_encode_string(__be32 *p, const char *s);
-__be32 *xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen);
+__be32 *xdr_decode_string_inplace(__be32 *p, char **sp, unsigned int *lenp,
+ unsigned int maxlen);
__be32 *xdr_encode_netobj(__be32 *p, const struct xdr_netobj *);
__be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *);
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 5c69a72..92e1dbe 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -11,6 +11,7 @@
auth.o auth_null.o auth_unix.o \
svc.o svcsock.o svcauth.o svcauth_unix.o \
rpcb_clnt.o timer.o xdr.o \
- sunrpc_syms.o cache.o rpc_pipe.o
+ sunrpc_syms.o cache.o rpc_pipe.o \
+ svc_xprt.o
sunrpc-$(CONFIG_PROC_FS) += stats.o
sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 73940df..481f984 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -224,38 +224,34 @@
/* major/minor */
len = qword_get(&mesg, buf, mlen);
+ if (len <= 0)
+ goto out;
+ rsii.major_status = simple_strtoul(buf, &ep, 10);
+ if (*ep)
+ goto out;
+ len = qword_get(&mesg, buf, mlen);
+ if (len <= 0)
+ goto out;
+ rsii.minor_status = simple_strtoul(buf, &ep, 10);
+ if (*ep)
+ goto out;
+
+ /* out_handle */
+ len = qword_get(&mesg, buf, mlen);
if (len < 0)
goto out;
- if (len == 0) {
+ status = -ENOMEM;
+ if (dup_to_netobj(&rsii.out_handle, buf, len))
goto out;
- } else {
- rsii.major_status = simple_strtoul(buf, &ep, 10);
- if (*ep)
- goto out;
- len = qword_get(&mesg, buf, mlen);
- if (len <= 0)
- goto out;
- rsii.minor_status = simple_strtoul(buf, &ep, 10);
- if (*ep)
- goto out;
- /* out_handle */
- len = qword_get(&mesg, buf, mlen);
- if (len < 0)
- goto out;
- status = -ENOMEM;
- if (dup_to_netobj(&rsii.out_handle, buf, len))
- goto out;
-
- /* out_token */
- len = qword_get(&mesg, buf, mlen);
- status = -EINVAL;
- if (len < 0)
- goto out;
- status = -ENOMEM;
- if (dup_to_netobj(&rsii.out_token, buf, len))
- goto out;
- }
+ /* out_token */
+ len = qword_get(&mesg, buf, mlen);
+ status = -EINVAL;
+ if (len < 0)
+ goto out;
+ status = -ENOMEM;
+ if (dup_to_netobj(&rsii.out_token, buf, len))
+ goto out;
rsii.h.expiry_time = expiry;
rsip = rsi_update(&rsii, rsip);
status = 0;
@@ -975,6 +971,7 @@
struct kvec *resv = &rqstp->rq_res.head[0];
struct xdr_netobj tmpobj;
struct rsi *rsip, rsikey;
+ int ret;
/* Read the verifier; should be NULL: */
*authp = rpc_autherr_badverf;
@@ -1014,23 +1011,27 @@
/* No upcall result: */
return SVC_DROP;
case 0:
+ ret = SVC_DROP;
/* Got an answer to the upcall; use it: */
if (gss_write_init_verf(rqstp, rsip))
- return SVC_DROP;
+ goto out;
if (resv->iov_len + 4 > PAGE_SIZE)
- return SVC_DROP;
+ goto out;
svc_putnl(resv, RPC_SUCCESS);
if (svc_safe_putnetobj(resv, &rsip->out_handle))
- return SVC_DROP;
+ goto out;
if (resv->iov_len + 3 * 4 > PAGE_SIZE)
- return SVC_DROP;
+ goto out;
svc_putnl(resv, rsip->major_status);
svc_putnl(resv, rsip->minor_status);
svc_putnl(resv, GSS_SEQ_WIN);
if (svc_safe_putnetobj(resv, &rsip->out_token))
- return SVC_DROP;
+ goto out;
}
- return SVC_COMPLETE;
+ ret = SVC_COMPLETE;
+out:
+ cache_put(&rsip->h, &rsi_cache);
+ return ret;
}
/*
@@ -1125,6 +1126,7 @@
case RPC_GSS_PROC_DESTROY:
if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
goto auth_err;
+ rsci->h.expiry_time = get_seconds();
set_bit(CACHE_NEGATIVE, &rsci->h.flags);
if (resv->iov_len + 4 > PAGE_SIZE)
goto drop;
@@ -1386,19 +1388,26 @@
gss_svc_init(void)
{
int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss);
- if (rv == 0) {
- cache_register(&rsc_cache);
- cache_register(&rsi_cache);
- }
+ if (rv)
+ return rv;
+ rv = cache_register(&rsc_cache);
+ if (rv)
+ goto out1;
+ rv = cache_register(&rsi_cache);
+ if (rv)
+ goto out2;
+ return 0;
+out2:
+ cache_unregister(&rsc_cache);
+out1:
+ svc_auth_unregister(RPC_AUTH_GSS);
return rv;
}
void
gss_svc_shutdown(void)
{
- if (cache_unregister(&rsc_cache))
- printk(KERN_ERR "auth_rpcgss: failed to unregister rsc cache\n");
- if (cache_unregister(&rsi_cache))
- printk(KERN_ERR "auth_rpcgss: failed to unregister rsi cache\n");
+ cache_unregister(&rsc_cache);
+ cache_unregister(&rsi_cache);
svc_auth_unregister(RPC_AUTH_GSS);
}
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 73f053d..636c8e0 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -245,6 +245,7 @@
cache_put(h, detail);
return rv;
}
+EXPORT_SYMBOL(cache_check);
/*
* caches need to be periodically cleaned.
@@ -290,44 +291,78 @@
static void do_cache_clean(struct work_struct *work);
static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean);
-void cache_register(struct cache_detail *cd)
+static void remove_cache_proc_entries(struct cache_detail *cd)
{
+ if (cd->proc_ent == NULL)
+ return;
+ if (cd->flush_ent)
+ remove_proc_entry("flush", cd->proc_ent);
+ if (cd->channel_ent)
+ remove_proc_entry("channel", cd->proc_ent);
+ if (cd->content_ent)
+ remove_proc_entry("content", cd->proc_ent);
+ cd->proc_ent = NULL;
+ remove_proc_entry(cd->name, proc_net_rpc);
+}
+
+#ifdef CONFIG_PROC_FS
+static int create_cache_proc_entries(struct cache_detail *cd)
+{
+ struct proc_dir_entry *p;
+
cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc);
- if (cd->proc_ent) {
- struct proc_dir_entry *p;
- cd->proc_ent->owner = cd->owner;
- cd->channel_ent = cd->content_ent = NULL;
+ if (cd->proc_ent == NULL)
+ goto out_nomem;
+ cd->proc_ent->owner = cd->owner;
+ cd->channel_ent = cd->content_ent = NULL;
- p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR,
+ p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, cd->proc_ent);
+ cd->flush_ent = p;
+ if (p == NULL)
+ goto out_nomem;
+ p->proc_fops = &cache_flush_operations;
+ p->owner = cd->owner;
+ p->data = cd;
+
+ if (cd->cache_request || cd->cache_parse) {
+ p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR,
cd->proc_ent);
- cd->flush_ent = p;
- if (p) {
- p->proc_fops = &cache_flush_operations;
- p->owner = cd->owner;
- p->data = cd;
- }
-
- if (cd->cache_request || cd->cache_parse) {
- p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR,
- cd->proc_ent);
- cd->channel_ent = p;
- if (p) {
- p->proc_fops = &cache_file_operations;
- p->owner = cd->owner;
- p->data = cd;
- }
- }
- if (cd->cache_show) {
- p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR,
- cd->proc_ent);
- cd->content_ent = p;
- if (p) {
- p->proc_fops = &content_file_operations;
- p->owner = cd->owner;
- p->data = cd;
- }
- }
+ cd->channel_ent = p;
+ if (p == NULL)
+ goto out_nomem;
+ p->proc_fops = &cache_file_operations;
+ p->owner = cd->owner;
+ p->data = cd;
}
+ if (cd->cache_show) {
+ p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR,
+ cd->proc_ent);
+ cd->content_ent = p;
+ if (p == NULL)
+ goto out_nomem;
+ p->proc_fops = &content_file_operations;
+ p->owner = cd->owner;
+ p->data = cd;
+ }
+ return 0;
+out_nomem:
+ remove_cache_proc_entries(cd);
+ return -ENOMEM;
+}
+#else /* CONFIG_PROC_FS */
+static int create_cache_proc_entries(struct cache_detail *cd)
+{
+ return 0;
+}
+#endif
+
+int cache_register(struct cache_detail *cd)
+{
+ int ret;
+
+ ret = create_cache_proc_entries(cd);
+ if (ret)
+ return ret;
rwlock_init(&cd->hash_lock);
INIT_LIST_HEAD(&cd->queue);
spin_lock(&cache_list_lock);
@@ -341,9 +376,11 @@
/* start the cleaning process */
schedule_delayed_work(&cache_cleaner, 0);
+ return 0;
}
+EXPORT_SYMBOL(cache_register);
-int cache_unregister(struct cache_detail *cd)
+void cache_unregister(struct cache_detail *cd)
{
cache_purge(cd);
spin_lock(&cache_list_lock);
@@ -351,30 +388,23 @@
if (cd->entries || atomic_read(&cd->inuse)) {
write_unlock(&cd->hash_lock);
spin_unlock(&cache_list_lock);
- return -EBUSY;
+ goto out;
}
if (current_detail == cd)
current_detail = NULL;
list_del_init(&cd->others);
write_unlock(&cd->hash_lock);
spin_unlock(&cache_list_lock);
- if (cd->proc_ent) {
- if (cd->flush_ent)
- remove_proc_entry("flush", cd->proc_ent);
- if (cd->channel_ent)
- remove_proc_entry("channel", cd->proc_ent);
- if (cd->content_ent)
- remove_proc_entry("content", cd->proc_ent);
-
- cd->proc_ent = NULL;
- remove_proc_entry(cd->name, proc_net_rpc);
- }
+ remove_cache_proc_entries(cd);
if (list_empty(&cache_list)) {
/* module must be being unloaded so its safe to kill the worker */
cancel_delayed_work_sync(&cache_cleaner);
}
- return 0;
+ return;
+out:
+ printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name);
}
+EXPORT_SYMBOL(cache_unregister);
/* clean cache tries to find something to clean
* and cleans it.
@@ -489,6 +519,7 @@
while (cache_clean() != -1)
cond_resched();
}
+EXPORT_SYMBOL(cache_flush);
void cache_purge(struct cache_detail *detail)
{
@@ -497,7 +528,7 @@
cache_flush();
detail->flush_time = 1;
}
-
+EXPORT_SYMBOL(cache_purge);
/*
@@ -634,13 +665,13 @@
/*
* communicate with user-space
*
- * We have a magic /proc file - /proc/sunrpc/cache
- * On read, you get a full request, or block
- * On write, an update request is processed
- * Poll works if anything to read, and always allows write
+ * We have a magic /proc file - /proc/sunrpc/<cachename>/channel.
+ * On read, you get a full request, or block.
+ * On write, an update request is processed.
+ * Poll works if anything to read, and always allows write.
*
* Implemented by linked list of requests. Each open file has
- * a ->private that also exists in this list. New request are added
+ * a ->private that also exists in this list. New requests are added
* to the end and may wakeup and preceding readers.
* New readers are added to the head. If, on read, an item is found with
* CACHE_UPCALLING clear, we free it from the list.
@@ -963,6 +994,7 @@
*bpp = bp;
*lp = len;
}
+EXPORT_SYMBOL(qword_add);
void qword_addhex(char **bpp, int *lp, char *buf, int blen)
{
@@ -991,6 +1023,7 @@
*bpp = bp;
*lp = len;
}
+EXPORT_SYMBOL(qword_addhex);
static void warn_no_listener(struct cache_detail *detail)
{
@@ -1113,6 +1146,7 @@
*dest = '\0';
return len;
}
+EXPORT_SYMBOL(qword_get);
/*
@@ -1244,18 +1278,18 @@
struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data;
char tbuf[20];
unsigned long p = *ppos;
- int len;
+ size_t len;
sprintf(tbuf, "%lu\n", cd->flush_time);
len = strlen(tbuf);
if (p >= len)
return 0;
len -= p;
- if (len > count) len = count;
+ if (len > count)
+ len = count;
if (copy_to_user(buf, (void*)(tbuf+p), len))
- len = -EFAULT;
- else
- *ppos += len;
+ return -EFAULT;
+ *ppos += len;
return len;
}
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 74df2d3..5a16875 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -33,7 +33,7 @@
static int rpc_proc_show(struct seq_file *seq, void *v) {
const struct rpc_stat *statp = seq->private;
const struct rpc_program *prog = statp->program;
- int i, j;
+ unsigned int i, j;
seq_printf(seq,
"net %u %u %u %u\n",
@@ -81,7 +81,7 @@
const struct svc_program *prog = statp->program;
const struct svc_procedure *proc;
const struct svc_version *vers;
- int i, j;
+ unsigned int i, j;
seq_printf(seq,
"net %u %u %u %u\n",
@@ -106,6 +106,7 @@
seq_putc(seq, '\n');
}
}
+EXPORT_SYMBOL(svc_seq_show);
/**
* rpc_alloc_iostats - allocate an rpc_iostats structure
@@ -255,12 +256,14 @@
{
return do_register(statp->program->pg_name, statp, fops);
}
+EXPORT_SYMBOL(svc_proc_register);
void
svc_proc_unregister(const char *name)
{
remove_proc_entry(name, proc_net_rpc);
}
+EXPORT_SYMBOL(svc_proc_unregister);
void
rpc_proc_init(void)
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 1a7e309..843629f 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -22,48 +22,6 @@
#include <linux/sunrpc/rpc_pipe_fs.h>
#include <linux/sunrpc/xprtsock.h>
-/* RPC server stuff */
-EXPORT_SYMBOL(svc_create);
-EXPORT_SYMBOL(svc_create_thread);
-EXPORT_SYMBOL(svc_create_pooled);
-EXPORT_SYMBOL(svc_set_num_threads);
-EXPORT_SYMBOL(svc_exit_thread);
-EXPORT_SYMBOL(svc_destroy);
-EXPORT_SYMBOL(svc_drop);
-EXPORT_SYMBOL(svc_process);
-EXPORT_SYMBOL(svc_recv);
-EXPORT_SYMBOL(svc_wake_up);
-EXPORT_SYMBOL(svc_makesock);
-EXPORT_SYMBOL(svc_reserve);
-EXPORT_SYMBOL(svc_auth_register);
-EXPORT_SYMBOL(auth_domain_lookup);
-EXPORT_SYMBOL(svc_authenticate);
-EXPORT_SYMBOL(svc_set_client);
-
-/* RPC statistics */
-#ifdef CONFIG_PROC_FS
-EXPORT_SYMBOL(svc_proc_register);
-EXPORT_SYMBOL(svc_proc_unregister);
-EXPORT_SYMBOL(svc_seq_show);
-#endif
-
-/* caching... */
-EXPORT_SYMBOL(auth_domain_find);
-EXPORT_SYMBOL(auth_domain_put);
-EXPORT_SYMBOL(auth_unix_add_addr);
-EXPORT_SYMBOL(auth_unix_forget_old);
-EXPORT_SYMBOL(auth_unix_lookup);
-EXPORT_SYMBOL(cache_check);
-EXPORT_SYMBOL(cache_flush);
-EXPORT_SYMBOL(cache_purge);
-EXPORT_SYMBOL(cache_register);
-EXPORT_SYMBOL(cache_unregister);
-EXPORT_SYMBOL(qword_add);
-EXPORT_SYMBOL(qword_addhex);
-EXPORT_SYMBOL(qword_get);
-EXPORT_SYMBOL(svcauth_unix_purge);
-EXPORT_SYMBOL(unix_domain_find);
-
extern struct cache_detail ip_map_cache, unix_gid_cache;
static int __init
@@ -85,7 +43,8 @@
#endif
cache_register(&ip_map_cache);
cache_register(&unix_gid_cache);
- init_socket_xprt();
+ svc_init_xprt_sock(); /* svc sock transport */
+ init_socket_xprt(); /* clnt sock transport */
rpcauth_init_module();
out:
return err;
@@ -96,12 +55,11 @@
{
rpcauth_remove_module();
cleanup_socket_xprt();
+ svc_cleanup_xprt_sock();
unregister_rpc_pipefs();
rpc_destroy_mempool();
- if (cache_unregister(&ip_map_cache))
- printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n");
- if (cache_unregister(&unix_gid_cache))
- printk(KERN_ERR "sunrpc: failed to unregister unix_gid cache\n");
+ cache_unregister(&ip_map_cache);
+ cache_unregister(&unix_gid_cache);
#ifdef RPC_DEBUG
rpc_unregister_sysctl();
#endif
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 4ad5fbb..a290e15 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -364,7 +364,7 @@
void (*shutdown)(struct svc_serv *serv))
{
struct svc_serv *serv;
- int vers;
+ unsigned int vers;
unsigned int xdrsize;
unsigned int i;
@@ -433,6 +433,7 @@
{
return __svc_create(prog, bufsize, /*npools*/1, shutdown);
}
+EXPORT_SYMBOL(svc_create);
struct svc_serv *
svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
@@ -452,6 +453,7 @@
return serv;
}
+EXPORT_SYMBOL(svc_create_pooled);
/*
* Destroy an RPC service. Should be called with the BKL held
@@ -459,9 +461,6 @@
void
svc_destroy(struct svc_serv *serv)
{
- struct svc_sock *svsk;
- struct svc_sock *tmp;
-
dprintk("svc: svc_destroy(%s, %d)\n",
serv->sv_program->pg_name,
serv->sv_nrthreads);
@@ -476,14 +475,12 @@
del_timer_sync(&serv->sv_temptimer);
- list_for_each_entry_safe(svsk, tmp, &serv->sv_tempsocks, sk_list)
- svc_force_close_socket(svsk);
+ svc_close_all(&serv->sv_tempsocks);
if (serv->sv_shutdown)
serv->sv_shutdown(serv);
- list_for_each_entry_safe(svsk, tmp, &serv->sv_permsocks, sk_list)
- svc_force_close_socket(svsk);
+ svc_close_all(&serv->sv_permsocks);
BUG_ON(!list_empty(&serv->sv_permsocks));
BUG_ON(!list_empty(&serv->sv_tempsocks));
@@ -498,6 +495,7 @@
kfree(serv->sv_pools);
kfree(serv);
}
+EXPORT_SYMBOL(svc_destroy);
/*
* Allocate an RPC server's buffer space.
@@ -536,6 +534,44 @@
put_page(rqstp->rq_pages[i]);
}
+struct svc_rqst *
+svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool)
+{
+ struct svc_rqst *rqstp;
+
+ rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
+ if (!rqstp)
+ goto out_enomem;
+
+ init_waitqueue_head(&rqstp->rq_wait);
+
+ serv->sv_nrthreads++;
+ spin_lock_bh(&pool->sp_lock);
+ pool->sp_nrthreads++;
+ list_add(&rqstp->rq_all, &pool->sp_all_threads);
+ spin_unlock_bh(&pool->sp_lock);
+ rqstp->rq_server = serv;
+ rqstp->rq_pool = pool;
+
+ rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
+ if (!rqstp->rq_argp)
+ goto out_thread;
+
+ rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL);
+ if (!rqstp->rq_resp)
+ goto out_thread;
+
+ if (!svc_init_buffer(rqstp, serv->sv_max_mesg))
+ goto out_thread;
+
+ return rqstp;
+out_thread:
+ svc_exit_thread(rqstp);
+out_enomem:
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(svc_prepare_thread);
+
/*
* Create a thread in the given pool. Caller must hold BKL.
* On a NUMA or SMP machine, with a multi-pool serv, the thread
@@ -550,24 +586,11 @@
int have_oldmask = 0;
cpumask_t oldmask;
- rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
- if (!rqstp)
+ rqstp = svc_prepare_thread(serv, pool);
+ if (IS_ERR(rqstp)) {
+ error = PTR_ERR(rqstp);
goto out;
-
- init_waitqueue_head(&rqstp->rq_wait);
-
- if (!(rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL))
- || !(rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL))
- || !svc_init_buffer(rqstp, serv->sv_max_mesg))
- goto out_thread;
-
- serv->sv_nrthreads++;
- spin_lock_bh(&pool->sp_lock);
- pool->sp_nrthreads++;
- list_add(&rqstp->rq_all, &pool->sp_all_threads);
- spin_unlock_bh(&pool->sp_lock);
- rqstp->rq_server = serv;
- rqstp->rq_pool = pool;
+ }
if (serv->sv_nrpools > 1)
have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
@@ -597,6 +620,7 @@
{
return __svc_create_thread(func, serv, &serv->sv_pools[0]);
}
+EXPORT_SYMBOL(svc_create_thread);
/*
* Choose a pool in which to create a new thread, for svc_set_num_threads
@@ -700,6 +724,7 @@
return error;
}
+EXPORT_SYMBOL(svc_set_num_threads);
/*
* Called from a server thread as it's exiting. Caller must hold BKL.
@@ -726,6 +751,7 @@
if (serv)
svc_destroy(serv);
}
+EXPORT_SYMBOL(svc_exit_thread);
/*
* Register an RPC service with the local portmapper.
@@ -737,7 +763,8 @@
{
struct svc_program *progp;
unsigned long flags;
- int i, error = 0, dummy;
+ unsigned int i;
+ int error = 0, dummy;
if (!port)
clear_thread_flag(TIF_SIGPENDING);
@@ -840,9 +867,9 @@
rqstp->rq_res.tail[0].iov_len = 0;
/* Will be turned off only in gss privacy case: */
rqstp->rq_splice_ok = 1;
- /* tcp needs a space for the record length... */
- if (rqstp->rq_prot == IPPROTO_TCP)
- svc_putnl(resv, 0);
+
+ /* Setup reply header */
+ rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp);
rqstp->rq_xid = svc_getu32(argv);
svc_putu32(resv, rqstp->rq_xid);
@@ -1049,16 +1076,15 @@
svc_putnl(resv, ntohl(rpc_stat));
goto sendit;
}
+EXPORT_SYMBOL(svc_process);
/*
* Return (transport-specific) limit on the rpc payload.
*/
u32 svc_max_payload(const struct svc_rqst *rqstp)
{
- int max = RPCSVC_MAXPAYLOAD_TCP;
+ u32 max = rqstp->rq_xprt->xpt_class->xcl_max_payload;
- if (rqstp->rq_sock->sk_sock->type == SOCK_DGRAM)
- max = RPCSVC_MAXPAYLOAD_UDP;
if (rqstp->rq_server->sv_max_payload < max)
max = rqstp->rq_server->sv_max_payload;
return max;
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
new file mode 100644
index 0000000..ea377e0
--- /dev/null
+++ b/net/sunrpc/svc_xprt.c
@@ -0,0 +1,1055 @@
+/*
+ * linux/net/sunrpc/svc_xprt.c
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <linux/freezer.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/tcp_states.h>
+#include <linux/uaccess.h>
+#include <asm/ioctls.h>
+
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svc_xprt.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt);
+static int svc_deferred_recv(struct svc_rqst *rqstp);
+static struct cache_deferred_req *svc_defer(struct cache_req *req);
+static void svc_age_temp_xprts(unsigned long closure);
+
+/* apparently the "standard" is that clients close
+ * idle connections after 5 minutes, servers after
+ * 6 minutes
+ * http://www.connectathon.org/talks96/nfstcp.pdf
+ */
+static int svc_conn_age_period = 6*60;
+
+/* List of registered transport classes */
+static DEFINE_SPINLOCK(svc_xprt_class_lock);
+static LIST_HEAD(svc_xprt_class_list);
+
+/* SMP locking strategy:
+ *
+ * svc_pool->sp_lock protects most of the fields of that pool.
+ * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
+ * when both need to be taken (rare), svc_serv->sv_lock is first.
+ * BKL protects svc_serv->sv_nrthread.
+ * svc_sock->sk_lock protects the svc_sock->sk_deferred list
+ * and the ->sk_info_authunix cache.
+ *
+ * The XPT_BUSY bit in xprt->xpt_flags prevents a transport being
+ * enqueued multiply. During normal transport processing this bit
+ * is set by svc_xprt_enqueue and cleared by svc_xprt_received.
+ * Providers should not manipulate this bit directly.
+ *
+ * Some flags can be set to certain values at any time
+ * providing that certain rules are followed:
+ *
+ * XPT_CONN, XPT_DATA:
+ * - Can be set or cleared at any time.
+ * - After a set, svc_xprt_enqueue must be called to enqueue
+ * the transport for processing.
+ * - After a clear, the transport must be read/accepted.
+ * If this succeeds, it must be set again.
+ * XPT_CLOSE:
+ * - Can set at any time. It is never cleared.
+ * XPT_DEAD:
+ * - Can only be set while XPT_BUSY is held which ensures
+ * that no other thread will be using the transport or will
+ * try to set XPT_DEAD.
+ */
+
+int svc_reg_xprt_class(struct svc_xprt_class *xcl)
+{
+ struct svc_xprt_class *cl;
+ int res = -EEXIST;
+
+ dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name);
+
+ INIT_LIST_HEAD(&xcl->xcl_list);
+ spin_lock(&svc_xprt_class_lock);
+ /* Make sure there isn't already a class with the same name */
+ list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) {
+ if (strcmp(xcl->xcl_name, cl->xcl_name) == 0)
+ goto out;
+ }
+ list_add_tail(&xcl->xcl_list, &svc_xprt_class_list);
+ res = 0;
+out:
+ spin_unlock(&svc_xprt_class_lock);
+ return res;
+}
+EXPORT_SYMBOL_GPL(svc_reg_xprt_class);
+
+void svc_unreg_xprt_class(struct svc_xprt_class *xcl)
+{
+ dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name);
+ spin_lock(&svc_xprt_class_lock);
+ list_del_init(&xcl->xcl_list);
+ spin_unlock(&svc_xprt_class_lock);
+}
+EXPORT_SYMBOL_GPL(svc_unreg_xprt_class);
+
+/*
+ * Format the transport list for printing
+ */
+int svc_print_xprts(char *buf, int maxlen)
+{
+ struct list_head *le;
+ char tmpstr[80];
+ int len = 0;
+ buf[0] = '\0';
+
+ spin_lock(&svc_xprt_class_lock);
+ list_for_each(le, &svc_xprt_class_list) {
+ int slen;
+ struct svc_xprt_class *xcl =
+ list_entry(le, struct svc_xprt_class, xcl_list);
+
+ sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload);
+ slen = strlen(tmpstr);
+ if (len + slen > maxlen)
+ break;
+ len += slen;
+ strcat(buf, tmpstr);
+ }
+ spin_unlock(&svc_xprt_class_lock);
+
+ return len;
+}
+
+static void svc_xprt_free(struct kref *kref)
+{
+ struct svc_xprt *xprt =
+ container_of(kref, struct svc_xprt, xpt_ref);
+ struct module *owner = xprt->xpt_class->xcl_owner;
+ if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)
+ && xprt->xpt_auth_cache != NULL)
+ svcauth_unix_info_release(xprt->xpt_auth_cache);
+ xprt->xpt_ops->xpo_free(xprt);
+ module_put(owner);
+}
+
+void svc_xprt_put(struct svc_xprt *xprt)
+{
+ kref_put(&xprt->xpt_ref, svc_xprt_free);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_put);
+
+/*
+ * Called by transport drivers to initialize the transport independent
+ * portion of the transport instance.
+ */
+void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt,
+ struct svc_serv *serv)
+{
+ memset(xprt, 0, sizeof(*xprt));
+ xprt->xpt_class = xcl;
+ xprt->xpt_ops = xcl->xcl_ops;
+ kref_init(&xprt->xpt_ref);
+ xprt->xpt_server = serv;
+ INIT_LIST_HEAD(&xprt->xpt_list);
+ INIT_LIST_HEAD(&xprt->xpt_ready);
+ INIT_LIST_HEAD(&xprt->xpt_deferred);
+ mutex_init(&xprt->xpt_mutex);
+ spin_lock_init(&xprt->xpt_lock);
+ set_bit(XPT_BUSY, &xprt->xpt_flags);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_init);
+
+int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
+ int flags)
+{
+ struct svc_xprt_class *xcl;
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = INADDR_ANY,
+ .sin_port = htons(port),
+ };
+ dprintk("svc: creating transport %s[%d]\n", xprt_name, port);
+ spin_lock(&svc_xprt_class_lock);
+ list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
+ struct svc_xprt *newxprt;
+
+ if (strcmp(xprt_name, xcl->xcl_name))
+ continue;
+
+ if (!try_module_get(xcl->xcl_owner))
+ goto err;
+
+ spin_unlock(&svc_xprt_class_lock);
+ newxprt = xcl->xcl_ops->
+ xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin),
+ flags);
+ if (IS_ERR(newxprt)) {
+ module_put(xcl->xcl_owner);
+ return PTR_ERR(newxprt);
+ }
+
+ clear_bit(XPT_TEMP, &newxprt->xpt_flags);
+ spin_lock_bh(&serv->sv_lock);
+ list_add(&newxprt->xpt_list, &serv->sv_permsocks);
+ spin_unlock_bh(&serv->sv_lock);
+ clear_bit(XPT_BUSY, &newxprt->xpt_flags);
+ return svc_xprt_local_port(newxprt);
+ }
+ err:
+ spin_unlock(&svc_xprt_class_lock);
+ dprintk("svc: transport %s not found\n", xprt_name);
+ return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(svc_create_xprt);
+
+/*
+ * Copy the local and remote xprt addresses to the rqstp structure
+ */
+void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt)
+{
+ struct sockaddr *sin;
+
+ memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen);
+ rqstp->rq_addrlen = xprt->xpt_remotelen;
+
+ /*
+ * Destination address in request is needed for binding the
+ * source address in RPC replies/callbacks later.
+ */
+ sin = (struct sockaddr *)&xprt->xpt_local;
+ switch (sin->sa_family) {
+ case AF_INET:
+ rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
+ break;
+ case AF_INET6:
+ rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs);
+
+/**
+ * svc_print_addr - Format rq_addr field for printing
+ * @rqstp: svc_rqst struct containing address to print
+ * @buf: target buffer for formatted address
+ * @len: length of target buffer
+ *
+ */
+char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
+{
+ return __svc_print_addr(svc_addr(rqstp), buf, len);
+}
+EXPORT_SYMBOL_GPL(svc_print_addr);
+
+/*
+ * Queue up an idle server thread. Must have pool->sp_lock held.
+ * Note: this is really a stack rather than a queue, so that we only
+ * use as many different threads as we need, and the rest don't pollute
+ * the cache.
+ */
+static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
+{
+ list_add(&rqstp->rq_list, &pool->sp_threads);
+}
+
+/*
+ * Dequeue an nfsd thread. Must have pool->sp_lock held.
+ */
+static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
+{
+ list_del(&rqstp->rq_list);
+}
+
+/*
+ * Queue up a transport with data pending. If there are idle nfsd
+ * processes, wake 'em up.
+ *
+ */
+void svc_xprt_enqueue(struct svc_xprt *xprt)
+{
+ struct svc_serv *serv = xprt->xpt_server;
+ struct svc_pool *pool;
+ struct svc_rqst *rqstp;
+ int cpu;
+
+ if (!(xprt->xpt_flags &
+ ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED))))
+ return;
+ if (test_bit(XPT_DEAD, &xprt->xpt_flags))
+ return;
+
+ cpu = get_cpu();
+ pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
+ put_cpu();
+
+ spin_lock_bh(&pool->sp_lock);
+
+ if (!list_empty(&pool->sp_threads) &&
+ !list_empty(&pool->sp_sockets))
+ printk(KERN_ERR
+ "svc_xprt_enqueue: "
+ "threads and transports both waiting??\n");
+
+ if (test_bit(XPT_DEAD, &xprt->xpt_flags)) {
+ /* Don't enqueue dead transports */
+ dprintk("svc: transport %p is dead, not enqueued\n", xprt);
+ goto out_unlock;
+ }
+
+ /* Mark transport as busy. It will remain in this state until
+ * the provider calls svc_xprt_received. We update XPT_BUSY
+ * atomically because it also guards against trying to enqueue
+ * the transport twice.
+ */
+ if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) {
+ /* Don't enqueue transport while already enqueued */
+ dprintk("svc: transport %p busy, not enqueued\n", xprt);
+ goto out_unlock;
+ }
+ BUG_ON(xprt->xpt_pool != NULL);
+ xprt->xpt_pool = pool;
+
+ /* Handle pending connection */
+ if (test_bit(XPT_CONN, &xprt->xpt_flags))
+ goto process;
+
+ /* Handle close in-progress */
+ if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
+ goto process;
+
+ /* Check if we have space to reply to a request */
+ if (!xprt->xpt_ops->xpo_has_wspace(xprt)) {
+ /* Don't enqueue while not enough space for reply */
+ dprintk("svc: no write space, transport %p not enqueued\n",
+ xprt);
+ xprt->xpt_pool = NULL;
+ clear_bit(XPT_BUSY, &xprt->xpt_flags);
+ goto out_unlock;
+ }
+
+ process:
+ if (!list_empty(&pool->sp_threads)) {
+ rqstp = list_entry(pool->sp_threads.next,
+ struct svc_rqst,
+ rq_list);
+ dprintk("svc: transport %p served by daemon %p\n",
+ xprt, rqstp);
+ svc_thread_dequeue(pool, rqstp);
+ if (rqstp->rq_xprt)
+ printk(KERN_ERR
+ "svc_xprt_enqueue: server %p, rq_xprt=%p!\n",
+ rqstp, rqstp->rq_xprt);
+ rqstp->rq_xprt = xprt;
+ svc_xprt_get(xprt);
+ rqstp->rq_reserved = serv->sv_max_mesg;
+ atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+ BUG_ON(xprt->xpt_pool != pool);
+ wake_up(&rqstp->rq_wait);
+ } else {
+ dprintk("svc: transport %p put into queue\n", xprt);
+ list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
+ BUG_ON(xprt->xpt_pool != pool);
+ }
+
+out_unlock:
+ spin_unlock_bh(&pool->sp_lock);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
+
+/*
+ * Dequeue the first transport. Must be called with the pool->sp_lock held.
+ */
+static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
+{
+ struct svc_xprt *xprt;
+
+ if (list_empty(&pool->sp_sockets))
+ return NULL;
+
+ xprt = list_entry(pool->sp_sockets.next,
+ struct svc_xprt, xpt_ready);
+ list_del_init(&xprt->xpt_ready);
+
+ dprintk("svc: transport %p dequeued, inuse=%d\n",
+ xprt, atomic_read(&xprt->xpt_ref.refcount));
+
+ return xprt;
+}
+
+/*
+ * svc_xprt_received conditionally queues the transport for processing
+ * by another thread. The caller must hold the XPT_BUSY bit and must
+ * not thereafter touch transport data.
+ *
+ * Note: XPT_DATA only gets cleared when a read-attempt finds no (or
+ * insufficient) data.
+ */
+void svc_xprt_received(struct svc_xprt *xprt)
+{
+ BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags));
+ xprt->xpt_pool = NULL;
+ clear_bit(XPT_BUSY, &xprt->xpt_flags);
+ svc_xprt_enqueue(xprt);
+}
+EXPORT_SYMBOL_GPL(svc_xprt_received);
+
+/**
+ * svc_reserve - change the space reserved for the reply to a request.
+ * @rqstp: The request in question
+ * @space: new max space to reserve
+ *
+ * Each request reserves some space on the output queue of the transport
+ * to make sure the reply fits. This function reduces that reserved
+ * space to be the amount of space used already, plus @space.
+ *
+ */
+void svc_reserve(struct svc_rqst *rqstp, int space)
+{
+ space += rqstp->rq_res.head[0].iov_len;
+
+ if (space < rqstp->rq_reserved) {
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved);
+ rqstp->rq_reserved = space;
+
+ svc_xprt_enqueue(xprt);
+ }
+}
+EXPORT_SYMBOL(svc_reserve);
+
+static void svc_xprt_release(struct svc_rqst *rqstp)
+{
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+
+ rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
+
+ svc_free_res_pages(rqstp);
+ rqstp->rq_res.page_len = 0;
+ rqstp->rq_res.page_base = 0;
+
+ /* Reset response buffer and release
+ * the reservation.
+ * But first, check that enough space was reserved
+ * for the reply, otherwise we have a bug!
+ */
+ if ((rqstp->rq_res.len) > rqstp->rq_reserved)
+ printk(KERN_ERR "RPC request reserved %d but used %d\n",
+ rqstp->rq_reserved,
+ rqstp->rq_res.len);
+
+ rqstp->rq_res.head[0].iov_len = 0;
+ svc_reserve(rqstp, 0);
+ rqstp->rq_xprt = NULL;
+
+ svc_xprt_put(xprt);
+}
+
+/*
+ * External function to wake up a server waiting for data
+ * This really only makes sense for services like lockd
+ * which have exactly one thread anyway.
+ */
+void svc_wake_up(struct svc_serv *serv)
+{
+ struct svc_rqst *rqstp;
+ unsigned int i;
+ struct svc_pool *pool;
+
+ for (i = 0; i < serv->sv_nrpools; i++) {
+ pool = &serv->sv_pools[i];
+
+ spin_lock_bh(&pool->sp_lock);
+ if (!list_empty(&pool->sp_threads)) {
+ rqstp = list_entry(pool->sp_threads.next,
+ struct svc_rqst,
+ rq_list);
+ dprintk("svc: daemon %p woken up.\n", rqstp);
+ /*
+ svc_thread_dequeue(pool, rqstp);
+ rqstp->rq_xprt = NULL;
+ */
+ wake_up(&rqstp->rq_wait);
+ }
+ spin_unlock_bh(&pool->sp_lock);
+ }
+}
+EXPORT_SYMBOL(svc_wake_up);
+
+int svc_port_is_privileged(struct sockaddr *sin)
+{
+ switch (sin->sa_family) {
+ case AF_INET:
+ return ntohs(((struct sockaddr_in *)sin)->sin_port)
+ < PROT_SOCK;
+ case AF_INET6:
+ return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
+ < PROT_SOCK;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * Make sure that we don't have too many active connections. If we
+ * have, something must be dropped.
+ *
+ * There's no point in trying to do random drop here for DoS
+ * prevention. The NFS clients does 1 reconnect in 15 seconds. An
+ * attacker can easily beat that.
+ *
+ * The only somewhat efficient mechanism would be if drop old
+ * connections from the same IP first. But right now we don't even
+ * record the client IP in svc_sock.
+ */
+static void svc_check_conn_limits(struct svc_serv *serv)
+{
+ if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
+ struct svc_xprt *xprt = NULL;
+ spin_lock_bh(&serv->sv_lock);
+ if (!list_empty(&serv->sv_tempsocks)) {
+ if (net_ratelimit()) {
+ /* Try to help the admin */
+ printk(KERN_NOTICE "%s: too many open "
+ "connections, consider increasing the "
+ "number of nfsd threads\n",
+ serv->sv_name);
+ }
+ /*
+ * Always select the oldest connection. It's not fair,
+ * but so is life
+ */
+ xprt = list_entry(serv->sv_tempsocks.prev,
+ struct svc_xprt,
+ xpt_list);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ svc_xprt_get(xprt);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+
+ if (xprt) {
+ svc_xprt_enqueue(xprt);
+ svc_xprt_put(xprt);
+ }
+ }
+}
+
+/*
+ * Receive the next request on any transport. This code is carefully
+ * organised not to touch any cachelines in the shared svc_serv
+ * structure, only cachelines in the local svc_pool.
+ */
+int svc_recv(struct svc_rqst *rqstp, long timeout)
+{
+ struct svc_xprt *xprt = NULL;
+ struct svc_serv *serv = rqstp->rq_server;
+ struct svc_pool *pool = rqstp->rq_pool;
+ int len, i;
+ int pages;
+ struct xdr_buf *arg;
+ DECLARE_WAITQUEUE(wait, current);
+
+ dprintk("svc: server %p waiting for data (to = %ld)\n",
+ rqstp, timeout);
+
+ if (rqstp->rq_xprt)
+ printk(KERN_ERR
+ "svc_recv: service %p, transport not NULL!\n",
+ rqstp);
+ if (waitqueue_active(&rqstp->rq_wait))
+ printk(KERN_ERR
+ "svc_recv: service %p, wait queue active!\n",
+ rqstp);
+
+ /* now allocate needed pages. If we get a failure, sleep briefly */
+ pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
+ for (i = 0; i < pages ; i++)
+ while (rqstp->rq_pages[i] == NULL) {
+ struct page *p = alloc_page(GFP_KERNEL);
+ if (!p) {
+ int j = msecs_to_jiffies(500);
+ schedule_timeout_uninterruptible(j);
+ }
+ rqstp->rq_pages[i] = p;
+ }
+ rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
+ BUG_ON(pages >= RPCSVC_MAXPAGES);
+
+ /* Make arg->head point to first page and arg->pages point to rest */
+ arg = &rqstp->rq_arg;
+ arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
+ arg->head[0].iov_len = PAGE_SIZE;
+ arg->pages = rqstp->rq_pages + 1;
+ arg->page_base = 0;
+ /* save at least one page for response */
+ arg->page_len = (pages-2)*PAGE_SIZE;
+ arg->len = (pages-1)*PAGE_SIZE;
+ arg->tail[0].iov_len = 0;
+
+ try_to_freeze();
+ cond_resched();
+ if (signalled())
+ return -EINTR;
+
+ spin_lock_bh(&pool->sp_lock);
+ xprt = svc_xprt_dequeue(pool);
+ if (xprt) {
+ rqstp->rq_xprt = xprt;
+ svc_xprt_get(xprt);
+ rqstp->rq_reserved = serv->sv_max_mesg;
+ atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
+ } else {
+ /* No data pending. Go to sleep */
+ svc_thread_enqueue(pool, rqstp);
+
+ /*
+ * We have to be able to interrupt this wait
+ * to bring down the daemons ...
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&rqstp->rq_wait, &wait);
+ spin_unlock_bh(&pool->sp_lock);
+
+ schedule_timeout(timeout);
+
+ try_to_freeze();
+
+ spin_lock_bh(&pool->sp_lock);
+ remove_wait_queue(&rqstp->rq_wait, &wait);
+
+ xprt = rqstp->rq_xprt;
+ if (!xprt) {
+ svc_thread_dequeue(pool, rqstp);
+ spin_unlock_bh(&pool->sp_lock);
+ dprintk("svc: server %p, no data yet\n", rqstp);
+ return signalled()? -EINTR : -EAGAIN;
+ }
+ }
+ spin_unlock_bh(&pool->sp_lock);
+
+ len = 0;
+ if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) {
+ dprintk("svc_recv: found XPT_CLOSE\n");
+ svc_delete_xprt(xprt);
+ } else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) {
+ struct svc_xprt *newxpt;
+ newxpt = xprt->xpt_ops->xpo_accept(xprt);
+ if (newxpt) {
+ /*
+ * We know this module_get will succeed because the
+ * listener holds a reference too
+ */
+ __module_get(newxpt->xpt_class->xcl_owner);
+ svc_check_conn_limits(xprt->xpt_server);
+ spin_lock_bh(&serv->sv_lock);
+ set_bit(XPT_TEMP, &newxpt->xpt_flags);
+ list_add(&newxpt->xpt_list, &serv->sv_tempsocks);
+ serv->sv_tmpcnt++;
+ if (serv->sv_temptimer.function == NULL) {
+ /* setup timer to age temp transports */
+ setup_timer(&serv->sv_temptimer,
+ svc_age_temp_xprts,
+ (unsigned long)serv);
+ mod_timer(&serv->sv_temptimer,
+ jiffies + svc_conn_age_period * HZ);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+ svc_xprt_received(newxpt);
+ }
+ svc_xprt_received(xprt);
+ } else {
+ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n",
+ rqstp, pool->sp_id, xprt,
+ atomic_read(&xprt->xpt_ref.refcount));
+ rqstp->rq_deferred = svc_deferred_dequeue(xprt);
+ if (rqstp->rq_deferred) {
+ svc_xprt_received(xprt);
+ len = svc_deferred_recv(rqstp);
+ } else
+ len = xprt->xpt_ops->xpo_recvfrom(rqstp);
+ dprintk("svc: got len=%d\n", len);
+ }
+
+ /* No data, incomplete (TCP) read, or accept() */
+ if (len == 0 || len == -EAGAIN) {
+ rqstp->rq_res.len = 0;
+ svc_xprt_release(rqstp);
+ return -EAGAIN;
+ }
+ clear_bit(XPT_OLD, &xprt->xpt_flags);
+
+ rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
+ rqstp->rq_chandle.defer = svc_defer;
+
+ if (serv->sv_stats)
+ serv->sv_stats->netcnt++;
+ return len;
+}
+EXPORT_SYMBOL(svc_recv);
+
+/*
+ * Drop request
+ */
+void svc_drop(struct svc_rqst *rqstp)
+{
+ dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt);
+ svc_xprt_release(rqstp);
+}
+EXPORT_SYMBOL(svc_drop);
+
+/*
+ * Return reply to client.
+ */
+int svc_send(struct svc_rqst *rqstp)
+{
+ struct svc_xprt *xprt;
+ int len;
+ struct xdr_buf *xb;
+
+ xprt = rqstp->rq_xprt;
+ if (!xprt)
+ return -EFAULT;
+
+ /* release the receive skb before sending the reply */
+ rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
+
+ /* calculate over-all length */
+ xb = &rqstp->rq_res;
+ xb->len = xb->head[0].iov_len +
+ xb->page_len +
+ xb->tail[0].iov_len;
+
+ /* Grab mutex to serialize outgoing data. */
+ mutex_lock(&xprt->xpt_mutex);
+ if (test_bit(XPT_DEAD, &xprt->xpt_flags))
+ len = -ENOTCONN;
+ else
+ len = xprt->xpt_ops->xpo_sendto(rqstp);
+ mutex_unlock(&xprt->xpt_mutex);
+ svc_xprt_release(rqstp);
+
+ if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
+ return 0;
+ return len;
+}
+
+/*
+ * Timer function to close old temporary transports, using
+ * a mark-and-sweep algorithm.
+ */
+static void svc_age_temp_xprts(unsigned long closure)
+{
+ struct svc_serv *serv = (struct svc_serv *)closure;
+ struct svc_xprt *xprt;
+ struct list_head *le, *next;
+ LIST_HEAD(to_be_aged);
+
+ dprintk("svc_age_temp_xprts\n");
+
+ if (!spin_trylock_bh(&serv->sv_lock)) {
+ /* busy, try again 1 sec later */
+ dprintk("svc_age_temp_xprts: busy\n");
+ mod_timer(&serv->sv_temptimer, jiffies + HZ);
+ return;
+ }
+
+ list_for_each_safe(le, next, &serv->sv_tempsocks) {
+ xprt = list_entry(le, struct svc_xprt, xpt_list);
+
+ /* First time through, just mark it OLD. Second time
+ * through, close it. */
+ if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags))
+ continue;
+ if (atomic_read(&xprt->xpt_ref.refcount) > 1
+ || test_bit(XPT_BUSY, &xprt->xpt_flags))
+ continue;
+ svc_xprt_get(xprt);
+ list_move(le, &to_be_aged);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ set_bit(XPT_DETACHED, &xprt->xpt_flags);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+
+ while (!list_empty(&to_be_aged)) {
+ le = to_be_aged.next;
+ /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */
+ list_del_init(le);
+ xprt = list_entry(le, struct svc_xprt, xpt_list);
+
+ dprintk("queuing xprt %p for closing\n", xprt);
+
+ /* a thread will dequeue and close it soon */
+ svc_xprt_enqueue(xprt);
+ svc_xprt_put(xprt);
+ }
+
+ mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
+}
+
+/*
+ * Remove a dead transport
+ */
+void svc_delete_xprt(struct svc_xprt *xprt)
+{
+ struct svc_serv *serv = xprt->xpt_server;
+
+ dprintk("svc: svc_delete_xprt(%p)\n", xprt);
+ xprt->xpt_ops->xpo_detach(xprt);
+
+ spin_lock_bh(&serv->sv_lock);
+ if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags))
+ list_del_init(&xprt->xpt_list);
+ /*
+ * We used to delete the transport from whichever list
+ * it's sk_xprt.xpt_ready node was on, but we don't actually
+ * need to. This is because the only time we're called
+ * while still attached to a queue, the queue itself
+ * is about to be destroyed (in svc_destroy).
+ */
+ if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) {
+ BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2);
+ if (test_bit(XPT_TEMP, &xprt->xpt_flags))
+ serv->sv_tmpcnt--;
+ svc_xprt_put(xprt);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+}
+
+void svc_close_xprt(struct svc_xprt *xprt)
+{
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
+ /* someone else will have to effect the close */
+ return;
+
+ svc_xprt_get(xprt);
+ svc_delete_xprt(xprt);
+ clear_bit(XPT_BUSY, &xprt->xpt_flags);
+ svc_xprt_put(xprt);
+}
+EXPORT_SYMBOL_GPL(svc_close_xprt);
+
+void svc_close_all(struct list_head *xprt_list)
+{
+ struct svc_xprt *xprt;
+ struct svc_xprt *tmp;
+
+ list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) {
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ if (test_bit(XPT_BUSY, &xprt->xpt_flags)) {
+ /* Waiting to be processed, but no threads left,
+ * So just remove it from the waiting list
+ */
+ list_del_init(&xprt->xpt_ready);
+ clear_bit(XPT_BUSY, &xprt->xpt_flags);
+ }
+ svc_close_xprt(xprt);
+ }
+}
+
+/*
+ * Handle defer and revisit of requests
+ */
+
+static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
+{
+ struct svc_deferred_req *dr =
+ container_of(dreq, struct svc_deferred_req, handle);
+ struct svc_xprt *xprt = dr->xprt;
+
+ if (too_many) {
+ svc_xprt_put(xprt);
+ kfree(dr);
+ return;
+ }
+ dprintk("revisit queued\n");
+ dr->xprt = NULL;
+ spin_lock(&xprt->xpt_lock);
+ list_add(&dr->handle.recent, &xprt->xpt_deferred);
+ spin_unlock(&xprt->xpt_lock);
+ set_bit(XPT_DEFERRED, &xprt->xpt_flags);
+ svc_xprt_enqueue(xprt);
+ svc_xprt_put(xprt);
+}
+
+/*
+ * Save the request off for later processing. The request buffer looks
+ * like this:
+ *
+ * <xprt-header><rpc-header><rpc-pagelist><rpc-tail>
+ *
+ * This code can only handle requests that consist of an xprt-header
+ * and rpc-header.
+ */
+static struct cache_deferred_req *svc_defer(struct cache_req *req)
+{
+ struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
+ struct svc_deferred_req *dr;
+
+ if (rqstp->rq_arg.page_len)
+ return NULL; /* if more than a page, give up FIXME */
+ if (rqstp->rq_deferred) {
+ dr = rqstp->rq_deferred;
+ rqstp->rq_deferred = NULL;
+ } else {
+ size_t skip;
+ size_t size;
+ /* FIXME maybe discard if size too large */
+ size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len;
+ dr = kmalloc(size, GFP_KERNEL);
+ if (dr == NULL)
+ return NULL;
+
+ dr->handle.owner = rqstp->rq_server;
+ dr->prot = rqstp->rq_prot;
+ memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
+ dr->addrlen = rqstp->rq_addrlen;
+ dr->daddr = rqstp->rq_daddr;
+ dr->argslen = rqstp->rq_arg.len >> 2;
+ dr->xprt_hlen = rqstp->rq_xprt_hlen;
+
+ /* back up head to the start of the buffer and copy */
+ skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
+ memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip,
+ dr->argslen << 2);
+ }
+ svc_xprt_get(rqstp->rq_xprt);
+ dr->xprt = rqstp->rq_xprt;
+
+ dr->handle.revisit = svc_revisit;
+ return &dr->handle;
+}
+
+/*
+ * recv data from a deferred request into an active one
+ */
+static int svc_deferred_recv(struct svc_rqst *rqstp)
+{
+ struct svc_deferred_req *dr = rqstp->rq_deferred;
+
+ /* setup iov_base past transport header */
+ rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2);
+ /* The iov_len does not include the transport header bytes */
+ rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen;
+ rqstp->rq_arg.page_len = 0;
+ /* The rq_arg.len includes the transport header bytes */
+ rqstp->rq_arg.len = dr->argslen<<2;
+ rqstp->rq_prot = dr->prot;
+ memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
+ rqstp->rq_addrlen = dr->addrlen;
+ /* Save off transport header len in case we get deferred again */
+ rqstp->rq_xprt_hlen = dr->xprt_hlen;
+ rqstp->rq_daddr = dr->daddr;
+ rqstp->rq_respages = rqstp->rq_pages;
+ return (dr->argslen<<2) - dr->xprt_hlen;
+}
+
+
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
+{
+ struct svc_deferred_req *dr = NULL;
+
+ if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags))
+ return NULL;
+ spin_lock(&xprt->xpt_lock);
+ clear_bit(XPT_DEFERRED, &xprt->xpt_flags);
+ if (!list_empty(&xprt->xpt_deferred)) {
+ dr = list_entry(xprt->xpt_deferred.next,
+ struct svc_deferred_req,
+ handle.recent);
+ list_del_init(&dr->handle.recent);
+ set_bit(XPT_DEFERRED, &xprt->xpt_flags);
+ }
+ spin_unlock(&xprt->xpt_lock);
+ return dr;
+}
+
+/*
+ * Return the transport instance pointer for the endpoint accepting
+ * connections/peer traffic from the specified transport class,
+ * address family and port.
+ *
+ * Specifying 0 for the address family or port is effectively a
+ * wild-card, and will result in matching the first transport in the
+ * service's list that has a matching class name.
+ */
+struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
+ int af, int port)
+{
+ struct svc_xprt *xprt;
+ struct svc_xprt *found = NULL;
+
+ /* Sanity check the args */
+ if (!serv || !xcl_name)
+ return found;
+
+ spin_lock_bh(&serv->sv_lock);
+ list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
+ if (strcmp(xprt->xpt_class->xcl_name, xcl_name))
+ continue;
+ if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family)
+ continue;
+ if (port && port != svc_xprt_local_port(xprt))
+ continue;
+ found = xprt;
+ svc_xprt_get(xprt);
+ break;
+ }
+ spin_unlock_bh(&serv->sv_lock);
+ return found;
+}
+EXPORT_SYMBOL_GPL(svc_find_xprt);
+
+/*
+ * Format a buffer with a list of the active transports. A zero for
+ * the buflen parameter disables target buffer overflow checking.
+ */
+int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen)
+{
+ struct svc_xprt *xprt;
+ char xprt_str[64];
+ int totlen = 0;
+ int len;
+
+ /* Sanity check args */
+ if (!serv)
+ return 0;
+
+ spin_lock_bh(&serv->sv_lock);
+ list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
+ len = snprintf(xprt_str, sizeof(xprt_str),
+ "%s %d\n", xprt->xpt_class->xcl_name,
+ svc_xprt_local_port(xprt));
+ /* If the string was truncated, replace with error string */
+ if (len >= sizeof(xprt_str))
+ strcpy(xprt_str, "name-too-long\n");
+ /* Don't overflow buffer */
+ len = strlen(xprt_str);
+ if (buflen && (len + totlen >= buflen))
+ break;
+ strcpy(buf+totlen, xprt_str);
+ totlen += len;
+ }
+ spin_unlock_bh(&serv->sv_lock);
+ return totlen;
+}
+EXPORT_SYMBOL_GPL(svc_xprt_names);
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index af7c5f0..8a73cbb 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -57,11 +57,13 @@
rqstp->rq_authop = aops;
return aops->accept(rqstp, authp);
}
+EXPORT_SYMBOL(svc_authenticate);
int svc_set_client(struct svc_rqst *rqstp)
{
return rqstp->rq_authop->set_client(rqstp);
}
+EXPORT_SYMBOL(svc_set_client);
/* A request, which was authenticated, has now executed.
* Time to finalise the credentials and verifier
@@ -93,6 +95,7 @@
spin_unlock(&authtab_lock);
return rv;
}
+EXPORT_SYMBOL(svc_auth_register);
void
svc_auth_unregister(rpc_authflavor_t flavor)
@@ -129,6 +132,7 @@
spin_unlock(&auth_domain_lock);
}
}
+EXPORT_SYMBOL(auth_domain_put);
struct auth_domain *
auth_domain_lookup(char *name, struct auth_domain *new)
@@ -153,8 +157,10 @@
spin_unlock(&auth_domain_lock);
return new;
}
+EXPORT_SYMBOL(auth_domain_lookup);
struct auth_domain *auth_domain_find(char *name)
{
return auth_domain_lookup(name, NULL);
}
+EXPORT_SYMBOL(auth_domain_find);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 41147941..3c64051 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -63,6 +63,7 @@
rv = auth_domain_lookup(name, &new->h);
}
}
+EXPORT_SYMBOL(unix_domain_find);
static void svcauth_unix_domain_release(struct auth_domain *dom)
{
@@ -340,6 +341,7 @@
else
return -ENOMEM;
}
+EXPORT_SYMBOL(auth_unix_add_addr);
int auth_unix_forget_old(struct auth_domain *dom)
{
@@ -351,6 +353,7 @@
udom->addr_changes++;
return 0;
}
+EXPORT_SYMBOL(auth_unix_forget_old);
struct auth_domain *auth_unix_lookup(struct in_addr addr)
{
@@ -375,50 +378,56 @@
cache_put(&ipm->h, &ip_map_cache);
return rv;
}
+EXPORT_SYMBOL(auth_unix_lookup);
void svcauth_unix_purge(void)
{
cache_purge(&ip_map_cache);
}
+EXPORT_SYMBOL(svcauth_unix_purge);
static inline struct ip_map *
ip_map_cached_get(struct svc_rqst *rqstp)
{
- struct ip_map *ipm;
- struct svc_sock *svsk = rqstp->rq_sock;
- spin_lock(&svsk->sk_lock);
- ipm = svsk->sk_info_authunix;
- if (ipm != NULL) {
- if (!cache_valid(&ipm->h)) {
- /*
- * The entry has been invalidated since it was
- * remembered, e.g. by a second mount from the
- * same IP address.
- */
- svsk->sk_info_authunix = NULL;
- spin_unlock(&svsk->sk_lock);
- cache_put(&ipm->h, &ip_map_cache);
- return NULL;
+ struct ip_map *ipm = NULL;
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+
+ if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
+ spin_lock(&xprt->xpt_lock);
+ ipm = xprt->xpt_auth_cache;
+ if (ipm != NULL) {
+ if (!cache_valid(&ipm->h)) {
+ /*
+ * The entry has been invalidated since it was
+ * remembered, e.g. by a second mount from the
+ * same IP address.
+ */
+ xprt->xpt_auth_cache = NULL;
+ spin_unlock(&xprt->xpt_lock);
+ cache_put(&ipm->h, &ip_map_cache);
+ return NULL;
+ }
+ cache_get(&ipm->h);
}
- cache_get(&ipm->h);
+ spin_unlock(&xprt->xpt_lock);
}
- spin_unlock(&svsk->sk_lock);
return ipm;
}
static inline void
ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm)
{
- struct svc_sock *svsk = rqstp->rq_sock;
+ struct svc_xprt *xprt = rqstp->rq_xprt;
- spin_lock(&svsk->sk_lock);
- if (svsk->sk_sock->type == SOCK_STREAM &&
- svsk->sk_info_authunix == NULL) {
- /* newly cached, keep the reference */
- svsk->sk_info_authunix = ipm;
- ipm = NULL;
+ if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) {
+ spin_lock(&xprt->xpt_lock);
+ if (xprt->xpt_auth_cache == NULL) {
+ /* newly cached, keep the reference */
+ xprt->xpt_auth_cache = ipm;
+ ipm = NULL;
+ }
+ spin_unlock(&xprt->xpt_lock);
}
- spin_unlock(&svsk->sk_lock);
if (ipm)
cache_put(&ipm->h, &ip_map_cache);
}
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index c75bffe..1d3e5fc 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -5,7 +5,7 @@
*
* The server scheduling algorithm does not always distribute the load
* evenly when servicing a single client. May need to modify the
- * svc_sock_enqueue procedure...
+ * svc_xprt_enqueue procedure...
*
* TCP support is largely untested and may be a little slow. The problem
* is that we currently do two separate recvfrom's, one for the 4-byte
@@ -48,72 +48,40 @@
#include <linux/sunrpc/svcsock.h>
#include <linux/sunrpc/stats.h>
-/* SMP locking strategy:
- *
- * svc_pool->sp_lock protects most of the fields of that pool.
- * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
- * when both need to be taken (rare), svc_serv->sv_lock is first.
- * BKL protects svc_serv->sv_nrthread.
- * svc_sock->sk_lock protects the svc_sock->sk_deferred list
- * and the ->sk_info_authunix cache.
- * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply.
- *
- * Some flags can be set to certain values at any time
- * providing that certain rules are followed:
- *
- * SK_CONN, SK_DATA, can be set or cleared at any time.
- * after a set, svc_sock_enqueue must be called.
- * after a clear, the socket must be read/accepted
- * if this succeeds, it must be set again.
- * SK_CLOSE can set at any time. It is never cleared.
- * sk_inuse contains a bias of '1' until SK_DEAD is set.
- * so when sk_inuse hits zero, we know the socket is dead
- * and no-one is using it.
- * SK_DEAD can only be set while SK_BUSY is held which ensures
- * no other thread will be using the socket or will try to
- * set SK_DEAD.
- *
- */
-
-#define RPCDBG_FACILITY RPCDBG_SVCSOCK
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
int *errp, int flags);
-static void svc_delete_socket(struct svc_sock *svsk);
static void svc_udp_data_ready(struct sock *, int);
static int svc_udp_recvfrom(struct svc_rqst *);
static int svc_udp_sendto(struct svc_rqst *);
-static void svc_close_socket(struct svc_sock *svsk);
+static void svc_sock_detach(struct svc_xprt *);
+static void svc_sock_free(struct svc_xprt *);
-static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk);
-static int svc_deferred_recv(struct svc_rqst *rqstp);
-static struct cache_deferred_req *svc_defer(struct cache_req *req);
-
-/* apparently the "standard" is that clients close
- * idle connections after 5 minutes, servers after
- * 6 minutes
- * http://www.connectathon.org/talks96/nfstcp.pdf
- */
-static int svc_conn_age_period = 6*60;
-
+static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
+ struct sockaddr *, int, int);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key svc_key[2];
static struct lock_class_key svc_slock_key[2];
-static inline void svc_reclassify_socket(struct socket *sock)
+static void svc_reclassify_socket(struct socket *sock)
{
struct sock *sk = sock->sk;
BUG_ON(sock_owned_by_user(sk));
switch (sk->sk_family) {
case AF_INET:
sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD",
- &svc_slock_key[0], "sk_lock-AF_INET-NFSD", &svc_key[0]);
+ &svc_slock_key[0],
+ "sk_xprt.xpt_lock-AF_INET-NFSD",
+ &svc_key[0]);
break;
case AF_INET6:
sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD",
- &svc_slock_key[1], "sk_lock-AF_INET6-NFSD", &svc_key[1]);
+ &svc_slock_key[1],
+ "sk_xprt.xpt_lock-AF_INET6-NFSD",
+ &svc_key[1]);
break;
default:
@@ -121,81 +89,26 @@
}
}
#else
-static inline void svc_reclassify_socket(struct socket *sock)
+static void svc_reclassify_socket(struct socket *sock)
{
}
#endif
-static char *__svc_print_addr(struct sockaddr *addr, char *buf, size_t len)
-{
- switch (addr->sa_family) {
- case AF_INET:
- snprintf(buf, len, "%u.%u.%u.%u, port=%u",
- NIPQUAD(((struct sockaddr_in *) addr)->sin_addr),
- ntohs(((struct sockaddr_in *) addr)->sin_port));
- break;
-
- case AF_INET6:
- snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u",
- NIP6(((struct sockaddr_in6 *) addr)->sin6_addr),
- ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
- break;
-
- default:
- snprintf(buf, len, "unknown address type: %d", addr->sa_family);
- break;
- }
- return buf;
-}
-
-/**
- * svc_print_addr - Format rq_addr field for printing
- * @rqstp: svc_rqst struct containing address to print
- * @buf: target buffer for formatted address
- * @len: length of target buffer
- *
- */
-char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len)
-{
- return __svc_print_addr(svc_addr(rqstp), buf, len);
-}
-EXPORT_SYMBOL_GPL(svc_print_addr);
-
-/*
- * Queue up an idle server thread. Must have pool->sp_lock held.
- * Note: this is really a stack rather than a queue, so that we only
- * use as many different threads as we need, and the rest don't pollute
- * the cache.
- */
-static inline void
-svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp)
-{
- list_add(&rqstp->rq_list, &pool->sp_threads);
-}
-
-/*
- * Dequeue an nfsd thread. Must have pool->sp_lock held.
- */
-static inline void
-svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp)
-{
- list_del(&rqstp->rq_list);
-}
-
/*
* Release an skbuff after use
*/
-static inline void
-svc_release_skb(struct svc_rqst *rqstp)
+static void svc_release_skb(struct svc_rqst *rqstp)
{
- struct sk_buff *skb = rqstp->rq_skbuff;
+ struct sk_buff *skb = rqstp->rq_xprt_ctxt;
struct svc_deferred_req *dr = rqstp->rq_deferred;
if (skb) {
- rqstp->rq_skbuff = NULL;
+ struct svc_sock *svsk =
+ container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+ rqstp->rq_xprt_ctxt = NULL;
dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
- skb_free_datagram(rqstp->rq_sock->sk_sk, skb);
+ skb_free_datagram(svsk->sk_sk, skb);
}
if (dr) {
rqstp->rq_deferred = NULL;
@@ -203,253 +116,6 @@
}
}
-/*
- * Any space to write?
- */
-static inline unsigned long
-svc_sock_wspace(struct svc_sock *svsk)
-{
- int wspace;
-
- if (svsk->sk_sock->type == SOCK_STREAM)
- wspace = sk_stream_wspace(svsk->sk_sk);
- else
- wspace = sock_wspace(svsk->sk_sk);
-
- return wspace;
-}
-
-/*
- * Queue up a socket with data pending. If there are idle nfsd
- * processes, wake 'em up.
- *
- */
-static void
-svc_sock_enqueue(struct svc_sock *svsk)
-{
- struct svc_serv *serv = svsk->sk_server;
- struct svc_pool *pool;
- struct svc_rqst *rqstp;
- int cpu;
-
- if (!(svsk->sk_flags &
- ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
- return;
- if (test_bit(SK_DEAD, &svsk->sk_flags))
- return;
-
- cpu = get_cpu();
- pool = svc_pool_for_cpu(svsk->sk_server, cpu);
- put_cpu();
-
- spin_lock_bh(&pool->sp_lock);
-
- if (!list_empty(&pool->sp_threads) &&
- !list_empty(&pool->sp_sockets))
- printk(KERN_ERR
- "svc_sock_enqueue: threads and sockets both waiting??\n");
-
- if (test_bit(SK_DEAD, &svsk->sk_flags)) {
- /* Don't enqueue dead sockets */
- dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk);
- goto out_unlock;
- }
-
- /* Mark socket as busy. It will remain in this state until the
- * server has processed all pending data and put the socket back
- * on the idle list. We update SK_BUSY atomically because
- * it also guards against trying to enqueue the svc_sock twice.
- */
- if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) {
- /* Don't enqueue socket while already enqueued */
- dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk);
- goto out_unlock;
- }
- BUG_ON(svsk->sk_pool != NULL);
- svsk->sk_pool = pool;
-
- set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
- if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2
- > svc_sock_wspace(svsk))
- && !test_bit(SK_CLOSE, &svsk->sk_flags)
- && !test_bit(SK_CONN, &svsk->sk_flags)) {
- /* Don't enqueue while not enough space for reply */
- dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n",
- svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg,
- svc_sock_wspace(svsk));
- svsk->sk_pool = NULL;
- clear_bit(SK_BUSY, &svsk->sk_flags);
- goto out_unlock;
- }
- clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
-
-
- if (!list_empty(&pool->sp_threads)) {
- rqstp = list_entry(pool->sp_threads.next,
- struct svc_rqst,
- rq_list);
- dprintk("svc: socket %p served by daemon %p\n",
- svsk->sk_sk, rqstp);
- svc_thread_dequeue(pool, rqstp);
- if (rqstp->rq_sock)
- printk(KERN_ERR
- "svc_sock_enqueue: server %p, rq_sock=%p!\n",
- rqstp, rqstp->rq_sock);
- rqstp->rq_sock = svsk;
- atomic_inc(&svsk->sk_inuse);
- rqstp->rq_reserved = serv->sv_max_mesg;
- atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
- BUG_ON(svsk->sk_pool != pool);
- wake_up(&rqstp->rq_wait);
- } else {
- dprintk("svc: socket %p put into queue\n", svsk->sk_sk);
- list_add_tail(&svsk->sk_ready, &pool->sp_sockets);
- BUG_ON(svsk->sk_pool != pool);
- }
-
-out_unlock:
- spin_unlock_bh(&pool->sp_lock);
-}
-
-/*
- * Dequeue the first socket. Must be called with the pool->sp_lock held.
- */
-static inline struct svc_sock *
-svc_sock_dequeue(struct svc_pool *pool)
-{
- struct svc_sock *svsk;
-
- if (list_empty(&pool->sp_sockets))
- return NULL;
-
- svsk = list_entry(pool->sp_sockets.next,
- struct svc_sock, sk_ready);
- list_del_init(&svsk->sk_ready);
-
- dprintk("svc: socket %p dequeued, inuse=%d\n",
- svsk->sk_sk, atomic_read(&svsk->sk_inuse));
-
- return svsk;
-}
-
-/*
- * Having read something from a socket, check whether it
- * needs to be re-enqueued.
- * Note: SK_DATA only gets cleared when a read-attempt finds
- * no (or insufficient) data.
- */
-static inline void
-svc_sock_received(struct svc_sock *svsk)
-{
- svsk->sk_pool = NULL;
- clear_bit(SK_BUSY, &svsk->sk_flags);
- svc_sock_enqueue(svsk);
-}
-
-
-/**
- * svc_reserve - change the space reserved for the reply to a request.
- * @rqstp: The request in question
- * @space: new max space to reserve
- *
- * Each request reserves some space on the output queue of the socket
- * to make sure the reply fits. This function reduces that reserved
- * space to be the amount of space used already, plus @space.
- *
- */
-void svc_reserve(struct svc_rqst *rqstp, int space)
-{
- space += rqstp->rq_res.head[0].iov_len;
-
- if (space < rqstp->rq_reserved) {
- struct svc_sock *svsk = rqstp->rq_sock;
- atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved);
- rqstp->rq_reserved = space;
-
- svc_sock_enqueue(svsk);
- }
-}
-
-/*
- * Release a socket after use.
- */
-static inline void
-svc_sock_put(struct svc_sock *svsk)
-{
- if (atomic_dec_and_test(&svsk->sk_inuse)) {
- BUG_ON(! test_bit(SK_DEAD, &svsk->sk_flags));
-
- dprintk("svc: releasing dead socket\n");
- if (svsk->sk_sock->file)
- sockfd_put(svsk->sk_sock);
- else
- sock_release(svsk->sk_sock);
- if (svsk->sk_info_authunix != NULL)
- svcauth_unix_info_release(svsk->sk_info_authunix);
- kfree(svsk);
- }
-}
-
-static void
-svc_sock_release(struct svc_rqst *rqstp)
-{
- struct svc_sock *svsk = rqstp->rq_sock;
-
- svc_release_skb(rqstp);
-
- svc_free_res_pages(rqstp);
- rqstp->rq_res.page_len = 0;
- rqstp->rq_res.page_base = 0;
-
-
- /* Reset response buffer and release
- * the reservation.
- * But first, check that enough space was reserved
- * for the reply, otherwise we have a bug!
- */
- if ((rqstp->rq_res.len) > rqstp->rq_reserved)
- printk(KERN_ERR "RPC request reserved %d but used %d\n",
- rqstp->rq_reserved,
- rqstp->rq_res.len);
-
- rqstp->rq_res.head[0].iov_len = 0;
- svc_reserve(rqstp, 0);
- rqstp->rq_sock = NULL;
-
- svc_sock_put(svsk);
-}
-
-/*
- * External function to wake up a server waiting for data
- * This really only makes sense for services like lockd
- * which have exactly one thread anyway.
- */
-void
-svc_wake_up(struct svc_serv *serv)
-{
- struct svc_rqst *rqstp;
- unsigned int i;
- struct svc_pool *pool;
-
- for (i = 0; i < serv->sv_nrpools; i++) {
- pool = &serv->sv_pools[i];
-
- spin_lock_bh(&pool->sp_lock);
- if (!list_empty(&pool->sp_threads)) {
- rqstp = list_entry(pool->sp_threads.next,
- struct svc_rqst,
- rq_list);
- dprintk("svc: daemon %p woken up.\n", rqstp);
- /*
- svc_thread_dequeue(pool, rqstp);
- rqstp->rq_sock = NULL;
- */
- wake_up(&rqstp->rq_wait);
- }
- spin_unlock_bh(&pool->sp_lock);
- }
-}
-
union svc_pktinfo_u {
struct in_pktinfo pkti;
struct in6_pktinfo pkti6;
@@ -459,7 +125,9 @@
static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
{
- switch (rqstp->rq_sock->sk_sk->sk_family) {
+ struct svc_sock *svsk =
+ container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+ switch (svsk->sk_sk->sk_family) {
case AF_INET: {
struct in_pktinfo *pki = CMSG_DATA(cmh);
@@ -489,10 +157,10 @@
/*
* Generic sendto routine
*/
-static int
-svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
+static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
{
- struct svc_sock *svsk = rqstp->rq_sock;
+ struct svc_sock *svsk =
+ container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
struct socket *sock = svsk->sk_sock;
int slen;
union {
@@ -565,7 +233,7 @@
}
out:
dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
- rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len,
+ svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
return len;
@@ -602,7 +270,7 @@
if (!serv)
return 0;
spin_lock_bh(&serv->sv_lock);
- list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) {
+ list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) {
int onelen = one_sock_name(buf+len, svsk);
if (toclose && strcmp(toclose, buf+len) == 0)
closesk = svsk;
@@ -614,7 +282,7 @@
/* Should unregister with portmap, but you cannot
* unregister just one protocol...
*/
- svc_close_socket(closesk);
+ svc_close_xprt(&closesk->sk_xprt);
else if (toclose)
return -ENOENT;
return len;
@@ -624,8 +292,7 @@
/*
* Check input queue length
*/
-static int
-svc_recv_available(struct svc_sock *svsk)
+static int svc_recv_available(struct svc_sock *svsk)
{
struct socket *sock = svsk->sk_sock;
int avail, err;
@@ -638,48 +305,31 @@
/*
* Generic recvfrom routine.
*/
-static int
-svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen)
+static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
+ int buflen)
{
- struct svc_sock *svsk = rqstp->rq_sock;
+ struct svc_sock *svsk =
+ container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
struct msghdr msg = {
.msg_flags = MSG_DONTWAIT,
};
- struct sockaddr *sin;
int len;
+ rqstp->rq_xprt_hlen = 0;
+
len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen,
msg.msg_flags);
- /* sock_recvmsg doesn't fill in the name/namelen, so we must..
- */
- memcpy(&rqstp->rq_addr, &svsk->sk_remote, svsk->sk_remotelen);
- rqstp->rq_addrlen = svsk->sk_remotelen;
-
- /* Destination address in request is needed for binding the
- * source address in RPC callbacks later.
- */
- sin = (struct sockaddr *)&svsk->sk_local;
- switch (sin->sa_family) {
- case AF_INET:
- rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr;
- break;
- case AF_INET6:
- rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr;
- break;
- }
-
dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
svsk, iov[0].iov_base, iov[0].iov_len, len);
-
return len;
}
/*
* Set socket snd and rcv buffer lengths
*/
-static inline void
-svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv)
+static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
+ unsigned int rcv)
{
#if 0
mm_segment_t oldfs;
@@ -704,16 +354,16 @@
/*
* INET callback when data has been received on the socket.
*/
-static void
-svc_udp_data_ready(struct sock *sk, int count)
+static void svc_udp_data_ready(struct sock *sk, int count)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
if (svsk) {
dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
- svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags));
- set_bit(SK_DATA, &svsk->sk_flags);
- svc_sock_enqueue(svsk);
+ svsk, sk, count,
+ test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+ svc_xprt_enqueue(&svsk->sk_xprt);
}
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
wake_up_interruptible(sk->sk_sleep);
@@ -722,15 +372,14 @@
/*
* INET callback when space is newly available on the socket.
*/
-static void
-svc_write_space(struct sock *sk)
+static void svc_write_space(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
if (svsk) {
dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
- svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags));
- svc_sock_enqueue(svsk);
+ svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
+ svc_xprt_enqueue(&svsk->sk_xprt);
}
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) {
@@ -740,10 +389,19 @@
}
}
-static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp,
- struct cmsghdr *cmh)
+/*
+ * Copy the UDP datagram's destination address to the rqstp structure.
+ * The 'destination' address in this case is the address to which the
+ * peer sent the datagram, i.e. our local address. For multihomed
+ * hosts, this can change from msg to msg. Note that only the IP
+ * address changes, the port number should remain the same.
+ */
+static void svc_udp_get_dest_address(struct svc_rqst *rqstp,
+ struct cmsghdr *cmh)
{
- switch (rqstp->rq_sock->sk_sk->sk_family) {
+ struct svc_sock *svsk =
+ container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+ switch (svsk->sk_sk->sk_family) {
case AF_INET: {
struct in_pktinfo *pki = CMSG_DATA(cmh);
rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr;
@@ -760,11 +418,11 @@
/*
* Receive a datagram from a UDP socket.
*/
-static int
-svc_udp_recvfrom(struct svc_rqst *rqstp)
+static int svc_udp_recvfrom(struct svc_rqst *rqstp)
{
- struct svc_sock *svsk = rqstp->rq_sock;
- struct svc_serv *serv = svsk->sk_server;
+ struct svc_sock *svsk =
+ container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+ struct svc_serv *serv = svsk->sk_xprt.xpt_server;
struct sk_buff *skb;
union {
struct cmsghdr hdr;
@@ -779,7 +437,7 @@
.msg_flags = MSG_DONTWAIT,
};
- if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
+ if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
/* udp sockets need large rcvbuf as all pending
* requests are still in that buffer. sndbuf must
* also be large enough that there is enough space
@@ -792,17 +450,7 @@
(serv->sv_nrthreads+3) * serv->sv_max_mesg,
(serv->sv_nrthreads+3) * serv->sv_max_mesg);
- if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) {
- svc_sock_received(svsk);
- return svc_deferred_recv(rqstp);
- }
-
- if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
- svc_delete_socket(svsk);
- return 0;
- }
-
- clear_bit(SK_DATA, &svsk->sk_flags);
+ clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
skb = NULL;
err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
0, 0, MSG_PEEK | MSG_DONTWAIT);
@@ -813,24 +461,27 @@
if (err != -EAGAIN) {
/* possibly an icmp error */
dprintk("svc: recvfrom returned error %d\n", -err);
- set_bit(SK_DATA, &svsk->sk_flags);
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
}
- svc_sock_received(svsk);
+ svc_xprt_received(&svsk->sk_xprt);
return -EAGAIN;
}
- rqstp->rq_addrlen = sizeof(rqstp->rq_addr);
+ len = svc_addr_len(svc_addr(rqstp));
+ if (len < 0)
+ return len;
+ rqstp->rq_addrlen = len;
if (skb->tstamp.tv64 == 0) {
skb->tstamp = ktime_get_real();
/* Don't enable netstamp, sunrpc doesn't
need that much accuracy */
}
svsk->sk_sk->sk_stamp = skb->tstamp;
- set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
/*
* Maybe more packets - kick another thread ASAP.
*/
- svc_sock_received(svsk);
+ svc_xprt_received(&svsk->sk_xprt);
len = skb->len - sizeof(struct udphdr);
rqstp->rq_arg.len = len;
@@ -861,13 +512,14 @@
skb_free_datagram(svsk->sk_sk, skb);
} else {
/* we can use it in-place */
- rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr);
+ rqstp->rq_arg.head[0].iov_base = skb->data +
+ sizeof(struct udphdr);
rqstp->rq_arg.head[0].iov_len = len;
if (skb_checksum_complete(skb)) {
skb_free_datagram(svsk->sk_sk, skb);
return 0;
}
- rqstp->rq_skbuff = skb;
+ rqstp->rq_xprt_ctxt = skb;
}
rqstp->rq_arg.page_base = 0;
@@ -900,27 +552,81 @@
return error;
}
-static void
-svc_udp_init(struct svc_sock *svsk)
+static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+}
+
+static int svc_udp_has_wspace(struct svc_xprt *xprt)
+{
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ struct svc_serv *serv = xprt->xpt_server;
+ unsigned long required;
+
+ /*
+ * Set the SOCK_NOSPACE flag before checking the available
+ * sock space.
+ */
+ set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
+ if (required*2 > sock_wspace(svsk->sk_sk))
+ return 0;
+ clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ return 1;
+}
+
+static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
+{
+ BUG();
+ return NULL;
+}
+
+static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
+ struct sockaddr *sa, int salen,
+ int flags)
+{
+ return svc_create_socket(serv, IPPROTO_UDP, sa, salen, flags);
+}
+
+static struct svc_xprt_ops svc_udp_ops = {
+ .xpo_create = svc_udp_create,
+ .xpo_recvfrom = svc_udp_recvfrom,
+ .xpo_sendto = svc_udp_sendto,
+ .xpo_release_rqst = svc_release_skb,
+ .xpo_detach = svc_sock_detach,
+ .xpo_free = svc_sock_free,
+ .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
+ .xpo_has_wspace = svc_udp_has_wspace,
+ .xpo_accept = svc_udp_accept,
+};
+
+static struct svc_xprt_class svc_udp_class = {
+ .xcl_name = "udp",
+ .xcl_owner = THIS_MODULE,
+ .xcl_ops = &svc_udp_ops,
+ .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP,
+};
+
+static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
{
int one = 1;
mm_segment_t oldfs;
+ svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv);
+ clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
svsk->sk_sk->sk_write_space = svc_write_space;
- svsk->sk_recvfrom = svc_udp_recvfrom;
- svsk->sk_sendto = svc_udp_sendto;
/* initialise setting must have enough space to
* receive and respond to one request.
* svc_udp_recvfrom will re-adjust if necessary
*/
svc_sock_setbufsize(svsk->sk_sock,
- 3 * svsk->sk_server->sv_max_mesg,
- 3 * svsk->sk_server->sv_max_mesg);
+ 3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
+ 3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
- set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */
- set_bit(SK_CHNGBUF, &svsk->sk_flags);
+ /* data might have come in before data_ready set up */
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+ set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
oldfs = get_fs();
set_fs(KERNEL_DS);
@@ -934,8 +640,7 @@
* A data_ready event on a listening socket means there's a connection
* pending. Do not use state_change as a substitute for it.
*/
-static void
-svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
+static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
@@ -954,8 +659,8 @@
*/
if (sk->sk_state == TCP_LISTEN) {
if (svsk) {
- set_bit(SK_CONN, &svsk->sk_flags);
- svc_sock_enqueue(svsk);
+ set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
+ svc_xprt_enqueue(&svsk->sk_xprt);
} else
printk("svc: socket %p: no user data\n", sk);
}
@@ -967,8 +672,7 @@
/*
* A state change on a connected socket means it's dying or dead.
*/
-static void
-svc_tcp_state_change(struct sock *sk)
+static void svc_tcp_state_change(struct sock *sk)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
@@ -978,51 +682,36 @@
if (!svsk)
printk("svc: socket %p: no user data\n", sk);
else {
- set_bit(SK_CLOSE, &svsk->sk_flags);
- svc_sock_enqueue(svsk);
+ set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
+ svc_xprt_enqueue(&svsk->sk_xprt);
}
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
wake_up_interruptible_all(sk->sk_sleep);
}
-static void
-svc_tcp_data_ready(struct sock *sk, int count)
+static void svc_tcp_data_ready(struct sock *sk, int count)
{
struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
dprintk("svc: socket %p TCP data ready (svsk %p)\n",
sk, sk->sk_user_data);
if (svsk) {
- set_bit(SK_DATA, &svsk->sk_flags);
- svc_sock_enqueue(svsk);
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
+ svc_xprt_enqueue(&svsk->sk_xprt);
}
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
wake_up_interruptible(sk->sk_sleep);
}
-static inline int svc_port_is_privileged(struct sockaddr *sin)
-{
- switch (sin->sa_family) {
- case AF_INET:
- return ntohs(((struct sockaddr_in *)sin)->sin_port)
- < PROT_SOCK;
- case AF_INET6:
- return ntohs(((struct sockaddr_in6 *)sin)->sin6_port)
- < PROT_SOCK;
- default:
- return 0;
- }
-}
-
/*
* Accept a TCP connection
*/
-static void
-svc_tcp_accept(struct svc_sock *svsk)
+static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
{
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
struct sockaddr_storage addr;
struct sockaddr *sin = (struct sockaddr *) &addr;
- struct svc_serv *serv = svsk->sk_server;
+ struct svc_serv *serv = svsk->sk_xprt.xpt_server;
struct socket *sock = svsk->sk_sock;
struct socket *newsock;
struct svc_sock *newsvsk;
@@ -1031,9 +720,9 @@
dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
if (!sock)
- return;
+ return NULL;
- clear_bit(SK_CONN, &svsk->sk_flags);
+ clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
err = kernel_accept(sock, &newsock, O_NONBLOCK);
if (err < 0) {
if (err == -ENOMEM)
@@ -1042,11 +731,9 @@
else if (err != -EAGAIN && net_ratelimit())
printk(KERN_WARNING "%s: accept failed (err %d)!\n",
serv->sv_name, -err);
- return;
+ return NULL;
}
-
- set_bit(SK_CONN, &svsk->sk_flags);
- svc_sock_enqueue(svsk);
+ set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
err = kernel_getpeername(newsock, sin, &slen);
if (err < 0) {
@@ -1077,106 +764,42 @@
if (!(newsvsk = svc_setup_socket(serv, newsock, &err,
(SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY))))
goto failed;
- memcpy(&newsvsk->sk_remote, sin, slen);
- newsvsk->sk_remotelen = slen;
+ svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
err = kernel_getsockname(newsock, sin, &slen);
if (unlikely(err < 0)) {
dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
slen = offsetof(struct sockaddr, sa_data);
}
- memcpy(&newsvsk->sk_local, sin, slen);
-
- svc_sock_received(newsvsk);
-
- /* make sure that we don't have too many active connections.
- * If we have, something must be dropped.
- *
- * There's no point in trying to do random drop here for
- * DoS prevention. The NFS clients does 1 reconnect in 15
- * seconds. An attacker can easily beat that.
- *
- * The only somewhat efficient mechanism would be if drop
- * old connections from the same IP first. But right now
- * we don't even record the client IP in svc_sock.
- */
- if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) {
- struct svc_sock *svsk = NULL;
- spin_lock_bh(&serv->sv_lock);
- if (!list_empty(&serv->sv_tempsocks)) {
- if (net_ratelimit()) {
- /* Try to help the admin */
- printk(KERN_NOTICE "%s: too many open TCP "
- "sockets, consider increasing the "
- "number of nfsd threads\n",
- serv->sv_name);
- printk(KERN_NOTICE
- "%s: last TCP connect from %s\n",
- serv->sv_name, __svc_print_addr(sin,
- buf, sizeof(buf)));
- }
- /*
- * Always select the oldest socket. It's not fair,
- * but so is life
- */
- svsk = list_entry(serv->sv_tempsocks.prev,
- struct svc_sock,
- sk_list);
- set_bit(SK_CLOSE, &svsk->sk_flags);
- atomic_inc(&svsk->sk_inuse);
- }
- spin_unlock_bh(&serv->sv_lock);
-
- if (svsk) {
- svc_sock_enqueue(svsk);
- svc_sock_put(svsk);
- }
-
- }
+ svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
if (serv->sv_stats)
serv->sv_stats->nettcpconn++;
- return;
+ return &newsvsk->sk_xprt;
failed:
sock_release(newsock);
- return;
+ return NULL;
}
/*
* Receive data from a TCP socket.
*/
-static int
-svc_tcp_recvfrom(struct svc_rqst *rqstp)
+static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
{
- struct svc_sock *svsk = rqstp->rq_sock;
- struct svc_serv *serv = svsk->sk_server;
+ struct svc_sock *svsk =
+ container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
+ struct svc_serv *serv = svsk->sk_xprt.xpt_server;
int len;
struct kvec *vec;
int pnum, vlen;
dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
- svsk, test_bit(SK_DATA, &svsk->sk_flags),
- test_bit(SK_CONN, &svsk->sk_flags),
- test_bit(SK_CLOSE, &svsk->sk_flags));
+ svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags),
+ test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags),
+ test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags));
- if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) {
- svc_sock_received(svsk);
- return svc_deferred_recv(rqstp);
- }
-
- if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
- svc_delete_socket(svsk);
- return 0;
- }
-
- if (svsk->sk_sk->sk_state == TCP_LISTEN) {
- svc_tcp_accept(svsk);
- svc_sock_received(svsk);
- return 0;
- }
-
- if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
+ if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
/* sndbuf needs to have room for one request
* per thread, otherwise we can stall even when the
* network isn't a bottleneck.
@@ -1193,7 +816,7 @@
(serv->sv_nrthreads+3) * serv->sv_max_mesg,
3 * serv->sv_max_mesg);
- clear_bit(SK_DATA, &svsk->sk_flags);
+ clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
/* Receive data. If we haven't got the record length yet, get
* the next four bytes. Otherwise try to gobble up as much as
@@ -1212,7 +835,7 @@
if (len < want) {
dprintk("svc: short recvfrom while reading record length (%d of %lu)\n",
len, want);
- svc_sock_received(svsk);
+ svc_xprt_received(&svsk->sk_xprt);
return -EAGAIN; /* record header not complete */
}
@@ -1248,11 +871,11 @@
if (len < svsk->sk_reclen) {
dprintk("svc: incomplete TCP record (%d of %d)\n",
len, svsk->sk_reclen);
- svc_sock_received(svsk);
+ svc_xprt_received(&svsk->sk_xprt);
return -EAGAIN; /* record not complete */
}
len = svsk->sk_reclen;
- set_bit(SK_DATA, &svsk->sk_flags);
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
vec = rqstp->rq_vec;
vec[0] = rqstp->rq_arg.head[0];
@@ -1281,30 +904,31 @@
rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
}
- rqstp->rq_skbuff = NULL;
+ rqstp->rq_xprt_ctxt = NULL;
rqstp->rq_prot = IPPROTO_TCP;
/* Reset TCP read info */
svsk->sk_reclen = 0;
svsk->sk_tcplen = 0;
- svc_sock_received(svsk);
+ svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt);
+ svc_xprt_received(&svsk->sk_xprt);
if (serv->sv_stats)
serv->sv_stats->nettcpcnt++;
return len;
err_delete:
- svc_delete_socket(svsk);
+ set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
return -EAGAIN;
error:
if (len == -EAGAIN) {
dprintk("RPC: TCP recvfrom got EAGAIN\n");
- svc_sock_received(svsk);
+ svc_xprt_received(&svsk->sk_xprt);
} else {
printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
- svsk->sk_server->sv_name, -len);
+ svsk->sk_xprt.xpt_server->sv_name, -len);
goto err_delete;
}
@@ -1314,8 +938,7 @@
/*
* Send out data on TCP socket.
*/
-static int
-svc_tcp_sendto(struct svc_rqst *rqstp)
+static int svc_tcp_sendto(struct svc_rqst *rqstp)
{
struct xdr_buf *xbufp = &rqstp->rq_res;
int sent;
@@ -1328,35 +951,109 @@
reclen = htonl(0x80000000|((xbufp->len ) - 4));
memcpy(xbufp->head[0].iov_base, &reclen, 4);
- if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags))
+ if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags))
return -ENOTCONN;
sent = svc_sendto(rqstp, &rqstp->rq_res);
if (sent != xbufp->len) {
- printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n",
- rqstp->rq_sock->sk_server->sv_name,
+ printk(KERN_NOTICE
+ "rpc-srv/tcp: %s: %s %d when sending %d bytes "
+ "- shutting down socket\n",
+ rqstp->rq_xprt->xpt_server->sv_name,
(sent<0)?"got error":"sent only",
sent, xbufp->len);
- set_bit(SK_CLOSE, &rqstp->rq_sock->sk_flags);
- svc_sock_enqueue(rqstp->rq_sock);
+ set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags);
+ svc_xprt_enqueue(rqstp->rq_xprt);
sent = -EAGAIN;
}
return sent;
}
-static void
-svc_tcp_init(struct svc_sock *svsk)
+/*
+ * Setup response header. TCP has a 4B record length field.
+ */
+static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+ struct kvec *resv = &rqstp->rq_res.head[0];
+
+ /* tcp needs a space for the record length... */
+ svc_putnl(resv, 0);
+}
+
+static int svc_tcp_has_wspace(struct svc_xprt *xprt)
+{
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ struct svc_serv *serv = svsk->sk_xprt.xpt_server;
+ int required;
+ int wspace;
+
+ /*
+ * Set the SOCK_NOSPACE flag before checking the available
+ * sock space.
+ */
+ set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
+ wspace = sk_stream_wspace(svsk->sk_sk);
+
+ if (wspace < sk_stream_min_wspace(svsk->sk_sk))
+ return 0;
+ if (required * 2 > wspace)
+ return 0;
+
+ clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
+ return 1;
+}
+
+static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
+ struct sockaddr *sa, int salen,
+ int flags)
+{
+ return svc_create_socket(serv, IPPROTO_TCP, sa, salen, flags);
+}
+
+static struct svc_xprt_ops svc_tcp_ops = {
+ .xpo_create = svc_tcp_create,
+ .xpo_recvfrom = svc_tcp_recvfrom,
+ .xpo_sendto = svc_tcp_sendto,
+ .xpo_release_rqst = svc_release_skb,
+ .xpo_detach = svc_sock_detach,
+ .xpo_free = svc_sock_free,
+ .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr,
+ .xpo_has_wspace = svc_tcp_has_wspace,
+ .xpo_accept = svc_tcp_accept,
+};
+
+static struct svc_xprt_class svc_tcp_class = {
+ .xcl_name = "tcp",
+ .xcl_owner = THIS_MODULE,
+ .xcl_ops = &svc_tcp_ops,
+ .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
+};
+
+void svc_init_xprt_sock(void)
+{
+ svc_reg_xprt_class(&svc_tcp_class);
+ svc_reg_xprt_class(&svc_udp_class);
+}
+
+void svc_cleanup_xprt_sock(void)
+{
+ svc_unreg_xprt_class(&svc_tcp_class);
+ svc_unreg_xprt_class(&svc_udp_class);
+}
+
+static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
{
struct sock *sk = svsk->sk_sk;
struct tcp_sock *tp = tcp_sk(sk);
- svsk->sk_recvfrom = svc_tcp_recvfrom;
- svsk->sk_sendto = svc_tcp_sendto;
-
+ svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv);
+ set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
if (sk->sk_state == TCP_LISTEN) {
dprintk("setting up TCP socket for listening\n");
+ set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
sk->sk_data_ready = svc_tcp_listen_data_ready;
- set_bit(SK_CONN, &svsk->sk_flags);
+ set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
} else {
dprintk("setting up TCP socket for reading\n");
sk->sk_state_change = svc_tcp_state_change;
@@ -1373,18 +1070,17 @@
* svc_tcp_recvfrom will re-adjust if necessary
*/
svc_sock_setbufsize(svsk->sk_sock,
- 3 * svsk->sk_server->sv_max_mesg,
- 3 * svsk->sk_server->sv_max_mesg);
+ 3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
+ 3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
- set_bit(SK_CHNGBUF, &svsk->sk_flags);
- set_bit(SK_DATA, &svsk->sk_flags);
+ set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
+ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
if (sk->sk_state != TCP_ESTABLISHED)
- set_bit(SK_CLOSE, &svsk->sk_flags);
+ set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
}
}
-void
-svc_sock_update_bufs(struct svc_serv *serv)
+void svc_sock_update_bufs(struct svc_serv *serv)
{
/*
* The number of server threads has changed. Update
@@ -1395,232 +1091,18 @@
spin_lock_bh(&serv->sv_lock);
list_for_each(le, &serv->sv_permsocks) {
struct svc_sock *svsk =
- list_entry(le, struct svc_sock, sk_list);
- set_bit(SK_CHNGBUF, &svsk->sk_flags);
+ list_entry(le, struct svc_sock, sk_xprt.xpt_list);
+ set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
}
list_for_each(le, &serv->sv_tempsocks) {
struct svc_sock *svsk =
- list_entry(le, struct svc_sock, sk_list);
- set_bit(SK_CHNGBUF, &svsk->sk_flags);
+ list_entry(le, struct svc_sock, sk_xprt.xpt_list);
+ set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
}
spin_unlock_bh(&serv->sv_lock);
}
/*
- * Receive the next request on any socket. This code is carefully
- * organised not to touch any cachelines in the shared svc_serv
- * structure, only cachelines in the local svc_pool.
- */
-int
-svc_recv(struct svc_rqst *rqstp, long timeout)
-{
- struct svc_sock *svsk = NULL;
- struct svc_serv *serv = rqstp->rq_server;
- struct svc_pool *pool = rqstp->rq_pool;
- int len, i;
- int pages;
- struct xdr_buf *arg;
- DECLARE_WAITQUEUE(wait, current);
-
- dprintk("svc: server %p waiting for data (to = %ld)\n",
- rqstp, timeout);
-
- if (rqstp->rq_sock)
- printk(KERN_ERR
- "svc_recv: service %p, socket not NULL!\n",
- rqstp);
- if (waitqueue_active(&rqstp->rq_wait))
- printk(KERN_ERR
- "svc_recv: service %p, wait queue active!\n",
- rqstp);
-
-
- /* now allocate needed pages. If we get a failure, sleep briefly */
- pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
- for (i=0; i < pages ; i++)
- while (rqstp->rq_pages[i] == NULL) {
- struct page *p = alloc_page(GFP_KERNEL);
- if (!p)
- schedule_timeout_uninterruptible(msecs_to_jiffies(500));
- rqstp->rq_pages[i] = p;
- }
- rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
- BUG_ON(pages >= RPCSVC_MAXPAGES);
-
- /* Make arg->head point to first page and arg->pages point to rest */
- arg = &rqstp->rq_arg;
- arg->head[0].iov_base = page_address(rqstp->rq_pages[0]);
- arg->head[0].iov_len = PAGE_SIZE;
- arg->pages = rqstp->rq_pages + 1;
- arg->page_base = 0;
- /* save at least one page for response */
- arg->page_len = (pages-2)*PAGE_SIZE;
- arg->len = (pages-1)*PAGE_SIZE;
- arg->tail[0].iov_len = 0;
-
- try_to_freeze();
- cond_resched();
- if (signalled())
- return -EINTR;
-
- spin_lock_bh(&pool->sp_lock);
- if ((svsk = svc_sock_dequeue(pool)) != NULL) {
- rqstp->rq_sock = svsk;
- atomic_inc(&svsk->sk_inuse);
- rqstp->rq_reserved = serv->sv_max_mesg;
- atomic_add(rqstp->rq_reserved, &svsk->sk_reserved);
- } else {
- /* No data pending. Go to sleep */
- svc_thread_enqueue(pool, rqstp);
-
- /*
- * We have to be able to interrupt this wait
- * to bring down the daemons ...
- */
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&rqstp->rq_wait, &wait);
- spin_unlock_bh(&pool->sp_lock);
-
- schedule_timeout(timeout);
-
- try_to_freeze();
-
- spin_lock_bh(&pool->sp_lock);
- remove_wait_queue(&rqstp->rq_wait, &wait);
-
- if (!(svsk = rqstp->rq_sock)) {
- svc_thread_dequeue(pool, rqstp);
- spin_unlock_bh(&pool->sp_lock);
- dprintk("svc: server %p, no data yet\n", rqstp);
- return signalled()? -EINTR : -EAGAIN;
- }
- }
- spin_unlock_bh(&pool->sp_lock);
-
- dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n",
- rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse));
- len = svsk->sk_recvfrom(rqstp);
- dprintk("svc: got len=%d\n", len);
-
- /* No data, incomplete (TCP) read, or accept() */
- if (len == 0 || len == -EAGAIN) {
- rqstp->rq_res.len = 0;
- svc_sock_release(rqstp);
- return -EAGAIN;
- }
- svsk->sk_lastrecv = get_seconds();
- clear_bit(SK_OLD, &svsk->sk_flags);
-
- rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp));
- rqstp->rq_chandle.defer = svc_defer;
-
- if (serv->sv_stats)
- serv->sv_stats->netcnt++;
- return len;
-}
-
-/*
- * Drop request
- */
-void
-svc_drop(struct svc_rqst *rqstp)
-{
- dprintk("svc: socket %p dropped request\n", rqstp->rq_sock);
- svc_sock_release(rqstp);
-}
-
-/*
- * Return reply to client.
- */
-int
-svc_send(struct svc_rqst *rqstp)
-{
- struct svc_sock *svsk;
- int len;
- struct xdr_buf *xb;
-
- if ((svsk = rqstp->rq_sock) == NULL) {
- printk(KERN_WARNING "NULL socket pointer in %s:%d\n",
- __FILE__, __LINE__);
- return -EFAULT;
- }
-
- /* release the receive skb before sending the reply */
- svc_release_skb(rqstp);
-
- /* calculate over-all length */
- xb = & rqstp->rq_res;
- xb->len = xb->head[0].iov_len +
- xb->page_len +
- xb->tail[0].iov_len;
-
- /* Grab svsk->sk_mutex to serialize outgoing data. */
- mutex_lock(&svsk->sk_mutex);
- if (test_bit(SK_DEAD, &svsk->sk_flags))
- len = -ENOTCONN;
- else
- len = svsk->sk_sendto(rqstp);
- mutex_unlock(&svsk->sk_mutex);
- svc_sock_release(rqstp);
-
- if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
- return 0;
- return len;
-}
-
-/*
- * Timer function to close old temporary sockets, using
- * a mark-and-sweep algorithm.
- */
-static void
-svc_age_temp_sockets(unsigned long closure)
-{
- struct svc_serv *serv = (struct svc_serv *)closure;
- struct svc_sock *svsk;
- struct list_head *le, *next;
- LIST_HEAD(to_be_aged);
-
- dprintk("svc_age_temp_sockets\n");
-
- if (!spin_trylock_bh(&serv->sv_lock)) {
- /* busy, try again 1 sec later */
- dprintk("svc_age_temp_sockets: busy\n");
- mod_timer(&serv->sv_temptimer, jiffies + HZ);
- return;
- }
-
- list_for_each_safe(le, next, &serv->sv_tempsocks) {
- svsk = list_entry(le, struct svc_sock, sk_list);
-
- if (!test_and_set_bit(SK_OLD, &svsk->sk_flags))
- continue;
- if (atomic_read(&svsk->sk_inuse) > 1 || test_bit(SK_BUSY, &svsk->sk_flags))
- continue;
- atomic_inc(&svsk->sk_inuse);
- list_move(le, &to_be_aged);
- set_bit(SK_CLOSE, &svsk->sk_flags);
- set_bit(SK_DETACHED, &svsk->sk_flags);
- }
- spin_unlock_bh(&serv->sv_lock);
-
- while (!list_empty(&to_be_aged)) {
- le = to_be_aged.next;
- /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */
- list_del_init(le);
- svsk = list_entry(le, struct svc_sock, sk_list);
-
- dprintk("queuing svsk %p for closing, %lu seconds old\n",
- svsk, get_seconds() - svsk->sk_lastrecv);
-
- /* a thread will dequeue and close it soon */
- svc_sock_enqueue(svsk);
- svc_sock_put(svsk);
- }
-
- mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
-}
-
-/*
* Initialize socket for RPC use and create svc_sock struct
* XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
*/
@@ -1631,7 +1113,6 @@
struct svc_sock *svsk;
struct sock *inet;
int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
- int is_temporary = flags & SVC_SOCK_TEMPORARY;
dprintk("svc: svc_setup_socket %p\n", sock);
if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1651,44 +1132,18 @@
return NULL;
}
- set_bit(SK_BUSY, &svsk->sk_flags);
inet->sk_user_data = svsk;
svsk->sk_sock = sock;
svsk->sk_sk = inet;
svsk->sk_ostate = inet->sk_state_change;
svsk->sk_odata = inet->sk_data_ready;
svsk->sk_owspace = inet->sk_write_space;
- svsk->sk_server = serv;
- atomic_set(&svsk->sk_inuse, 1);
- svsk->sk_lastrecv = get_seconds();
- spin_lock_init(&svsk->sk_lock);
- INIT_LIST_HEAD(&svsk->sk_deferred);
- INIT_LIST_HEAD(&svsk->sk_ready);
- mutex_init(&svsk->sk_mutex);
/* Initialize the socket */
if (sock->type == SOCK_DGRAM)
- svc_udp_init(svsk);
+ svc_udp_init(svsk, serv);
else
- svc_tcp_init(svsk);
-
- spin_lock_bh(&serv->sv_lock);
- if (is_temporary) {
- set_bit(SK_TEMP, &svsk->sk_flags);
- list_add(&svsk->sk_list, &serv->sv_tempsocks);
- serv->sv_tmpcnt++;
- if (serv->sv_temptimer.function == NULL) {
- /* setup timer to age temp sockets */
- setup_timer(&serv->sv_temptimer, svc_age_temp_sockets,
- (unsigned long)serv);
- mod_timer(&serv->sv_temptimer,
- jiffies + svc_conn_age_period * HZ);
- }
- } else {
- clear_bit(SK_TEMP, &svsk->sk_flags);
- list_add(&svsk->sk_list, &serv->sv_permsocks);
- }
- spin_unlock_bh(&serv->sv_lock);
+ svc_tcp_init(svsk, serv);
dprintk("svc: svc_setup_socket created %p (inet %p)\n",
svsk, svsk->sk_sk);
@@ -1717,7 +1172,16 @@
else {
svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS);
if (svsk) {
- svc_sock_received(svsk);
+ struct sockaddr_storage addr;
+ struct sockaddr *sin = (struct sockaddr *)&addr;
+ int salen;
+ if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0)
+ svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
+ clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags);
+ spin_lock_bh(&serv->sv_lock);
+ list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks);
+ spin_unlock_bh(&serv->sv_lock);
+ svc_xprt_received(&svsk->sk_xprt);
err = 0;
}
}
@@ -1733,14 +1197,19 @@
/*
* Create socket for RPC service.
*/
-static int svc_create_socket(struct svc_serv *serv, int protocol,
- struct sockaddr *sin, int len, int flags)
+static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
+ int protocol,
+ struct sockaddr *sin, int len,
+ int flags)
{
struct svc_sock *svsk;
struct socket *sock;
int error;
int type;
char buf[RPC_MAX_ADDRBUFLEN];
+ struct sockaddr_storage addr;
+ struct sockaddr *newsin = (struct sockaddr *)&addr;
+ int newlen;
dprintk("svc: svc_create_socket(%s, %d, %s)\n",
serv->sv_program->pg_name, protocol,
@@ -1749,13 +1218,13 @@
if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
printk(KERN_WARNING "svc: only UDP and TCP "
"sockets supported\n");
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
}
type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
error = sock_create_kern(sin->sa_family, type, protocol, &sock);
if (error < 0)
- return error;
+ return ERR_PTR(error);
svc_reclassify_socket(sock);
@@ -1765,203 +1234,55 @@
if (error < 0)
goto bummer;
+ newlen = len;
+ error = kernel_getsockname(sock, newsin, &newlen);
+ if (error < 0)
+ goto bummer;
+
if (protocol == IPPROTO_TCP) {
if ((error = kernel_listen(sock, 64)) < 0)
goto bummer;
}
if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) {
- svc_sock_received(svsk);
- return ntohs(inet_sk(svsk->sk_sk)->sport);
+ svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen);
+ return (struct svc_xprt *)svsk;
}
bummer:
dprintk("svc: svc_create_socket error = %d\n", -error);
sock_release(sock);
- return error;
+ return ERR_PTR(error);
}
/*
- * Remove a dead socket
+ * Detach the svc_sock from the socket so that no
+ * more callbacks occur.
*/
-static void
-svc_delete_socket(struct svc_sock *svsk)
+static void svc_sock_detach(struct svc_xprt *xprt)
{
- struct svc_serv *serv;
- struct sock *sk;
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ struct sock *sk = svsk->sk_sk;
- dprintk("svc: svc_delete_socket(%p)\n", svsk);
+ dprintk("svc: svc_sock_detach(%p)\n", svsk);
- serv = svsk->sk_server;
- sk = svsk->sk_sk;
-
+ /* put back the old socket callbacks */
sk->sk_state_change = svsk->sk_ostate;
sk->sk_data_ready = svsk->sk_odata;
sk->sk_write_space = svsk->sk_owspace;
-
- spin_lock_bh(&serv->sv_lock);
-
- if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags))
- list_del_init(&svsk->sk_list);
- /*
- * We used to delete the svc_sock from whichever list
- * it's sk_ready node was on, but we don't actually
- * need to. This is because the only time we're called
- * while still attached to a queue, the queue itself
- * is about to be destroyed (in svc_destroy).
- */
- if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) {
- BUG_ON(atomic_read(&svsk->sk_inuse)<2);
- atomic_dec(&svsk->sk_inuse);
- if (test_bit(SK_TEMP, &svsk->sk_flags))
- serv->sv_tmpcnt--;
- }
-
- spin_unlock_bh(&serv->sv_lock);
-}
-
-static void svc_close_socket(struct svc_sock *svsk)
-{
- set_bit(SK_CLOSE, &svsk->sk_flags);
- if (test_and_set_bit(SK_BUSY, &svsk->sk_flags))
- /* someone else will have to effect the close */
- return;
-
- atomic_inc(&svsk->sk_inuse);
- svc_delete_socket(svsk);
- clear_bit(SK_BUSY, &svsk->sk_flags);
- svc_sock_put(svsk);
-}
-
-void svc_force_close_socket(struct svc_sock *svsk)
-{
- set_bit(SK_CLOSE, &svsk->sk_flags);
- if (test_bit(SK_BUSY, &svsk->sk_flags)) {
- /* Waiting to be processed, but no threads left,
- * So just remove it from the waiting list
- */
- list_del_init(&svsk->sk_ready);
- clear_bit(SK_BUSY, &svsk->sk_flags);
- }
- svc_close_socket(svsk);
-}
-
-/**
- * svc_makesock - Make a socket for nfsd and lockd
- * @serv: RPC server structure
- * @protocol: transport protocol to use
- * @port: port to use
- * @flags: requested socket characteristics
- *
- */
-int svc_makesock(struct svc_serv *serv, int protocol, unsigned short port,
- int flags)
-{
- struct sockaddr_in sin = {
- .sin_family = AF_INET,
- .sin_addr.s_addr = INADDR_ANY,
- .sin_port = htons(port),
- };
-
- dprintk("svc: creating socket proto = %d\n", protocol);
- return svc_create_socket(serv, protocol, (struct sockaddr *) &sin,
- sizeof(sin), flags);
}
/*
- * Handle defer and revisit of requests
+ * Free the svc_sock's socket resources and the svc_sock itself.
*/
-
-static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
+static void svc_sock_free(struct svc_xprt *xprt)
{
- struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle);
- struct svc_sock *svsk;
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ dprintk("svc: svc_sock_free(%p)\n", svsk);
- if (too_many) {
- svc_sock_put(dr->svsk);
- kfree(dr);
- return;
- }
- dprintk("revisit queued\n");
- svsk = dr->svsk;
- dr->svsk = NULL;
- spin_lock(&svsk->sk_lock);
- list_add(&dr->handle.recent, &svsk->sk_deferred);
- spin_unlock(&svsk->sk_lock);
- set_bit(SK_DEFERRED, &svsk->sk_flags);
- svc_sock_enqueue(svsk);
- svc_sock_put(svsk);
-}
-
-static struct cache_deferred_req *
-svc_defer(struct cache_req *req)
-{
- struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
- int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len);
- struct svc_deferred_req *dr;
-
- if (rqstp->rq_arg.page_len)
- return NULL; /* if more than a page, give up FIXME */
- if (rqstp->rq_deferred) {
- dr = rqstp->rq_deferred;
- rqstp->rq_deferred = NULL;
- } else {
- int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
- /* FIXME maybe discard if size too large */
- dr = kmalloc(size, GFP_KERNEL);
- if (dr == NULL)
- return NULL;
-
- dr->handle.owner = rqstp->rq_server;
- dr->prot = rqstp->rq_prot;
- memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen);
- dr->addrlen = rqstp->rq_addrlen;
- dr->daddr = rqstp->rq_daddr;
- dr->argslen = rqstp->rq_arg.len >> 2;
- memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2);
- }
- atomic_inc(&rqstp->rq_sock->sk_inuse);
- dr->svsk = rqstp->rq_sock;
-
- dr->handle.revisit = svc_revisit;
- return &dr->handle;
-}
-
-/*
- * recv data from a deferred request into an active one
- */
-static int svc_deferred_recv(struct svc_rqst *rqstp)
-{
- struct svc_deferred_req *dr = rqstp->rq_deferred;
-
- rqstp->rq_arg.head[0].iov_base = dr->args;
- rqstp->rq_arg.head[0].iov_len = dr->argslen<<2;
- rqstp->rq_arg.page_len = 0;
- rqstp->rq_arg.len = dr->argslen<<2;
- rqstp->rq_prot = dr->prot;
- memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen);
- rqstp->rq_addrlen = dr->addrlen;
- rqstp->rq_daddr = dr->daddr;
- rqstp->rq_respages = rqstp->rq_pages;
- return dr->argslen<<2;
-}
-
-
-static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
-{
- struct svc_deferred_req *dr = NULL;
-
- if (!test_bit(SK_DEFERRED, &svsk->sk_flags))
- return NULL;
- spin_lock(&svsk->sk_lock);
- clear_bit(SK_DEFERRED, &svsk->sk_flags);
- if (!list_empty(&svsk->sk_deferred)) {
- dr = list_entry(svsk->sk_deferred.next,
- struct svc_deferred_req,
- handle.recent);
- list_del_init(&dr->handle.recent);
- set_bit(SK_DEFERRED, &svsk->sk_flags);
- }
- spin_unlock(&svsk->sk_lock);
- return dr;
+ if (svsk->sk_sock->file)
+ sockfd_put(svsk->sk_sock);
+ else
+ sock_release(svsk->sk_sock);
+ kfree(svsk);
}
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index bada7de..0f8c439 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -18,6 +18,7 @@
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/stats.h>
+#include <linux/sunrpc/svc_xprt.h>
/*
* Declare the debug flags here
@@ -55,6 +56,30 @@
}
}
+static int proc_do_xprt(ctl_table *table, int write, struct file *file,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char tmpbuf[256];
+ int len;
+ if ((*ppos && !write) || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write)
+ return -EINVAL;
+ else {
+ len = svc_print_xprts(tmpbuf, sizeof(tmpbuf));
+ if (!access_ok(VERIFY_WRITE, buffer, len))
+ return -EFAULT;
+
+ if (__copy_to_user(buffer, tmpbuf, len))
+ return -EFAULT;
+ }
+ *lenp -= len;
+ *ppos += len;
+ return 0;
+}
+
static int
proc_dodebug(ctl_table *table, int write, struct file *file,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -147,6 +172,12 @@
.mode = 0644,
.proc_handler = &proc_dodebug
},
+ {
+ .procname = "transports",
+ .maxlen = 256,
+ .mode = 0444,
+ .proc_handler = &proc_do_xprt,
+ },
{ .ctl_name = 0 }
};
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 5426406..995c3fd 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -96,11 +96,13 @@
EXPORT_SYMBOL(xdr_encode_string);
__be32 *
-xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen)
+xdr_decode_string_inplace(__be32 *p, char **sp,
+ unsigned int *lenp, unsigned int maxlen)
{
- unsigned int len;
+ u32 len;
- if ((len = ntohl(*p++)) > maxlen)
+ len = ntohl(*p++);
+ if (len > maxlen)
return NULL;
*lenp = len;
*sp = (char *) p;
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 264f0fe..5a8f268 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,3 +1,8 @@
obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
xprtrdma-y := transport.o rpc_rdma.o verbs.o
+
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o
+
+svcrdma-y := svc_rdma.o svc_rdma_transport.o \
+ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
new file mode 100644
index 0000000..88c0ca2
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+/* RPC/RDMA parameters */
+unsigned int svcrdma_ord = RPCRDMA_ORD;
+static unsigned int min_ord = 1;
+static unsigned int max_ord = 4096;
+unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
+static unsigned int min_max_requests = 4;
+static unsigned int max_max_requests = 16384;
+unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
+static unsigned int min_max_inline = 4096;
+static unsigned int max_max_inline = 65536;
+
+atomic_t rdma_stat_recv;
+atomic_t rdma_stat_read;
+atomic_t rdma_stat_write;
+atomic_t rdma_stat_sq_starve;
+atomic_t rdma_stat_rq_starve;
+atomic_t rdma_stat_rq_poll;
+atomic_t rdma_stat_rq_prod;
+atomic_t rdma_stat_sq_poll;
+atomic_t rdma_stat_sq_prod;
+
+/*
+ * This function implements reading and resetting an atomic_t stat
+ * variable through read/write to a proc file. Any write to the file
+ * resets the associated statistic to zero. Any read returns it's
+ * current value.
+ */
+static int read_reset_stat(ctl_table *table, int write,
+ struct file *filp, void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ atomic_t *stat = (atomic_t *)table->data;
+
+ if (!stat)
+ return -EINVAL;
+
+ if (write)
+ atomic_set(stat, 0);
+ else {
+ char str_buf[32];
+ char *data;
+ int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
+ if (len >= 32)
+ return -EFAULT;
+ len = strlen(str_buf);
+ if (*ppos > len) {
+ *lenp = 0;
+ return 0;
+ }
+ data = &str_buf[*ppos];
+ len -= *ppos;
+ if (len > *lenp)
+ len = *lenp;
+ if (len && copy_to_user(buffer, str_buf, len))
+ return -EFAULT;
+ *lenp = len;
+ *ppos += len;
+ }
+ return 0;
+}
+
+static struct ctl_table_header *svcrdma_table_header;
+static ctl_table svcrdma_parm_table[] = {
+ {
+ .procname = "max_requests",
+ .data = &svcrdma_max_requests,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_max_requests,
+ .extra2 = &max_max_requests
+ },
+ {
+ .procname = "max_req_size",
+ .data = &svcrdma_max_req_size,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_max_inline,
+ .extra2 = &max_max_inline
+ },
+ {
+ .procname = "max_outbound_read_requests",
+ .data = &svcrdma_ord,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &min_ord,
+ .extra2 = &max_ord,
+ },
+
+ {
+ .procname = "rdma_stat_read",
+ .data = &rdma_stat_read,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = &read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_recv",
+ .data = &rdma_stat_recv,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = &read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_write",
+ .data = &rdma_stat_write,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = &read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_sq_starve",
+ .data = &rdma_stat_sq_starve,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = &read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_rq_starve",
+ .data = &rdma_stat_rq_starve,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = &read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_rq_poll",
+ .data = &rdma_stat_rq_poll,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = &read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_rq_prod",
+ .data = &rdma_stat_rq_prod,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = &read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_sq_poll",
+ .data = &rdma_stat_sq_poll,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = &read_reset_stat,
+ },
+ {
+ .procname = "rdma_stat_sq_prod",
+ .data = &rdma_stat_sq_prod,
+ .maxlen = sizeof(atomic_t),
+ .mode = 0644,
+ .proc_handler = &read_reset_stat,
+ },
+ {
+ .ctl_name = 0,
+ },
+};
+
+static ctl_table svcrdma_table[] = {
+ {
+ .procname = "svc_rdma",
+ .mode = 0555,
+ .child = svcrdma_parm_table
+ },
+ {
+ .ctl_name = 0,
+ },
+};
+
+static ctl_table svcrdma_root_table[] = {
+ {
+ .ctl_name = CTL_SUNRPC,
+ .procname = "sunrpc",
+ .mode = 0555,
+ .child = svcrdma_table
+ },
+ {
+ .ctl_name = 0,
+ },
+};
+
+void svc_rdma_cleanup(void)
+{
+ dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
+ if (svcrdma_table_header) {
+ unregister_sysctl_table(svcrdma_table_header);
+ svcrdma_table_header = NULL;
+ }
+ svc_unreg_xprt_class(&svc_rdma_class);
+}
+
+int svc_rdma_init(void)
+{
+ dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
+ dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
+ dprintk("\tmax_requests : %d\n", svcrdma_max_requests);
+ dprintk("\tsq_depth : %d\n",
+ svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
+ dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
+ if (!svcrdma_table_header)
+ svcrdma_table_header =
+ register_sysctl_table(svcrdma_root_table);
+
+ /* Register RDMA with the SVC transport switch */
+ svc_reg_xprt_class(&svc_rdma_class);
+ return 0;
+}
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("SVC RDMA Transport");
+MODULE_LICENSE("Dual BSD/GPL");
+module_init(svc_rdma_init);
+module_exit(svc_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
new file mode 100644
index 0000000..9530ef2
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/debug.h>
+#include <asm/unaligned.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+/*
+ * Decodes a read chunk list. The expected format is as follows:
+ * descrim : xdr_one
+ * position : u32 offset into XDR stream
+ * handle : u32 RKEY
+ * . . .
+ * end-of-list: xdr_zero
+ */
+static u32 *decode_read_list(u32 *va, u32 *vaend)
+{
+ struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
+
+ while (ch->rc_discrim != xdr_zero) {
+ u64 ch_offset;
+
+ if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
+ (unsigned long)vaend) {
+ dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
+ return NULL;
+ }
+
+ ch->rc_discrim = ntohl(ch->rc_discrim);
+ ch->rc_position = ntohl(ch->rc_position);
+ ch->rc_target.rs_handle = ntohl(ch->rc_target.rs_handle);
+ ch->rc_target.rs_length = ntohl(ch->rc_target.rs_length);
+ va = (u32 *)&ch->rc_target.rs_offset;
+ xdr_decode_hyper(va, &ch_offset);
+ put_unaligned(ch_offset, (u64 *)va);
+ ch++;
+ }
+ return (u32 *)&ch->rc_position;
+}
+
+/*
+ * Determine number of chunks and total bytes in chunk list. The chunk
+ * list has already been verified to fit within the RPCRDMA header.
+ */
+void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
+ int *ch_count, int *byte_count)
+{
+ /* compute the number of bytes represented by read chunks */
+ *byte_count = 0;
+ *ch_count = 0;
+ for (; ch->rc_discrim != 0; ch++) {
+ *byte_count = *byte_count + ch->rc_target.rs_length;
+ *ch_count = *ch_count + 1;
+ }
+}
+
+/*
+ * Decodes a write chunk list. The expected format is as follows:
+ * descrim : xdr_one
+ * nchunks : <count>
+ * handle : u32 RKEY ---+
+ * length : u32 <len of segment> |
+ * offset : remove va + <count>
+ * . . . |
+ * ---+
+ */
+static u32 *decode_write_list(u32 *va, u32 *vaend)
+{
+ int ch_no;
+ struct rpcrdma_write_array *ary =
+ (struct rpcrdma_write_array *)va;
+
+ /* Check for not write-array */
+ if (ary->wc_discrim == xdr_zero)
+ return (u32 *)&ary->wc_nchunks;
+
+ if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
+ (unsigned long)vaend) {
+ dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
+ return NULL;
+ }
+ ary->wc_discrim = ntohl(ary->wc_discrim);
+ ary->wc_nchunks = ntohl(ary->wc_nchunks);
+ if (((unsigned long)&ary->wc_array[0] +
+ (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) >
+ (unsigned long)vaend) {
+ dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
+ ary, ary->wc_nchunks, vaend);
+ return NULL;
+ }
+ for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) {
+ u64 ch_offset;
+
+ ary->wc_array[ch_no].wc_target.rs_handle =
+ ntohl(ary->wc_array[ch_no].wc_target.rs_handle);
+ ary->wc_array[ch_no].wc_target.rs_length =
+ ntohl(ary->wc_array[ch_no].wc_target.rs_length);
+ va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset;
+ xdr_decode_hyper(va, &ch_offset);
+ put_unaligned(ch_offset, (u64 *)va);
+ }
+
+ /*
+ * rs_length is the 2nd 4B field in wc_target and taking its
+ * address skips the list terminator
+ */
+ return (u32 *)&ary->wc_array[ch_no].wc_target.rs_length;
+}
+
+static u32 *decode_reply_array(u32 *va, u32 *vaend)
+{
+ int ch_no;
+ struct rpcrdma_write_array *ary =
+ (struct rpcrdma_write_array *)va;
+
+ /* Check for no reply-array */
+ if (ary->wc_discrim == xdr_zero)
+ return (u32 *)&ary->wc_nchunks;
+
+ if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
+ (unsigned long)vaend) {
+ dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
+ return NULL;
+ }
+ ary->wc_discrim = ntohl(ary->wc_discrim);
+ ary->wc_nchunks = ntohl(ary->wc_nchunks);
+ if (((unsigned long)&ary->wc_array[0] +
+ (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) >
+ (unsigned long)vaend) {
+ dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
+ ary, ary->wc_nchunks, vaend);
+ return NULL;
+ }
+ for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) {
+ u64 ch_offset;
+
+ ary->wc_array[ch_no].wc_target.rs_handle =
+ ntohl(ary->wc_array[ch_no].wc_target.rs_handle);
+ ary->wc_array[ch_no].wc_target.rs_length =
+ ntohl(ary->wc_array[ch_no].wc_target.rs_length);
+ va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset;
+ xdr_decode_hyper(va, &ch_offset);
+ put_unaligned(ch_offset, (u64 *)va);
+ }
+
+ return (u32 *)&ary->wc_array[ch_no];
+}
+
+int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
+ struct svc_rqst *rqstp)
+{
+ struct rpcrdma_msg *rmsgp = NULL;
+ u32 *va;
+ u32 *vaend;
+ u32 hdr_len;
+
+ rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+
+ /* Verify that there's enough bytes for header + something */
+ if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) {
+ dprintk("svcrdma: header too short = %d\n",
+ rqstp->rq_arg.len);
+ return -EINVAL;
+ }
+
+ /* Decode the header */
+ rmsgp->rm_xid = ntohl(rmsgp->rm_xid);
+ rmsgp->rm_vers = ntohl(rmsgp->rm_vers);
+ rmsgp->rm_credit = ntohl(rmsgp->rm_credit);
+ rmsgp->rm_type = ntohl(rmsgp->rm_type);
+
+ if (rmsgp->rm_vers != RPCRDMA_VERSION)
+ return -ENOSYS;
+
+ /* Pull in the extra for the padded case and bump our pointer */
+ if (rmsgp->rm_type == RDMA_MSGP) {
+ int hdrlen;
+ rmsgp->rm_body.rm_padded.rm_align =
+ ntohl(rmsgp->rm_body.rm_padded.rm_align);
+ rmsgp->rm_body.rm_padded.rm_thresh =
+ ntohl(rmsgp->rm_body.rm_padded.rm_thresh);
+
+ va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
+ rqstp->rq_arg.head[0].iov_base = va;
+ hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
+ rqstp->rq_arg.head[0].iov_len -= hdrlen;
+ if (hdrlen > rqstp->rq_arg.len)
+ return -EINVAL;
+ return hdrlen;
+ }
+
+ /* The chunk list may contain either a read chunk list or a write
+ * chunk list and a reply chunk list.
+ */
+ va = &rmsgp->rm_body.rm_chunks[0];
+ vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
+ va = decode_read_list(va, vaend);
+ if (!va)
+ return -EINVAL;
+ va = decode_write_list(va, vaend);
+ if (!va)
+ return -EINVAL;
+ va = decode_reply_array(va, vaend);
+ if (!va)
+ return -EINVAL;
+
+ rqstp->rq_arg.head[0].iov_base = va;
+ hdr_len = (unsigned long)va - (unsigned long)rmsgp;
+ rqstp->rq_arg.head[0].iov_len -= hdr_len;
+
+ *rdma_req = rmsgp;
+ return hdr_len;
+}
+
+int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp)
+{
+ struct rpcrdma_msg *rmsgp = NULL;
+ struct rpcrdma_read_chunk *ch;
+ struct rpcrdma_write_array *ary;
+ u32 *va;
+ u32 hdrlen;
+
+ dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n",
+ rqstp);
+ rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+
+ /* Pull in the extra for the padded case and bump our pointer */
+ if (rmsgp->rm_type == RDMA_MSGP) {
+ va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
+ rqstp->rq_arg.head[0].iov_base = va;
+ hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
+ rqstp->rq_arg.head[0].iov_len -= hdrlen;
+ return hdrlen;
+ }
+
+ /*
+ * Skip all chunks to find RPC msg. These were previously processed
+ */
+ va = &rmsgp->rm_body.rm_chunks[0];
+
+ /* Skip read-list */
+ for (ch = (struct rpcrdma_read_chunk *)va;
+ ch->rc_discrim != xdr_zero; ch++);
+ va = (u32 *)&ch->rc_position;
+
+ /* Skip write-list */
+ ary = (struct rpcrdma_write_array *)va;
+ if (ary->wc_discrim == xdr_zero)
+ va = (u32 *)&ary->wc_nchunks;
+ else
+ /*
+ * rs_length is the 2nd 4B field in wc_target and taking its
+ * address skips the list terminator
+ */
+ va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length;
+
+ /* Skip reply-array */
+ ary = (struct rpcrdma_write_array *)va;
+ if (ary->wc_discrim == xdr_zero)
+ va = (u32 *)&ary->wc_nchunks;
+ else
+ va = (u32 *)&ary->wc_array[ary->wc_nchunks];
+
+ rqstp->rq_arg.head[0].iov_base = va;
+ hdrlen = (unsigned long)va - (unsigned long)rmsgp;
+ rqstp->rq_arg.head[0].iov_len -= hdrlen;
+
+ return hdrlen;
+}
+
+int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
+ struct rpcrdma_msg *rmsgp,
+ enum rpcrdma_errcode err, u32 *va)
+{
+ u32 *startp = va;
+
+ *va++ = htonl(rmsgp->rm_xid);
+ *va++ = htonl(rmsgp->rm_vers);
+ *va++ = htonl(xprt->sc_max_requests);
+ *va++ = htonl(RDMA_ERROR);
+ *va++ = htonl(err);
+ if (err == ERR_VERS) {
+ *va++ = htonl(RPCRDMA_VERSION);
+ *va++ = htonl(RPCRDMA_VERSION);
+ }
+
+ return (int)((unsigned long)va - (unsigned long)startp);
+}
+
+int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
+{
+ struct rpcrdma_write_array *wr_ary;
+
+ /* There is no read-list in a reply */
+
+ /* skip write list */
+ wr_ary = (struct rpcrdma_write_array *)
+ &rmsgp->rm_body.rm_chunks[1];
+ if (wr_ary->wc_discrim)
+ wr_ary = (struct rpcrdma_write_array *)
+ &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)].
+ wc_target.rs_length;
+ else
+ wr_ary = (struct rpcrdma_write_array *)
+ &wr_ary->wc_nchunks;
+
+ /* skip reply array */
+ if (wr_ary->wc_discrim)
+ wr_ary = (struct rpcrdma_write_array *)
+ &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)];
+ else
+ wr_ary = (struct rpcrdma_write_array *)
+ &wr_ary->wc_nchunks;
+
+ return (unsigned long) wr_ary - (unsigned long) rmsgp;
+}
+
+void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
+{
+ struct rpcrdma_write_array *ary;
+
+ /* no read-list */
+ rmsgp->rm_body.rm_chunks[0] = xdr_zero;
+
+ /* write-array discrim */
+ ary = (struct rpcrdma_write_array *)
+ &rmsgp->rm_body.rm_chunks[1];
+ ary->wc_discrim = xdr_one;
+ ary->wc_nchunks = htonl(chunks);
+
+ /* write-list terminator */
+ ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
+
+ /* reply-array discriminator */
+ ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
+}
+
+void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
+ int chunks)
+{
+ ary->wc_discrim = xdr_one;
+ ary->wc_nchunks = htonl(chunks);
+}
+
+void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
+ int chunk_no,
+ u32 rs_handle, u64 rs_offset,
+ u32 write_len)
+{
+ struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
+ seg->rs_handle = htonl(rs_handle);
+ seg->rs_length = htonl(write_len);
+ xdr_encode_hyper((u32 *) &seg->rs_offset, rs_offset);
+}
+
+void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
+ struct rpcrdma_msg *rdma_argp,
+ struct rpcrdma_msg *rdma_resp,
+ enum rpcrdma_proc rdma_type)
+{
+ rdma_resp->rm_xid = htonl(rdma_argp->rm_xid);
+ rdma_resp->rm_vers = htonl(rdma_argp->rm_vers);
+ rdma_resp->rm_credit = htonl(xprt->sc_max_requests);
+ rdma_resp->rm_type = htonl(rdma_type);
+
+ /* Encode <nul> chunks lists */
+ rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
+ rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
+ rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
new file mode 100644
index 0000000..ab54a73
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/spinlock.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+/*
+ * Replace the pages in the rq_argpages array with the pages from the SGE in
+ * the RDMA_RECV completion. The SGL should contain full pages up until the
+ * last one.
+ */
+static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *ctxt,
+ u32 byte_count)
+{
+ struct page *page;
+ u32 bc;
+ int sge_no;
+
+ /* Swap the page in the SGE with the page in argpages */
+ page = ctxt->pages[0];
+ put_page(rqstp->rq_pages[0]);
+ rqstp->rq_pages[0] = page;
+
+ /* Set up the XDR head */
+ rqstp->rq_arg.head[0].iov_base = page_address(page);
+ rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length);
+ rqstp->rq_arg.len = byte_count;
+ rqstp->rq_arg.buflen = byte_count;
+
+ /* Compute bytes past head in the SGL */
+ bc = byte_count - rqstp->rq_arg.head[0].iov_len;
+
+ /* If data remains, store it in the pagelist */
+ rqstp->rq_arg.page_len = bc;
+ rqstp->rq_arg.page_base = 0;
+ rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+ sge_no = 1;
+ while (bc && sge_no < ctxt->count) {
+ page = ctxt->pages[sge_no];
+ put_page(rqstp->rq_pages[sge_no]);
+ rqstp->rq_pages[sge_no] = page;
+ bc -= min(bc, ctxt->sge[sge_no].length);
+ rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
+ sge_no++;
+ }
+ rqstp->rq_respages = &rqstp->rq_pages[sge_no];
+
+ /* We should never run out of SGE because the limit is defined to
+ * support the max allowed RPC data length
+ */
+ BUG_ON(bc && (sge_no == ctxt->count));
+ BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len)
+ != byte_count);
+ BUG_ON(rqstp->rq_arg.len != byte_count);
+
+ /* If not all pages were used from the SGL, free the remaining ones */
+ bc = sge_no;
+ while (sge_no < ctxt->count) {
+ page = ctxt->pages[sge_no++];
+ put_page(page);
+ }
+ ctxt->count = bc;
+
+ /* Set up tail */
+ rqstp->rq_arg.tail[0].iov_base = NULL;
+ rqstp->rq_arg.tail[0].iov_len = 0;
+}
+
+struct chunk_sge {
+ int start; /* sge no for this chunk */
+ int count; /* sge count for this chunk */
+};
+
+/* Encode a read-chunk-list as an array of IB SGE
+ *
+ * Assumptions:
+ * - chunk[0]->position points to pages[0] at an offset of 0
+ * - pages[] is not physically or virtually contigous and consists of
+ * PAGE_SIZE elements.
+ *
+ * Output:
+ * - sge array pointing into pages[] array.
+ * - chunk_sge array specifying sge index and count for each
+ * chunk in the read list
+ *
+ */
+static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head,
+ struct rpcrdma_msg *rmsgp,
+ struct ib_sge *sge,
+ struct chunk_sge *ch_sge_ary,
+ int ch_count,
+ int byte_count)
+{
+ int sge_no;
+ int sge_bytes;
+ int page_off;
+ int page_no;
+ int ch_bytes;
+ int ch_no;
+ struct rpcrdma_read_chunk *ch;
+
+ sge_no = 0;
+ page_no = 0;
+ page_off = 0;
+ ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+ ch_no = 0;
+ ch_bytes = ch->rc_target.rs_length;
+ head->arg.head[0] = rqstp->rq_arg.head[0];
+ head->arg.tail[0] = rqstp->rq_arg.tail[0];
+ head->arg.pages = &head->pages[head->count];
+ head->sge[0].length = head->count; /* save count of hdr pages */
+ head->arg.page_base = 0;
+ head->arg.page_len = ch_bytes;
+ head->arg.len = rqstp->rq_arg.len + ch_bytes;
+ head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
+ head->count++;
+ ch_sge_ary[0].start = 0;
+ while (byte_count) {
+ sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
+ sge[sge_no].addr =
+ ib_dma_map_page(xprt->sc_cm_id->device,
+ rqstp->rq_arg.pages[page_no],
+ page_off, sge_bytes,
+ DMA_FROM_DEVICE);
+ sge[sge_no].length = sge_bytes;
+ sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+ /*
+ * Don't bump head->count here because the same page
+ * may be used by multiple SGE.
+ */
+ head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
+ rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
+
+ byte_count -= sge_bytes;
+ ch_bytes -= sge_bytes;
+ sge_no++;
+ /*
+ * If all bytes for this chunk have been mapped to an
+ * SGE, move to the next SGE
+ */
+ if (ch_bytes == 0) {
+ ch_sge_ary[ch_no].count =
+ sge_no - ch_sge_ary[ch_no].start;
+ ch_no++;
+ ch++;
+ ch_sge_ary[ch_no].start = sge_no;
+ ch_bytes = ch->rc_target.rs_length;
+ /* If bytes remaining account for next chunk */
+ if (byte_count) {
+ head->arg.page_len += ch_bytes;
+ head->arg.len += ch_bytes;
+ head->arg.buflen += ch_bytes;
+ }
+ }
+ /*
+ * If this SGE consumed all of the page, move to the
+ * next page
+ */
+ if ((sge_bytes + page_off) == PAGE_SIZE) {
+ page_no++;
+ page_off = 0;
+ /*
+ * If there are still bytes left to map, bump
+ * the page count
+ */
+ if (byte_count)
+ head->count++;
+ } else
+ page_off += sge_bytes;
+ }
+ BUG_ON(byte_count != 0);
+ return sge_no;
+}
+
+static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt,
+ struct ib_sge *sge,
+ u64 *sgl_offset,
+ int count)
+{
+ int i;
+
+ ctxt->count = count;
+ for (i = 0; i < count; i++) {
+ ctxt->sge[i].addr = sge[i].addr;
+ ctxt->sge[i].length = sge[i].length;
+ *sgl_offset = *sgl_offset + sge[i].length;
+ }
+}
+
+static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
+{
+#ifdef RDMA_TRANSPORT_IWARP
+ if ((RDMA_TRANSPORT_IWARP ==
+ rdma_node_get_transport(xprt->sc_cm_id->
+ device->node_type))
+ && sge_count > 1)
+ return 1;
+ else
+#endif
+ return min_t(int, sge_count, xprt->sc_max_sge);
+}
+
+/*
+ * Use RDMA_READ to read data from the advertised client buffer into the
+ * XDR stream starting at rq_arg.head[0].iov_base.
+ * Each chunk in the array
+ * contains the following fields:
+ * discrim - '1', This isn't used for data placement
+ * position - The xdr stream offset (the same for every chunk)
+ * handle - RMR for client memory region
+ * length - data transfer length
+ * offset - 64 bit tagged offset in remote memory region
+ *
+ * On our side, we need to read into a pagelist. The first page immediately
+ * follows the RPC header.
+ *
+ * This function returns 1 to indicate success. The data is not yet in
+ * the pagelist and therefore the RPC request must be deferred. The
+ * I/O completion will enqueue the transport again and
+ * svc_rdma_recvfrom will complete the request.
+ *
+ * NOTE: The ctxt must not be touched after the last WR has been posted
+ * because the I/O completion processing may occur on another
+ * processor and free / modify the context. Ne touche pas!
+ */
+static int rdma_read_xdr(struct svcxprt_rdma *xprt,
+ struct rpcrdma_msg *rmsgp,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *hdr_ctxt)
+{
+ struct ib_send_wr read_wr;
+ int err = 0;
+ int ch_no;
+ struct ib_sge *sge;
+ int ch_count;
+ int byte_count;
+ int sge_count;
+ u64 sgl_offset;
+ struct rpcrdma_read_chunk *ch;
+ struct svc_rdma_op_ctxt *ctxt = NULL;
+ struct svc_rdma_op_ctxt *head;
+ struct svc_rdma_op_ctxt *tmp_sge_ctxt;
+ struct svc_rdma_op_ctxt *tmp_ch_ctxt;
+ struct chunk_sge *ch_sge_ary;
+
+ /* If no read list is present, return 0 */
+ ch = svc_rdma_get_read_chunk(rmsgp);
+ if (!ch)
+ return 0;
+
+ /* Allocate temporary contexts to keep SGE */
+ BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge));
+ tmp_sge_ctxt = svc_rdma_get_context(xprt);
+ sge = tmp_sge_ctxt->sge;
+ tmp_ch_ctxt = svc_rdma_get_context(xprt);
+ ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge;
+
+ svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
+ sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp,
+ sge, ch_sge_ary,
+ ch_count, byte_count);
+ head = svc_rdma_get_context(xprt);
+ sgl_offset = 0;
+ ch_no = 0;
+
+ for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+ ch->rc_discrim != 0; ch++, ch_no++) {
+next_sge:
+ if (!ctxt)
+ ctxt = head;
+ else {
+ ctxt->next = svc_rdma_get_context(xprt);
+ ctxt = ctxt->next;
+ }
+ ctxt->next = NULL;
+ ctxt->direction = DMA_FROM_DEVICE;
+ clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
+ clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+ if ((ch+1)->rc_discrim == 0) {
+ /*
+ * Checked in sq_cq_reap to see if we need to
+ * be enqueued
+ */
+ set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+ ctxt->next = hdr_ctxt;
+ hdr_ctxt->next = head;
+ }
+
+ /* Prepare READ WR */
+ memset(&read_wr, 0, sizeof read_wr);
+ ctxt->wr_op = IB_WR_RDMA_READ;
+ read_wr.wr_id = (unsigned long)ctxt;
+ read_wr.opcode = IB_WR_RDMA_READ;
+ read_wr.send_flags = IB_SEND_SIGNALED;
+ read_wr.wr.rdma.rkey = ch->rc_target.rs_handle;
+ read_wr.wr.rdma.remote_addr =
+ get_unaligned(&(ch->rc_target.rs_offset)) +
+ sgl_offset;
+ read_wr.sg_list = &sge[ch_sge_ary[ch_no].start];
+ read_wr.num_sge =
+ rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count);
+ rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start],
+ &sgl_offset,
+ read_wr.num_sge);
+
+ /* Post the read */
+ err = svc_rdma_send(xprt, &read_wr);
+ if (err) {
+ printk(KERN_ERR "svcrdma: Error posting send = %d\n",
+ err);
+ /*
+ * Break the circular list so free knows when
+ * to stop if the error happened to occur on
+ * the last read
+ */
+ ctxt->next = NULL;
+ goto out;
+ }
+ atomic_inc(&rdma_stat_read);
+
+ if (read_wr.num_sge < ch_sge_ary[ch_no].count) {
+ ch_sge_ary[ch_no].count -= read_wr.num_sge;
+ ch_sge_ary[ch_no].start += read_wr.num_sge;
+ goto next_sge;
+ }
+ sgl_offset = 0;
+ err = 0;
+ }
+
+ out:
+ svc_rdma_put_context(tmp_sge_ctxt, 0);
+ svc_rdma_put_context(tmp_ch_ctxt, 0);
+
+ /* Detach arg pages. svc_recv will replenish them */
+ for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
+ rqstp->rq_pages[ch_no] = NULL;
+
+ /*
+ * Detach res pages. svc_release must see a resused count of
+ * zero or it will attempt to put them.
+ */
+ while (rqstp->rq_resused)
+ rqstp->rq_respages[--rqstp->rq_resused] = NULL;
+
+ if (err) {
+ printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err);
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ /* Free the linked list of read contexts */
+ while (head != NULL) {
+ ctxt = head->next;
+ svc_rdma_put_context(head, 1);
+ head = ctxt;
+ }
+ return 0;
+ }
+
+ return 1;
+}
+
+static int rdma_read_complete(struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *data)
+{
+ struct svc_rdma_op_ctxt *head = data->next;
+ int page_no;
+ int ret;
+
+ BUG_ON(!head);
+
+ /* Copy RPC pages */
+ for (page_no = 0; page_no < head->count; page_no++) {
+ put_page(rqstp->rq_pages[page_no]);
+ rqstp->rq_pages[page_no] = head->pages[page_no];
+ }
+ /* Point rq_arg.pages past header */
+ rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length];
+ rqstp->rq_arg.page_len = head->arg.page_len;
+ rqstp->rq_arg.page_base = head->arg.page_base;
+
+ /* rq_respages starts after the last arg page */
+ rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+ rqstp->rq_resused = 0;
+
+ /* Rebuild rq_arg head and tail. */
+ rqstp->rq_arg.head[0] = head->arg.head[0];
+ rqstp->rq_arg.tail[0] = head->arg.tail[0];
+ rqstp->rq_arg.len = head->arg.len;
+ rqstp->rq_arg.buflen = head->arg.buflen;
+
+ /* XXX: What should this be? */
+ rqstp->rq_prot = IPPROTO_MAX;
+
+ /*
+ * Free the contexts we used to build the RDMA_READ. We have
+ * to be careful here because the context list uses the same
+ * next pointer used to chain the contexts associated with the
+ * RDMA_READ
+ */
+ data->next = NULL; /* terminate circular list */
+ do {
+ data = head->next;
+ svc_rdma_put_context(head, 0);
+ head = data;
+ } while (head != NULL);
+
+ ret = rqstp->rq_arg.head[0].iov_len
+ + rqstp->rq_arg.page_len
+ + rqstp->rq_arg.tail[0].iov_len;
+ dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, "
+ "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
+ ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
+ rqstp->rq_arg.head[0].iov_len);
+
+ /* Indicate that we've consumed an RQ credit */
+ rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
+ svc_xprt_received(rqstp->rq_xprt);
+ return ret;
+}
+
+/*
+ * Set up the rqstp thread context to point to the RQ buffer. If
+ * necessary, pull additional data from the client with an RDMA_READ
+ * request.
+ */
+int svc_rdma_recvfrom(struct svc_rqst *rqstp)
+{
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ struct svcxprt_rdma *rdma_xprt =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+ struct svc_rdma_op_ctxt *ctxt = NULL;
+ struct rpcrdma_msg *rmsgp;
+ int ret = 0;
+ int len;
+
+ dprintk("svcrdma: rqstp=%p\n", rqstp);
+
+ /*
+ * The rq_xprt_ctxt indicates if we've consumed an RQ credit
+ * or not. It is used in the rdma xpo_release_rqst function to
+ * determine whether or not to return an RQ WQE to the RQ.
+ */
+ rqstp->rq_xprt_ctxt = NULL;
+
+ spin_lock_bh(&rdma_xprt->sc_read_complete_lock);
+ if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
+ ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
+ struct svc_rdma_op_ctxt,
+ dto_q);
+ list_del_init(&ctxt->dto_q);
+ }
+ spin_unlock_bh(&rdma_xprt->sc_read_complete_lock);
+ if (ctxt)
+ return rdma_read_complete(rqstp, ctxt);
+
+ spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
+ if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
+ ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
+ struct svc_rdma_op_ctxt,
+ dto_q);
+ list_del_init(&ctxt->dto_q);
+ } else {
+ atomic_inc(&rdma_stat_rq_starve);
+ clear_bit(XPT_DATA, &xprt->xpt_flags);
+ ctxt = NULL;
+ }
+ spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+ if (!ctxt) {
+ /* This is the EAGAIN path. The svc_recv routine will
+ * return -EAGAIN, the nfsd thread will go to call into
+ * svc_recv again and we shouldn't be on the active
+ * transport list
+ */
+ if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
+ goto close_out;
+
+ BUG_ON(ret);
+ goto out;
+ }
+ dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
+ ctxt, rdma_xprt, rqstp, ctxt->wc_status);
+ BUG_ON(ctxt->wc_status != IB_WC_SUCCESS);
+ atomic_inc(&rdma_stat_recv);
+
+ /* Build up the XDR from the receive buffers. */
+ rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len);
+
+ /* Decode the RDMA header. */
+ len = svc_rdma_xdr_decode_req(&rmsgp, rqstp);
+ rqstp->rq_xprt_hlen = len;
+
+ /* If the request is invalid, reply with an error */
+ if (len < 0) {
+ if (len == -ENOSYS)
+ (void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
+ goto close_out;
+ }
+
+ /* Read read-list data. If we would need to wait, defer
+ * it. Not that in this case, we don't return the RQ credit
+ * until after the read completes.
+ */
+ if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) {
+ svc_xprt_received(xprt);
+ return 0;
+ }
+
+ /* Indicate we've consumed an RQ credit */
+ rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
+
+ ret = rqstp->rq_arg.head[0].iov_len
+ + rqstp->rq_arg.page_len
+ + rqstp->rq_arg.tail[0].iov_len;
+ svc_rdma_put_context(ctxt, 0);
+ out:
+ dprintk("svcrdma: ret = %d, rq_arg.len =%d, "
+ "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
+ ret, rqstp->rq_arg.len,
+ rqstp->rq_arg.head[0].iov_base,
+ rqstp->rq_arg.head[0].iov_len);
+ rqstp->rq_prot = IPPROTO_MAX;
+ svc_xprt_copy_addrs(rqstp, xprt);
+ svc_xprt_received(xprt);
+ return ret;
+
+ close_out:
+ if (ctxt) {
+ svc_rdma_put_context(ctxt, 1);
+ /* Indicate we've consumed an RQ credit */
+ rqstp->rq_xprt_ctxt = rqstp->rq_xprt;
+ }
+ dprintk("svcrdma: transport %p is closing\n", xprt);
+ /*
+ * Set the close bit and enqueue it. svc_recv will see the
+ * close bit and call svc_xprt_delete
+ */
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ svc_xprt_received(xprt);
+ return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
new file mode 100644
index 0000000..3e32194
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/spinlock.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+/* Encode an XDR as an array of IB SGE
+ *
+ * Assumptions:
+ * - head[0] is physically contiguous.
+ * - tail[0] is physically contiguous.
+ * - pages[] is not physically or virtually contigous and consists of
+ * PAGE_SIZE elements.
+ *
+ * Output:
+ * SGE[0] reserved for RCPRDMA header
+ * SGE[1] data from xdr->head[]
+ * SGE[2..sge_count-2] data from xdr->pages[]
+ * SGE[sge_count-1] data from xdr->tail.
+ *
+ */
+static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt,
+ struct xdr_buf *xdr,
+ struct ib_sge *sge,
+ int *sge_count)
+{
+ /* Max we need is the length of the XDR / pagesize + one for
+ * head + one for tail + one for RPCRDMA header
+ */
+ int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
+ int sge_no;
+ u32 byte_count = xdr->len;
+ u32 sge_bytes;
+ u32 page_bytes;
+ int page_off;
+ int page_no;
+
+ /* Skip the first sge, this is for the RPCRDMA header */
+ sge_no = 1;
+
+ /* Head SGE */
+ sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device,
+ xdr->head[0].iov_base,
+ xdr->head[0].iov_len,
+ DMA_TO_DEVICE);
+ sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len);
+ byte_count -= sge_bytes;
+ sge[sge_no].length = sge_bytes;
+ sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+ sge_no++;
+
+ /* pages SGE */
+ page_no = 0;
+ page_bytes = xdr->page_len;
+ page_off = xdr->page_base;
+ while (byte_count && page_bytes) {
+ sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off));
+ sge[sge_no].addr =
+ ib_dma_map_page(xprt->sc_cm_id->device,
+ xdr->pages[page_no], page_off,
+ sge_bytes, DMA_TO_DEVICE);
+ sge_bytes = min(sge_bytes, page_bytes);
+ byte_count -= sge_bytes;
+ page_bytes -= sge_bytes;
+ sge[sge_no].length = sge_bytes;
+ sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+
+ sge_no++;
+ page_no++;
+ page_off = 0; /* reset for next time through loop */
+ }
+
+ /* Tail SGE */
+ if (byte_count && xdr->tail[0].iov_len) {
+ sge[sge_no].addr =
+ ib_dma_map_single(xprt->sc_cm_id->device,
+ xdr->tail[0].iov_base,
+ xdr->tail[0].iov_len,
+ DMA_TO_DEVICE);
+ sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len);
+ byte_count -= sge_bytes;
+ sge[sge_no].length = sge_bytes;
+ sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+ sge_no++;
+ }
+
+ BUG_ON(sge_no > sge_max);
+ BUG_ON(byte_count != 0);
+
+ *sge_count = sge_no;
+ return sge;
+}
+
+
+/* Assumptions:
+ * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
+ */
+static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+ u32 rmr, u64 to,
+ u32 xdr_off, int write_len,
+ struct ib_sge *xdr_sge, int sge_count)
+{
+ struct svc_rdma_op_ctxt *tmp_sge_ctxt;
+ struct ib_send_wr write_wr;
+ struct ib_sge *sge;
+ int xdr_sge_no;
+ int sge_no;
+ int sge_bytes;
+ int sge_off;
+ int bc;
+ struct svc_rdma_op_ctxt *ctxt;
+ int ret = 0;
+
+ BUG_ON(sge_count >= 32);
+ dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
+ "write_len=%d, xdr_sge=%p, sge_count=%d\n",
+ rmr, to, xdr_off, write_len, xdr_sge, sge_count);
+
+ ctxt = svc_rdma_get_context(xprt);
+ ctxt->count = 0;
+ tmp_sge_ctxt = svc_rdma_get_context(xprt);
+ sge = tmp_sge_ctxt->sge;
+
+ /* Find the SGE associated with xdr_off */
+ for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count;
+ xdr_sge_no++) {
+ if (xdr_sge[xdr_sge_no].length > bc)
+ break;
+ bc -= xdr_sge[xdr_sge_no].length;
+ }
+
+ sge_off = bc;
+ bc = write_len;
+ sge_no = 0;
+
+ /* Copy the remaining SGE */
+ while (bc != 0 && xdr_sge_no < sge_count) {
+ sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off;
+ sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey;
+ sge_bytes = min((size_t)bc,
+ (size_t)(xdr_sge[xdr_sge_no].length-sge_off));
+ sge[sge_no].length = sge_bytes;
+
+ sge_off = 0;
+ sge_no++;
+ xdr_sge_no++;
+ bc -= sge_bytes;
+ }
+
+ BUG_ON(bc != 0);
+ BUG_ON(xdr_sge_no > sge_count);
+
+ /* Prepare WRITE WR */
+ memset(&write_wr, 0, sizeof write_wr);
+ ctxt->wr_op = IB_WR_RDMA_WRITE;
+ write_wr.wr_id = (unsigned long)ctxt;
+ write_wr.sg_list = &sge[0];
+ write_wr.num_sge = sge_no;
+ write_wr.opcode = IB_WR_RDMA_WRITE;
+ write_wr.send_flags = IB_SEND_SIGNALED;
+ write_wr.wr.rdma.rkey = rmr;
+ write_wr.wr.rdma.remote_addr = to;
+
+ /* Post It */
+ atomic_inc(&rdma_stat_write);
+ if (svc_rdma_send(xprt, &write_wr)) {
+ svc_rdma_put_context(ctxt, 1);
+ /* Fatal error, close transport */
+ ret = -EIO;
+ }
+ svc_rdma_put_context(tmp_sge_ctxt, 0);
+ return ret;
+}
+
+static int send_write_chunks(struct svcxprt_rdma *xprt,
+ struct rpcrdma_msg *rdma_argp,
+ struct rpcrdma_msg *rdma_resp,
+ struct svc_rqst *rqstp,
+ struct ib_sge *sge,
+ int sge_count)
+{
+ u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
+ int write_len;
+ int max_write;
+ u32 xdr_off;
+ int chunk_off;
+ int chunk_no;
+ struct rpcrdma_write_array *arg_ary;
+ struct rpcrdma_write_array *res_ary;
+ int ret;
+
+ arg_ary = svc_rdma_get_write_array(rdma_argp);
+ if (!arg_ary)
+ return 0;
+ res_ary = (struct rpcrdma_write_array *)
+ &rdma_resp->rm_body.rm_chunks[1];
+
+ max_write = xprt->sc_max_sge * PAGE_SIZE;
+
+ /* Write chunks start at the pagelist */
+ for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
+ xfer_len && chunk_no < arg_ary->wc_nchunks;
+ chunk_no++) {
+ struct rpcrdma_segment *arg_ch;
+ u64 rs_offset;
+
+ arg_ch = &arg_ary->wc_array[chunk_no].wc_target;
+ write_len = min(xfer_len, arg_ch->rs_length);
+
+ /* Prepare the response chunk given the length actually
+ * written */
+ rs_offset = get_unaligned(&(arg_ch->rs_offset));
+ svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
+ arg_ch->rs_handle,
+ rs_offset,
+ write_len);
+ chunk_off = 0;
+ while (write_len) {
+ int this_write;
+ this_write = min(write_len, max_write);
+ ret = send_write(xprt, rqstp,
+ arg_ch->rs_handle,
+ rs_offset + chunk_off,
+ xdr_off,
+ this_write,
+ sge,
+ sge_count);
+ if (ret) {
+ dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
+ ret);
+ return -EIO;
+ }
+ chunk_off += this_write;
+ xdr_off += this_write;
+ xfer_len -= this_write;
+ write_len -= this_write;
+ }
+ }
+ /* Update the req with the number of chunks actually used */
+ svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
+
+ return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
+}
+
+static int send_reply_chunks(struct svcxprt_rdma *xprt,
+ struct rpcrdma_msg *rdma_argp,
+ struct rpcrdma_msg *rdma_resp,
+ struct svc_rqst *rqstp,
+ struct ib_sge *sge,
+ int sge_count)
+{
+ u32 xfer_len = rqstp->rq_res.len;
+ int write_len;
+ int max_write;
+ u32 xdr_off;
+ int chunk_no;
+ int chunk_off;
+ struct rpcrdma_segment *ch;
+ struct rpcrdma_write_array *arg_ary;
+ struct rpcrdma_write_array *res_ary;
+ int ret;
+
+ arg_ary = svc_rdma_get_reply_array(rdma_argp);
+ if (!arg_ary)
+ return 0;
+ /* XXX: need to fix when reply lists occur with read-list and or
+ * write-list */
+ res_ary = (struct rpcrdma_write_array *)
+ &rdma_resp->rm_body.rm_chunks[2];
+
+ max_write = xprt->sc_max_sge * PAGE_SIZE;
+
+ /* xdr offset starts at RPC message */
+ for (xdr_off = 0, chunk_no = 0;
+ xfer_len && chunk_no < arg_ary->wc_nchunks;
+ chunk_no++) {
+ u64 rs_offset;
+ ch = &arg_ary->wc_array[chunk_no].wc_target;
+ write_len = min(xfer_len, ch->rs_length);
+
+
+ /* Prepare the reply chunk given the length actually
+ * written */
+ rs_offset = get_unaligned(&(ch->rs_offset));
+ svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
+ ch->rs_handle, rs_offset,
+ write_len);
+ chunk_off = 0;
+ while (write_len) {
+ int this_write;
+
+ this_write = min(write_len, max_write);
+ ret = send_write(xprt, rqstp,
+ ch->rs_handle,
+ rs_offset + chunk_off,
+ xdr_off,
+ this_write,
+ sge,
+ sge_count);
+ if (ret) {
+ dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
+ ret);
+ return -EIO;
+ }
+ chunk_off += this_write;
+ xdr_off += this_write;
+ xfer_len -= this_write;
+ write_len -= this_write;
+ }
+ }
+ /* Update the req with the number of chunks actually used */
+ svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
+
+ return rqstp->rq_res.len;
+}
+
+/* This function prepares the portion of the RPCRDMA message to be
+ * sent in the RDMA_SEND. This function is called after data sent via
+ * RDMA has already been transmitted. There are three cases:
+ * - The RPCRDMA header, RPC header, and payload are all sent in a
+ * single RDMA_SEND. This is the "inline" case.
+ * - The RPCRDMA header and some portion of the RPC header and data
+ * are sent via this RDMA_SEND and another portion of the data is
+ * sent via RDMA.
+ * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
+ * header and data are all transmitted via RDMA.
+ * In all three cases, this function prepares the RPCRDMA header in
+ * sge[0], the 'type' parameter indicates the type to place in the
+ * RPCRDMA header, and the 'byte_count' field indicates how much of
+ * the XDR to include in this RDMA_SEND.
+ */
+static int send_reply(struct svcxprt_rdma *rdma,
+ struct svc_rqst *rqstp,
+ struct page *page,
+ struct rpcrdma_msg *rdma_resp,
+ struct svc_rdma_op_ctxt *ctxt,
+ int sge_count,
+ int byte_count)
+{
+ struct ib_send_wr send_wr;
+ int sge_no;
+ int sge_bytes;
+ int page_no;
+ int ret;
+
+ /* Prepare the context */
+ ctxt->pages[0] = page;
+ ctxt->count = 1;
+
+ /* Prepare the SGE for the RPCRDMA Header */
+ ctxt->sge[0].addr =
+ ib_dma_map_page(rdma->sc_cm_id->device,
+ page, 0, PAGE_SIZE, DMA_TO_DEVICE);
+ ctxt->direction = DMA_TO_DEVICE;
+ ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
+ ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey;
+
+ /* Determine how many of our SGE are to be transmitted */
+ for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) {
+ sge_bytes = min((size_t)ctxt->sge[sge_no].length,
+ (size_t)byte_count);
+ byte_count -= sge_bytes;
+ }
+ BUG_ON(byte_count != 0);
+
+ /* Save all respages in the ctxt and remove them from the
+ * respages array. They are our pages until the I/O
+ * completes.
+ */
+ for (page_no = 0; page_no < rqstp->rq_resused; page_no++) {
+ ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
+ ctxt->count++;
+ rqstp->rq_respages[page_no] = NULL;
+ }
+
+ BUG_ON(sge_no > rdma->sc_max_sge);
+ memset(&send_wr, 0, sizeof send_wr);
+ ctxt->wr_op = IB_WR_SEND;
+ send_wr.wr_id = (unsigned long)ctxt;
+ send_wr.sg_list = ctxt->sge;
+ send_wr.num_sge = sge_no;
+ send_wr.opcode = IB_WR_SEND;
+ send_wr.send_flags = IB_SEND_SIGNALED;
+
+ ret = svc_rdma_send(rdma, &send_wr);
+ if (ret)
+ svc_rdma_put_context(ctxt, 1);
+
+ return ret;
+}
+
+void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
+{
+}
+
+/*
+ * Return the start of an xdr buffer.
+ */
+static void *xdr_start(struct xdr_buf *xdr)
+{
+ return xdr->head[0].iov_base -
+ (xdr->len -
+ xdr->page_len -
+ xdr->tail[0].iov_len -
+ xdr->head[0].iov_len);
+}
+
+int svc_rdma_sendto(struct svc_rqst *rqstp)
+{
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ struct svcxprt_rdma *rdma =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+ struct rpcrdma_msg *rdma_argp;
+ struct rpcrdma_msg *rdma_resp;
+ struct rpcrdma_write_array *reply_ary;
+ enum rpcrdma_proc reply_type;
+ int ret;
+ int inline_bytes;
+ struct ib_sge *sge;
+ int sge_count = 0;
+ struct page *res_page;
+ struct svc_rdma_op_ctxt *ctxt;
+
+ dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
+
+ /* Get the RDMA request header. */
+ rdma_argp = xdr_start(&rqstp->rq_arg);
+
+ /* Build an SGE for the XDR */
+ ctxt = svc_rdma_get_context(rdma);
+ ctxt->direction = DMA_TO_DEVICE;
+ sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count);
+
+ inline_bytes = rqstp->rq_res.len;
+
+ /* Create the RDMA response header */
+ res_page = svc_rdma_get_page();
+ rdma_resp = page_address(res_page);
+ reply_ary = svc_rdma_get_reply_array(rdma_argp);
+ if (reply_ary)
+ reply_type = RDMA_NOMSG;
+ else
+ reply_type = RDMA_MSG;
+ svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
+ rdma_resp, reply_type);
+
+ /* Send any write-chunk data and build resp write-list */
+ ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
+ rqstp, sge, sge_count);
+ if (ret < 0) {
+ printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
+ ret);
+ goto error;
+ }
+ inline_bytes -= ret;
+
+ /* Send any reply-list data and update resp reply-list */
+ ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
+ rqstp, sge, sge_count);
+ if (ret < 0) {
+ printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
+ ret);
+ goto error;
+ }
+ inline_bytes -= ret;
+
+ ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count,
+ inline_bytes);
+ dprintk("svcrdma: send_reply returns %d\n", ret);
+ return ret;
+ error:
+ svc_rdma_put_context(ctxt, 0);
+ put_page(res_page);
+ return ret;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
new file mode 100644
index 0000000..f09444c
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -0,0 +1,1080 @@
+/*
+ * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * Neither the name of the Network Appliance, Inc. nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/spinlock.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+ struct sockaddr *sa, int salen,
+ int flags);
+static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
+static void svc_rdma_release_rqst(struct svc_rqst *);
+static void rdma_destroy_xprt(struct svcxprt_rdma *xprt);
+static void dto_tasklet_func(unsigned long data);
+static void svc_rdma_detach(struct svc_xprt *xprt);
+static void svc_rdma_free(struct svc_xprt *xprt);
+static int svc_rdma_has_wspace(struct svc_xprt *xprt);
+static void rq_cq_reap(struct svcxprt_rdma *xprt);
+static void sq_cq_reap(struct svcxprt_rdma *xprt);
+
+DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL);
+static DEFINE_SPINLOCK(dto_lock);
+static LIST_HEAD(dto_xprt_q);
+
+static struct svc_xprt_ops svc_rdma_ops = {
+ .xpo_create = svc_rdma_create,
+ .xpo_recvfrom = svc_rdma_recvfrom,
+ .xpo_sendto = svc_rdma_sendto,
+ .xpo_release_rqst = svc_rdma_release_rqst,
+ .xpo_detach = svc_rdma_detach,
+ .xpo_free = svc_rdma_free,
+ .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
+ .xpo_has_wspace = svc_rdma_has_wspace,
+ .xpo_accept = svc_rdma_accept,
+};
+
+struct svc_xprt_class svc_rdma_class = {
+ .xcl_name = "rdma",
+ .xcl_owner = THIS_MODULE,
+ .xcl_ops = &svc_rdma_ops,
+ .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP,
+};
+
+static int rdma_bump_context_cache(struct svcxprt_rdma *xprt)
+{
+ int target;
+ int at_least_one = 0;
+ struct svc_rdma_op_ctxt *ctxt;
+
+ target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump,
+ xprt->sc_ctxt_max);
+
+ spin_lock_bh(&xprt->sc_ctxt_lock);
+ while (xprt->sc_ctxt_cnt < target) {
+ xprt->sc_ctxt_cnt++;
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
+
+ ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+
+ spin_lock_bh(&xprt->sc_ctxt_lock);
+ if (ctxt) {
+ at_least_one = 1;
+ ctxt->next = xprt->sc_ctxt_head;
+ xprt->sc_ctxt_head = ctxt;
+ } else {
+ /* kmalloc failed...give up for now */
+ xprt->sc_ctxt_cnt--;
+ break;
+ }
+ }
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
+ dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n",
+ xprt->sc_ctxt_max, xprt->sc_ctxt_cnt);
+ return at_least_one;
+}
+
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+{
+ struct svc_rdma_op_ctxt *ctxt;
+
+ while (1) {
+ spin_lock_bh(&xprt->sc_ctxt_lock);
+ if (unlikely(xprt->sc_ctxt_head == NULL)) {
+ /* Try to bump my cache. */
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
+
+ if (rdma_bump_context_cache(xprt))
+ continue;
+
+ printk(KERN_INFO "svcrdma: sleeping waiting for "
+ "context memory on xprt=%p\n",
+ xprt);
+ schedule_timeout_uninterruptible(msecs_to_jiffies(500));
+ continue;
+ }
+ ctxt = xprt->sc_ctxt_head;
+ xprt->sc_ctxt_head = ctxt->next;
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
+ ctxt->xprt = xprt;
+ INIT_LIST_HEAD(&ctxt->dto_q);
+ ctxt->count = 0;
+ break;
+ }
+ return ctxt;
+}
+
+void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
+{
+ struct svcxprt_rdma *xprt;
+ int i;
+
+ BUG_ON(!ctxt);
+ xprt = ctxt->xprt;
+ if (free_pages)
+ for (i = 0; i < ctxt->count; i++)
+ put_page(ctxt->pages[i]);
+
+ for (i = 0; i < ctxt->count; i++)
+ dma_unmap_single(xprt->sc_cm_id->device->dma_device,
+ ctxt->sge[i].addr,
+ ctxt->sge[i].length,
+ ctxt->direction);
+ spin_lock_bh(&xprt->sc_ctxt_lock);
+ ctxt->next = xprt->sc_ctxt_head;
+ xprt->sc_ctxt_head = ctxt;
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
+}
+
+/* ib_cq event handler */
+static void cq_event_handler(struct ib_event *event, void *context)
+{
+ struct svc_xprt *xprt = context;
+ dprintk("svcrdma: received CQ event id=%d, context=%p\n",
+ event->event, context);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+}
+
+/* QP event handler */
+static void qp_event_handler(struct ib_event *event, void *context)
+{
+ struct svc_xprt *xprt = context;
+
+ switch (event->event) {
+ /* These are considered benign events */
+ case IB_EVENT_PATH_MIG:
+ case IB_EVENT_COMM_EST:
+ case IB_EVENT_SQ_DRAINED:
+ case IB_EVENT_QP_LAST_WQE_REACHED:
+ dprintk("svcrdma: QP event %d received for QP=%p\n",
+ event->event, event->element.qp);
+ break;
+ /* These are considered fatal events */
+ case IB_EVENT_PATH_MIG_ERR:
+ case IB_EVENT_QP_FATAL:
+ case IB_EVENT_QP_REQ_ERR:
+ case IB_EVENT_QP_ACCESS_ERR:
+ case IB_EVENT_DEVICE_FATAL:
+ default:
+ dprintk("svcrdma: QP ERROR event %d received for QP=%p, "
+ "closing transport\n",
+ event->event, event->element.qp);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ break;
+ }
+}
+
+/*
+ * Data Transfer Operation Tasklet
+ *
+ * Walks a list of transports with I/O pending, removing entries as
+ * they are added to the server's I/O pending list. Two bits indicate
+ * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave
+ * spinlock that serializes access to the transport list with the RQ
+ * and SQ interrupt handlers.
+ */
+static void dto_tasklet_func(unsigned long data)
+{
+ struct svcxprt_rdma *xprt;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dto_lock, flags);
+ while (!list_empty(&dto_xprt_q)) {
+ xprt = list_entry(dto_xprt_q.next,
+ struct svcxprt_rdma, sc_dto_q);
+ list_del_init(&xprt->sc_dto_q);
+ spin_unlock_irqrestore(&dto_lock, flags);
+
+ if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) {
+ ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
+ rq_cq_reap(xprt);
+ set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+ /*
+ * If data arrived before established event,
+ * don't enqueue. This defers RPC I/O until the
+ * RDMA connection is complete.
+ */
+ if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
+ svc_xprt_enqueue(&xprt->sc_xprt);
+ }
+
+ if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) {
+ ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
+ sq_cq_reap(xprt);
+ }
+
+ spin_lock_irqsave(&dto_lock, flags);
+ }
+ spin_unlock_irqrestore(&dto_lock, flags);
+}
+
+/*
+ * Receive Queue Completion Handler
+ *
+ * Since an RQ completion handler is called on interrupt context, we
+ * need to defer the handling of the I/O to a tasklet
+ */
+static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+ struct svcxprt_rdma *xprt = cq_context;
+ unsigned long flags;
+
+ /*
+ * Set the bit regardless of whether or not it's on the list
+ * because it may be on the list already due to an SQ
+ * completion.
+ */
+ set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags);
+
+ /*
+ * If this transport is not already on the DTO transport queue,
+ * add it
+ */
+ spin_lock_irqsave(&dto_lock, flags);
+ if (list_empty(&xprt->sc_dto_q))
+ list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
+ spin_unlock_irqrestore(&dto_lock, flags);
+
+ /* Tasklet does all the work to avoid irqsave locks. */
+ tasklet_schedule(&dto_tasklet);
+}
+
+/*
+ * rq_cq_reap - Process the RQ CQ.
+ *
+ * Take all completing WC off the CQE and enqueue the associated DTO
+ * context on the dto_q for the transport.
+ */
+static void rq_cq_reap(struct svcxprt_rdma *xprt)
+{
+ int ret;
+ struct ib_wc wc;
+ struct svc_rdma_op_ctxt *ctxt = NULL;
+
+ atomic_inc(&rdma_stat_rq_poll);
+
+ spin_lock_bh(&xprt->sc_rq_dto_lock);
+ while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) {
+ ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
+ ctxt->wc_status = wc.status;
+ ctxt->byte_len = wc.byte_len;
+ if (wc.status != IB_WC_SUCCESS) {
+ /* Close the transport */
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ svc_rdma_put_context(ctxt, 1);
+ continue;
+ }
+ list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+ }
+ spin_unlock_bh(&xprt->sc_rq_dto_lock);
+
+ if (ctxt)
+ atomic_inc(&rdma_stat_rq_prod);
+}
+
+/*
+ * Send Queue Completion Handler - potentially called on interrupt context.
+ */
+static void sq_cq_reap(struct svcxprt_rdma *xprt)
+{
+ struct svc_rdma_op_ctxt *ctxt = NULL;
+ struct ib_wc wc;
+ struct ib_cq *cq = xprt->sc_sq_cq;
+ int ret;
+
+ atomic_inc(&rdma_stat_sq_poll);
+ while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
+ ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
+ xprt = ctxt->xprt;
+
+ if (wc.status != IB_WC_SUCCESS)
+ /* Close the transport */
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+
+ /* Decrement used SQ WR count */
+ atomic_dec(&xprt->sc_sq_count);
+ wake_up(&xprt->sc_send_wait);
+
+ switch (ctxt->wr_op) {
+ case IB_WR_SEND:
+ case IB_WR_RDMA_WRITE:
+ svc_rdma_put_context(ctxt, 1);
+ break;
+
+ case IB_WR_RDMA_READ:
+ if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
+ set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+ set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags);
+ spin_lock_bh(&xprt->sc_read_complete_lock);
+ list_add_tail(&ctxt->dto_q,
+ &xprt->sc_read_complete_q);
+ spin_unlock_bh(&xprt->sc_read_complete_lock);
+ svc_xprt_enqueue(&xprt->sc_xprt);
+ }
+ break;
+
+ default:
+ printk(KERN_ERR "svcrdma: unexpected completion type, "
+ "opcode=%d, status=%d\n",
+ wc.opcode, wc.status);
+ break;
+ }
+ }
+
+ if (ctxt)
+ atomic_inc(&rdma_stat_sq_prod);
+}
+
+static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+ struct svcxprt_rdma *xprt = cq_context;
+ unsigned long flags;
+
+ /*
+ * Set the bit regardless of whether or not it's on the list
+ * because it may be on the list already due to an RQ
+ * completion.
+ */
+ set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags);
+
+ /*
+ * If this transport is not already on the DTO transport queue,
+ * add it
+ */
+ spin_lock_irqsave(&dto_lock, flags);
+ if (list_empty(&xprt->sc_dto_q))
+ list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
+ spin_unlock_irqrestore(&dto_lock, flags);
+
+ /* Tasklet does all the work to avoid irqsave locks. */
+ tasklet_schedule(&dto_tasklet);
+}
+
+static void create_context_cache(struct svcxprt_rdma *xprt,
+ int ctxt_count, int ctxt_bump, int ctxt_max)
+{
+ struct svc_rdma_op_ctxt *ctxt;
+ int i;
+
+ xprt->sc_ctxt_max = ctxt_max;
+ xprt->sc_ctxt_bump = ctxt_bump;
+ xprt->sc_ctxt_cnt = 0;
+ xprt->sc_ctxt_head = NULL;
+ for (i = 0; i < ctxt_count; i++) {
+ ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+ if (ctxt) {
+ ctxt->next = xprt->sc_ctxt_head;
+ xprt->sc_ctxt_head = ctxt;
+ xprt->sc_ctxt_cnt++;
+ }
+ }
+}
+
+static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt)
+{
+ struct svc_rdma_op_ctxt *next;
+ if (!ctxt)
+ return;
+
+ do {
+ next = ctxt->next;
+ kfree(ctxt);
+ ctxt = next;
+ } while (next);
+}
+
+static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
+ int listener)
+{
+ struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
+
+ if (!cma_xprt)
+ return NULL;
+ svc_xprt_init(&svc_rdma_class, &cma_xprt->sc_xprt, serv);
+ INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
+ INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
+ INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
+ INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
+ init_waitqueue_head(&cma_xprt->sc_send_wait);
+
+ spin_lock_init(&cma_xprt->sc_lock);
+ spin_lock_init(&cma_xprt->sc_read_complete_lock);
+ spin_lock_init(&cma_xprt->sc_ctxt_lock);
+ spin_lock_init(&cma_xprt->sc_rq_dto_lock);
+
+ cma_xprt->sc_ord = svcrdma_ord;
+
+ cma_xprt->sc_max_req_size = svcrdma_max_req_size;
+ cma_xprt->sc_max_requests = svcrdma_max_requests;
+ cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
+ atomic_set(&cma_xprt->sc_sq_count, 0);
+
+ if (!listener) {
+ int reqs = cma_xprt->sc_max_requests;
+ create_context_cache(cma_xprt,
+ reqs << 1, /* starting size */
+ reqs, /* bump amount */
+ reqs +
+ cma_xprt->sc_sq_depth +
+ RPCRDMA_MAX_THREADS + 1); /* max */
+ if (!cma_xprt->sc_ctxt_head) {
+ kfree(cma_xprt);
+ return NULL;
+ }
+ clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
+ } else
+ set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
+
+ return cma_xprt;
+}
+
+struct page *svc_rdma_get_page(void)
+{
+ struct page *page;
+
+ while ((page = alloc_page(GFP_KERNEL)) == NULL) {
+ /* If we can't get memory, wait a bit and try again */
+ printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 "
+ "jiffies.\n");
+ schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
+ }
+ return page;
+}
+
+int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
+{
+ struct ib_recv_wr recv_wr, *bad_recv_wr;
+ struct svc_rdma_op_ctxt *ctxt;
+ struct page *page;
+ unsigned long pa;
+ int sge_no;
+ int buflen;
+ int ret;
+
+ ctxt = svc_rdma_get_context(xprt);
+ buflen = 0;
+ ctxt->direction = DMA_FROM_DEVICE;
+ for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
+ BUG_ON(sge_no >= xprt->sc_max_sge);
+ page = svc_rdma_get_page();
+ ctxt->pages[sge_no] = page;
+ pa = ib_dma_map_page(xprt->sc_cm_id->device,
+ page, 0, PAGE_SIZE,
+ DMA_FROM_DEVICE);
+ ctxt->sge[sge_no].addr = pa;
+ ctxt->sge[sge_no].length = PAGE_SIZE;
+ ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey;
+ buflen += PAGE_SIZE;
+ }
+ ctxt->count = sge_no;
+ recv_wr.next = NULL;
+ recv_wr.sg_list = &ctxt->sge[0];
+ recv_wr.num_sge = ctxt->count;
+ recv_wr.wr_id = (u64)(unsigned long)ctxt;
+
+ ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
+ return ret;
+}
+
+/*
+ * This function handles the CONNECT_REQUEST event on a listening
+ * endpoint. It is passed the cma_id for the _new_ connection. The context in
+ * this cma_id is inherited from the listening cma_id and is the svc_xprt
+ * structure for the listening endpoint.
+ *
+ * This function creates a new xprt for the new connection and enqueues it on
+ * the accept queue for the listent xprt. When the listen thread is kicked, it
+ * will call the recvfrom method on the listen xprt which will accept the new
+ * connection.
+ */
+static void handle_connect_req(struct rdma_cm_id *new_cma_id)
+{
+ struct svcxprt_rdma *listen_xprt = new_cma_id->context;
+ struct svcxprt_rdma *newxprt;
+
+ /* Create a new transport */
+ newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0);
+ if (!newxprt) {
+ dprintk("svcrdma: failed to create new transport\n");
+ return;
+ }
+ newxprt->sc_cm_id = new_cma_id;
+ new_cma_id->context = newxprt;
+ dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
+ newxprt, newxprt->sc_cm_id, listen_xprt);
+
+ /*
+ * Enqueue the new transport on the accept queue of the listening
+ * transport
+ */
+ spin_lock_bh(&listen_xprt->sc_lock);
+ list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
+ spin_unlock_bh(&listen_xprt->sc_lock);
+
+ /*
+ * Can't use svc_xprt_received here because we are not on a
+ * rqstp thread
+ */
+ set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
+ svc_xprt_enqueue(&listen_xprt->sc_xprt);
+}
+
+/*
+ * Handles events generated on the listening endpoint. These events will be
+ * either be incoming connect requests or adapter removal events.
+ */
+static int rdma_listen_handler(struct rdma_cm_id *cma_id,
+ struct rdma_cm_event *event)
+{
+ struct svcxprt_rdma *xprt = cma_id->context;
+ int ret = 0;
+
+ switch (event->event) {
+ case RDMA_CM_EVENT_CONNECT_REQUEST:
+ dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
+ "event=%d\n", cma_id, cma_id->context, event->event);
+ handle_connect_req(cma_id);
+ break;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ /* Accept complete */
+ dprintk("svcrdma: Connection completed on LISTEN xprt=%p, "
+ "cm_id=%p\n", xprt, cma_id);
+ break;
+
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n",
+ xprt, cma_id);
+ if (xprt)
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ break;
+
+ default:
+ dprintk("svcrdma: Unexpected event on listening endpoint %p, "
+ "event=%d\n", cma_id, event->event);
+ break;
+ }
+
+ return ret;
+}
+
+static int rdma_cma_handler(struct rdma_cm_id *cma_id,
+ struct rdma_cm_event *event)
+{
+ struct svc_xprt *xprt = cma_id->context;
+ struct svcxprt_rdma *rdma =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+ switch (event->event) {
+ case RDMA_CM_EVENT_ESTABLISHED:
+ /* Accept complete */
+ dprintk("svcrdma: Connection completed on DTO xprt=%p, "
+ "cm_id=%p\n", xprt, cma_id);
+ clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
+ svc_xprt_enqueue(xprt);
+ break;
+ case RDMA_CM_EVENT_DISCONNECTED:
+ dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n",
+ xprt, cma_id);
+ if (xprt) {
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ svc_xprt_enqueue(xprt);
+ }
+ break;
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, "
+ "event=%d\n", cma_id, xprt, event->event);
+ if (xprt) {
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ svc_xprt_enqueue(xprt);
+ }
+ break;
+ default:
+ dprintk("svcrdma: Unexpected event on DTO endpoint %p, "
+ "event=%d\n", cma_id, event->event);
+ break;
+ }
+ return 0;
+}
+
+/*
+ * Create a listening RDMA service endpoint.
+ */
+static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+ struct sockaddr *sa, int salen,
+ int flags)
+{
+ struct rdma_cm_id *listen_id;
+ struct svcxprt_rdma *cma_xprt;
+ struct svc_xprt *xprt;
+ int ret;
+
+ dprintk("svcrdma: Creating RDMA socket\n");
+
+ cma_xprt = rdma_create_xprt(serv, 1);
+ if (!cma_xprt)
+ return ERR_PTR(ENOMEM);
+ xprt = &cma_xprt->sc_xprt;
+
+ listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP);
+ if (IS_ERR(listen_id)) {
+ rdma_destroy_xprt(cma_xprt);
+ dprintk("svcrdma: rdma_create_id failed = %ld\n",
+ PTR_ERR(listen_id));
+ return (void *)listen_id;
+ }
+ ret = rdma_bind_addr(listen_id, sa);
+ if (ret) {
+ rdma_destroy_xprt(cma_xprt);
+ rdma_destroy_id(listen_id);
+ dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret);
+ return ERR_PTR(ret);
+ }
+ cma_xprt->sc_cm_id = listen_id;
+
+ ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
+ if (ret) {
+ rdma_destroy_id(listen_id);
+ rdma_destroy_xprt(cma_xprt);
+ dprintk("svcrdma: rdma_listen failed = %d\n", ret);
+ }
+
+ /*
+ * We need to use the address from the cm_id in case the
+ * caller specified 0 for the port number.
+ */
+ sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr;
+ svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
+
+ return &cma_xprt->sc_xprt;
+}
+
+/*
+ * This is the xpo_recvfrom function for listening endpoints. Its
+ * purpose is to accept incoming connections. The CMA callback handler
+ * has already created a new transport and attached it to the new CMA
+ * ID.
+ *
+ * There is a queue of pending connections hung on the listening
+ * transport. This queue contains the new svc_xprt structure. This
+ * function takes svc_xprt structures off the accept_q and completes
+ * the connection.
+ */
+static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
+{
+ struct svcxprt_rdma *listen_rdma;
+ struct svcxprt_rdma *newxprt = NULL;
+ struct rdma_conn_param conn_param;
+ struct ib_qp_init_attr qp_attr;
+ struct ib_device_attr devattr;
+ struct sockaddr *sa;
+ int ret;
+ int i;
+
+ listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
+ clear_bit(XPT_CONN, &xprt->xpt_flags);
+ /* Get the next entry off the accept list */
+ spin_lock_bh(&listen_rdma->sc_lock);
+ if (!list_empty(&listen_rdma->sc_accept_q)) {
+ newxprt = list_entry(listen_rdma->sc_accept_q.next,
+ struct svcxprt_rdma, sc_accept_q);
+ list_del_init(&newxprt->sc_accept_q);
+ }
+ if (!list_empty(&listen_rdma->sc_accept_q))
+ set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
+ spin_unlock_bh(&listen_rdma->sc_lock);
+ if (!newxprt)
+ return NULL;
+
+ dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
+ newxprt, newxprt->sc_cm_id);
+
+ ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
+ if (ret) {
+ dprintk("svcrdma: could not query device attributes on "
+ "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret);
+ goto errout;
+ }
+
+ /* Qualify the transport resource defaults with the
+ * capabilities of this particular device */
+ newxprt->sc_max_sge = min((size_t)devattr.max_sge,
+ (size_t)RPCSVC_MAXPAGES);
+ newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
+ (size_t)svcrdma_max_requests);
+ newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
+
+ newxprt->sc_ord = min((size_t)devattr.max_qp_rd_atom,
+ (size_t)svcrdma_ord);
+
+ newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
+ if (IS_ERR(newxprt->sc_pd)) {
+ dprintk("svcrdma: error creating PD for connect request\n");
+ goto errout;
+ }
+ newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+ sq_comp_handler,
+ cq_event_handler,
+ newxprt,
+ newxprt->sc_sq_depth,
+ 0);
+ if (IS_ERR(newxprt->sc_sq_cq)) {
+ dprintk("svcrdma: error creating SQ CQ for connect request\n");
+ goto errout;
+ }
+ newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
+ rq_comp_handler,
+ cq_event_handler,
+ newxprt,
+ newxprt->sc_max_requests,
+ 0);
+ if (IS_ERR(newxprt->sc_rq_cq)) {
+ dprintk("svcrdma: error creating RQ CQ for connect request\n");
+ goto errout;
+ }
+
+ memset(&qp_attr, 0, sizeof qp_attr);
+ qp_attr.event_handler = qp_event_handler;
+ qp_attr.qp_context = &newxprt->sc_xprt;
+ qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
+ qp_attr.cap.max_recv_wr = newxprt->sc_max_requests;
+ qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
+ qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
+ qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ qp_attr.qp_type = IB_QPT_RC;
+ qp_attr.send_cq = newxprt->sc_sq_cq;
+ qp_attr.recv_cq = newxprt->sc_rq_cq;
+ dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n"
+ " cm_id->device=%p, sc_pd->device=%p\n"
+ " cap.max_send_wr = %d\n"
+ " cap.max_recv_wr = %d\n"
+ " cap.max_send_sge = %d\n"
+ " cap.max_recv_sge = %d\n",
+ newxprt->sc_cm_id, newxprt->sc_pd,
+ newxprt->sc_cm_id->device, newxprt->sc_pd->device,
+ qp_attr.cap.max_send_wr,
+ qp_attr.cap.max_recv_wr,
+ qp_attr.cap.max_send_sge,
+ qp_attr.cap.max_recv_sge);
+
+ ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
+ if (ret) {
+ /*
+ * XXX: This is a hack. We need a xx_request_qp interface
+ * that will adjust the qp_attr's with a best-effort
+ * number
+ */
+ qp_attr.cap.max_send_sge -= 2;
+ qp_attr.cap.max_recv_sge -= 2;
+ ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd,
+ &qp_attr);
+ if (ret) {
+ dprintk("svcrdma: failed to create QP, ret=%d\n", ret);
+ goto errout;
+ }
+ newxprt->sc_max_sge = qp_attr.cap.max_send_sge;
+ newxprt->sc_max_sge = qp_attr.cap.max_recv_sge;
+ newxprt->sc_sq_depth = qp_attr.cap.max_send_wr;
+ newxprt->sc_max_requests = qp_attr.cap.max_recv_wr;
+ }
+ newxprt->sc_qp = newxprt->sc_cm_id->qp;
+
+ /* Register all of physical memory */
+ newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd,
+ IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE);
+ if (IS_ERR(newxprt->sc_phys_mr)) {
+ dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret);
+ goto errout;
+ }
+
+ /* Post receive buffers */
+ for (i = 0; i < newxprt->sc_max_requests; i++) {
+ ret = svc_rdma_post_recv(newxprt);
+ if (ret) {
+ dprintk("svcrdma: failure posting receive buffers\n");
+ goto errout;
+ }
+ }
+
+ /* Swap out the handler */
+ newxprt->sc_cm_id->event_handler = rdma_cma_handler;
+
+ /* Accept Connection */
+ set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
+ memset(&conn_param, 0, sizeof conn_param);
+ conn_param.responder_resources = 0;
+ conn_param.initiator_depth = newxprt->sc_ord;
+ ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
+ if (ret) {
+ dprintk("svcrdma: failed to accept new connection, ret=%d\n",
+ ret);
+ goto errout;
+ }
+
+ dprintk("svcrdma: new connection %p accepted with the following "
+ "attributes:\n"
+ " local_ip : %d.%d.%d.%d\n"
+ " local_port : %d\n"
+ " remote_ip : %d.%d.%d.%d\n"
+ " remote_port : %d\n"
+ " max_sge : %d\n"
+ " sq_depth : %d\n"
+ " max_requests : %d\n"
+ " ord : %d\n",
+ newxprt,
+ NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id->
+ route.addr.src_addr)->sin_addr.s_addr),
+ ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
+ route.addr.src_addr)->sin_port),
+ NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id->
+ route.addr.dst_addr)->sin_addr.s_addr),
+ ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id->
+ route.addr.dst_addr)->sin_port),
+ newxprt->sc_max_sge,
+ newxprt->sc_sq_depth,
+ newxprt->sc_max_requests,
+ newxprt->sc_ord);
+
+ /* Set the local and remote addresses in the transport */
+ sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+ svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+ sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+ svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+
+ ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
+ ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
+ return &newxprt->sc_xprt;
+
+ errout:
+ dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret);
+ rdma_destroy_id(newxprt->sc_cm_id);
+ rdma_destroy_xprt(newxprt);
+ return NULL;
+}
+
+/*
+ * Post an RQ WQE to the RQ when the rqst is being released. This
+ * effectively returns an RQ credit to the client. The rq_xprt_ctxt
+ * will be null if the request is deferred due to an RDMA_READ or the
+ * transport had no data ready (EAGAIN). Note that an RPC deferred in
+ * svc_process will still return the credit, this is because the data
+ * is copied and no longer consume a WQE/WC.
+ */
+static void svc_rdma_release_rqst(struct svc_rqst *rqstp)
+{
+ int err;
+ struct svcxprt_rdma *rdma =
+ container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt);
+ if (rqstp->rq_xprt_ctxt) {
+ BUG_ON(rqstp->rq_xprt_ctxt != rdma);
+ err = svc_rdma_post_recv(rdma);
+ if (err)
+ dprintk("svcrdma: failed to post an RQ WQE error=%d\n",
+ err);
+ }
+ rqstp->rq_xprt_ctxt = NULL;
+}
+
+/* Disable data ready events for this connection */
+static void svc_rdma_detach(struct svc_xprt *xprt)
+{
+ struct svcxprt_rdma *rdma =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+ unsigned long flags;
+
+ dprintk("svc: svc_rdma_detach(%p)\n", xprt);
+ /*
+ * Shutdown the connection. This will ensure we don't get any
+ * more events from the provider.
+ */
+ rdma_disconnect(rdma->sc_cm_id);
+ rdma_destroy_id(rdma->sc_cm_id);
+
+ /* We may already be on the DTO list */
+ spin_lock_irqsave(&dto_lock, flags);
+ if (!list_empty(&rdma->sc_dto_q))
+ list_del_init(&rdma->sc_dto_q);
+ spin_unlock_irqrestore(&dto_lock, flags);
+}
+
+static void svc_rdma_free(struct svc_xprt *xprt)
+{
+ struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt;
+ dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
+ rdma_destroy_xprt(rdma);
+ kfree(rdma);
+}
+
+static void rdma_destroy_xprt(struct svcxprt_rdma *xprt)
+{
+ if (xprt->sc_qp && !IS_ERR(xprt->sc_qp))
+ ib_destroy_qp(xprt->sc_qp);
+
+ if (xprt->sc_sq_cq && !IS_ERR(xprt->sc_sq_cq))
+ ib_destroy_cq(xprt->sc_sq_cq);
+
+ if (xprt->sc_rq_cq && !IS_ERR(xprt->sc_rq_cq))
+ ib_destroy_cq(xprt->sc_rq_cq);
+
+ if (xprt->sc_phys_mr && !IS_ERR(xprt->sc_phys_mr))
+ ib_dereg_mr(xprt->sc_phys_mr);
+
+ if (xprt->sc_pd && !IS_ERR(xprt->sc_pd))
+ ib_dealloc_pd(xprt->sc_pd);
+
+ destroy_context_cache(xprt->sc_ctxt_head);
+}
+
+static int svc_rdma_has_wspace(struct svc_xprt *xprt)
+{
+ struct svcxprt_rdma *rdma =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
+ /*
+ * If there are fewer SQ WR available than required to send a
+ * simple response, return false.
+ */
+ if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3))
+ return 0;
+
+ /*
+ * ...or there are already waiters on the SQ,
+ * return false.
+ */
+ if (waitqueue_active(&rdma->sc_send_wait))
+ return 0;
+
+ /* Otherwise return true. */
+ return 1;
+}
+
+int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
+{
+ struct ib_send_wr *bad_wr;
+ int ret;
+
+ if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags))
+ return 0;
+
+ BUG_ON(wr->send_flags != IB_SEND_SIGNALED);
+ BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op !=
+ wr->opcode);
+ /* If the SQ is full, wait until an SQ entry is available */
+ while (1) {
+ spin_lock_bh(&xprt->sc_lock);
+ if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) {
+ spin_unlock_bh(&xprt->sc_lock);
+ atomic_inc(&rdma_stat_sq_starve);
+ /* See if we can reap some SQ WR */
+ sq_cq_reap(xprt);
+
+ /* Wait until SQ WR available if SQ still full */
+ wait_event(xprt->sc_send_wait,
+ atomic_read(&xprt->sc_sq_count) <
+ xprt->sc_sq_depth);
+ continue;
+ }
+ /* Bumped used SQ WR count and post */
+ ret = ib_post_send(xprt->sc_qp, wr, &bad_wr);
+ if (!ret)
+ atomic_inc(&xprt->sc_sq_count);
+ else
+ dprintk("svcrdma: failed to post SQ WR rc=%d, "
+ "sc_sq_count=%d, sc_sq_depth=%d\n",
+ ret, atomic_read(&xprt->sc_sq_count),
+ xprt->sc_sq_depth);
+ spin_unlock_bh(&xprt->sc_lock);
+ break;
+ }
+ return ret;
+}
+
+int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
+ enum rpcrdma_errcode err)
+{
+ struct ib_send_wr err_wr;
+ struct ib_sge sge;
+ struct page *p;
+ struct svc_rdma_op_ctxt *ctxt;
+ u32 *va;
+ int length;
+ int ret;
+
+ p = svc_rdma_get_page();
+ va = page_address(p);
+
+ /* XDR encode error */
+ length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
+
+ /* Prepare SGE for local address */
+ sge.addr = ib_dma_map_page(xprt->sc_cm_id->device,
+ p, 0, PAGE_SIZE, DMA_FROM_DEVICE);
+ sge.lkey = xprt->sc_phys_mr->lkey;
+ sge.length = length;
+
+ ctxt = svc_rdma_get_context(xprt);
+ ctxt->count = 1;
+ ctxt->pages[0] = p;
+
+ /* Prepare SEND WR */
+ memset(&err_wr, 0, sizeof err_wr);
+ ctxt->wr_op = IB_WR_SEND;
+ err_wr.wr_id = (unsigned long)ctxt;
+ err_wr.sg_list = &sge;
+ err_wr.num_sge = 1;
+ err_wr.opcode = IB_WR_SEND;
+ err_wr.send_flags = IB_SEND_SIGNALED;
+
+ /* Post It */
+ ret = svc_rdma_send(xprt, &err_wr);
+ if (ret) {
+ dprintk("svcrdma: Error posting send = %d\n", ret);
+ svc_rdma_put_context(ctxt, 1);
+ }
+
+ return ret;
+}