[SK_BUFF]: Convert skb->tail to sk_buff_data_t

So that it is also an offset from skb->head, reduces its size from 8 to 4 bytes
on 64bit architectures, allowing us to combine the 4 bytes hole left by the
layer headers conversion, reducing struct sk_buff size to 256 bytes, i.e. 4
64byte cachelines, and since the sk_buff slab cache is SLAB_HWCACHE_ALIGN...
:-)

Many calculations that previously required that skb->{transport,network,
mac}_header be first converted to a pointer now can be done directly, being
meaningful as offsets or pointers.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 959c306..63146d3 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -418,17 +418,19 @@
 int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
 {
 	int i;
-	struct rtattr * top_start = (struct rtattr*) skb->tail;
-	struct rtattr * list_start;
+	u8 *tail;
+	struct rtattr *top_start = (struct rtattr *)skb_tail_pointer(skb);
+	struct rtattr *list_start;
 
 	RTA_PUT(skb, tlv, 0, NULL);
 	RTA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr);
 
-	list_start = (struct rtattr *) skb->tail;
+	list_start = (struct rtattr *)skb_tail_pointer(skb);
 	RTA_PUT(skb, TCA_EMATCH_TREE_LIST, 0, NULL);
 
+	tail = skb_tail_pointer(skb);
 	for (i = 0; i < tree->hdr.nmatches; i++) {
-		struct rtattr *match_start = (struct rtattr*) skb->tail;
+		struct rtattr *match_start = (struct rtattr *)tail;
 		struct tcf_ematch *em = tcf_em_get_match(tree, i);
 		struct tcf_ematch_hdr em_hdr = {
 			.kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER,
@@ -447,11 +449,12 @@
 		} else if (em->datalen > 0)
 			RTA_PUT_NOHDR(skb, em->datalen, (void *) em->data);
 
-		match_start->rta_len = skb->tail - (u8*) match_start;
+		tail = skb_tail_pointer(skb);
+		match_start->rta_len = tail - (u8 *)match_start;
 	}
 
-	list_start->rta_len = skb->tail - (u8 *) list_start;
-	top_start->rta_len = skb->tail - (u8 *) top_start;
+	list_start->rta_len = tail - (u8 *)list_start;
+	top_start->rta_len = tail - (u8 *)top_start;
 
 	return 0;