From 81559f9ad3d88c033e4ec3b6468012dbfda3b31d Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Tue, 18 Oct 2011 13:33:02 +0300
Subject: crypto: twofish-x86_64-3way - add lrw support

Patch adds LRW support for twofish-x86_64-3way by using lrw_crypt(). Patch has
been tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (twofish-3way/twofish-asm speed ratios):

Intel Celeron T1600 (fam:6, model:15, step:13):

size	lrw-enc	lrw-dec
16B	0.99x	1.00x
64B	1.17x	1.17x
256B	1.26x	1.27x
1024B	1.30x	1.31x
8192B	1.31x	1.32x

AMD Phenom II 1055T (fam:16, model:10):

size	lrw-enc	lrw-dec
16B	1.06x	1.01x
64B	1.08x	1.14x
256B	1.19x	1.20x
1024B	1.21x	1.22x
8192B	1.23x	1.24x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish_glue_3way.c | 135 ++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 5ede9c4..fa9151d 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -32,6 +32,11 @@
 #include <crypto/algapi.h>
 #include <crypto/twofish.h>
 #include <crypto/b128ops.h>
+#include <crypto/lrw.h>
+
+#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
+#define HAS_LRW
+#endif
 
 /* regular block cipher functions from twofish_x86_64 module */
 asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
@@ -432,6 +437,124 @@ static struct crypto_alg blk_ctr_alg = {
 	},
 };
 
+#ifdef HAS_LRW
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = TF_BLOCK_SIZE;
+	struct twofish_ctx *ctx = priv;
+	int i;
+
+	if (nbytes == 3 * bsize) {
+		twofish_enc_blk_3way(ctx, srcdst, srcdst);
+		return;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		twofish_enc_blk(ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = TF_BLOCK_SIZE;
+	struct twofish_ctx *ctx = priv;
+	int i;
+
+	if (nbytes == 3 * bsize) {
+		twofish_dec_blk_3way(ctx, srcdst, srcdst);
+		return;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		twofish_dec_blk(ctx, srcdst, srcdst);
+}
+
+struct twofish_lrw_ctx {
+	struct lrw_table_ctx lrw_table;
+	struct twofish_ctx twofish_ctx;
+};
+
+static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+	int err;
+
+	err = __twofish_setkey(&ctx->twofish_ctx, key, keylen - TF_BLOCK_SIZE,
+			       &tfm->crt_flags);
+	if (err)
+		return err;
+
+	return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[3];
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &ctx->twofish_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+
+	return lrw_crypt(desc, dst, src, nbytes, &req);
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[3];
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &ctx->twofish_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+
+	return lrw_crypt(desc, dst, src, nbytes, &req);
+}
+
+static void lrw_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	lrw_free_table(&ctx->lrw_table);
+}
+
+static struct crypto_alg blk_lrw_alg = {
+	.cra_name		= "lrw(twofish)",
+	.cra_driver_name	= "lrw-twofish-3way",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_lrw_alg.cra_list),
+	.cra_exit		= lrw_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE + TF_BLOCK_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE + TF_BLOCK_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= lrw_twofish_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+};
+
+#endif
+
 int __init init(void)
 {
 	int err;
@@ -445,9 +568,18 @@ int __init init(void)
 	err = crypto_register_alg(&blk_ctr_alg);
 	if (err)
 		goto ctr_err;
+#ifdef HAS_LRW
+	err = crypto_register_alg(&blk_lrw_alg);
+	if (err)
+		goto blk_lrw_err;
+#endif
 
 	return 0;
 
+#ifdef HAS_LRW
+blk_lrw_err:
+	crypto_unregister_alg(&blk_ctr_alg);
+#endif
 ctr_err:
 	crypto_unregister_alg(&blk_cbc_alg);
 cbc_err:
@@ -458,6 +590,9 @@ ecb_err:
 
 void __exit fini(void)
 {
+#ifdef HAS_LRW
+	crypto_unregister_alg(&blk_lrw_alg);
+#endif
 	crypto_unregister_alg(&blk_ctr_alg);
 	crypto_unregister_alg(&blk_cbc_alg);
 	crypto_unregister_alg(&blk_ecb_alg);
-- 
cgit v1.1


From bae6d3038b7faff187f4207448a40b9912cf787d Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Tue, 18 Oct 2011 13:33:43 +0300
Subject: crypto: twofish-x86_64-3way - add xts support

Patch adds XTS support for twofish-x86_64-3way by using xts_crypt(). Patch has
been tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (twofish-3way/twofish-asm speed ratios):

Intel Celeron T1600 (fam:6, model:15, step:13):

size    xts-enc xts-dec
16B     0.98x   1.00x
64B     1.14x   1.15x
256B    1.23x   1.25x
1024B   1.26x   1.29x
8192B   1.28x   1.30x

AMD Phenom II 1055T (fam:16, model:10):

size    xts-enc xts-dec
16B     1.03x   1.03x
64B     1.13x   1.16x
256B    1.20x   1.20x
1024B   1.22x   1.22x
8192B   1.22x   1.21x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish_glue_3way.c | 119 +++++++++++++++++++++++++++++++++++-
 1 file changed, 117 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index fa9151d..954f59e 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -33,11 +33,16 @@
 #include <crypto/twofish.h>
 #include <crypto/b128ops.h>
 #include <crypto/lrw.h>
+#include <crypto/xts.h>
 
 #if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
 #define HAS_LRW
 #endif
 
+#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
+#define HAS_XTS
+#endif
+
 /* regular block cipher functions from twofish_x86_64 module */
 asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
 				const u8 *src);
@@ -437,7 +442,7 @@ static struct crypto_alg blk_ctr_alg = {
 	},
 };
 
-#ifdef HAS_LRW
+#if defined(HAS_LRW) || defined(HAS_XTS)
 
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
@@ -469,6 +474,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 		twofish_dec_blk(ctx, srcdst, srcdst);
 }
 
+#endif
+
+#ifdef HAS_LRW
+
 struct twofish_lrw_ctx {
 	struct lrw_table_ctx lrw_table;
 	struct twofish_ctx twofish_ctx;
@@ -555,6 +564,99 @@ static struct crypto_alg blk_lrw_alg = {
 
 #endif
 
+#ifdef HAS_XTS
+
+struct twofish_xts_ctx {
+	struct twofish_ctx tweak_ctx;
+	struct twofish_ctx crypt_ctx;
+};
+
+static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+	u32 *flags = &tfm->crt_flags;
+	int err;
+
+	/* key consists of keys of equal size concatenated, therefore
+	 * the length must be even
+	 */
+	if (keylen % 2) {
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	/* first half of xts-key is for crypt */
+	err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
+	if (err)
+		return err;
+
+	/* second half of xts-key is for tweak */
+	return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
+				flags);
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[3];
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
+		.crypt_ctx = &ctx->crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+
+	return xts_crypt(desc, dst, src, nbytes, &req);
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[3];
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
+		.crypt_ctx = &ctx->crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+
+	return xts_crypt(desc, dst, src, nbytes, &req);
+}
+
+static struct crypto_alg blk_xts_alg = {
+	.cra_name		= "xts(twofish)",
+	.cra_driver_name	= "xts-twofish-3way",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_xts_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE * 2,
+			.max_keysize	= TF_MAX_KEY_SIZE * 2,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= xts_twofish_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+};
+
+#endif
+
 int __init init(void)
 {
 	int err;
@@ -573,13 +675,23 @@ int __init init(void)
 	if (err)
 		goto blk_lrw_err;
 #endif
+#ifdef HAS_XTS
+	err = crypto_register_alg(&blk_xts_alg);
+	if (err)
+		goto blk_xts_err;
+#endif
 
 	return 0;
 
+#ifdef HAS_XTS
+	crypto_unregister_alg(&blk_xts_alg);
+blk_xts_err:
+#endif
 #ifdef HAS_LRW
+	crypto_unregister_alg(&blk_lrw_alg);
 blk_lrw_err:
-	crypto_unregister_alg(&blk_ctr_alg);
 #endif
+	crypto_unregister_alg(&blk_ctr_alg);
 ctr_err:
 	crypto_unregister_alg(&blk_cbc_alg);
 cbc_err:
@@ -590,6 +702,9 @@ ecb_err:
 
 void __exit fini(void)
 {
+#ifdef HAS_XTS
+	crypto_unregister_alg(&blk_xts_alg);
+#endif
 #ifdef HAS_LRW
 	crypto_unregister_alg(&blk_lrw_alg);
 #endif
-- 
cgit v1.1


From ff14c1d01576fb839a925a42596582f6c68a1a1a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 1 Nov 2011 15:58:16 +0200
Subject: x86, mm: Use MAX_DMA_PFN for ZONE_DMA on 32-bit

Use MAX_DMA_PFN which represents the 16 MB ISA DMA limit on
32-bit x86 just like we do on 64-bit.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-1-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 29f7c6d9..434c97d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -679,8 +679,7 @@ static void __init zone_sizes_init(void)
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 #ifdef CONFIG_ZONE_DMA
-	max_zone_pfns[ZONE_DMA] =
-		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 #endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 #ifdef CONFIG_HIGHMEM
-- 
cgit v1.1


From 4c0b2e5f8940fec7cbeafcf641fecd5e746329c5 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 1 Nov 2011 15:58:17 +0200
Subject: x86, mm: Move zone init from paging_init() on 64-bit

This patch introduces a zone_sizes_init() helper function on
64-bit to make it more similar to 32-bit init.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-2-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bbaaa00..3ddda59 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -612,7 +612,7 @@ void __init initmem_init(void)
 }
 #endif
 
-void __init paging_init(void)
+static void __init zone_sizes_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 
@@ -623,6 +623,11 @@ void __init paging_init(void)
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 	max_zone_pfns[ZONE_NORMAL] = max_pfn;
 
+	free_area_init_nodes(max_zone_pfns);
+}
+
+void __init paging_init(void)
+{
 	sparse_memory_present_with_active_regions(MAX_NUMNODES);
 	sparse_init();
 
@@ -634,7 +639,7 @@ void __init paging_init(void)
 	 */
 	node_clear_state(0, N_NORMAL_MEMORY);
 
-	free_area_init_nodes(max_zone_pfns);
+	zone_sizes_init();
 }
 
 /*
-- 
cgit v1.1


From e4794640ca408acda18eb31b126f58a58803b9c9 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 1 Nov 2011 15:58:18 +0200
Subject: x86, mm: Use max_pfn instead of highend_pfn

The 'highend_pfn' variable is always set to 'max_pfn' so just
use the latter directly.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-3-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 434c97d..5ac0118 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -683,7 +683,7 @@ static void __init zone_sizes_init(void)
 #endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 #ifdef CONFIG_HIGHMEM
-	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
+	max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
 #endif
 
 	free_area_init_nodes(max_zone_pfns);
-- 
cgit v1.1


From 80b3cac97bc14fdf839d967602e599cbf82ea336 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 1 Nov 2011 15:58:19 +0200
Subject: x86, mm: Wrap ZONE_DMA32 with CONFIG_ZONE_DMA32

In preparation for unifying 32-bit and 64-bit zone_sizes_init()
make sure ZONE_DMA32 is wrapped in CONFIG_ZONE_DMA32.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Arun Sharma <asharma@fb.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-4-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3ddda59..a9214e6 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -620,7 +620,9 @@ static void __init zone_sizes_init(void)
 #ifdef CONFIG_ZONE_DMA
 	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 #endif
+#ifdef CONFIG_ZONE_DMA32
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+#endif
 	max_zone_pfns[ZONE_NORMAL] = max_pfn;
 
 	free_area_init_nodes(max_zone_pfns);
-- 
cgit v1.1


From ece838b6257412647197c072fe59dfc6615df144 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 1 Nov 2011 15:58:20 +0200
Subject: x86, mm: Use max_low_pfn for ZONE_NORMAL on 64-bit

64-bit has no highmem so max_low_pfn is always the same as
'max_pfn'.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-5-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a9214e6..f6b1f08 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -623,7 +623,7 @@ static void __init zone_sizes_init(void)
 #ifdef CONFIG_ZONE_DMA32
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 #endif
-	max_zone_pfns[ZONE_NORMAL] = max_pfn;
+	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 
 	free_area_init_nodes(max_zone_pfns);
 }
-- 
cgit v1.1


From 248b52b97da7a712d2263a51d8d84c959f38ef75 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 1 Nov 2011 15:58:21 +0200
Subject: x86, mm: Prepare zone_sizes_init() for unification

Make 32-bit and 64-bit zone_sizes_init() identical in
preparation for unification.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-6-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 4 ++++
 arch/x86/mm/init_64.c | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 5ac0118..27455b9 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -677,10 +677,14 @@ void __init initmem_init(void)
 static void __init zone_sizes_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
+
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 #ifdef CONFIG_ZONE_DMA
 	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
 #endif
+#ifdef CONFIG_ZONE_DMA32
+        max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+#endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 #ifdef CONFIG_HIGHMEM
 	max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index f6b1f08..06c4360 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -624,6 +624,9 @@ static void __init zone_sizes_init(void)
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 #endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+	max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
+#endif
 
 	free_area_init_nodes(max_zone_pfns);
 }
-- 
cgit v1.1


From 176239153049a023d060ce95b05f7ef31667e362 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 1 Nov 2011 15:58:22 +0200
Subject: x86, mm: Unify zone_sizes_init()

Now that zone_sizes_init() is identical on 32-bit and 64-bit,
move the code to arch/x86/mm/init.c and use it for both
architectures.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Link: http://lkml.kernel.org/r/1320155902-10424-7-git-send-email-penberg@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/init.h |  2 ++
 arch/x86/mm/init.c          | 23 +++++++++++++++++++++++
 arch/x86/mm/init_32.c       | 19 -------------------
 arch/x86/mm/init_64.c       | 19 -------------------
 4 files changed, 25 insertions(+), 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 8dbe353..adcc0ae 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -5,6 +5,8 @@
 extern void __init early_ioremap_page_table_range_init(void);
 #endif
 
+extern void __init zone_sizes_init(void);
+
 extern unsigned long __init
 kernel_physical_mapping_init(unsigned long start,
 			     unsigned long end,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 87488b9..2426b60 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -3,6 +3,7 @@
 #include <linux/ioport.h>
 #include <linux/swap.h>
 #include <linux/memblock.h>
+#include <linux/bootmem.h>	/* for max_low_pfn */
 
 #include <asm/cacheflush.h>
 #include <asm/e820.h>
@@ -15,6 +16,7 @@
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/proto.h>
+#include <asm/dma.h>		/* for MAX_DMA_PFN */
 
 unsigned long __initdata pgt_buf_start;
 unsigned long __meminitdata pgt_buf_end;
@@ -392,3 +394,24 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	free_init_pages("initrd memory", start, PAGE_ALIGN(end));
 }
 #endif
+
+void __init zone_sizes_init(void)
+{
+	unsigned long max_zone_pfns[MAX_NR_ZONES];
+
+	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+
+#ifdef CONFIG_ZONE_DMA
+	max_zone_pfns[ZONE_DMA]		= MAX_DMA_PFN;
+#endif
+#ifdef CONFIG_ZONE_DMA32
+	max_zone_pfns[ZONE_DMA32]	= MAX_DMA32_PFN;
+#endif
+	max_zone_pfns[ZONE_NORMAL]	= max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+	max_zone_pfns[ZONE_HIGHMEM]	= max_pfn;
+#endif
+
+	free_area_init_nodes(max_zone_pfns);
+}
+
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 27455b9..3bebaed 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -674,25 +674,6 @@ void __init initmem_init(void)
 }
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
-static void __init zone_sizes_init(void)
-{
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-#ifdef CONFIG_ZONE_DMA
-	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-#endif
-#ifdef CONFIG_ZONE_DMA32
-        max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-#endif
-	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
-	max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
-#endif
-
-	free_area_init_nodes(max_zone_pfns);
-}
-
 void __init setup_bootmem_allocator(void)
 {
 	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 06c4360..6fcce7d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -612,25 +612,6 @@ void __init initmem_init(void)
 }
 #endif
 
-static void __init zone_sizes_init(void)
-{
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-#ifdef CONFIG_ZONE_DMA
-	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-#endif
-#ifdef CONFIG_ZONE_DMA32
-	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-#endif
-	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
-	max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
-#endif
-
-	free_area_init_nodes(max_zone_pfns);
-}
-
 void __init paging_init(void)
 {
 	sparse_memory_present_with_active_regions(MAX_NUMNODES);
-- 
cgit v1.1


From bcb71abe7d4c5a0d0368c67da0a7def4fc73497a Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 21 Oct 2011 15:56:24 -0400
Subject: iommu: Add option to group multi-function devices

The option iommu=group_mf indicates the that the iommu driver should
expose all functions of a multi-function PCI device as the same
iommu_device_group.  This is useful for disallowing individual functions
being exposed as independent devices to userspace as there are often
hidden dependencies.  Virtual functions are not affected by this option.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/iommu.h |  1 +
 arch/x86/kernel/pci-dma.c    | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 345c99c..dffc38e 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -5,6 +5,7 @@ extern struct dma_map_ops nommu_dma_ops;
 extern int force_iommu, no_iommu;
 extern int iommu_detected;
 extern int iommu_pass_through;
+extern int iommu_group_mf;
 
 /* 10 seconds */
 #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 80dc793..1c4d769 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -45,6 +45,15 @@ int iommu_detected __read_mostly = 0;
  */
 int iommu_pass_through __read_mostly;
 
+/*
+ * Group multi-function PCI devices into a single device-group for the
+ * iommu_device_group interface.  This tells the iommu driver to pretend
+ * it cannot distinguish between functions of a device, exposing only one
+ * group for the device.  Useful for disallowing use of individual PCI
+ * functions from userspace drivers.
+ */
+int iommu_group_mf __read_mostly;
+
 extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
 
 /* Dummy device used for NULL arguments (normally ISA). */
@@ -169,6 +178,8 @@ static __init int iommu_setup(char *p)
 #endif
 		if (!strncmp(p, "pt", 2))
 			iommu_pass_through = 1;
+		if (!strncmp(p, "group_mf", 8))
+			iommu_group_mf = 1;
 
 		gart_parse_options(p);
 
-- 
cgit v1.1


From b82e324b3c46a554595c12b45465d1943a57326c Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Thu, 10 Nov 2011 13:18:09 +0000
Subject: serial, mfd: don't hardcode the console

Add support to specify which HSU port to use as an early console. This can
be selected by passing "earlyprintk=hsu<n>" on the kernel command line. By
default port 0 is still used.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/include/asm/mrst.h                |  2 +-
 arch/x86/kernel/early_printk.c             |  2 +-
 arch/x86/platform/mrst/early_printk_mrst.c | 16 ++++++++++++----
 3 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 719f00b..4707760 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -51,7 +51,7 @@ extern struct console early_mrst_console;
 extern void mrst_early_console_init(void);
 
 extern struct console early_hsu_console;
-extern void hsu_early_console_init(void);
+extern void hsu_early_console_init(const char *);
 
 extern void intel_scu_devices_create(void);
 extern void intel_scu_devices_destroy(void);
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index cd28a35..9d42a52 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -247,7 +247,7 @@ static int __init setup_early_printk(char *buf)
 		}
 
 		if (!strncmp(buf, "hsu", 3)) {
-			hsu_early_console_init();
+			hsu_early_console_init(buf + 3);
 			early_console_register(&early_hsu_console, keep);
 		}
 #endif
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 25bfdbb..3c6e328 100644
--- a/arch/x86/platform/mrst/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -245,16 +245,24 @@ struct console early_mrst_console = {
  * Following is the early console based on Medfield HSU (High
  * Speed UART) device.
  */
-#define HSU_PORT2_PADDR		0xffa28180
+#define HSU_PORT_BASE		0xffa28080
 
 static void __iomem *phsu;
 
-void hsu_early_console_init(void)
+void hsu_early_console_init(const char *s)
 {
+	unsigned long paddr, port = 0;
 	u8 lcr;
 
-	phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
-							HSU_PORT2_PADDR);
+	/*
+	 * Select the early HSU console port if specified by user in the
+	 * kernel command line.
+	 */
+	if (*s && !kstrtoul(s, 10, &port))
+		port = clamp_val(port, 0, 2);
+
+	paddr = HSU_PORT_BASE + port * 0x80;
+	phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr);
 
 	/* Disable FIFO */
 	writeb(0x0, phsu + UART_FCR);
-- 
cgit v1.1


From b7641d2c83aa10031bf45afd82619bfaaedcbc6f Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 11 Nov 2011 15:43:02 -0800
Subject: x86-64, syscall: Adjust comment spacing and remove typo

Adjust spacing for comment so that it matches the multiline comment
style used in the rest of the kernel, and remove word duplication.

It is not really clear what version of gcc this refers to, but the
extra & doesn't cause any harm, so there is no reason to remove it.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/syscall_64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index de87d60..0edfafa 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -21,9 +21,9 @@ extern void sys_ni_syscall(void);
 
 const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
 	/*
-	*Smells like a like a compiler bug -- it doesn't work
-	*when the & below is removed.
-	*/
+	 * Smells like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
 	[0 ... __NR_syscall_max] = &sys_ni_syscall,
 #include <asm/unistd_64.h>
 };
-- 
cgit v1.1


From e79a7fccfb2ab10f8753ac634a1c8473e870ae6c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 11 Nov 2011 15:48:42 -0800
Subject: x86-64, ia32: Move compat_ni_syscall into C and its own file

Move compat_ni_syscall out of ia32entry.S and into its own .c file.
Although this is a trivial function, it is not performance-critical,
and this will simplify further cleanups.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/ia32/Makefile    | 1 +
 arch/x86/ia32/ia32entry.S | 3 ---
 arch/x86/ia32/nosyscall.c | 7 +++++++
 3 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/ia32/nosyscall.c

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index 52d0ccf..eea9a1c 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,6 +3,7 @@
 #
 
 obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
+obj-$(CONFIG_IA32_EMULATION) += nosyscall.o
 
 sysv-$(CONFIG_SYSVIPC) := ipc32.o
 obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a6253ec..59538a77 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -453,9 +453,6 @@ ia32_badsys:
 	movq $-ENOSYS,%rax
 	jmp ia32_sysret
 
-quiet_ni_syscall:
-	movq $-ENOSYS,%rax
-	ret
 	CFI_ENDPROC
 	
 	.macro PTREGSCALL label, func, arg
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c
new file mode 100644
index 0000000..51ecd5b
--- /dev/null
+++ b/arch/x86/ia32/nosyscall.c
@@ -0,0 +1,7 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+long compat_ni_syscall(void)
+{
+	return -ENOSYS;
+}
-- 
cgit v1.1


From d181764ccf6207e02abb95fb3052639b947f4833 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 11 Nov 2011 15:55:49 -0800
Subject: x86: Machine-readable syscall tables and scripts to process them

Create a simple set of syscall tables and scripts to turn them into
both header files (unistd_*.h) and macros for generating the system
call tables.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/syscalls/Makefile       |  43 +++++
 arch/x86/syscalls/syscall_32.tbl | 357 +++++++++++++++++++++++++++++++++++++++
 arch/x86/syscalls/syscall_64.tbl | 320 +++++++++++++++++++++++++++++++++++
 arch/x86/syscalls/syscallhdr.sh  |  36 ++++
 arch/x86/syscalls/syscalltbl.sh  |  15 ++
 5 files changed, 771 insertions(+)
 create mode 100644 arch/x86/syscalls/Makefile
 create mode 100644 arch/x86/syscalls/syscall_32.tbl
 create mode 100644 arch/x86/syscalls/syscall_64.tbl
 create mode 100644 arch/x86/syscalls/syscallhdr.sh
 create mode 100644 arch/x86/syscalls/syscalltbl.sh

(limited to 'arch/x86')

diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile
new file mode 100644
index 0000000..564b247
--- /dev/null
+++ b/arch/x86/syscalls/Makefile
@@ -0,0 +1,43 @@
+out := $(obj)/../include/generated/asm
+
+# Create output directory if not already present
+_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)')
+
+syscall32 := $(srctree)/$(src)/syscall_32.tbl
+syscall64 := $(srctree)/$(src)/syscall_64.tbl
+
+syshdr := $(srctree)/$(src)/syscallhdr.sh
+systbl := $(srctree)/$(src)/syscalltbl.sh
+
+quiet_cmd_syshdr = SYSHDR  $@
+      cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' $< $@ \
+		   $(syshdr_abi_$(basetarget)) $(syshdr_pfx_$(basetarget))
+quiet_cmd_systbl = SYSTBL  $@
+      cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
+
+syshdr_abi_unistd_32 := i386
+$(out)/unistd_32.h: $(syscall32) $(syshdr)
+	$(call if_changed,syshdr)
+
+syshdr_abi_unistd_32_ia32 := i386
+syshdr_pfx_unistd_32_ia32 := ia32_
+$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr)
+	$(call if_changed,syshdr)
+
+syshdr_abi_unistd_64 := 64
+$(out)/unistd_64.h: $(syscall64) $(syshdr)
+	$(call if_changed,syshdr)
+
+$(out)/syscalls_32.h: $(syscall32) $(systbl)
+	$(call if_changed,systbl)
+$(out)/syscalls_64.h: $(syscall64) $(systbl)
+	$(call if_changed,systbl)
+
+syshdr-y			+= unistd_32.h unistd_64.h
+syshdr-y			+= syscalls_32.h
+syshdr-$(CONFIG_X86_64)		+= unistd_32_ia32.h
+syshdr-$(CONFIG_X86_64)		+= syscalls_64.h
+
+targets	+= $(syshdr-y)
+
+all: $(addprefix $(out)/,$(targets))
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
new file mode 100644
index 0000000..ce98e28
--- /dev/null
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -0,0 +1,357 @@
+#
+# 32-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point> <compat entry point>
+#
+# The abi is always "i386" for this file.
+#
+0	i386	restart_syscall		sys_restart_syscall
+1	i386	exit			sys_exit
+2	i386	fork			ptregs_fork			stub32_fork
+3	i386	read			sys_read
+4	i386	write			sys_write
+5	i386	open			sys_open			compat_sys_open
+6	i386	close			sys_close
+7	i386	waitpid			sys_waitpid			sys32_waitpid
+8	i386	creat			sys_creat
+9	i386	link			sys_link
+10	i386	unlink			sys_unlink
+11	i386	execve			ptregs_execve			stub32_execve
+12	i386	chdir			sys_chdir
+13	i386	time			sys_time			compat_sys_time
+14	i386	mknod			sys_mknod
+15	i386	chmod			sys_chmod
+16	i386	lchown			sys_lchown16
+17	i386	break
+18	i386	oldstat			sys_stat
+19	i386	lseek			sys_lseek			sys32_lseek
+20	i386	getpid			sys_getpid
+21	i386	mount			sys_mount			compat_sys_mount
+22	i386	umount			sys_oldumount
+23	i386	setuid			sys_setuid16
+24	i386	getuid			sys_getuid16
+25	i386	stime			sys_stime			compat_sys_stime
+26	i386	ptrace			sys_ptrace			compat_sys_ptrace
+27	i386	alarm			sys_alarm
+28	i386	oldfstat		sys_fstat
+29	i386	pause			sys_pause
+30	i386	utime			sys_utime			compat_sys_utime
+31	i386	stty
+32	i386	gtty
+33	i386	access			sys_access
+34	i386	nice			sys_nice
+35	i386	ftime
+36	i386	sync			sys_sync
+37	i386	kill			sys_kill			sys32_kill
+38	i386	rename			sys_rename
+39	i386	mkdir			sys_mkdir
+40	i386	rmdir			sys_rmdir
+41	i386	dup			sys_dup
+42	i386	pipe			sys_pipe
+43	i386	times			sys_times			compat_sys_times
+44	i386	prof
+45	i386	brk			sys_brk
+46	i386	setgid			sys_setgid16
+47	i386	getgid			sys_getgid16
+48	i386	signal			sys_signal
+49	i386	geteuid			sys_geteuid16
+50	i386	getegid			sys_getegid16
+51	i386	acct			sys_acct
+52	i386	umount2			sys_umount
+53	i386	lock
+54	i386	ioctl			sys_ioctl			compat_sys_ioctl
+55	i386	fcntl			sys_fcntl			compat_sys_fcntl64
+56	i386	mpx
+57	i386	setpgid			sys_setpgid
+58	i386	ulimit
+59	i386	oldolduname		sys_olduname
+60	i386	umask			sys_umask
+61	i386	chroot			sys_chroot
+62	i386	ustat			sys_ustat			compat_sys_ustat
+63	i386	dup2			sys_dup2
+64	i386	getppid			sys_getppid
+65	i386	getpgrp			sys_getpgrp
+66	i386	setsid			sys_setsid
+67	i386	sigaction		sys_sigaction			sys32_sigaction
+68	i386	sgetmask		sys_sgetmask
+69	i386	ssetmask		sys_ssetmask
+70	i386	setreuid		sys_setreuid16
+71	i386	setregid		sys_setregid16
+72	i386	sigsuspend		sys_sigsuspend			sys32_sigsuspend
+73	i386	sigpending		sys_sigpending			compat_sys_sigpending
+74	i386	sethostname		sys_sethostname
+75	i386	setrlimit		sys_setrlimit			compat_sys_setrlimit
+76	i386	getrlimit		sys_old_getrlimit		compat_sys_old_getrlimit
+77	i386	getrusage		sys_getrusage			compat_sys_getrusage
+78	i386	gettimeofday		sys_gettimeofday		compat_sys_gettimeofday
+79	i386	settimeofday		sys_settimeofday		compat_sys_settimeofday
+80	i386	getgroups		sys_getgroups16
+81	i386	setgroups		sys_setgroups16
+82	i386	select			sys_old_select			compat_sys_old_select
+83	i386	symlink			sys_symlink
+84	i386	oldlstat		sys_lstat
+85	i386	readlink		sys_readlink
+86	i386	uselib			sys_uselib
+87	i386	swapon			sys_swapon
+88	i386	reboot			sys_reboot
+89	i386	readdir			sys_old_readdir			compat_sys_old_readdir
+90	i386	mmap			sys_old_mmap			sys32_mmap
+91	i386	munmap			sys_munmap
+92	i386	truncate		sys_truncate
+93	i386	ftruncate		sys_ftruncate
+94	i386	fchmod			sys_fchmod
+95	i386	fchown			sys_fchown16
+96	i386	getpriority		sys_getpriority
+97	i386	setpriority		sys_setpriority
+98	i386	profil
+99	i386	statfs			sys_statfs			compat_sys_statfs
+100	i386	fstatfs			sys_fstatfs			compat_sys_fstatfs
+101	i386	ioperm			sys_ioperm
+102	i386	socketcall		sys_socketcall			compat_sys_socketcall
+103	i386	syslog			sys_syslog
+104	i386	setitimer		sys_setitimer			compat_sys_setitimer
+105	i386	getitimer		sys_getitimer			compat_sys_getitimer
+106	i386	stat			sys_newstat			compat_sys_newstat
+107	i386	lstat			sys_newlstat			compat_sys_newlstat
+108	i386	fstat			sys_newfstat			compat_sys_newfstat
+109	i386	olduname		sys_uname
+110	i386	iopl			ptregs_iopl			stub32_iopl
+111	i386	vhangup			sys_vhangup
+112	i386	idle
+113	i386	vm86old			ptregs_vm86old			sys32_vm86_warning
+114	i386	wait4			sys_wait4			compat_sys_wait4
+115	i386	swapoff			sys_swapoff
+116	i386	sysinfo			sys_sysinfo			compat_sys_sysinfo
+117	i386	ipc			sys_ipc				sys32_ipc
+118	i386	fsync			sys_fsync
+119	i386	sigreturn		ptregs_sigreturn		stub32_sigreturn
+120	i386	clone			ptregs_clone			stub32_clone
+121	i386	setdomainname		sys_setdomainname
+122	i386	uname			sys_newuname
+123	i386	modify_ldt		sys_modify_ldt
+124	i386	adjtimex		sys_adjtimex			compat_sys_adjtimex
+125	i386	mprotect		sys_mprotect			sys32_mprotect
+126	i386	sigprocmask		sys_sigprocmask			compat_sys_sigprocmask
+127	i386	create_module
+128	i386	init_module		sys_init_module
+129	i386	delete_module		sys_delete_module
+130	i386	get_kernel_syms
+131	i386	quotactl		sys_quotactl			sys32_quotactl
+132	i386	getpgid			sys_getpgid
+133	i386	fchdir			sys_fchdir
+134	i386	bdflush			sys_bdflush
+135	i386	sysfs			sys_sysfs
+136	i386	personality		sys_personality
+137	i386	afs_syscall
+138	i386	setfsuid		sys_setfsuid16
+139	i386	setfsgid		sys_setfsgid16
+140	i386	_llseek			sys_llseek
+141	i386	getdents		sys_getdents			compat_sys_getdents
+142	i386	_newselect		sys_select			compat_sys_select
+143	i386	flock			sys_flock
+144	i386	msync			sys_msync
+145	i386	readv			sys_readv			compat_sys_readv
+146	i386	writev			sys_writev			compat_sys_writev
+147	i386	getsid			sys_getsid
+148	i386	fdatasync		sys_fdatasync
+149	i386	_sysctl			sys_sysctl			compat_sys_sysctl
+150	i386	mlock			sys_mlock
+151	i386	munlock			sys_munlock
+152	i386	mlockall		sys_mlockall
+153	i386	munlockall		sys_munlockall
+154	i386	sched_setparam		sys_sched_setparam
+155	i386	sched_getparam		sys_sched_getparam
+156	i386	sched_setscheduler	sys_sched_setscheduler
+157	i386	sched_getscheduler	sys_sched_getscheduler
+158	i386	sched_yield		sys_sched_yield
+159	i386	sched_get_priority_max	sys_sched_get_priority_max
+160	i386	sched_get_priority_min	sys_sched_get_priority_min
+161	i386	sched_rr_get_interval	sys_sched_rr_get_interval	sys32_sched_rr_get_interval
+162	i386	nanosleep		sys_nanosleep			compat_sys_nanosleep
+163	i386	mremap			sys_mremap
+164	i386	setresuid		sys_setresuid16
+165	i386	getresuid		sys_getresuid16
+166	i386	vm86			ptregs_vm86			sys32_vm86_warning
+167	i386	query_module
+168	i386	poll			sys_poll
+169	i386	nfsservctl
+170	i386	setresgid		sys_setresgid16
+171	i386	getresgid		sys_getresgid16
+172	i386	prctl			sys_prctl
+173	i386	rt_sigreturn		ptregs_rt_sigreturn		stub32_rt_sigreturn
+174	i386	rt_sigaction		sys_rt_sigaction		sys32_rt_sigaction
+175	i386	rt_sigprocmask		sys_rt_sigprocmask		sys32_rt_sigprocmask
+176	i386	rt_sigpending		sys_rt_sigpending		sys32_rt_sigpending
+177	i386	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait
+178	i386	rt_sigqueueinfo		sys_rt_sigqueueinfo		sys32_rt_sigqueueinfo
+179	i386	rt_sigsuspend		sys_rt_sigsuspend
+180	i386	pread64			sys_pread64			sys32_pread
+181	i386	pwrite64		sys_pwrite64			sys32_pwrite
+182	i386	chown			sys_chown16
+183	i386	getcwd			sys_getcwd
+184	i386	capget			sys_capget
+185	i386	capset			sys_capset
+186	i386	sigaltstack		ptregs_sigaltstack		stub32_sigaltstack
+187	i386	sendfile		sys_sendfile			sys32_sendfile
+188	i386	getpmsg
+189	i386	putpmsg
+190	i386	vfork			ptregs_vfork			stub32_vfork
+191	i386	ugetrlimit		sys_getrlimit			compat_sys_getrlimit
+192	i386	mmap2			sys_mmap_pgoff
+193	i386	truncate64		sys_truncate64			sys32_truncate64
+194	i386	ftruncate64		sys_ftruncate64			sys32_ftruncate64
+195	i386	stat64			sys_stat64			sys32_stat64
+196	i386	lstat64			sys_lstat64			sys32_lstat64
+197	i386	fstat64			sys_fstat64			sys32_fstat64
+198	i386	lchown32		sys_lchown
+199	i386	getuid32		sys_getuid
+200	i386	getgid32		sys_getgid
+201	i386	geteuid32		sys_geteuid
+202	i386	getegid32		sys_getegid
+203	i386	setreuid32		sys_setreuid
+204	i386	setregid32		sys_setregid
+205	i386	getgroups32		sys_getgroups
+206	i386	setgroups32		sys_setgroups
+207	i386	fchown32		sys_fchown
+208	i386	setresuid32		sys_setresuid
+209	i386	getresuid32		sys_getresuid
+210	i386	setresgid32		sys_setresgid
+211	i386	getresgid32		sys_getresgid
+212	i386	chown32			sys_chown
+213	i386	setuid32		sys_setuid
+214	i386	setgid32		sys_setgid
+215	i386	setfsuid32		sys_setfsuid
+216	i386	setfsgid32		sys_setfsgid
+217	i386	pivot_root		sys_pivot_root
+218	i386	mincore			sys_mincore
+219	i386	madvise			sys_madvise
+220	i386	getdents64		sys_getdents64			compat_sys_getdents64
+221	i386	fcntl64			sys_fcntl64			compat_sys_fcntl64
+# 222 is unused
+# 223 is unused
+224	i386	gettid			sys_gettid
+225	i386	readahead		sys_readahead			sys32_readahead
+226	i386	setxattr		sys_setxattr
+227	i386	lsetxattr		sys_lsetxattr
+228	i386	fsetxattr		sys_fsetxattr
+229	i386	getxattr		sys_getxattr
+230	i386	lgetxattr		sys_lgetxattr
+231	i386	fgetxattr		sys_fgetxattr
+232	i386	listxattr		sys_listxattr
+233	i386	llistxattr		sys_llistxattr
+234	i386	flistxattr		sys_flistxattr
+235	i386	removexattr		sys_removexattr
+236	i386	lremovexattr		sys_lremovexattr
+237	i386	fremovexattr		sys_fremovexattr
+238	i386	tkill			sys_tkill
+239	i386	sendfile64		sys_sendfile64
+240	i386	futex			sys_futex			compat_sys_futex
+241	i386	sched_setaffinity	sys_sched_setaffinity		compat_sys_sched_setaffinity
+242	i386	sched_getaffinity	sys_sched_getaffinity		compat_sys_sched_getaffinity
+243	i386	set_thread_area		sys_set_thread_area
+244	i386	get_thread_area		sys_get_thread_area
+245	i386	io_setup		sys_io_setup			compat_sys_io_setup
+246	i386	io_destroy		sys_io_destroy
+247	i386	io_getevents		sys_io_getevents		compat_sys_io_getevents
+248	i386	io_submit		sys_io_submit			compat_sys_io_submit
+249	i386	io_cancel		sys_io_cancel
+250	i386	fadvise64		sys_fadvise64			sys32_fadvise64
+# 251 is available for reuse (was briefly sys_set_zone_reclaim)
+252	i386	exit_group		sys_exit_group
+253	i386	lookup_dcookie		sys_lookup_dcookie		sys32_lookup_dcookie
+254	i386	epoll_create		sys_epoll_create
+255	i386	epoll_ctl		sys_epoll_ctl
+256	i386	epoll_wait		sys_epoll_wait
+257	i386	remap_file_pages	sys_remap_file_pages
+258	i386	set_tid_address		sys_set_tid_address
+259	i386	timer_create		sys_timer_create		compat_sys_timer_create
+260	i386	timer_settime		sys_timer_settime		compat_sys_timer_settime
+261	i386	timer_gettime		sys_timer_gettime		compat_sys_timer_gettime
+262	i386	timer_getoverrun	sys_timer_getoverrun
+263	i386	timer_delete		sys_timer_delete
+264	i386	clock_settime		sys_clock_settime		compat_sys_clock_settime
+265	i386	clock_gettime		sys_clock_gettime		compat_sys_clock_gettime
+266	i386	clock_getres		sys_clock_getres		compat_sys_clock_getres
+267	i386	clock_nanosleep		sys_clock_nanosleep		compat_sys_clock_nanosleep
+268	i386	statfs64		sys_statfs64			compat_sys_statfs64
+269	i386	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
+270	i386	tgkill			sys_tgkill
+271	i386	utimes			sys_utimes			compat_sys_utimes
+272	i386	fadvise64_64		sys_fadvise64_64		sys32_fadvise64_64
+273	i386	vserver
+274	i386	mbind			sys_mbind
+275	i386	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy
+276	i386	set_mempolicy		sys_set_mempolicy
+277	i386	mq_open			sys_mq_open			compat_sys_mq_open
+278	i386	mq_unlink		sys_mq_unlink
+279	i386	mq_timedsend		sys_mq_timedsend		compat_sys_mq_timedsend
+280	i386	mq_timedreceive		sys_mq_timedreceive		compat_sys_mq_timedreceive
+281	i386	mq_notify		sys_mq_notify			compat_sys_mq_notify
+282	i386	mq_getsetaddr		sys_mq_getsetattr		compat_sys_mq_getsetattr
+283	i386	kexec_load		sys_kexec_load			compat_sys_kexec_load
+284	i386	waitid			sys_waitid			compat_sys_waitid
+# 285 sys_setaltroot
+286	i386	add_key			sys_add_key
+287	i386	request_key		sys_request_key
+288	i386	keyctl			sys_keyctl
+289	i386	ioprio_set		sys_ioprio_set
+290	i386	ioprio_get		sys_ioprio_get
+291	i386	inotify_init		sys_inotify_init
+292	i386	inotify_add_watch	sys_inotify_add_watch
+293	i386	inotify_rm_watch	sys_inotify_rm_watch
+294	i386	migrate_pages		sys_migrate_pages
+295	i386	openat			sys_openat			compat_sys_openat
+296	i386	mkdirat			sys_mkdirat
+297	i386	mknodat			sys_mknodat
+298	i386	fchownat		sys_fchownat
+299	i386	futimesat		sys_futimesat			compat_sys_futimesat
+300	i386	fstatat64		sys_fstatat64			sys32_fstatat
+301	i386	unlinkat		sys_unlinkat
+302	i386	renameat		sys_renameat
+303	i386	linkat			sys_linkat
+304	i386	symlinkat		sys_symlinkat
+305	i386	readlinkat		sys_readlinkat
+306	i386	fchmodat		sys_fchmodat
+307	i386	faccessat		sys_faccessat
+308	i386	pselect6		sys_pselect6			compat_sys_pselect6
+309	i386	ppoll			sys_ppoll			compat_sys_ppoll
+310	i386	unshare			sys_unshare
+311	i386	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list
+312	i386	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list
+313	i386	splice			sys_splice
+314	i386	sync_file_range		sys_sync_file_range		sys32_sync_file_range
+315	i386	tee			sys_tee
+316	i386	vmsplice		sys_vmsplice			compat_sys_vmsplice
+317	i386	move_pages		sys_move_pages			compat_sys_move_pages
+318	i386	getcpu			sys_getcpu
+319	i386	epoll_pwait		sys_epoll_pwait
+320	i386	utimensat		sys_utimensat			compat_sys_utimensat
+321	i386	signalfd		sys_signalfd			compat_sys_signalfd
+322	i386	timerfd_create		sys_timerfd_create
+323	i386	eventfd			sys_eventfd
+324	i386	fallocate		sys_fallocate			sys32_fallocate
+325	i386	timerfd_settime		sys_timerfd_settime		compat_sys_timerfd_settime
+326	i386	timerfd_gettime		sys_timerfd_gettime		compat_sys_timerfd_gettime
+327	i386	signalfd4		sys_signalfd4			compat_sys_signalfd4
+328	i386	eventfd2		sys_eventfd2
+329	i386	epoll_create1		sys_epoll_create1
+330	i386	dup3			sys_dup3
+331	i386	pipe2			sys_pipe2
+332	i386	inotify_init1		sys_inotify_init1
+333	i386	preadv			sys_preadv			compat_sys_preadv
+334	i386	pwritev			sys_pwritev			compat_sys_pwritev
+335	i386	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo		compat_sys_rt_tgsigqueueinfo
+336	i386	perf_event_open		sys_perf_event_open
+337	i386	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg
+338	i386	fanotify_init		sys_fanotify_init
+339	i386	fanotify_mark		sys_fanotify_mark		sys32_fanotify_mark
+340	i386	prlimit64		sys_prlimit64
+341	i386	name_to_handle_at	sys_name_to_handle_at
+342	i386	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at
+343	i386	clock_adjtime		sys_clock_adjtime		compat_sys_clock_adjtime
+344	i386	syncfs			sys_syncfs
+345	i386	sendmmsg		sys_sendmmsg			compat_sys_sendmmsg
+346	i386	setns			sys_setns
+347	i386	process_vm_readv	sys_process_vm_readv		compat_sys_process_vm_readv
+348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
new file mode 100644
index 0000000..b440a8f
--- /dev/null
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -0,0 +1,320 @@
+#
+# 64-bit system call numbers and entry vectors
+#
+# The format is:
+# <number> <abi> <name> <entry point>
+#
+# The abi is always "64" for this file (for now.)
+#
+0	64	read			sys_read
+1	64	write			sys_write
+2	64	open			sys_open
+3	64	close			sys_close
+4	64	stat			sys_newstat
+5	64	fstat			sys_newfstat
+6	64	lstat			sys_newlstat
+7	64	poll			sys_poll
+8	64	lseek			sys_lseek
+9	64	mmap			sys_mmap
+10	64	mprotect		sys_mprotect
+11	64	munmap			sys_munmap
+12	64	brk			sys_brk
+13	64	rt_sigaction		sys_rt_sigaction
+14	64	rt_sigprocmask		sys_rt_sigprocmask
+15	64	rt_sigreturn		stub_rt_sigreturn
+16	64	ioctl			sys_ioctl
+17	64	pread64			sys_pread64
+18	64	pwrite64		sys_pwrite64
+19	64	readv			sys_readv
+20	64	writev			sys_writev
+21	64	access			sys_access
+22	64	pipe			sys_pipe
+23	64	select			sys_select
+24	64	sched_yield		sys_sched_yield
+25	64	mremap			sys_mremap
+26	64	msync			sys_msync
+27	64	mincore			sys_mincore
+28	64	madvise			sys_madvise
+29	64	shmget			sys_shmget
+30	64	shmat			sys_shmat
+31	64	shmctl			sys_shmctl
+32	64	dup			sys_dup
+33	64	dup2			sys_dup2
+34	64	pause			sys_pause
+35	64	nanosleep		sys_nanosleep
+36	64	getitimer		sys_getitimer
+37	64	alarm			sys_alarm
+38	64	setitimer		sys_setitimer
+39	64	getpid			sys_getpid
+40	64	sendfile		sys_sendfile64
+41	64	socket			sys_socket
+42	64	connect			sys_connect
+43	64	accept			sys_accept
+44	64	sendto			sys_sendto
+45	64	recvfrom		sys_recvfrom
+46	64	sendmsg			sys_sendmsg
+47	64	recvmsg			sys_recvmsg
+48	64	shutdown		sys_shutdown
+49	64	bind			sys_bind
+50	64	listen			sys_listen
+51	64	getsockname		sys_getsockname
+52	64	getpeername		sys_getpeername
+53	64	socketpair		sys_socketpair
+54	64	setsockopt		sys_setsockopt
+55	64	getsockopt		sys_getsockopt
+56	64	clone			stub_clone
+57	64	fork			stub_fork
+58	64	vfork			stub_vfork
+59	64	execve			stub_execve
+60	64	exit			sys_exit
+61	64	wait4			sys_wait4
+62	64	kill			sys_kill
+63	64	uname			sys_newuname
+64	64	semget			sys_semget
+65	64	semop			sys_semop
+66	64	semctl			sys_semctl
+67	64	shmdt			sys_shmdt
+68	64	msgget			sys_msgget
+69	64	msgsnd			sys_msgsnd
+70	64	msgrcv			sys_msgrcv
+71	64	msgctl			sys_msgctl
+72	64	fcntl			sys_fcntl
+73	64	flock			sys_flock
+74	64	fsync			sys_fsync
+75	64	fdatasync		sys_fdatasync
+76	64	truncate		sys_truncate
+77	64	ftruncate		sys_ftruncate
+78	64	getdents		sys_getdents
+79	64	getcwd			sys_getcwd
+80	64	chdir			sys_chdir
+81	64	fchdir			sys_fchdir
+82	64	rename			sys_rename
+83	64	mkdir			sys_mkdir
+84	64	rmdir			sys_rmdir
+85	64	creat			sys_creat
+86	64	link			sys_link
+87	64	unlink			sys_unlink
+88	64	symlink			sys_symlink
+89	64	readlink		sys_readlink
+90	64	chmod			sys_chmod
+91	64	fchmod			sys_fchmod
+92	64	chown			sys_chown
+93	64	fchown			sys_fchown
+94	64	lchown			sys_lchown
+95	64	umask			sys_umask
+96	64	gettimeofday		sys_gettimeofday
+97	64	getrlimit		sys_getrlimit
+98	64	getrusage		sys_getrusage
+99	64	sysinfo			sys_sysinfo
+100	64	times			sys_times
+101	64	ptrace			sys_ptrace
+102	64	getuid			sys_getuid
+103	64	syslog			sys_syslog
+104	64	getgid			sys_getgid
+105	64	setuid			sys_setuid
+106	64	setgid			sys_setgid
+107	64	geteuid			sys_geteuid
+108	64	getegid			sys_getegid
+109	64	setpgid			sys_setpgid
+110	64	getppid			sys_getppid
+111	64	getpgrp			sys_getpgrp
+112	64	setsid			sys_setsid
+113	64	setreuid		sys_setreuid
+114	64	setregid		sys_setregid
+115	64	getgroups		sys_getgroups
+116	64	setgroups		sys_setgroups
+117	64	setresuid		sys_setresuid
+118	64	getresuid		sys_getresuid
+119	64	setresgid		sys_setresgid
+120	64	getresgid		sys_getresgid
+121	64	getpgid			sys_getpgid
+122	64	setfsuid		sys_setfsuid
+123	64	setfsgid		sys_setfsgid
+124	64	getsid			sys_getsid
+125	64	capget			sys_capget
+126	64	capset			sys_capset
+127	64	rt_sigpending		sys_rt_sigpending
+128	64	rt_sigtimedwait		sys_rt_sigtimedwait
+129	64	rt_sigqueueinfo		sys_rt_sigqueueinfo
+130	64	rt_sigsuspend		sys_rt_sigsuspend
+131	64	sigaltstack		stub_sigaltstack
+132	64	utime			sys_utime
+133	64	mknod			sys_mknod
+134	64	uselib
+135	64	personality		sys_personality
+136	64	ustat			sys_ustat
+137	64	statfs			sys_statfs
+138	64	fstatfs			sys_fstatfs
+139	64	sysfs			sys_sysfs
+140	64	getpriority		sys_getpriority
+141	64	setpriority		sys_setpriority
+142	64	sched_setparam		sys_sched_setparam
+143	64	sched_getparam		sys_sched_getparam
+144	64	sched_setscheduler	sys_sched_setscheduler
+145	64	sched_getscheduler	sys_sched_getscheduler
+146	64	sched_get_priority_max	sys_sched_get_priority_max
+147	64	sched_get_priority_min	sys_sched_get_priority_min
+148	64	sched_rr_get_interval	sys_sched_rr_get_interval
+149	64	mlock			sys_mlock
+150	64	munlock			sys_munlock
+151	64	mlockall		sys_mlockall
+152	64	munlockall		sys_munlockall
+153	64	vhangup			sys_vhangup
+154	64	modify_ldt		sys_modify_ldt
+155	64	pivot_root		sys_pivot_root
+156	64	_sysctl			sys_sysctl
+157	64	prctl			sys_prctl
+158	64	arch_prctl		sys_arch_prctl
+159	64	adjtimex		sys_adjtimex
+160	64	setrlimit		sys_setrlimit
+161	64	chroot			sys_chroot
+162	64	sync			sys_sync
+163	64	acct			sys_acct
+164	64	settimeofday		sys_settimeofday
+165	64	mount			sys_mount
+166	64	umount2			sys_umount
+167	64	swapon			sys_swapon
+168	64	swapoff			sys_swapoff
+169	64	reboot			sys_reboot
+170	64	sethostname		sys_sethostname
+171	64	setdomainname		sys_setdomainname
+172	64	iopl			stub_iopl
+173	64	ioperm			sys_ioperm
+174	64	create_module
+175	64	init_module		sys_init_module
+176	64	delete_module		sys_delete_module
+177	64	get_kernel_syms
+178	64	query_module
+179	64	quotactl		sys_quotactl
+180	64	nfsservctl
+181	64	getpmsg
+182	64	putpmsg
+183	64	afs_syscall
+184	64	tuxcall
+185	64	security
+186	64	gettid			sys_gettid
+187	64	readahead		sys_readahead
+188	64	setxattr		sys_setxattr
+189	64	lsetxattr		sys_lsetxattr
+190	64	fsetxattr		sys_fsetxattr
+191	64	getxattr		sys_getxattr
+192	64	lgetxattr		sys_lgetxattr
+193	64	fgetxattr		sys_fgetxattr
+194	64	listxattr		sys_listxattr
+195	64	llistxattr		sys_llistxattr
+196	64	flistxattr		sys_flistxattr
+197	64	removexattr		sys_removexattr
+198	64	lremovexattr		sys_lremovexattr
+199	64	fremovexattr		sys_fremovexattr
+200	64	tkill			sys_tkill
+201	64	time			sys_time
+202	64	futex			sys_futex
+203	64	sched_setaffinity	sys_sched_setaffinity
+204	64	sched_getaffinity	sys_sched_getaffinity
+205	64	set_thread_area
+206	64	io_setup		sys_io_setup
+207	64	io_destroy		sys_io_destroy
+208	64	io_getevents		sys_io_getevents
+209	64	io_submit		sys_io_submit
+210	64	io_cancel		sys_io_cancel
+211	64	get_thread_area
+212	64	lookup_dcookie		sys_lookup_dcookie
+213	64	epoll_create		sys_epoll_create
+214	64	epoll_ctl_old
+215	64	epoll_wait_old
+216	64	remap_file_pages	sys_remap_file_pages
+217	64	getdents64		sys_getdents64
+218	64	set_tid_address		sys_set_tid_address
+219	64	restart_syscall		sys_restart_syscall
+220	64	semtimedop		sys_semtimedop
+221	64	fadvise64		sys_fadvise64
+222	64	timer_create		sys_timer_create
+223	64	timer_settime		sys_timer_settime
+224	64	timer_gettime		sys_timer_gettime
+225	64	timer_getoverrun	sys_timer_getoverrun
+226	64	timer_delete		sys_timer_delete
+227	64	clock_settime		sys_clock_settime
+228	64	clock_gettime		sys_clock_gettime
+229	64	clock_getres		sys_clock_getres
+230	64	clock_nanosleep		sys_clock_nanosleep
+231	64	exit_group		sys_exit_group
+232	64	epoll_wait		sys_epoll_wait
+233	64	epoll_ctl		sys_epoll_ctl
+234	64	tgkill			sys_tgkill
+235	64	utimes			sys_utimes
+236	64	vserver
+237	64	mbind			sys_mbind
+238	64	set_mempolicy		sys_set_mempolicy
+239	64	get_mempolicy		sys_get_mempolicy
+240	64	mq_open			sys_mq_open
+241	64	mq_unlink		sys_mq_unlink
+242	64	mq_timedsend		sys_mq_timedsend
+243	64	mq_timedreceive		sys_mq_timedreceive
+244	64	mq_notify		sys_mq_notify
+245	64	mq_getsetattr		sys_mq_getsetattr
+246	64	kexec_load		sys_kexec_load
+247	64	waitid			sys_waitid
+248	64	add_key			sys_add_key
+249	64	request_key		sys_request_key
+250	64	keyctl			sys_keyctl
+251	64	ioprio_set		sys_ioprio_set
+252	64	ioprio_get		sys_ioprio_get
+253	64	inotify_init		sys_inotify_init
+254	64	inotify_add_watch	sys_inotify_add_watch
+255	64	inotify_rm_watch	sys_inotify_rm_watch
+256	64	migrate_pages		sys_migrate_pages
+257	64	openat			sys_openat
+258	64	mkdirat			sys_mkdirat
+259	64	mknodat			sys_mknodat
+260	64	fchownat		sys_fchownat
+261	64	futimesat		sys_futimesat
+262	64	newfstatat		sys_newfstatat
+263	64	unlinkat		sys_unlinkat
+264	64	renameat		sys_renameat
+265	64	linkat			sys_linkat
+266	64	symlinkat		sys_symlinkat
+267	64	readlinkat		sys_readlinkat
+268	64	fchmodat		sys_fchmodat
+269	64	faccessat		sys_faccessat
+270	64	pselect6		sys_pselect6
+271	64	ppoll			sys_ppoll
+272	64	unshare			sys_unshare
+273	64	set_robust_list		sys_set_robust_list
+274	64	get_robust_list		sys_get_robust_list
+275	64	splice			sys_splice
+276	64	tee			sys_tee
+277	64	sync_file_range		sys_sync_file_range
+278	64	vmsplice		sys_vmsplice
+279	64	move_pages		sys_move_pages
+280	64	utimensat		sys_utimensat
+281	64	epoll_pwait		sys_epoll_pwait
+282	64	signalfd		sys_signalfd
+283	64	timerfd_create		sys_timerfd_create
+284	64	eventfd			sys_eventfd
+285	64	fallocate		sys_fallocate
+286	64	timerfd_settime		sys_timerfd_settime
+287	64	timerfd_gettime		sys_timerfd_gettime
+288	64	accept4			sys_accept4
+289	64	signalfd4		sys_signalfd4
+290	64	eventfd2		sys_eventfd2
+291	64	epoll_create1		sys_epoll_create1
+292	64	dup3			sys_dup3
+293	64	pipe2			sys_pipe2
+294	64	inotify_init1		sys_inotify_init1
+295	64	preadv			sys_preadv
+296	64	pwritev			sys_pwritev
+297	64	rt_tgsigqueueinfo	sys_rt_tgsigqueueinfo
+298	64	perf_event_open		sys_perf_event_open
+299	64	recvmmsg		sys_recvmmsg
+300	64	fanotify_init		sys_fanotify_init
+301	64	fanotify_mark		sys_fanotify_mark
+302	64	prlimit64		sys_prlimit64
+303	64	name_to_handle_at	sys_name_to_handle_at
+304	64	open_by_handle_at	sys_open_by_handle_at
+305	64	clock_adjtime		sys_clock_adjtime
+306	64	syncfs			sys_syncfs
+307	64	sendmmsg		sys_sendmmsg
+308	64	setns			sys_setns
+309	64	getcpu			sys_getcpu
+310	64	process_vm_readv	sys_process_vm_readv
+311	64	process_vm_writev	sys_process_vm_writev
diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh
new file mode 100644
index 0000000..0d473ff
--- /dev/null
+++ b/arch/x86/syscalls/syscallhdr.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+in="$1"
+out="$2"
+my_abis=`echo "$3" | tr ',' ' '`
+prefix="$4"
+offset="$5"
+
+fileguard=_ASM_X86_`basename "$out" | sed \
+    -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
+    -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
+
+in_list () {
+    local x
+    for x in $1; do
+	if [ x"$x" = x"$2" ]; then
+	    return 0
+	fi
+    done
+    return 1
+}
+
+grep '^[0-9]' "$in" | sort -n | (
+    echo "#ifndef ${fileguard}"
+    echo "#define ${fileguard} 1"
+    echo ""
+
+    while read nr abi name entry ; do
+	if in_list "$my_abis" "$abi"; then
+	    echo "#define __NR_${prefix}${name}" $((nr+offset))
+        fi
+    done
+
+    echo ""
+    echo "#endif /* ${fileguard} */"
+) > "$out"
diff --git a/arch/x86/syscalls/syscalltbl.sh b/arch/x86/syscalls/syscalltbl.sh
new file mode 100644
index 0000000..0e7f8ec
--- /dev/null
+++ b/arch/x86/syscalls/syscalltbl.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+in="$1"
+out="$2"
+
+grep '^[0-9]' "$in" | sort -n | (
+    while read nr abi name entry compat; do
+	abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
+	if [ -n "$compat" ]; then
+	    echo "__SYSCALL_${abi}($nr, $entry, $compat)"
+	elif [ -n "$entry" ]; then
+	    echo "__SYSCALL_${abi}($nr, $entry, $entry)"
+	fi
+    done
+) > "$out"
-- 
cgit v1.1


From 303395ac3bf3e2cb488435537d416bc840438fcb Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 11 Nov 2011 16:07:41 -0800
Subject: x86: Generate system call tables and unistd_*.h from tables

Generate system call tables and unistd_*.h automatically from the
tables in arch/x86/syscalls.  All other information, like NR_syscalls,
is auto-generated, some of which is in asm-offsets_*.c.

This allows us to keep all the system call information in one place,
and allows for kernel space and user space to see different
information; this is currently used for the ia32 system call numbers
when building the 64-bit kernel, but will be used by the x32 ABI in
the near future.

This also removes some gratuitious differences between i386, x86-64
and ia32; in particular, now all system call tables are generated with
the same mechanism.

Cc: H. J. Lu <hjl.tools@gmail.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Michal Marek <mmarek@suse.cz>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Makefile                  |   6 +
 arch/x86/ia32/Makefile             |   2 +-
 arch/x86/ia32/ia32entry.S          | 356 ------------------
 arch/x86/ia32/syscall_ia32.c       |  25 ++
 arch/x86/include/asm/Kbuild        |   5 +-
 arch/x86/include/asm/ia32_unistd.h |  13 +-
 arch/x86/include/asm/unistd.h      |  54 ++-
 arch/x86/include/asm/unistd_32.h   | 401 --------------------
 arch/x86/include/asm/unistd_64.h   | 732 -------------------------------------
 arch/x86/kernel/Makefile           |   3 +-
 arch/x86/kernel/asm-offsets_32.c   |   8 +
 arch/x86/kernel/asm-offsets_64.c   |  19 +-
 arch/x86/kernel/entry_32.S         |  37 +-
 arch/x86/kernel/syscall_32.c       |  25 ++
 arch/x86/kernel/syscall_64.c       |  14 +-
 arch/x86/kernel/syscall_table_32.S | 350 ------------------
 16 files changed, 154 insertions(+), 1896 deletions(-)
 create mode 100644 arch/x86/ia32/syscall_ia32.c
 delete mode 100644 arch/x86/include/asm/unistd_32.h
 delete mode 100644 arch/x86/include/asm/unistd_64.h
 create mode 100644 arch/x86/kernel/syscall_32.c
 delete mode 100644 arch/x86/kernel/syscall_table_32.S

(limited to 'arch/x86')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b02e509..209ba12 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -118,6 +118,12 @@ KBUILD_CFLAGS += $(mflags-y)
 KBUILD_AFLAGS += $(mflags-y)
 
 ###
+# Syscall table generation
+
+archheaders:
+	$(Q)$(MAKE) $(build)=arch/x86/syscalls all
+
+###
 # Kernel objects
 
 head-y := arch/x86/kernel/head_$(BITS).o
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index eea9a1c..455646e 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,7 @@
 #
 
 obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
-obj-$(CONFIG_IA32_EMULATION) += nosyscall.o
+obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o
 
 sysv-$(CONFIG_SYSVIPC) := ipc32.o
 obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 59538a77..72f853a 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -27,8 +27,6 @@
 
 	.section .entry.text, "ax"
 
-#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
-
 	.macro IA32_ARG_FIXUP noebp=0
 	movl	%edi,%r8d
 	.if \noebp
@@ -496,357 +494,3 @@ ENTRY(ia32_ptregs_common)
 	jmp  ia32_sysret	/* misbalances the return cache */
 	CFI_ENDPROC
 END(ia32_ptregs_common)
-
-	.section .rodata,"a"
-	.align 8
-ia32_sys_call_table:
-	.quad sys_restart_syscall
-	.quad sys_exit
-	.quad stub32_fork
-	.quad sys_read
-	.quad sys_write
-	.quad compat_sys_open		/* 5 */
-	.quad sys_close
-	.quad sys32_waitpid
-	.quad sys_creat
-	.quad sys_link
-	.quad sys_unlink		/* 10 */
-	.quad stub32_execve
-	.quad sys_chdir
-	.quad compat_sys_time
-	.quad sys_mknod
-	.quad sys_chmod		/* 15 */
-	.quad sys_lchown16
-	.quad quiet_ni_syscall			/* old break syscall holder */
-	.quad sys_stat
-	.quad sys32_lseek
-	.quad sys_getpid		/* 20 */
-	.quad compat_sys_mount	/* mount  */
-	.quad sys_oldumount	/* old_umount  */
-	.quad sys_setuid16
-	.quad sys_getuid16
-	.quad compat_sys_stime	/* stime */		/* 25 */
-	.quad compat_sys_ptrace	/* ptrace */
-	.quad sys_alarm
-	.quad sys_fstat	/* (old)fstat */
-	.quad sys_pause
-	.quad compat_sys_utime	/* 30 */
-	.quad quiet_ni_syscall	/* old stty syscall holder */
-	.quad quiet_ni_syscall	/* old gtty syscall holder */
-	.quad sys_access
-	.quad sys_nice	
-	.quad quiet_ni_syscall	/* 35 */	/* old ftime syscall holder */
-	.quad sys_sync
-	.quad sys32_kill
-	.quad sys_rename
-	.quad sys_mkdir
-	.quad sys_rmdir		/* 40 */
-	.quad sys_dup
-	.quad sys_pipe
-	.quad compat_sys_times
-	.quad quiet_ni_syscall			/* old prof syscall holder */
-	.quad sys_brk		/* 45 */
-	.quad sys_setgid16
-	.quad sys_getgid16
-	.quad sys_signal
-	.quad sys_geteuid16
-	.quad sys_getegid16	/* 50 */
-	.quad sys_acct
-	.quad sys_umount			/* new_umount */
-	.quad quiet_ni_syscall			/* old lock syscall holder */
-	.quad compat_sys_ioctl
-	.quad compat_sys_fcntl64		/* 55 */
-	.quad quiet_ni_syscall			/* old mpx syscall holder */
-	.quad sys_setpgid
-	.quad quiet_ni_syscall			/* old ulimit syscall holder */
-	.quad sys_olduname
-	.quad sys_umask		/* 60 */
-	.quad sys_chroot
-	.quad compat_sys_ustat
-	.quad sys_dup2
-	.quad sys_getppid
-	.quad sys_getpgrp		/* 65 */
-	.quad sys_setsid
-	.quad sys32_sigaction
-	.quad sys_sgetmask
-	.quad sys_ssetmask
-	.quad sys_setreuid16	/* 70 */
-	.quad sys_setregid16
-	.quad sys32_sigsuspend
-	.quad compat_sys_sigpending
-	.quad sys_sethostname
-	.quad compat_sys_setrlimit	/* 75 */
-	.quad compat_sys_old_getrlimit	/* old_getrlimit */
-	.quad compat_sys_getrusage
-	.quad compat_sys_gettimeofday
-	.quad compat_sys_settimeofday
-	.quad sys_getgroups16	/* 80 */
-	.quad sys_setgroups16
-	.quad compat_sys_old_select
-	.quad sys_symlink
-	.quad sys_lstat
-	.quad sys_readlink		/* 85 */
-	.quad sys_uselib
-	.quad sys_swapon
-	.quad sys_reboot
-	.quad compat_sys_old_readdir
-	.quad sys32_mmap		/* 90 */
-	.quad sys_munmap
-	.quad sys_truncate
-	.quad sys_ftruncate
-	.quad sys_fchmod
-	.quad sys_fchown16		/* 95 */
-	.quad sys_getpriority
-	.quad sys_setpriority
-	.quad quiet_ni_syscall			/* old profil syscall holder */
-	.quad compat_sys_statfs
-	.quad compat_sys_fstatfs		/* 100 */
-	.quad sys_ioperm
-	.quad compat_sys_socketcall
-	.quad sys_syslog
-	.quad compat_sys_setitimer
-	.quad compat_sys_getitimer	/* 105 */
-	.quad compat_sys_newstat
-	.quad compat_sys_newlstat
-	.quad compat_sys_newfstat
-	.quad sys_uname
-	.quad stub32_iopl		/* 110 */
-	.quad sys_vhangup
-	.quad quiet_ni_syscall	/* old "idle" system call */
-	.quad sys32_vm86_warning	/* vm86old */ 
-	.quad compat_sys_wait4
-	.quad sys_swapoff		/* 115 */
-	.quad compat_sys_sysinfo
-	.quad sys32_ipc
-	.quad sys_fsync
-	.quad stub32_sigreturn
-	.quad stub32_clone		/* 120 */
-	.quad sys_setdomainname
-	.quad sys_newuname
-	.quad sys_modify_ldt
-	.quad compat_sys_adjtimex
-	.quad sys32_mprotect		/* 125 */
-	.quad compat_sys_sigprocmask
-	.quad quiet_ni_syscall		/* create_module */
-	.quad sys_init_module
-	.quad sys_delete_module
-	.quad quiet_ni_syscall		/* 130  get_kernel_syms */
-	.quad sys32_quotactl
-	.quad sys_getpgid
-	.quad sys_fchdir
-	.quad quiet_ni_syscall	/* bdflush */
-	.quad sys_sysfs		/* 135 */
-	.quad sys_personality
-	.quad quiet_ni_syscall	/* for afs_syscall */
-	.quad sys_setfsuid16
-	.quad sys_setfsgid16
-	.quad sys_llseek		/* 140 */
-	.quad compat_sys_getdents
-	.quad compat_sys_select
-	.quad sys_flock
-	.quad sys_msync
-	.quad compat_sys_readv		/* 145 */
-	.quad compat_sys_writev
-	.quad sys_getsid
-	.quad sys_fdatasync
-	.quad compat_sys_sysctl	/* sysctl */
-	.quad sys_mlock		/* 150 */
-	.quad sys_munlock
-	.quad sys_mlockall
-	.quad sys_munlockall
-	.quad sys_sched_setparam
-	.quad sys_sched_getparam   /* 155 */
-	.quad sys_sched_setscheduler
-	.quad sys_sched_getscheduler
-	.quad sys_sched_yield
-	.quad sys_sched_get_priority_max
-	.quad sys_sched_get_priority_min  /* 160 */
-	.quad sys32_sched_rr_get_interval
-	.quad compat_sys_nanosleep
-	.quad sys_mremap
-	.quad sys_setresuid16
-	.quad sys_getresuid16	/* 165 */
-	.quad sys32_vm86_warning	/* vm86 */ 
-	.quad quiet_ni_syscall	/* query_module */
-	.quad sys_poll
-	.quad quiet_ni_syscall /* old nfsservctl */
-	.quad sys_setresgid16	/* 170 */
-	.quad sys_getresgid16
-	.quad sys_prctl
-	.quad stub32_rt_sigreturn
-	.quad sys32_rt_sigaction
-	.quad sys32_rt_sigprocmask	/* 175 */
-	.quad sys32_rt_sigpending
-	.quad compat_sys_rt_sigtimedwait
-	.quad sys32_rt_sigqueueinfo
-	.quad sys_rt_sigsuspend
-	.quad sys32_pread		/* 180 */
-	.quad sys32_pwrite
-	.quad sys_chown16
-	.quad sys_getcwd
-	.quad sys_capget
-	.quad sys_capset
-	.quad stub32_sigaltstack
-	.quad sys32_sendfile
-	.quad quiet_ni_syscall		/* streams1 */
-	.quad quiet_ni_syscall		/* streams2 */
-	.quad stub32_vfork            /* 190 */
-	.quad compat_sys_getrlimit
-	.quad sys_mmap_pgoff
-	.quad sys32_truncate64
-	.quad sys32_ftruncate64
-	.quad sys32_stat64		/* 195 */
-	.quad sys32_lstat64
-	.quad sys32_fstat64
-	.quad sys_lchown
-	.quad sys_getuid
-	.quad sys_getgid		/* 200 */
-	.quad sys_geteuid
-	.quad sys_getegid
-	.quad sys_setreuid
-	.quad sys_setregid
-	.quad sys_getgroups	/* 205 */
-	.quad sys_setgroups
-	.quad sys_fchown
-	.quad sys_setresuid
-	.quad sys_getresuid
-	.quad sys_setresgid	/* 210 */
-	.quad sys_getresgid
-	.quad sys_chown
-	.quad sys_setuid
-	.quad sys_setgid
-	.quad sys_setfsuid		/* 215 */
-	.quad sys_setfsgid
-	.quad sys_pivot_root
-	.quad sys_mincore
-	.quad sys_madvise
-	.quad compat_sys_getdents64	/* 220 getdents64 */
-	.quad compat_sys_fcntl64	
-	.quad quiet_ni_syscall		/* tux */
-	.quad quiet_ni_syscall    	/* security */
-	.quad sys_gettid	
-	.quad sys32_readahead	/* 225 */
-	.quad sys_setxattr
-	.quad sys_lsetxattr
-	.quad sys_fsetxattr
-	.quad sys_getxattr
-	.quad sys_lgetxattr	/* 230 */
-	.quad sys_fgetxattr
-	.quad sys_listxattr
-	.quad sys_llistxattr
-	.quad sys_flistxattr
-	.quad sys_removexattr	/* 235 */
-	.quad sys_lremovexattr
-	.quad sys_fremovexattr
-	.quad sys_tkill
-	.quad sys_sendfile64 
-	.quad compat_sys_futex		/* 240 */
-	.quad compat_sys_sched_setaffinity
-	.quad compat_sys_sched_getaffinity
-	.quad sys_set_thread_area
-	.quad sys_get_thread_area
-	.quad compat_sys_io_setup	/* 245 */
-	.quad sys_io_destroy
-	.quad compat_sys_io_getevents
-	.quad compat_sys_io_submit
-	.quad sys_io_cancel
-	.quad sys32_fadvise64		/* 250 */
-	.quad quiet_ni_syscall 	/* free_huge_pages */
-	.quad sys_exit_group
-	.quad sys32_lookup_dcookie
-	.quad sys_epoll_create
-	.quad sys_epoll_ctl		/* 255 */
-	.quad sys_epoll_wait
-	.quad sys_remap_file_pages
-	.quad sys_set_tid_address
-	.quad compat_sys_timer_create
-	.quad compat_sys_timer_settime	/* 260 */
-	.quad compat_sys_timer_gettime
-	.quad sys_timer_getoverrun
-	.quad sys_timer_delete
-	.quad compat_sys_clock_settime
-	.quad compat_sys_clock_gettime	/* 265 */
-	.quad compat_sys_clock_getres
-	.quad compat_sys_clock_nanosleep
-	.quad compat_sys_statfs64
-	.quad compat_sys_fstatfs64
-	.quad sys_tgkill		/* 270 */
-	.quad compat_sys_utimes
-	.quad sys32_fadvise64_64
-	.quad quiet_ni_syscall	/* sys_vserver */
-	.quad sys_mbind
-	.quad compat_sys_get_mempolicy	/* 275 */
-	.quad sys_set_mempolicy
-	.quad compat_sys_mq_open
-	.quad sys_mq_unlink
-	.quad compat_sys_mq_timedsend
-	.quad compat_sys_mq_timedreceive	/* 280 */
-	.quad compat_sys_mq_notify
-	.quad compat_sys_mq_getsetattr
-	.quad compat_sys_kexec_load	/* reserved for kexec */
-	.quad compat_sys_waitid
-	.quad quiet_ni_syscall		/* 285: sys_altroot */
-	.quad sys_add_key
-	.quad sys_request_key
-	.quad sys_keyctl
-	.quad sys_ioprio_set
-	.quad sys_ioprio_get		/* 290 */
-	.quad sys_inotify_init
-	.quad sys_inotify_add_watch
-	.quad sys_inotify_rm_watch
-	.quad sys_migrate_pages
-	.quad compat_sys_openat		/* 295 */
-	.quad sys_mkdirat
-	.quad sys_mknodat
-	.quad sys_fchownat
-	.quad compat_sys_futimesat
-	.quad sys32_fstatat		/* 300 */
-	.quad sys_unlinkat
-	.quad sys_renameat
-	.quad sys_linkat
-	.quad sys_symlinkat
-	.quad sys_readlinkat		/* 305 */
-	.quad sys_fchmodat
-	.quad sys_faccessat
-	.quad compat_sys_pselect6
-	.quad compat_sys_ppoll
-	.quad sys_unshare		/* 310 */
-	.quad compat_sys_set_robust_list
-	.quad compat_sys_get_robust_list
-	.quad sys_splice
-	.quad sys32_sync_file_range
-	.quad sys_tee			/* 315 */
-	.quad compat_sys_vmsplice
-	.quad compat_sys_move_pages
-	.quad sys_getcpu
-	.quad sys_epoll_pwait
-	.quad compat_sys_utimensat	/* 320 */
-	.quad compat_sys_signalfd
-	.quad sys_timerfd_create
-	.quad sys_eventfd
-	.quad sys32_fallocate
-	.quad compat_sys_timerfd_settime	/* 325 */
-	.quad compat_sys_timerfd_gettime
-	.quad compat_sys_signalfd4
-	.quad sys_eventfd2
-	.quad sys_epoll_create1
-	.quad sys_dup3				/* 330 */
-	.quad sys_pipe2
-	.quad sys_inotify_init1
-	.quad compat_sys_preadv
-	.quad compat_sys_pwritev
-	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
-	.quad sys_perf_event_open
-	.quad compat_sys_recvmmsg
-	.quad sys_fanotify_init
-	.quad sys32_fanotify_mark
-	.quad sys_prlimit64		/* 340 */
-	.quad sys_name_to_handle_at
-	.quad compat_sys_open_by_handle_at
-	.quad compat_sys_clock_adjtime
-	.quad sys_syncfs
-	.quad compat_sys_sendmmsg	/* 345 */
-	.quad sys_setns
-	.quad compat_sys_process_vm_readv
-	.quad compat_sys_process_vm_writev
-ia32_syscall_end:
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
new file mode 100644
index 0000000..d04d3db
--- /dev/null
+++ b/arch/x86/ia32/syscall_ia32.c
@@ -0,0 +1,25 @@
+/* System call table for ia32 emulation. */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <asm/asm-offsets.h>
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ;
+#include <asm/syscalls_32.h>
+#undef __SYSCALL_I386
+
+#define __SYSCALL_I386(nr, sym, compat) [nr] = compat,
+
+typedef void (*sys_call_ptr_t)(void);
+
+extern void compat_ni_syscall(void);
+
+const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
+	/*
+	 * Smells like a like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
+	[0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
+#include <asm/syscalls_32.h>
+};
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 6fa90a8..b57e6a4 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -19,7 +19,8 @@ header-y += processor-flags.h
 header-y += ptrace-abi.h
 header-y += sigcontext32.h
 header-y += ucontext.h
-header-y += unistd_32.h
-header-y += unistd_64.h
 header-y += vm86.h
 header-y += vsyscall.h
+
+genhdr-y += unistd_32.h
+genhdr-y += unistd_64.h
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
index 976f6ec..b0d5716 100644
--- a/arch/x86/include/asm/ia32_unistd.h
+++ b/arch/x86/include/asm/ia32_unistd.h
@@ -2,17 +2,10 @@
 #define _ASM_X86_IA32_UNISTD_H
 
 /*
- * This file contains the system call numbers of the ia32 port,
+ * This file contains the system call numbers of the ia32 compat ABI,
  * this is for the kernel only.
- * Only add syscalls here where some part of the kernel needs to know
- * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK
  */
-
-#define __NR_ia32_restart_syscall 0
-#define __NR_ia32_exit		  1
-#define __NR_ia32_read		  3
-#define __NR_ia32_write		  4
-#define __NR_ia32_sigreturn	119
-#define __NR_ia32_rt_sigreturn	173
+#define __SYSCALL_ia32_NR(x) (x)
+#include <asm/unistd_32_ia32.h>
 
 #endif /* _ASM_X86_IA32_UNISTD_H */
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 2a58ed3..b4a3db7 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -1,13 +1,59 @@
+#ifndef _ASM_X86_UNISTD_H
+#define _ASM_X86_UNISTD_H 1
+
 #ifdef __KERNEL__
 # ifdef CONFIG_X86_32
-#  include "unistd_32.h"
+
+#  include <asm/unistd_32.h>
+#  define __ARCH_WANT_IPC_PARSE_VERSION
+#  define __ARCH_WANT_STAT64
+#  define __ARCH_WANT_SYS_OLD_MMAP
+#  define __ARCH_WANT_SYS_OLD_SELECT
+
 # else
-#  include "unistd_64.h"
+
+#  include <asm/unistd_64.h>
+#  define __ARCH_WANT_COMPAT_SYS_TIME
+
 # endif
+
+# define __ARCH_WANT_OLD_READDIR
+# define __ARCH_WANT_OLD_STAT
+# define __ARCH_WANT_SYS_ALARM
+# define __ARCH_WANT_SYS_FADVISE64
+# define __ARCH_WANT_SYS_GETHOSTNAME
+# define __ARCH_WANT_SYS_GETPGRP
+# define __ARCH_WANT_SYS_LLSEEK
+# define __ARCH_WANT_SYS_NICE
+# define __ARCH_WANT_SYS_OLDUMOUNT
+# define __ARCH_WANT_SYS_OLD_GETRLIMIT
+# define __ARCH_WANT_SYS_OLD_UNAME
+# define __ARCH_WANT_SYS_PAUSE
+# define __ARCH_WANT_SYS_RT_SIGACTION
+# define __ARCH_WANT_SYS_RT_SIGSUSPEND
+# define __ARCH_WANT_SYS_SGETMASK
+# define __ARCH_WANT_SYS_SIGNAL
+# define __ARCH_WANT_SYS_SIGPENDING
+# define __ARCH_WANT_SYS_SIGPROCMASK
+# define __ARCH_WANT_SYS_SOCKETCALL
+# define __ARCH_WANT_SYS_TIME
+# define __ARCH_WANT_SYS_UTIME
+# define __ARCH_WANT_SYS_WAITPID
+
+/*
+ * "Conditional" syscalls
+ *
+ * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
+ * but it doesn't work on all toolchains, so we just do it by hand
+ */
+# define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
+
 #else
 # ifdef __i386__
-#  include "unistd_32.h"
+#  include <asm/unistd_32.h>
 # else
-#  include "unistd_64.h"
+#  include <asm/unistd_64.h>
 # endif
 #endif
+
+#endif /* _ASM_X86_UNISTD_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
deleted file mode 100644
index 599c77d..0000000
--- a/arch/x86/include/asm/unistd_32.h
+++ /dev/null
@@ -1,401 +0,0 @@
-#ifndef _ASM_X86_UNISTD_32_H
-#define _ASM_X86_UNISTD_32_H
-
-/*
- * This file contains the system call numbers.
- */
-
-#define __NR_restart_syscall      0
-#define __NR_exit		  1
-#define __NR_fork		  2
-#define __NR_read		  3
-#define __NR_write		  4
-#define __NR_open		  5
-#define __NR_close		  6
-#define __NR_waitpid		  7
-#define __NR_creat		  8
-#define __NR_link		  9
-#define __NR_unlink		 10
-#define __NR_execve		 11
-#define __NR_chdir		 12
-#define __NR_time		 13
-#define __NR_mknod		 14
-#define __NR_chmod		 15
-#define __NR_lchown		 16
-#define __NR_break		 17
-#define __NR_oldstat		 18
-#define __NR_lseek		 19
-#define __NR_getpid		 20
-#define __NR_mount		 21
-#define __NR_umount		 22
-#define __NR_setuid		 23
-#define __NR_getuid		 24
-#define __NR_stime		 25
-#define __NR_ptrace		 26
-#define __NR_alarm		 27
-#define __NR_oldfstat		 28
-#define __NR_pause		 29
-#define __NR_utime		 30
-#define __NR_stty		 31
-#define __NR_gtty		 32
-#define __NR_access		 33
-#define __NR_nice		 34
-#define __NR_ftime		 35
-#define __NR_sync		 36
-#define __NR_kill		 37
-#define __NR_rename		 38
-#define __NR_mkdir		 39
-#define __NR_rmdir		 40
-#define __NR_dup		 41
-#define __NR_pipe		 42
-#define __NR_times		 43
-#define __NR_prof		 44
-#define __NR_brk		 45
-#define __NR_setgid		 46
-#define __NR_getgid		 47
-#define __NR_signal		 48
-#define __NR_geteuid		 49
-#define __NR_getegid		 50
-#define __NR_acct		 51
-#define __NR_umount2		 52
-#define __NR_lock		 53
-#define __NR_ioctl		 54
-#define __NR_fcntl		 55
-#define __NR_mpx		 56
-#define __NR_setpgid		 57
-#define __NR_ulimit		 58
-#define __NR_oldolduname	 59
-#define __NR_umask		 60
-#define __NR_chroot		 61
-#define __NR_ustat		 62
-#define __NR_dup2		 63
-#define __NR_getppid		 64
-#define __NR_getpgrp		 65
-#define __NR_setsid		 66
-#define __NR_sigaction		 67
-#define __NR_sgetmask		 68
-#define __NR_ssetmask		 69
-#define __NR_setreuid		 70
-#define __NR_setregid		 71
-#define __NR_sigsuspend		 72
-#define __NR_sigpending		 73
-#define __NR_sethostname	 74
-#define __NR_setrlimit		 75
-#define __NR_getrlimit		 76   /* Back compatible 2Gig limited rlimit */
-#define __NR_getrusage		 77
-#define __NR_gettimeofday	 78
-#define __NR_settimeofday	 79
-#define __NR_getgroups		 80
-#define __NR_setgroups		 81
-#define __NR_select		 82
-#define __NR_symlink		 83
-#define __NR_oldlstat		 84
-#define __NR_readlink		 85
-#define __NR_uselib		 86
-#define __NR_swapon		 87
-#define __NR_reboot		 88
-#define __NR_readdir		 89
-#define __NR_mmap		 90
-#define __NR_munmap		 91
-#define __NR_truncate		 92
-#define __NR_ftruncate		 93
-#define __NR_fchmod		 94
-#define __NR_fchown		 95
-#define __NR_getpriority	 96
-#define __NR_setpriority	 97
-#define __NR_profil		 98
-#define __NR_statfs		 99
-#define __NR_fstatfs		100
-#define __NR_ioperm		101
-#define __NR_socketcall		102
-#define __NR_syslog		103
-#define __NR_setitimer		104
-#define __NR_getitimer		105
-#define __NR_stat		106
-#define __NR_lstat		107
-#define __NR_fstat		108
-#define __NR_olduname		109
-#define __NR_iopl		110
-#define __NR_vhangup		111
-#define __NR_idle		112
-#define __NR_vm86old		113
-#define __NR_wait4		114
-#define __NR_swapoff		115
-#define __NR_sysinfo		116
-#define __NR_ipc		117
-#define __NR_fsync		118
-#define __NR_sigreturn		119
-#define __NR_clone		120
-#define __NR_setdomainname	121
-#define __NR_uname		122
-#define __NR_modify_ldt		123
-#define __NR_adjtimex		124
-#define __NR_mprotect		125
-#define __NR_sigprocmask	126
-#define __NR_create_module	127
-#define __NR_init_module	128
-#define __NR_delete_module	129
-#define __NR_get_kernel_syms	130
-#define __NR_quotactl		131
-#define __NR_getpgid		132
-#define __NR_fchdir		133
-#define __NR_bdflush		134
-#define __NR_sysfs		135
-#define __NR_personality	136
-#define __NR_afs_syscall	137 /* Syscall for Andrew File System */
-#define __NR_setfsuid		138
-#define __NR_setfsgid		139
-#define __NR__llseek		140
-#define __NR_getdents		141
-#define __NR__newselect		142
-#define __NR_flock		143
-#define __NR_msync		144
-#define __NR_readv		145
-#define __NR_writev		146
-#define __NR_getsid		147
-#define __NR_fdatasync		148
-#define __NR__sysctl		149
-#define __NR_mlock		150
-#define __NR_munlock		151
-#define __NR_mlockall		152
-#define __NR_munlockall		153
-#define __NR_sched_setparam		154
-#define __NR_sched_getparam		155
-#define __NR_sched_setscheduler		156
-#define __NR_sched_getscheduler		157
-#define __NR_sched_yield		158
-#define __NR_sched_get_priority_max	159
-#define __NR_sched_get_priority_min	160
-#define __NR_sched_rr_get_interval	161
-#define __NR_nanosleep		162
-#define __NR_mremap		163
-#define __NR_setresuid		164
-#define __NR_getresuid		165
-#define __NR_vm86		166
-#define __NR_query_module	167
-#define __NR_poll		168
-#define __NR_nfsservctl		169
-#define __NR_setresgid		170
-#define __NR_getresgid		171
-#define __NR_prctl              172
-#define __NR_rt_sigreturn	173
-#define __NR_rt_sigaction	174
-#define __NR_rt_sigprocmask	175
-#define __NR_rt_sigpending	176
-#define __NR_rt_sigtimedwait	177
-#define __NR_rt_sigqueueinfo	178
-#define __NR_rt_sigsuspend	179
-#define __NR_pread64		180
-#define __NR_pwrite64		181
-#define __NR_chown		182
-#define __NR_getcwd		183
-#define __NR_capget		184
-#define __NR_capset		185
-#define __NR_sigaltstack	186
-#define __NR_sendfile		187
-#define __NR_getpmsg		188	/* some people actually want streams */
-#define __NR_putpmsg		189	/* some people actually want streams */
-#define __NR_vfork		190
-#define __NR_ugetrlimit		191	/* SuS compliant getrlimit */
-#define __NR_mmap2		192
-#define __NR_truncate64		193
-#define __NR_ftruncate64	194
-#define __NR_stat64		195
-#define __NR_lstat64		196
-#define __NR_fstat64		197
-#define __NR_lchown32		198
-#define __NR_getuid32		199
-#define __NR_getgid32		200
-#define __NR_geteuid32		201
-#define __NR_getegid32		202
-#define __NR_setreuid32		203
-#define __NR_setregid32		204
-#define __NR_getgroups32	205
-#define __NR_setgroups32	206
-#define __NR_fchown32		207
-#define __NR_setresuid32	208
-#define __NR_getresuid32	209
-#define __NR_setresgid32	210
-#define __NR_getresgid32	211
-#define __NR_chown32		212
-#define __NR_setuid32		213
-#define __NR_setgid32		214
-#define __NR_setfsuid32		215
-#define __NR_setfsgid32		216
-#define __NR_pivot_root		217
-#define __NR_mincore		218
-#define __NR_madvise		219
-#define __NR_madvise1		219	/* delete when C lib stub is removed */
-#define __NR_getdents64		220
-#define __NR_fcntl64		221
-/* 223 is unused */
-#define __NR_gettid		224
-#define __NR_readahead		225
-#define __NR_setxattr		226
-#define __NR_lsetxattr		227
-#define __NR_fsetxattr		228
-#define __NR_getxattr		229
-#define __NR_lgetxattr		230
-#define __NR_fgetxattr		231
-#define __NR_listxattr		232
-#define __NR_llistxattr		233
-#define __NR_flistxattr		234
-#define __NR_removexattr	235
-#define __NR_lremovexattr	236
-#define __NR_fremovexattr	237
-#define __NR_tkill		238
-#define __NR_sendfile64		239
-#define __NR_futex		240
-#define __NR_sched_setaffinity	241
-#define __NR_sched_getaffinity	242
-#define __NR_set_thread_area	243
-#define __NR_get_thread_area	244
-#define __NR_io_setup		245
-#define __NR_io_destroy		246
-#define __NR_io_getevents	247
-#define __NR_io_submit		248
-#define __NR_io_cancel		249
-#define __NR_fadvise64		250
-/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */
-#define __NR_exit_group		252
-#define __NR_lookup_dcookie	253
-#define __NR_epoll_create	254
-#define __NR_epoll_ctl		255
-#define __NR_epoll_wait		256
-#define __NR_remap_file_pages	257
-#define __NR_set_tid_address	258
-#define __NR_timer_create	259
-#define __NR_timer_settime	(__NR_timer_create+1)
-#define __NR_timer_gettime	(__NR_timer_create+2)
-#define __NR_timer_getoverrun	(__NR_timer_create+3)
-#define __NR_timer_delete	(__NR_timer_create+4)
-#define __NR_clock_settime	(__NR_timer_create+5)
-#define __NR_clock_gettime	(__NR_timer_create+6)
-#define __NR_clock_getres	(__NR_timer_create+7)
-#define __NR_clock_nanosleep	(__NR_timer_create+8)
-#define __NR_statfs64		268
-#define __NR_fstatfs64		269
-#define __NR_tgkill		270
-#define __NR_utimes		271
-#define __NR_fadvise64_64	272
-#define __NR_vserver		273
-#define __NR_mbind		274
-#define __NR_get_mempolicy	275
-#define __NR_set_mempolicy	276
-#define __NR_mq_open 		277
-#define __NR_mq_unlink		(__NR_mq_open+1)
-#define __NR_mq_timedsend	(__NR_mq_open+2)
-#define __NR_mq_timedreceive	(__NR_mq_open+3)
-#define __NR_mq_notify		(__NR_mq_open+4)
-#define __NR_mq_getsetattr	(__NR_mq_open+5)
-#define __NR_kexec_load		283
-#define __NR_waitid		284
-/* #define __NR_sys_setaltroot	285 */
-#define __NR_add_key		286
-#define __NR_request_key	287
-#define __NR_keyctl		288
-#define __NR_ioprio_set		289
-#define __NR_ioprio_get		290
-#define __NR_inotify_init	291
-#define __NR_inotify_add_watch	292
-#define __NR_inotify_rm_watch	293
-#define __NR_migrate_pages	294
-#define __NR_openat		295
-#define __NR_mkdirat		296
-#define __NR_mknodat		297
-#define __NR_fchownat		298
-#define __NR_futimesat		299
-#define __NR_fstatat64		300
-#define __NR_unlinkat		301
-#define __NR_renameat		302
-#define __NR_linkat		303
-#define __NR_symlinkat		304
-#define __NR_readlinkat		305
-#define __NR_fchmodat		306
-#define __NR_faccessat		307
-#define __NR_pselect6		308
-#define __NR_ppoll		309
-#define __NR_unshare		310
-#define __NR_set_robust_list	311
-#define __NR_get_robust_list	312
-#define __NR_splice		313
-#define __NR_sync_file_range	314
-#define __NR_tee		315
-#define __NR_vmsplice		316
-#define __NR_move_pages		317
-#define __NR_getcpu		318
-#define __NR_epoll_pwait	319
-#define __NR_utimensat		320
-#define __NR_signalfd		321
-#define __NR_timerfd_create	322
-#define __NR_eventfd		323
-#define __NR_fallocate		324
-#define __NR_timerfd_settime	325
-#define __NR_timerfd_gettime	326
-#define __NR_signalfd4		327
-#define __NR_eventfd2		328
-#define __NR_epoll_create1	329
-#define __NR_dup3		330
-#define __NR_pipe2		331
-#define __NR_inotify_init1	332
-#define __NR_preadv		333
-#define __NR_pwritev		334
-#define __NR_rt_tgsigqueueinfo	335
-#define __NR_perf_event_open	336
-#define __NR_recvmmsg		337
-#define __NR_fanotify_init	338
-#define __NR_fanotify_mark	339
-#define __NR_prlimit64		340
-#define __NR_name_to_handle_at	341
-#define __NR_open_by_handle_at  342
-#define __NR_clock_adjtime	343
-#define __NR_syncfs             344
-#define __NR_sendmmsg		345
-#define __NR_setns		346
-#define __NR_process_vm_readv	347
-#define __NR_process_vm_writev	348
-
-#ifdef __KERNEL__
-
-#define NR_syscalls 349
-
-#define __ARCH_WANT_IPC_PARSE_VERSION
-#define __ARCH_WANT_OLD_READDIR
-#define __ARCH_WANT_OLD_STAT
-#define __ARCH_WANT_STAT64
-#define __ARCH_WANT_SYS_ALARM
-#define __ARCH_WANT_SYS_GETHOSTNAME
-#define __ARCH_WANT_SYS_IPC
-#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
-#define __ARCH_WANT_SYS_SIGNAL
-#define __ARCH_WANT_SYS_TIME
-#define __ARCH_WANT_SYS_UTIME
-#define __ARCH_WANT_SYS_WAITPID
-#define __ARCH_WANT_SYS_SOCKETCALL
-#define __ARCH_WANT_SYS_FADVISE64
-#define __ARCH_WANT_SYS_GETPGRP
-#define __ARCH_WANT_SYS_LLSEEK
-#define __ARCH_WANT_SYS_NICE
-#define __ARCH_WANT_SYS_OLD_GETRLIMIT
-#define __ARCH_WANT_SYS_OLD_UNAME
-#define __ARCH_WANT_SYS_OLD_MMAP
-#define __ARCH_WANT_SYS_OLD_SELECT
-#define __ARCH_WANT_SYS_OLDUMOUNT
-#define __ARCH_WANT_SYS_SIGPENDING
-#define __ARCH_WANT_SYS_SIGPROCMASK
-#define __ARCH_WANT_SYS_RT_SIGACTION
-#define __ARCH_WANT_SYS_RT_SIGSUSPEND
-
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#ifndef cond_syscall
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-#endif
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_X86_UNISTD_32_H */
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
deleted file mode 100644
index 0431f19..0000000
--- a/arch/x86/include/asm/unistd_64.h
+++ /dev/null
@@ -1,732 +0,0 @@
-#ifndef _ASM_X86_UNISTD_64_H
-#define _ASM_X86_UNISTD_64_H
-
-#ifndef __SYSCALL
-#define __SYSCALL(a, b)
-#endif
-
-/*
- * This file contains the system call numbers.
- *
- * Note: holes are not allowed.
- */
-
-/* at least 8 syscall per cacheline */
-#define __NR_read				0
-__SYSCALL(__NR_read, sys_read)
-#define __NR_write				1
-__SYSCALL(__NR_write, sys_write)
-#define __NR_open				2
-__SYSCALL(__NR_open, sys_open)
-#define __NR_close				3
-__SYSCALL(__NR_close, sys_close)
-#define __NR_stat				4
-__SYSCALL(__NR_stat, sys_newstat)
-#define __NR_fstat				5
-__SYSCALL(__NR_fstat, sys_newfstat)
-#define __NR_lstat				6
-__SYSCALL(__NR_lstat, sys_newlstat)
-#define __NR_poll				7
-__SYSCALL(__NR_poll, sys_poll)
-
-#define __NR_lseek				8
-__SYSCALL(__NR_lseek, sys_lseek)
-#define __NR_mmap				9
-__SYSCALL(__NR_mmap, sys_mmap)
-#define __NR_mprotect				10
-__SYSCALL(__NR_mprotect, sys_mprotect)
-#define __NR_munmap				11
-__SYSCALL(__NR_munmap, sys_munmap)
-#define __NR_brk				12
-__SYSCALL(__NR_brk, sys_brk)
-#define __NR_rt_sigaction			13
-__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction)
-#define __NR_rt_sigprocmask			14
-__SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask)
-#define __NR_rt_sigreturn			15
-__SYSCALL(__NR_rt_sigreturn, stub_rt_sigreturn)
-
-#define __NR_ioctl				16
-__SYSCALL(__NR_ioctl, sys_ioctl)
-#define __NR_pread64				17
-__SYSCALL(__NR_pread64, sys_pread64)
-#define __NR_pwrite64				18
-__SYSCALL(__NR_pwrite64, sys_pwrite64)
-#define __NR_readv				19
-__SYSCALL(__NR_readv, sys_readv)
-#define __NR_writev				20
-__SYSCALL(__NR_writev, sys_writev)
-#define __NR_access				21
-__SYSCALL(__NR_access, sys_access)
-#define __NR_pipe				22
-__SYSCALL(__NR_pipe, sys_pipe)
-#define __NR_select				23
-__SYSCALL(__NR_select, sys_select)
-
-#define __NR_sched_yield			24
-__SYSCALL(__NR_sched_yield, sys_sched_yield)
-#define __NR_mremap				25
-__SYSCALL(__NR_mremap, sys_mremap)
-#define __NR_msync				26
-__SYSCALL(__NR_msync, sys_msync)
-#define __NR_mincore				27
-__SYSCALL(__NR_mincore, sys_mincore)
-#define __NR_madvise				28
-__SYSCALL(__NR_madvise, sys_madvise)
-#define __NR_shmget				29
-__SYSCALL(__NR_shmget, sys_shmget)
-#define __NR_shmat				30
-__SYSCALL(__NR_shmat, sys_shmat)
-#define __NR_shmctl				31
-__SYSCALL(__NR_shmctl, sys_shmctl)
-
-#define __NR_dup				32
-__SYSCALL(__NR_dup, sys_dup)
-#define __NR_dup2				33
-__SYSCALL(__NR_dup2, sys_dup2)
-#define __NR_pause				34
-__SYSCALL(__NR_pause, sys_pause)
-#define __NR_nanosleep				35
-__SYSCALL(__NR_nanosleep, sys_nanosleep)
-#define __NR_getitimer				36
-__SYSCALL(__NR_getitimer, sys_getitimer)
-#define __NR_alarm				37
-__SYSCALL(__NR_alarm, sys_alarm)
-#define __NR_setitimer				38
-__SYSCALL(__NR_setitimer, sys_setitimer)
-#define __NR_getpid				39
-__SYSCALL(__NR_getpid, sys_getpid)
-
-#define __NR_sendfile				40
-__SYSCALL(__NR_sendfile, sys_sendfile64)
-#define __NR_socket				41
-__SYSCALL(__NR_socket, sys_socket)
-#define __NR_connect				42
-__SYSCALL(__NR_connect, sys_connect)
-#define __NR_accept				43
-__SYSCALL(__NR_accept, sys_accept)
-#define __NR_sendto				44
-__SYSCALL(__NR_sendto, sys_sendto)
-#define __NR_recvfrom				45
-__SYSCALL(__NR_recvfrom, sys_recvfrom)
-#define __NR_sendmsg				46
-__SYSCALL(__NR_sendmsg, sys_sendmsg)
-#define __NR_recvmsg				47
-__SYSCALL(__NR_recvmsg, sys_recvmsg)
-
-#define __NR_shutdown				48
-__SYSCALL(__NR_shutdown, sys_shutdown)
-#define __NR_bind				49
-__SYSCALL(__NR_bind, sys_bind)
-#define __NR_listen				50
-__SYSCALL(__NR_listen, sys_listen)
-#define __NR_getsockname			51
-__SYSCALL(__NR_getsockname, sys_getsockname)
-#define __NR_getpeername			52
-__SYSCALL(__NR_getpeername, sys_getpeername)
-#define __NR_socketpair				53
-__SYSCALL(__NR_socketpair, sys_socketpair)
-#define __NR_setsockopt				54
-__SYSCALL(__NR_setsockopt, sys_setsockopt)
-#define __NR_getsockopt				55
-__SYSCALL(__NR_getsockopt, sys_getsockopt)
-
-#define __NR_clone				56
-__SYSCALL(__NR_clone, stub_clone)
-#define __NR_fork				57
-__SYSCALL(__NR_fork, stub_fork)
-#define __NR_vfork				58
-__SYSCALL(__NR_vfork, stub_vfork)
-#define __NR_execve				59
-__SYSCALL(__NR_execve, stub_execve)
-#define __NR_exit				60
-__SYSCALL(__NR_exit, sys_exit)
-#define __NR_wait4				61
-__SYSCALL(__NR_wait4, sys_wait4)
-#define __NR_kill				62
-__SYSCALL(__NR_kill, sys_kill)
-#define __NR_uname				63
-__SYSCALL(__NR_uname, sys_newuname)
-
-#define __NR_semget				64
-__SYSCALL(__NR_semget, sys_semget)
-#define __NR_semop				65
-__SYSCALL(__NR_semop, sys_semop)
-#define __NR_semctl				66
-__SYSCALL(__NR_semctl, sys_semctl)
-#define __NR_shmdt				67
-__SYSCALL(__NR_shmdt, sys_shmdt)
-#define __NR_msgget				68
-__SYSCALL(__NR_msgget, sys_msgget)
-#define __NR_msgsnd				69
-__SYSCALL(__NR_msgsnd, sys_msgsnd)
-#define __NR_msgrcv				70
-__SYSCALL(__NR_msgrcv, sys_msgrcv)
-#define __NR_msgctl				71
-__SYSCALL(__NR_msgctl, sys_msgctl)
-
-#define __NR_fcntl				72
-__SYSCALL(__NR_fcntl, sys_fcntl)
-#define __NR_flock				73
-__SYSCALL(__NR_flock, sys_flock)
-#define __NR_fsync				74
-__SYSCALL(__NR_fsync, sys_fsync)
-#define __NR_fdatasync				75
-__SYSCALL(__NR_fdatasync, sys_fdatasync)
-#define __NR_truncate				76
-__SYSCALL(__NR_truncate, sys_truncate)
-#define __NR_ftruncate				77
-__SYSCALL(__NR_ftruncate, sys_ftruncate)
-#define __NR_getdents				78
-__SYSCALL(__NR_getdents, sys_getdents)
-#define __NR_getcwd				79
-__SYSCALL(__NR_getcwd, sys_getcwd)
-
-#define __NR_chdir				80
-__SYSCALL(__NR_chdir, sys_chdir)
-#define __NR_fchdir				81
-__SYSCALL(__NR_fchdir, sys_fchdir)
-#define __NR_rename				82
-__SYSCALL(__NR_rename, sys_rename)
-#define __NR_mkdir				83
-__SYSCALL(__NR_mkdir, sys_mkdir)
-#define __NR_rmdir				84
-__SYSCALL(__NR_rmdir, sys_rmdir)
-#define __NR_creat				85
-__SYSCALL(__NR_creat, sys_creat)
-#define __NR_link				86
-__SYSCALL(__NR_link, sys_link)
-#define __NR_unlink				87
-__SYSCALL(__NR_unlink, sys_unlink)
-
-#define __NR_symlink				88
-__SYSCALL(__NR_symlink, sys_symlink)
-#define __NR_readlink				89
-__SYSCALL(__NR_readlink, sys_readlink)
-#define __NR_chmod				90
-__SYSCALL(__NR_chmod, sys_chmod)
-#define __NR_fchmod				91
-__SYSCALL(__NR_fchmod, sys_fchmod)
-#define __NR_chown				92
-__SYSCALL(__NR_chown, sys_chown)
-#define __NR_fchown				93
-__SYSCALL(__NR_fchown, sys_fchown)
-#define __NR_lchown				94
-__SYSCALL(__NR_lchown, sys_lchown)
-#define __NR_umask				95
-__SYSCALL(__NR_umask, sys_umask)
-
-#define __NR_gettimeofday			96
-__SYSCALL(__NR_gettimeofday, sys_gettimeofday)
-#define __NR_getrlimit				97
-__SYSCALL(__NR_getrlimit, sys_getrlimit)
-#define __NR_getrusage				98
-__SYSCALL(__NR_getrusage, sys_getrusage)
-#define __NR_sysinfo				99
-__SYSCALL(__NR_sysinfo, sys_sysinfo)
-#define __NR_times				100
-__SYSCALL(__NR_times, sys_times)
-#define __NR_ptrace				101
-__SYSCALL(__NR_ptrace, sys_ptrace)
-#define __NR_getuid				102
-__SYSCALL(__NR_getuid, sys_getuid)
-#define __NR_syslog				103
-__SYSCALL(__NR_syslog, sys_syslog)
-
-/* at the very end the stuff that never runs during the benchmarks */
-#define __NR_getgid				104
-__SYSCALL(__NR_getgid, sys_getgid)
-#define __NR_setuid				105
-__SYSCALL(__NR_setuid, sys_setuid)
-#define __NR_setgid				106
-__SYSCALL(__NR_setgid, sys_setgid)
-#define __NR_geteuid				107
-__SYSCALL(__NR_geteuid, sys_geteuid)
-#define __NR_getegid				108
-__SYSCALL(__NR_getegid, sys_getegid)
-#define __NR_setpgid				109
-__SYSCALL(__NR_setpgid, sys_setpgid)
-#define __NR_getppid				110
-__SYSCALL(__NR_getppid, sys_getppid)
-#define __NR_getpgrp				111
-__SYSCALL(__NR_getpgrp, sys_getpgrp)
-
-#define __NR_setsid				112
-__SYSCALL(__NR_setsid, sys_setsid)
-#define __NR_setreuid				113
-__SYSCALL(__NR_setreuid, sys_setreuid)
-#define __NR_setregid				114
-__SYSCALL(__NR_setregid, sys_setregid)
-#define __NR_getgroups				115
-__SYSCALL(__NR_getgroups, sys_getgroups)
-#define __NR_setgroups				116
-__SYSCALL(__NR_setgroups, sys_setgroups)
-#define __NR_setresuid				117
-__SYSCALL(__NR_setresuid, sys_setresuid)
-#define __NR_getresuid				118
-__SYSCALL(__NR_getresuid, sys_getresuid)
-#define __NR_setresgid				119
-__SYSCALL(__NR_setresgid, sys_setresgid)
-
-#define __NR_getresgid				120
-__SYSCALL(__NR_getresgid, sys_getresgid)
-#define __NR_getpgid				121
-__SYSCALL(__NR_getpgid, sys_getpgid)
-#define __NR_setfsuid				122
-__SYSCALL(__NR_setfsuid, sys_setfsuid)
-#define __NR_setfsgid				123
-__SYSCALL(__NR_setfsgid, sys_setfsgid)
-#define __NR_getsid				124
-__SYSCALL(__NR_getsid, sys_getsid)
-#define __NR_capget				125
-__SYSCALL(__NR_capget, sys_capget)
-#define __NR_capset				126
-__SYSCALL(__NR_capset, sys_capset)
-
-#define __NR_rt_sigpending			127
-__SYSCALL(__NR_rt_sigpending, sys_rt_sigpending)
-#define __NR_rt_sigtimedwait			128
-__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait)
-#define __NR_rt_sigqueueinfo			129
-__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo)
-#define __NR_rt_sigsuspend			130
-__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend)
-#define __NR_sigaltstack			131
-__SYSCALL(__NR_sigaltstack, stub_sigaltstack)
-#define __NR_utime				132
-__SYSCALL(__NR_utime, sys_utime)
-#define __NR_mknod				133
-__SYSCALL(__NR_mknod, sys_mknod)
-
-/* Only needed for a.out */
-#define __NR_uselib				134
-__SYSCALL(__NR_uselib, sys_ni_syscall)
-#define __NR_personality			135
-__SYSCALL(__NR_personality, sys_personality)
-
-#define __NR_ustat				136
-__SYSCALL(__NR_ustat, sys_ustat)
-#define __NR_statfs				137
-__SYSCALL(__NR_statfs, sys_statfs)
-#define __NR_fstatfs				138
-__SYSCALL(__NR_fstatfs, sys_fstatfs)
-#define __NR_sysfs				139
-__SYSCALL(__NR_sysfs, sys_sysfs)
-
-#define __NR_getpriority			140
-__SYSCALL(__NR_getpriority, sys_getpriority)
-#define __NR_setpriority			141
-__SYSCALL(__NR_setpriority, sys_setpriority)
-#define __NR_sched_setparam			142
-__SYSCALL(__NR_sched_setparam, sys_sched_setparam)
-#define __NR_sched_getparam			143
-__SYSCALL(__NR_sched_getparam, sys_sched_getparam)
-#define __NR_sched_setscheduler			144
-__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler)
-#define __NR_sched_getscheduler			145
-__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler)
-#define __NR_sched_get_priority_max		146
-__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
-#define __NR_sched_get_priority_min		147
-__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
-#define __NR_sched_rr_get_interval		148
-__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval)
-
-#define __NR_mlock				149
-__SYSCALL(__NR_mlock, sys_mlock)
-#define __NR_munlock				150
-__SYSCALL(__NR_munlock, sys_munlock)
-#define __NR_mlockall				151
-__SYSCALL(__NR_mlockall, sys_mlockall)
-#define __NR_munlockall				152
-__SYSCALL(__NR_munlockall, sys_munlockall)
-
-#define __NR_vhangup				153
-__SYSCALL(__NR_vhangup, sys_vhangup)
-
-#define __NR_modify_ldt				154
-__SYSCALL(__NR_modify_ldt, sys_modify_ldt)
-
-#define __NR_pivot_root				155
-__SYSCALL(__NR_pivot_root, sys_pivot_root)
-
-#define __NR__sysctl				156
-__SYSCALL(__NR__sysctl, sys_sysctl)
-
-#define __NR_prctl				157
-__SYSCALL(__NR_prctl, sys_prctl)
-#define __NR_arch_prctl				158
-__SYSCALL(__NR_arch_prctl, sys_arch_prctl)
-
-#define __NR_adjtimex				159
-__SYSCALL(__NR_adjtimex, sys_adjtimex)
-
-#define __NR_setrlimit				160
-__SYSCALL(__NR_setrlimit, sys_setrlimit)
-
-#define __NR_chroot				161
-__SYSCALL(__NR_chroot, sys_chroot)
-
-#define __NR_sync				162
-__SYSCALL(__NR_sync, sys_sync)
-
-#define __NR_acct				163
-__SYSCALL(__NR_acct, sys_acct)
-
-#define __NR_settimeofday			164
-__SYSCALL(__NR_settimeofday, sys_settimeofday)
-
-#define __NR_mount				165
-__SYSCALL(__NR_mount, sys_mount)
-#define __NR_umount2				166
-__SYSCALL(__NR_umount2, sys_umount)
-
-#define __NR_swapon				167
-__SYSCALL(__NR_swapon, sys_swapon)
-#define __NR_swapoff				168
-__SYSCALL(__NR_swapoff, sys_swapoff)
-
-#define __NR_reboot				169
-__SYSCALL(__NR_reboot, sys_reboot)
-
-#define __NR_sethostname			170
-__SYSCALL(__NR_sethostname, sys_sethostname)
-#define __NR_setdomainname			171
-__SYSCALL(__NR_setdomainname, sys_setdomainname)
-
-#define __NR_iopl				172
-__SYSCALL(__NR_iopl, stub_iopl)
-#define __NR_ioperm				173
-__SYSCALL(__NR_ioperm, sys_ioperm)
-
-#define __NR_create_module			174
-__SYSCALL(__NR_create_module, sys_ni_syscall)
-#define __NR_init_module			175
-__SYSCALL(__NR_init_module, sys_init_module)
-#define __NR_delete_module			176
-__SYSCALL(__NR_delete_module, sys_delete_module)
-#define __NR_get_kernel_syms			177
-__SYSCALL(__NR_get_kernel_syms, sys_ni_syscall)
-#define __NR_query_module			178
-__SYSCALL(__NR_query_module, sys_ni_syscall)
-
-#define __NR_quotactl				179
-__SYSCALL(__NR_quotactl, sys_quotactl)
-
-#define __NR_nfsservctl				180
-__SYSCALL(__NR_nfsservctl, sys_ni_syscall)
-
-/* reserved for LiS/STREAMS */
-#define __NR_getpmsg				181
-__SYSCALL(__NR_getpmsg, sys_ni_syscall)
-#define __NR_putpmsg				182
-__SYSCALL(__NR_putpmsg, sys_ni_syscall)
-
-/* reserved for AFS */
-#define __NR_afs_syscall			183
-__SYSCALL(__NR_afs_syscall, sys_ni_syscall)
-
-/* reserved for tux */
-#define __NR_tuxcall				184
-__SYSCALL(__NR_tuxcall, sys_ni_syscall)
-
-#define __NR_security				185
-__SYSCALL(__NR_security, sys_ni_syscall)
-
-#define __NR_gettid				186
-__SYSCALL(__NR_gettid, sys_gettid)
-
-#define __NR_readahead				187
-__SYSCALL(__NR_readahead, sys_readahead)
-#define __NR_setxattr				188
-__SYSCALL(__NR_setxattr, sys_setxattr)
-#define __NR_lsetxattr				189
-__SYSCALL(__NR_lsetxattr, sys_lsetxattr)
-#define __NR_fsetxattr				190
-__SYSCALL(__NR_fsetxattr, sys_fsetxattr)
-#define __NR_getxattr				191
-__SYSCALL(__NR_getxattr, sys_getxattr)
-#define __NR_lgetxattr				192
-__SYSCALL(__NR_lgetxattr, sys_lgetxattr)
-#define __NR_fgetxattr				193
-__SYSCALL(__NR_fgetxattr, sys_fgetxattr)
-#define __NR_listxattr				194
-__SYSCALL(__NR_listxattr, sys_listxattr)
-#define __NR_llistxattr				195
-__SYSCALL(__NR_llistxattr, sys_llistxattr)
-#define __NR_flistxattr				196
-__SYSCALL(__NR_flistxattr, sys_flistxattr)
-#define __NR_removexattr			197
-__SYSCALL(__NR_removexattr, sys_removexattr)
-#define __NR_lremovexattr			198
-__SYSCALL(__NR_lremovexattr, sys_lremovexattr)
-#define __NR_fremovexattr			199
-__SYSCALL(__NR_fremovexattr, sys_fremovexattr)
-#define __NR_tkill				200
-__SYSCALL(__NR_tkill, sys_tkill)
-#define __NR_time				201
-__SYSCALL(__NR_time, sys_time)
-#define __NR_futex				202
-__SYSCALL(__NR_futex, sys_futex)
-#define __NR_sched_setaffinity			203
-__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity)
-#define __NR_sched_getaffinity			204
-__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity)
-#define __NR_set_thread_area			205
-__SYSCALL(__NR_set_thread_area, sys_ni_syscall)	/* use arch_prctl */
-#define __NR_io_setup				206
-__SYSCALL(__NR_io_setup, sys_io_setup)
-#define __NR_io_destroy				207
-__SYSCALL(__NR_io_destroy, sys_io_destroy)
-#define __NR_io_getevents			208
-__SYSCALL(__NR_io_getevents, sys_io_getevents)
-#define __NR_io_submit				209
-__SYSCALL(__NR_io_submit, sys_io_submit)
-#define __NR_io_cancel				210
-__SYSCALL(__NR_io_cancel, sys_io_cancel)
-#define __NR_get_thread_area			211
-__SYSCALL(__NR_get_thread_area, sys_ni_syscall)	/* use arch_prctl */
-#define __NR_lookup_dcookie			212
-__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie)
-#define __NR_epoll_create			213
-__SYSCALL(__NR_epoll_create, sys_epoll_create)
-#define __NR_epoll_ctl_old			214
-__SYSCALL(__NR_epoll_ctl_old, sys_ni_syscall)
-#define __NR_epoll_wait_old			215
-__SYSCALL(__NR_epoll_wait_old, sys_ni_syscall)
-#define __NR_remap_file_pages			216
-__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages)
-#define __NR_getdents64				217
-__SYSCALL(__NR_getdents64, sys_getdents64)
-#define __NR_set_tid_address			218
-__SYSCALL(__NR_set_tid_address, sys_set_tid_address)
-#define __NR_restart_syscall			219
-__SYSCALL(__NR_restart_syscall, sys_restart_syscall)
-#define __NR_semtimedop				220
-__SYSCALL(__NR_semtimedop, sys_semtimedop)
-#define __NR_fadvise64				221
-__SYSCALL(__NR_fadvise64, sys_fadvise64)
-#define __NR_timer_create			222
-__SYSCALL(__NR_timer_create, sys_timer_create)
-#define __NR_timer_settime			223
-__SYSCALL(__NR_timer_settime, sys_timer_settime)
-#define __NR_timer_gettime			224
-__SYSCALL(__NR_timer_gettime, sys_timer_gettime)
-#define __NR_timer_getoverrun			225
-__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
-#define __NR_timer_delete			226
-__SYSCALL(__NR_timer_delete, sys_timer_delete)
-#define __NR_clock_settime			227
-__SYSCALL(__NR_clock_settime, sys_clock_settime)
-#define __NR_clock_gettime			228
-__SYSCALL(__NR_clock_gettime, sys_clock_gettime)
-#define __NR_clock_getres			229
-__SYSCALL(__NR_clock_getres, sys_clock_getres)
-#define __NR_clock_nanosleep			230
-__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep)
-#define __NR_exit_group				231
-__SYSCALL(__NR_exit_group, sys_exit_group)
-#define __NR_epoll_wait				232
-__SYSCALL(__NR_epoll_wait, sys_epoll_wait)
-#define __NR_epoll_ctl				233
-__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl)
-#define __NR_tgkill				234
-__SYSCALL(__NR_tgkill, sys_tgkill)
-#define __NR_utimes				235
-__SYSCALL(__NR_utimes, sys_utimes)
-#define __NR_vserver				236
-__SYSCALL(__NR_vserver, sys_ni_syscall)
-#define __NR_mbind				237
-__SYSCALL(__NR_mbind, sys_mbind)
-#define __NR_set_mempolicy			238
-__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy)
-#define __NR_get_mempolicy			239
-__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy)
-#define __NR_mq_open				240
-__SYSCALL(__NR_mq_open, sys_mq_open)
-#define __NR_mq_unlink				241
-__SYSCALL(__NR_mq_unlink, sys_mq_unlink)
-#define __NR_mq_timedsend			242
-__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend)
-#define __NR_mq_timedreceive			243
-__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive)
-#define __NR_mq_notify				244
-__SYSCALL(__NR_mq_notify, sys_mq_notify)
-#define __NR_mq_getsetattr			245
-__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
-#define __NR_kexec_load				246
-__SYSCALL(__NR_kexec_load, sys_kexec_load)
-#define __NR_waitid				247
-__SYSCALL(__NR_waitid, sys_waitid)
-#define __NR_add_key				248
-__SYSCALL(__NR_add_key, sys_add_key)
-#define __NR_request_key			249
-__SYSCALL(__NR_request_key, sys_request_key)
-#define __NR_keyctl				250
-__SYSCALL(__NR_keyctl, sys_keyctl)
-#define __NR_ioprio_set				251
-__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
-#define __NR_ioprio_get				252
-__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
-#define __NR_inotify_init			253
-__SYSCALL(__NR_inotify_init, sys_inotify_init)
-#define __NR_inotify_add_watch			254
-__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
-#define __NR_inotify_rm_watch			255
-__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
-#define __NR_migrate_pages			256
-__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
-#define __NR_openat				257
-__SYSCALL(__NR_openat, sys_openat)
-#define __NR_mkdirat				258
-__SYSCALL(__NR_mkdirat, sys_mkdirat)
-#define __NR_mknodat				259
-__SYSCALL(__NR_mknodat, sys_mknodat)
-#define __NR_fchownat				260
-__SYSCALL(__NR_fchownat, sys_fchownat)
-#define __NR_futimesat				261
-__SYSCALL(__NR_futimesat, sys_futimesat)
-#define __NR_newfstatat				262
-__SYSCALL(__NR_newfstatat, sys_newfstatat)
-#define __NR_unlinkat				263
-__SYSCALL(__NR_unlinkat, sys_unlinkat)
-#define __NR_renameat				264
-__SYSCALL(__NR_renameat, sys_renameat)
-#define __NR_linkat				265
-__SYSCALL(__NR_linkat, sys_linkat)
-#define __NR_symlinkat				266
-__SYSCALL(__NR_symlinkat, sys_symlinkat)
-#define __NR_readlinkat				267
-__SYSCALL(__NR_readlinkat, sys_readlinkat)
-#define __NR_fchmodat				268
-__SYSCALL(__NR_fchmodat, sys_fchmodat)
-#define __NR_faccessat				269
-__SYSCALL(__NR_faccessat, sys_faccessat)
-#define __NR_pselect6				270
-__SYSCALL(__NR_pselect6, sys_pselect6)
-#define __NR_ppoll				271
-__SYSCALL(__NR_ppoll,	sys_ppoll)
-#define __NR_unshare				272
-__SYSCALL(__NR_unshare,	sys_unshare)
-#define __NR_set_robust_list			273
-__SYSCALL(__NR_set_robust_list, sys_set_robust_list)
-#define __NR_get_robust_list			274
-__SYSCALL(__NR_get_robust_list, sys_get_robust_list)
-#define __NR_splice				275
-__SYSCALL(__NR_splice, sys_splice)
-#define __NR_tee				276
-__SYSCALL(__NR_tee, sys_tee)
-#define __NR_sync_file_range			277
-__SYSCALL(__NR_sync_file_range, sys_sync_file_range)
-#define __NR_vmsplice				278
-__SYSCALL(__NR_vmsplice, sys_vmsplice)
-#define __NR_move_pages				279
-__SYSCALL(__NR_move_pages, sys_move_pages)
-#define __NR_utimensat				280
-__SYSCALL(__NR_utimensat, sys_utimensat)
-#define __NR_epoll_pwait			281
-__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait)
-#define __NR_signalfd				282
-__SYSCALL(__NR_signalfd, sys_signalfd)
-#define __NR_timerfd_create			283
-__SYSCALL(__NR_timerfd_create, sys_timerfd_create)
-#define __NR_eventfd				284
-__SYSCALL(__NR_eventfd, sys_eventfd)
-#define __NR_fallocate				285
-__SYSCALL(__NR_fallocate, sys_fallocate)
-#define __NR_timerfd_settime			286
-__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
-#define __NR_timerfd_gettime			287
-__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
-#define __NR_accept4				288
-__SYSCALL(__NR_accept4, sys_accept4)
-#define __NR_signalfd4				289
-__SYSCALL(__NR_signalfd4, sys_signalfd4)
-#define __NR_eventfd2				290
-__SYSCALL(__NR_eventfd2, sys_eventfd2)
-#define __NR_epoll_create1			291
-__SYSCALL(__NR_epoll_create1, sys_epoll_create1)
-#define __NR_dup3				292
-__SYSCALL(__NR_dup3, sys_dup3)
-#define __NR_pipe2				293
-__SYSCALL(__NR_pipe2, sys_pipe2)
-#define __NR_inotify_init1			294
-__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
-#define __NR_preadv				295
-__SYSCALL(__NR_preadv, sys_preadv)
-#define __NR_pwritev				296
-__SYSCALL(__NR_pwritev, sys_pwritev)
-#define __NR_rt_tgsigqueueinfo			297
-__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
-#define __NR_perf_event_open			298
-__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
-#define __NR_recvmmsg				299
-__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
-#define __NR_fanotify_init			300
-__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
-#define __NR_fanotify_mark			301
-__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
-#define __NR_prlimit64				302
-__SYSCALL(__NR_prlimit64, sys_prlimit64)
-#define __NR_name_to_handle_at			303
-__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
-#define __NR_open_by_handle_at			304
-__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
-#define __NR_clock_adjtime			305
-__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
-#define __NR_syncfs                             306
-__SYSCALL(__NR_syncfs, sys_syncfs)
-#define __NR_sendmmsg				307
-__SYSCALL(__NR_sendmmsg, sys_sendmmsg)
-#define __NR_setns				308
-__SYSCALL(__NR_setns, sys_setns)
-#define __NR_getcpu				309
-__SYSCALL(__NR_getcpu, sys_getcpu)
-#define __NR_process_vm_readv			310
-__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
-#define __NR_process_vm_writev			311
-__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
-
-#ifndef __NO_STUBS
-#define __ARCH_WANT_OLD_READDIR
-#define __ARCH_WANT_OLD_STAT
-#define __ARCH_WANT_SYS_ALARM
-#define __ARCH_WANT_SYS_GETHOSTNAME
-#define __ARCH_WANT_SYS_PAUSE
-#define __ARCH_WANT_SYS_SGETMASK
-#define __ARCH_WANT_SYS_SIGNAL
-#define __ARCH_WANT_SYS_UTIME
-#define __ARCH_WANT_SYS_WAITPID
-#define __ARCH_WANT_SYS_SOCKETCALL
-#define __ARCH_WANT_SYS_FADVISE64
-#define __ARCH_WANT_SYS_GETPGRP
-#define __ARCH_WANT_SYS_LLSEEK
-#define __ARCH_WANT_SYS_NICE
-#define __ARCH_WANT_SYS_OLD_GETRLIMIT
-#define __ARCH_WANT_SYS_OLD_UNAME
-#define __ARCH_WANT_SYS_OLDUMOUNT
-#define __ARCH_WANT_SYS_SIGPENDING
-#define __ARCH_WANT_SYS_SIGPROCMASK
-#define __ARCH_WANT_SYS_RT_SIGACTION
-#define __ARCH_WANT_SYS_RT_SIGSUSPEND
-#define __ARCH_WANT_SYS_TIME
-#define __ARCH_WANT_COMPAT_SYS_TIME
-#endif	/* __NO_STUBS */
-
-#ifdef __KERNEL__
-
-#ifndef COMPILE_OFFSETS
-#include <asm/asm-offsets.h>
-#define NR_syscalls (__NR_syscall_max + 1)
-#endif
-
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-#endif	/* __KERNEL__ */
-
-#endif /* _ASM_X86_UNISTD_64_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8baca3c..8c473d9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,8 @@ obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
+obj-y			+= syscall_$(BITS).o
+obj-$(CONFIG_X86_64)	+= vsyscall_64.o
 obj-$(CONFIG_X86_64)	+= vsyscall_emu_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o topology.o kdebugfs.o
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 395a10e..85d98ab 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -3,6 +3,11 @@
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
 
+#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+static char syscalls[] = {
+#include <asm/syscalls_32.h>
+};
+
 /* workaround for a warning with -Wmissing-prototypes */
 void foo(void);
 
@@ -76,4 +81,7 @@ void foo(void)
 	OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
 	OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
 #endif
+	BLANK();
+	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+	DEFINE(NR_syscalls, sizeof(syscalls));
 }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72a119..834e897 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,11 +1,12 @@
 #include <asm/ia32.h>
 
-#define __NO_STUBS 1
-#undef __SYSCALL
-#undef _ASM_X86_UNISTD_64_H
-#define __SYSCALL(nr, sym) [nr] = 1,
-static char syscalls[] = {
-#include <asm/unistd.h>
+#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+static char syscalls_64[] = {
+#include <asm/syscalls_64.h>
+};
+#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+static char syscalls_ia32[] = {
+#include <asm/syscalls_32.h>
 };
 
 int main(void)
@@ -72,7 +73,11 @@ int main(void)
 	OFFSET(TSS_ist, tss_struct, x86_tss.ist);
 	BLANK();
 
-	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+	DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
+	DEFINE(NR_syscalls, sizeof(syscalls_64));
+
+	DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1);
+	DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
 
 	return 0;
 }
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f3f6f53..1ffcda2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -81,8 +81,6 @@
  * enough to patch inline, increasing performance.
  */
 
-#define nr_syscalls ((syscall_table_size)/4)
-
 #ifdef CONFIG_PREEMPT
 #define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
 #else
@@ -423,7 +421,7 @@ sysenter_past_esp:
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz sysenter_audit
 sysenter_do_call:
-	cmpl $(nr_syscalls), %eax
+	cmpl $(NR_syscalls), %eax
 	jae syscall_badsys
 	call *sys_call_table(,%eax,4)
 	movl %eax,PT_EAX(%esp)
@@ -504,7 +502,7 @@ ENTRY(system_call)
 					# system call tracing in operation / emulation
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz syscall_trace_entry
-	cmpl $(nr_syscalls), %eax
+	cmpl $(NR_syscalls), %eax
 	jae syscall_badsys
 syscall_call:
 	call *sys_call_table(,%eax,4)
@@ -650,7 +648,7 @@ syscall_trace_entry:
 	movl %esp, %eax
 	call syscall_trace_enter
 	/* What it returned is what we'll actually use.  */
-	cmpl $(nr_syscalls), %eax
+	cmpl $(NR_syscalls), %eax
 	jnae syscall_call
 	jmp syscall_exit
 END(syscall_trace_entry)
@@ -690,29 +688,28 @@ END(syscall_badsys)
  * System calls that need a pt_regs pointer.
  */
 #define PTREGSCALL0(name) \
-	ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ;  \
 	leal 4(%esp),%eax; \
-	jmp sys_##name;
+	jmp sys_##name; \
+ENDPROC(ptregs_##name)
 
 #define PTREGSCALL1(name) \
-	ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
 	leal 4(%esp),%edx; \
 	movl (PT_EBX+4)(%esp),%eax; \
-	jmp sys_##name;
+	jmp sys_##name; \
+ENDPROC(ptregs_##name)
 
 #define PTREGSCALL2(name) \
-	ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
 	leal 4(%esp),%ecx; \
 	movl (PT_ECX+4)(%esp),%edx; \
 	movl (PT_EBX+4)(%esp),%eax; \
-	jmp sys_##name;
+	jmp sys_##name; \
+ENDPROC(ptregs_##name)
 
 #define PTREGSCALL3(name) \
-	ALIGN; \
-ptregs_##name: \
+ENTRY(ptregs_##name) ; \
 	CFI_STARTPROC; \
 	leal 4(%esp),%eax; \
 	pushl_cfi %eax; \
@@ -737,8 +734,7 @@ PTREGSCALL2(vm86)
 PTREGSCALL1(vm86old)
 
 /* Clone is an oddball.  The 4th arg is in %edi */
-	ALIGN;
-ptregs_clone:
+ENTRY(ptregs_clone)
 	CFI_STARTPROC
 	leal 4(%esp),%eax
 	pushl_cfi %eax
@@ -1209,11 +1205,6 @@ return_to_handler:
 	jmp *%ecx
 #endif
 
-.section .rodata,"a"
-#include "syscall_table_32.S"
-
-syscall_table_size=(.-sys_call_table)
-
 /*
  * Some functions should be protected against kprobes
  */
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
new file mode 100644
index 0000000..b37a573
--- /dev/null
+++ b/arch/x86/kernel/syscall_32.c
@@ -0,0 +1,25 @@
+/* System call table for i386. */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <asm/asm-offsets.h>
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#include <asm/syscalls_32.h>
+#undef __SYSCALL_I386
+
+#define __SYSCALL_I386(nr, sym, compat) [nr] = sym,
+
+typedef asmlinkage void (*sys_call_ptr_t)(void);
+
+extern asmlinkage void sys_ni_syscall(void);
+
+const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+	/*
+	 * Smells like a like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
+	[0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_32.h>
+};
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 0edfafa..7ac7943 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -5,15 +5,11 @@
 #include <linux/cache.h>
 #include <asm/asm-offsets.h>
 
-#define __NO_STUBS
+#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
+#include <asm/syscalls_64.h>
+#undef __SYSCALL_64
 
-#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef _ASM_X86_UNISTD_64_H
-#include <asm/unistd_64.h>
-
-#undef __SYSCALL
-#define __SYSCALL(nr, sym) [nr] = sym,
-#undef _ASM_X86_UNISTD_64_H
+#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
 
 typedef void (*sys_call_ptr_t)(void);
 
@@ -25,5 +21,5 @@ const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
 	 * when the & below is removed.
 	 */
 	[0 ... __NR_syscall_max] = &sys_ni_syscall,
-#include <asm/unistd_64.h>
+#include <asm/syscalls_64.h>
 };
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
deleted file mode 100644
index 9a0e312..0000000
--- a/arch/x86/kernel/syscall_table_32.S
+++ /dev/null
@@ -1,350 +0,0 @@
-ENTRY(sys_call_table)
-	.long sys_restart_syscall	/* 0 - old "setup()" system call, used for restarting */
-	.long sys_exit
-	.long ptregs_fork
-	.long sys_read
-	.long sys_write
-	.long sys_open		/* 5 */
-	.long sys_close
-	.long sys_waitpid
-	.long sys_creat
-	.long sys_link
-	.long sys_unlink	/* 10 */
-	.long ptregs_execve
-	.long sys_chdir
-	.long sys_time
-	.long sys_mknod
-	.long sys_chmod		/* 15 */
-	.long sys_lchown16
-	.long sys_ni_syscall	/* old break syscall holder */
-	.long sys_stat
-	.long sys_lseek
-	.long sys_getpid	/* 20 */
-	.long sys_mount
-	.long sys_oldumount
-	.long sys_setuid16
-	.long sys_getuid16
-	.long sys_stime		/* 25 */
-	.long sys_ptrace
-	.long sys_alarm
-	.long sys_fstat
-	.long sys_pause
-	.long sys_utime		/* 30 */
-	.long sys_ni_syscall	/* old stty syscall holder */
-	.long sys_ni_syscall	/* old gtty syscall holder */
-	.long sys_access
-	.long sys_nice
-	.long sys_ni_syscall	/* 35 - old ftime syscall holder */
-	.long sys_sync
-	.long sys_kill
-	.long sys_rename
-	.long sys_mkdir
-	.long sys_rmdir		/* 40 */
-	.long sys_dup
-	.long sys_pipe
-	.long sys_times
-	.long sys_ni_syscall	/* old prof syscall holder */
-	.long sys_brk		/* 45 */
-	.long sys_setgid16
-	.long sys_getgid16
-	.long sys_signal
-	.long sys_geteuid16
-	.long sys_getegid16	/* 50 */
-	.long sys_acct
-	.long sys_umount	/* recycled never used phys() */
-	.long sys_ni_syscall	/* old lock syscall holder */
-	.long sys_ioctl
-	.long sys_fcntl		/* 55 */
-	.long sys_ni_syscall	/* old mpx syscall holder */
-	.long sys_setpgid
-	.long sys_ni_syscall	/* old ulimit syscall holder */
-	.long sys_olduname
-	.long sys_umask		/* 60 */
-	.long sys_chroot
-	.long sys_ustat
-	.long sys_dup2
-	.long sys_getppid
-	.long sys_getpgrp	/* 65 */
-	.long sys_setsid
-	.long sys_sigaction
-	.long sys_sgetmask
-	.long sys_ssetmask
-	.long sys_setreuid16	/* 70 */
-	.long sys_setregid16
-	.long sys_sigsuspend
-	.long sys_sigpending
-	.long sys_sethostname
-	.long sys_setrlimit	/* 75 */
-	.long sys_old_getrlimit
-	.long sys_getrusage
-	.long sys_gettimeofday
-	.long sys_settimeofday
-	.long sys_getgroups16	/* 80 */
-	.long sys_setgroups16
-	.long sys_old_select
-	.long sys_symlink
-	.long sys_lstat
-	.long sys_readlink	/* 85 */
-	.long sys_uselib
-	.long sys_swapon
-	.long sys_reboot
-	.long sys_old_readdir
-	.long sys_old_mmap	/* 90 */
-	.long sys_munmap
-	.long sys_truncate
-	.long sys_ftruncate
-	.long sys_fchmod
-	.long sys_fchown16	/* 95 */
-	.long sys_getpriority
-	.long sys_setpriority
-	.long sys_ni_syscall	/* old profil syscall holder */
-	.long sys_statfs
-	.long sys_fstatfs	/* 100 */
-	.long sys_ioperm
-	.long sys_socketcall
-	.long sys_syslog
-	.long sys_setitimer
-	.long sys_getitimer	/* 105 */
-	.long sys_newstat
-	.long sys_newlstat
-	.long sys_newfstat
-	.long sys_uname
-	.long ptregs_iopl	/* 110 */
-	.long sys_vhangup
-	.long sys_ni_syscall	/* old "idle" system call */
-	.long ptregs_vm86old
-	.long sys_wait4
-	.long sys_swapoff	/* 115 */
-	.long sys_sysinfo
-	.long sys_ipc
-	.long sys_fsync
-	.long ptregs_sigreturn
-	.long ptregs_clone	/* 120 */
-	.long sys_setdomainname
-	.long sys_newuname
-	.long sys_modify_ldt
-	.long sys_adjtimex
-	.long sys_mprotect	/* 125 */
-	.long sys_sigprocmask
-	.long sys_ni_syscall	/* old "create_module" */
-	.long sys_init_module
-	.long sys_delete_module
-	.long sys_ni_syscall	/* 130:	old "get_kernel_syms" */
-	.long sys_quotactl
-	.long sys_getpgid
-	.long sys_fchdir
-	.long sys_bdflush
-	.long sys_sysfs		/* 135 */
-	.long sys_personality
-	.long sys_ni_syscall	/* reserved for afs_syscall */
-	.long sys_setfsuid16
-	.long sys_setfsgid16
-	.long sys_llseek	/* 140 */
-	.long sys_getdents
-	.long sys_select
-	.long sys_flock
-	.long sys_msync
-	.long sys_readv		/* 145 */
-	.long sys_writev
-	.long sys_getsid
-	.long sys_fdatasync
-	.long sys_sysctl
-	.long sys_mlock		/* 150 */
-	.long sys_munlock
-	.long sys_mlockall
-	.long sys_munlockall
-	.long sys_sched_setparam
-	.long sys_sched_getparam   /* 155 */
-	.long sys_sched_setscheduler
-	.long sys_sched_getscheduler
-	.long sys_sched_yield
-	.long sys_sched_get_priority_max
-	.long sys_sched_get_priority_min  /* 160 */
-	.long sys_sched_rr_get_interval
-	.long sys_nanosleep
-	.long sys_mremap
-	.long sys_setresuid16
-	.long sys_getresuid16	/* 165 */
-	.long ptregs_vm86
-	.long sys_ni_syscall	/* Old sys_query_module */
-	.long sys_poll
-	.long sys_ni_syscall	/* Old nfsservctl */
-	.long sys_setresgid16	/* 170 */
-	.long sys_getresgid16
-	.long sys_prctl
-	.long ptregs_rt_sigreturn
-	.long sys_rt_sigaction
-	.long sys_rt_sigprocmask	/* 175 */
-	.long sys_rt_sigpending
-	.long sys_rt_sigtimedwait
-	.long sys_rt_sigqueueinfo
-	.long sys_rt_sigsuspend
-	.long sys_pread64	/* 180 */
-	.long sys_pwrite64
-	.long sys_chown16
-	.long sys_getcwd
-	.long sys_capget
-	.long sys_capset	/* 185 */
-	.long ptregs_sigaltstack
-	.long sys_sendfile
-	.long sys_ni_syscall	/* reserved for streams1 */
-	.long sys_ni_syscall	/* reserved for streams2 */
-	.long ptregs_vfork	/* 190 */
-	.long sys_getrlimit
-	.long sys_mmap_pgoff
-	.long sys_truncate64
-	.long sys_ftruncate64
-	.long sys_stat64	/* 195 */
-	.long sys_lstat64
-	.long sys_fstat64
-	.long sys_lchown
-	.long sys_getuid
-	.long sys_getgid	/* 200 */
-	.long sys_geteuid
-	.long sys_getegid
-	.long sys_setreuid
-	.long sys_setregid
-	.long sys_getgroups	/* 205 */
-	.long sys_setgroups
-	.long sys_fchown
-	.long sys_setresuid
-	.long sys_getresuid
-	.long sys_setresgid	/* 210 */
-	.long sys_getresgid
-	.long sys_chown
-	.long sys_setuid
-	.long sys_setgid
-	.long sys_setfsuid	/* 215 */
-	.long sys_setfsgid
-	.long sys_pivot_root
-	.long sys_mincore
-	.long sys_madvise
-	.long sys_getdents64	/* 220 */
-	.long sys_fcntl64
-	.long sys_ni_syscall	/* reserved for TUX */
-	.long sys_ni_syscall
-	.long sys_gettid
-	.long sys_readahead	/* 225 */
-	.long sys_setxattr
-	.long sys_lsetxattr
-	.long sys_fsetxattr
-	.long sys_getxattr
-	.long sys_lgetxattr	/* 230 */
-	.long sys_fgetxattr
-	.long sys_listxattr
-	.long sys_llistxattr
-	.long sys_flistxattr
-	.long sys_removexattr	/* 235 */
-	.long sys_lremovexattr
-	.long sys_fremovexattr
-	.long sys_tkill
-	.long sys_sendfile64
-	.long sys_futex		/* 240 */
-	.long sys_sched_setaffinity
-	.long sys_sched_getaffinity
-	.long sys_set_thread_area
-	.long sys_get_thread_area
-	.long sys_io_setup	/* 245 */
-	.long sys_io_destroy
-	.long sys_io_getevents
-	.long sys_io_submit
-	.long sys_io_cancel
-	.long sys_fadvise64	/* 250 */
-	.long sys_ni_syscall
-	.long sys_exit_group
-	.long sys_lookup_dcookie
-	.long sys_epoll_create
-	.long sys_epoll_ctl	/* 255 */
-	.long sys_epoll_wait
- 	.long sys_remap_file_pages
- 	.long sys_set_tid_address
- 	.long sys_timer_create
- 	.long sys_timer_settime		/* 260 */
- 	.long sys_timer_gettime
- 	.long sys_timer_getoverrun
- 	.long sys_timer_delete
- 	.long sys_clock_settime
- 	.long sys_clock_gettime		/* 265 */
- 	.long sys_clock_getres
- 	.long sys_clock_nanosleep
-	.long sys_statfs64
-	.long sys_fstatfs64
-	.long sys_tgkill	/* 270 */
-	.long sys_utimes
- 	.long sys_fadvise64_64
-	.long sys_ni_syscall	/* sys_vserver */
-	.long sys_mbind
-	.long sys_get_mempolicy
-	.long sys_set_mempolicy
-	.long sys_mq_open
-	.long sys_mq_unlink
-	.long sys_mq_timedsend
-	.long sys_mq_timedreceive	/* 280 */
-	.long sys_mq_notify
-	.long sys_mq_getsetattr
-	.long sys_kexec_load
-	.long sys_waitid
-	.long sys_ni_syscall		/* 285 */ /* available */
-	.long sys_add_key
-	.long sys_request_key
-	.long sys_keyctl
-	.long sys_ioprio_set
-	.long sys_ioprio_get		/* 290 */
-	.long sys_inotify_init
-	.long sys_inotify_add_watch
-	.long sys_inotify_rm_watch
-	.long sys_migrate_pages
-	.long sys_openat		/* 295 */
-	.long sys_mkdirat
-	.long sys_mknodat
-	.long sys_fchownat
-	.long sys_futimesat
-	.long sys_fstatat64		/* 300 */
-	.long sys_unlinkat
-	.long sys_renameat
-	.long sys_linkat
-	.long sys_symlinkat
-	.long sys_readlinkat		/* 305 */
-	.long sys_fchmodat
-	.long sys_faccessat
-	.long sys_pselect6
-	.long sys_ppoll
-	.long sys_unshare		/* 310 */
-	.long sys_set_robust_list
-	.long sys_get_robust_list
-	.long sys_splice
-	.long sys_sync_file_range
-	.long sys_tee			/* 315 */
-	.long sys_vmsplice
-	.long sys_move_pages
-	.long sys_getcpu
-	.long sys_epoll_pwait
-	.long sys_utimensat		/* 320 */
-	.long sys_signalfd
-	.long sys_timerfd_create
-	.long sys_eventfd
-	.long sys_fallocate
-	.long sys_timerfd_settime	/* 325 */
-	.long sys_timerfd_gettime
-	.long sys_signalfd4
-	.long sys_eventfd2
-	.long sys_epoll_create1
-	.long sys_dup3			/* 330 */
-	.long sys_pipe2
-	.long sys_inotify_init1
-	.long sys_preadv
-	.long sys_pwritev
-	.long sys_rt_tgsigqueueinfo	/* 335 */
-	.long sys_perf_event_open
-	.long sys_recvmmsg
-	.long sys_fanotify_init
-	.long sys_fanotify_mark
-	.long sys_prlimit64		/* 340 */
-	.long sys_name_to_handle_at
-	.long sys_open_by_handle_at
-	.long sys_clock_adjtime
-	.long sys_syncfs
-	.long sys_sendmmsg		/* 345 */
-	.long sys_setns
-	.long sys_process_vm_readv
-	.long sys_process_vm_writev
-- 
cgit v1.1


From f14525f9e033f344996905744f41680ea2b877ce Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 18 Nov 2011 16:03:27 -0800
Subject: x86: Simplify syscallhdr.sh

Simplify syscallhdr.sh by letting grep sort out the ABIs that we want,
rather than relying on manual list matching.  This is safe since the
ABI strings already have to consist only of characters which are valid in C
macro names.

Suggested-by: Matt Helsley <matthltc@us.ibm.com>
Link: http://lkml.kernel.org/r/20111118221558.GA6408@count0.beaverton.ibm.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/syscalls/syscallhdr.sh | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh
index 0d473ff..b3c5930 100644
--- a/arch/x86/syscalls/syscallhdr.sh
+++ b/arch/x86/syscalls/syscallhdr.sh
@@ -2,33 +2,20 @@
 
 in="$1"
 out="$2"
-my_abis=`echo "$3" | tr ',' ' '`
+my_abis=`echo "($3)" | tr ',' '|'`
 prefix="$4"
 offset="$5"
 
 fileguard=_ASM_X86_`basename "$out" | sed \
     -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \
     -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'`
-
-in_list () {
-    local x
-    for x in $1; do
-	if [ x"$x" = x"$2" ]; then
-	    return 0
-	fi
-    done
-    return 1
-}
-
-grep '^[0-9]' "$in" | sort -n | (
+grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
     echo "#ifndef ${fileguard}"
     echo "#define ${fileguard} 1"
     echo ""
 
     while read nr abi name entry ; do
-	if in_list "$my_abis" "$abi"; then
-	    echo "#define __NR_${prefix}${name}" $((nr+offset))
-        fi
+	echo "#define __NR_${prefix}${name}" $((nr+offset))
     done
 
     echo ""
-- 
cgit v1.1


From 61f1e7e20874e8f11dab69b6a4bf7616badd4fe8 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 18 Nov 2011 16:25:07 -0800
Subject: x86, syscall: Re-fix typo in comment

Fix the same typo as was fixed in:

b7641d2c x86-64, syscall: Adjust comment spacing and remove typo

... for the new versions of this file (32-bit and IA32 compat).

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Link: http://lkml.kernel.org/r/1321569446-20433-4-git-send-email-hpa@linux.intel.com
---
 arch/x86/ia32/syscall_ia32.c | 2 +-
 arch/x86/kernel/syscall_32.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
index d04d3db..4754ba0 100644
--- a/arch/x86/ia32/syscall_ia32.c
+++ b/arch/x86/ia32/syscall_ia32.c
@@ -17,7 +17,7 @@ extern void compat_ni_syscall(void);
 
 const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
 	/*
-	 * Smells like a like a compiler bug -- it doesn't work
+	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
 	 */
 	[0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
index b37a573..147fcd4 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/kernel/syscall_32.c
@@ -17,7 +17,7 @@ extern asmlinkage void sys_ni_syscall(void);
 
 const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
 	/*
-	 * Smells like a like a compiler bug -- it doesn't work
+	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
 	 */
 	[0 ... __NR_syscall_max] = &sys_ni_syscall,
-- 
cgit v1.1


From 3f86886c72fb68088162c7e08cc7f85282f1860c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Fri, 18 Nov 2011 17:01:19 -0800
Subject: x86, syscall: Allow syscall offset to be symbolic

Allow the specified syscall offset to be symbolic, e.g. a macro.  For
offset system calls, this if nothing else makes the generated code
easier to read.

Suggested-by: H. J. Lu <hjl.tools@gmail.com>
Link: http://lkml.kernel.org/r/1321569446-20433-7-git-send-email-hpa@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/syscalls/syscallhdr.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh
index b3c5930..31fd5f1 100644
--- a/arch/x86/syscalls/syscallhdr.sh
+++ b/arch/x86/syscalls/syscallhdr.sh
@@ -15,7 +15,11 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
     echo ""
 
     while read nr abi name entry ; do
-	echo "#define __NR_${prefix}${name}" $((nr+offset))
+	if [ -z "$offset" ]; then
+	    echo "#define __NR_${prefix}${name} $nr"
+	else
+	    echo "#define __NR_${prefix}${name} ($offset + $nr)"
+        fi
     done
 
     echo ""
-- 
cgit v1.1


From 937c30d7f560210b0163035edd42b2aef78fed9e Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Wed, 9 Nov 2011 16:26:25 +0200
Subject: crypto: serpent - add 8-way parallel x86_64/SSE2 assembler
 implementation

Patch adds x86_64/SSE2 assembler implementation of serpent cipher. Assembler
functions crypt data in eigth block chunks (two 4 block chunk SSE2 operations
in parallel to improve performance on out-of-order CPUs). Glue code is based
on one from AES-NI implementation, so requests from irq context are redirected
to cryptd.

v2:
 - add missing include of linux/module.h
   (appearently crypto.h used to include module.h, which changed for 3.2 by
    commit 7c926402a7e8c9b279968fd94efec8700ba3859e)

Patch has been tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios):

AMD Phenom II 1055T (fam:16, model:10):

size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16B     1.03x   1.01x   1.03x   1.05x   1.00x   0.99x
64B     1.00x   1.01x   1.02x   1.04x   1.02x   1.01x
256B    2.34x   2.41x   0.99x   2.43x   2.39x   2.40x
1024B   2.51x   2.57x   1.00x   2.59x   2.56x   2.56x
8192B   2.50x   2.54x   1.00x   2.55x   2.57x   2.57x

Intel Celeron T1600 (fam:6, model:15, step:13):

size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16B     0.97x   0.97x   1.01x   1.01x   1.01x   1.02x
64B     1.00x   1.00x   1.00x   1.02x   1.01x   1.01x
256B    3.41x   3.35x   1.00x   3.39x   3.42x   3.44x
1024B   3.75x   3.72x   0.99x   3.74x   3.75x   3.75x
8192B   3.70x   3.68x   0.99x   3.68x   3.69x   3.69x

Full output:
 http://koti.mbnet.fi/axh/kernel/crypto/phenom-ii-1055t/serpent-generic.txt
 http://koti.mbnet.fi/axh/kernel/crypto/phenom-ii-1055t/serpent-sse2.txt
 http://koti.mbnet.fi/axh/kernel/crypto/celeron-t1600/serpent-generic.txt
 http://koti.mbnet.fi/axh/kernel/crypto/celeron-t1600/serpent-sse2.txt

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile                     |   2 +
 arch/x86/crypto/serpent-sse2-x86_64-asm_64.S | 761 +++++++++++++++++++++++++++
 arch/x86/crypto/serpent_sse2_glue.c          | 719 +++++++++++++++++++++++++
 arch/x86/include/asm/serpent.h               |  32 ++
 4 files changed, 1514 insertions(+)
 create mode 100644 arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
 create mode 100644 arch/x86/crypto/serpent_sse2_glue.c
 create mode 100644 arch/x86/include/asm/serpent.h

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3537d4b..12ebdbd 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 
@@ -26,6 +27,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
+serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
new file mode 100644
index 0000000..7f24a15
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -0,0 +1,761 @@
+/*
+ * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on crypto/serpent.c by
+ *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
+ *                2003 Herbert Valerio Riedel <hvr@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+.file "serpent-sse2-x86_64-asm_64.S"
+.text
+
+#define CTX %rdi
+
+/**********************************************************************
+  8-way SSE2 serpent
+ **********************************************************************/
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+#define RE1 %xmm4
+
+#define RA2 %xmm5
+#define RB2 %xmm6
+#define RC2 %xmm7
+#define RD2 %xmm8
+#define RE2 %xmm9
+
+#define RNOT %xmm10
+
+#define RK0 %xmm11
+#define RK1 %xmm12
+#define RK2 %xmm13
+#define RK3 %xmm14
+
+#define S0_1(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	por x0,			x3; \
+	pxor x4,		x0; \
+	pxor x2,		x4; \
+	pxor RNOT,		x4; \
+	pxor x1,		x3; \
+	pand x0,		x1; \
+	pxor x4,		x1; \
+	pxor x0,		x2;
+#define S0_2(x0, x1, x2, x3, x4) \
+	pxor x3,		x0; \
+	por x0,			x4; \
+	pxor x2,		x0; \
+	pand x1,		x2; \
+	pxor x2,		x3; \
+	pxor RNOT,		x1; \
+	pxor x4,		x2; \
+	pxor x2,		x1;
+
+#define S1_1(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	pxor x0,		x1; \
+	pxor x3,		x0; \
+	pxor RNOT,		x3; \
+	pand x1,		x4; \
+	por x1,			x0; \
+	pxor x2,		x3; \
+	pxor x3,		x0; \
+	pxor x3,		x1;
+#define S1_2(x0, x1, x2, x3, x4) \
+	pxor x4,		x3; \
+	por x4,			x1; \
+	pxor x2,		x4; \
+	pand x0,		x2; \
+	pxor x1,		x2; \
+	por x0,			x1; \
+	pxor RNOT,		x0; \
+	pxor x2,		x0; \
+	pxor x1,		x4;
+
+#define S2_1(x0, x1, x2, x3, x4) \
+	pxor RNOT,		x3; \
+	pxor x0,		x1; \
+	movdqa x0,		x4; \
+	pand x2,		x0; \
+	pxor x3,		x0; \
+	por x4,			x3; \
+	pxor x1,		x2; \
+	pxor x1,		x3; \
+	pand x0,		x1;
+#define S2_2(x0, x1, x2, x3, x4) \
+	pxor x2,		x0; \
+	pand x3,		x2; \
+	por x1,			x3; \
+	pxor RNOT,		x0; \
+	pxor x0,		x3; \
+	pxor x0,		x4; \
+	pxor x2,		x0; \
+	por x2,			x1;
+
+#define S3_1(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	pxor x3,		x1; \
+	por x0,			x3; \
+	pand x0,		x4; \
+	pxor x2,		x0; \
+	pxor x1,		x2; \
+	pand x3,		x1; \
+	pxor x3,		x2; \
+	por x4,			x0; \
+	pxor x3,		x4;
+#define S3_2(x0, x1, x2, x3, x4) \
+	pxor x0,		x1; \
+	pand x3,		x0; \
+	pand x4,		x3; \
+	pxor x2,		x3; \
+	por x1,			x4; \
+	pand x1,		x2; \
+	pxor x3,		x4; \
+	pxor x3,		x0; \
+	pxor x2,		x3;
+
+#define S4_1(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	pand x0,		x3; \
+	pxor x4,		x0; \
+	pxor x2,		x3; \
+	por x4,			x2; \
+	pxor x1,		x0; \
+	pxor x3,		x4; \
+	por x0,			x2; \
+	pxor x1,		x2;
+#define S4_2(x0, x1, x2, x3, x4) \
+	pand x0,		x1; \
+	pxor x4,		x1; \
+	pand x2,		x4; \
+	pxor x3,		x2; \
+	pxor x0,		x4; \
+	por x1,			x3; \
+	pxor RNOT,		x1; \
+	pxor x0,		x3;
+
+#define S5_1(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	por x0,			x1; \
+	pxor x1,		x2; \
+	pxor RNOT,		x3; \
+	pxor x0,		x4; \
+	pxor x2,		x0; \
+	pand x4,		x1; \
+	por x3,			x4; \
+	pxor x0,		x4;
+#define S5_2(x0, x1, x2, x3, x4) \
+	pand x3,		x0; \
+	pxor x3,		x1; \
+	pxor x2,		x3; \
+	pxor x1,		x0; \
+	pand x4,		x2; \
+	pxor x2,		x1; \
+	pand x0,		x2; \
+	pxor x2,		x3;
+
+#define S6_1(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	pxor x0,		x3; \
+	pxor x2,		x1; \
+	pxor x0,		x2; \
+	pand x3,		x0; \
+	por x3,			x1; \
+	pxor RNOT,		x4; \
+	pxor x1,		x0; \
+	pxor x2,		x1;
+#define S6_2(x0, x1, x2, x3, x4) \
+	pxor x4,		x3; \
+	pxor x0,		x4; \
+	pand x0,		x2; \
+	pxor x1,		x4; \
+	pxor x3,		x2; \
+	pand x1,		x3; \
+	pxor x0,		x3; \
+	pxor x2,		x1;
+
+#define S7_1(x0, x1, x2, x3, x4) \
+	pxor RNOT,		x1; \
+	movdqa x1,		x4; \
+	pxor RNOT,		x0; \
+	pand x2,		x1; \
+	pxor x3,		x1; \
+	por x4,			x3; \
+	pxor x2,		x4; \
+	pxor x3,		x2; \
+	pxor x0,		x3; \
+	por x1,			x0;
+#define S7_2(x0, x1, x2, x3, x4) \
+	pand x0,		x2; \
+	pxor x4,		x0; \
+	pxor x3,		x4; \
+	pand x0,		x3; \
+	pxor x1,		x4; \
+	pxor x4,		x2; \
+	pxor x1,		x3; \
+	por x0,			x4; \
+	pxor x1,		x4;
+
+#define SI0_1(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	pxor x0,		x1; \
+	por x1,			x3; \
+	pxor x1,		x4; \
+	pxor RNOT,		x0; \
+	pxor x3,		x2; \
+	pxor x0,		x3; \
+	pand x1,		x0; \
+	pxor x2,		x0;
+#define SI0_2(x0, x1, x2, x3, x4) \
+	pand x3,		x2; \
+	pxor x4,		x3; \
+	pxor x3,		x2; \
+	pxor x3,		x1; \
+	pand x0,		x3; \
+	pxor x0,		x1; \
+	pxor x2,		x0; \
+	pxor x3,		x4;
+
+#define SI1_1(x0, x1, x2, x3, x4) \
+	pxor x3,		x1; \
+	movdqa x0,		x4; \
+	pxor x2,		x0; \
+	pxor RNOT,		x2; \
+	por x1,			x4; \
+	pxor x3,		x4; \
+	pand x1,		x3; \
+	pxor x2,		x1; \
+	pand x4,		x2;
+#define SI1_2(x0, x1, x2, x3, x4) \
+	pxor x1,		x4; \
+	por x3,			x1; \
+	pxor x0,		x3; \
+	pxor x0,		x2; \
+	por x4,			x0; \
+	pxor x4,		x2; \
+	pxor x0,		x1; \
+	pxor x1,		x4;
+
+#define SI2_1(x0, x1, x2, x3, x4) \
+	pxor x1,		x2; \
+	movdqa x3,		x4; \
+	pxor RNOT,		x3; \
+	por x2,			x3; \
+	pxor x4,		x2; \
+	pxor x0,		x4; \
+	pxor x1,		x3; \
+	por x2,			x1; \
+	pxor x0,		x2;
+#define SI2_2(x0, x1, x2, x3, x4) \
+	pxor x4,		x1; \
+	por x3,			x4; \
+	pxor x3,		x2; \
+	pxor x2,		x4; \
+	pand x1,		x2; \
+	pxor x3,		x2; \
+	pxor x4,		x3; \
+	pxor x0,		x4;
+
+#define SI3_1(x0, x1, x2, x3, x4) \
+	pxor x1,		x2; \
+	movdqa x1,		x4; \
+	pand x2,		x1; \
+	pxor x0,		x1; \
+	por x4,			x0; \
+	pxor x3,		x4; \
+	pxor x3,		x0; \
+	por x1,			x3; \
+	pxor x2,		x1;
+#define SI3_2(x0, x1, x2, x3, x4) \
+	pxor x3,		x1; \
+	pxor x2,		x0; \
+	pxor x3,		x2; \
+	pand x1,		x3; \
+	pxor x0,		x1; \
+	pand x2,		x0; \
+	pxor x3,		x4; \
+	pxor x0,		x3; \
+	pxor x1,		x0;
+
+#define SI4_1(x0, x1, x2, x3, x4) \
+	pxor x3,		x2; \
+	movdqa x0,		x4; \
+	pand x1,		x0; \
+	pxor x2,		x0; \
+	por x3,			x2; \
+	pxor RNOT,		x4; \
+	pxor x0,		x1; \
+	pxor x2,		x0; \
+	pand x4,		x2;
+#define SI4_2(x0, x1, x2, x3, x4) \
+	pxor x0,		x2; \
+	por x4,			x0; \
+	pxor x3,		x0; \
+	pand x2,		x3; \
+	pxor x3,		x4; \
+	pxor x1,		x3; \
+	pand x0,		x1; \
+	pxor x1,		x4; \
+	pxor x3,		x0;
+
+#define SI5_1(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	por x2,			x1; \
+	pxor x4,		x2; \
+	pxor x3,		x1; \
+	pand x4,		x3; \
+	pxor x3,		x2; \
+	por x0,			x3; \
+	pxor RNOT,		x0; \
+	pxor x2,		x3; \
+	por x0,			x2;
+#define SI5_2(x0, x1, x2, x3, x4) \
+	pxor x1,		x4; \
+	pxor x4,		x2; \
+	pand x0,		x4; \
+	pxor x1,		x0; \
+	pxor x3,		x1; \
+	pand x2,		x0; \
+	pxor x3,		x2; \
+	pxor x2,		x0; \
+	pxor x4,		x2; \
+	pxor x3,		x4;
+
+#define SI6_1(x0, x1, x2, x3, x4) \
+	pxor x2,		x0; \
+	movdqa x0,		x4; \
+	pand x3,		x0; \
+	pxor x3,		x2; \
+	pxor x2,		x0; \
+	pxor x1,		x3; \
+	por x4,			x2; \
+	pxor x3,		x2; \
+	pand x0,		x3;
+#define SI6_2(x0, x1, x2, x3, x4) \
+	pxor RNOT,		x0; \
+	pxor x1,		x3; \
+	pand x2,		x1; \
+	pxor x0,		x4; \
+	pxor x4,		x3; \
+	pxor x2,		x4; \
+	pxor x1,		x0; \
+	pxor x0,		x2;
+
+#define SI7_1(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	pand x0,		x3; \
+	pxor x2,		x0; \
+	por x4,			x2; \
+	pxor x1,		x4; \
+	pxor RNOT,		x0; \
+	por x3,			x1; \
+	pxor x0,		x4; \
+	pand x2,		x0; \
+	pxor x1,		x0;
+#define SI7_2(x0, x1, x2, x3, x4) \
+	pand x2,		x1; \
+	pxor x2,		x3; \
+	pxor x3,		x4; \
+	pand x3,		x2; \
+	por x0,			x3; \
+	pxor x4,		x1; \
+	pxor x4,		x3; \
+	pand x0,		x4; \
+	pxor x2,		x4;
+
+#define get_key(i, j, t) \
+	movd (4*(i)+(j))*4(CTX), t; \
+	pshufd $0, t, t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+	get_key(i, 0, RK0); \
+	get_key(i, 1, RK1); \
+	get_key(i, 2, RK2); \
+	get_key(i, 3, RK3); \
+	pxor RK0,		x0 ## 1; \
+	pxor RK1,		x1 ## 1; \
+	pxor RK2,		x2 ## 1; \
+	pxor RK3,		x3 ## 1; \
+		pxor RK0,		x0 ## 2; \
+		pxor RK1,		x1 ## 2; \
+		pxor RK2,		x2 ## 2; \
+		pxor RK3,		x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+	movdqa x0 ## 1,		x4 ## 1; \
+	pslld $13,		x0 ## 1; \
+	psrld $(32 - 13),	x4 ## 1; \
+	por x4 ## 1,		x0 ## 1; \
+	pxor x0 ## 1,		x1 ## 1; \
+	movdqa x2 ## 1,		x4 ## 1; \
+	pslld $3,		x2 ## 1; \
+	psrld $(32 - 3),	x4 ## 1; \
+	por x4 ## 1,		x2 ## 1; \
+	pxor x2 ## 1,		x1 ## 1; \
+		movdqa x0 ## 2,		x4 ## 2; \
+		pslld $13,		x0 ## 2; \
+		psrld $(32 - 13),	x4 ## 2; \
+		por x4 ## 2,		x0 ## 2; \
+		pxor x0 ## 2,		x1 ## 2; \
+		movdqa x2 ## 2,		x4 ## 2; \
+		pslld $3,		x2 ## 2; \
+		psrld $(32 - 3),	x4 ## 2; \
+		por x4 ## 2,		x2 ## 2; \
+		pxor x2 ## 2,		x1 ## 2; \
+	movdqa x1 ## 1,		x4 ## 1; \
+	pslld $1,		x1 ## 1; \
+	psrld $(32 - 1),	x4 ## 1; \
+	por x4 ## 1,		x1 ## 1; \
+	movdqa x0 ## 1,		x4 ## 1; \
+	pslld $3,		x4 ## 1; \
+	pxor x2 ## 1,		x3 ## 1; \
+	pxor x4 ## 1,		x3 ## 1; \
+	movdqa x3 ## 1,		x4 ## 1; \
+	get_key(i, 1, RK1); \
+		movdqa x1 ## 2,		x4 ## 2; \
+		pslld $1,		x1 ## 2; \
+		psrld $(32 - 1),	x4 ## 2; \
+		por x4 ## 2,		x1 ## 2; \
+		movdqa x0 ## 2,		x4 ## 2; \
+		pslld $3,		x4 ## 2; \
+		pxor x2 ## 2,		x3 ## 2; \
+		pxor x4 ## 2,		x3 ## 2; \
+		movdqa x3 ## 2,		x4 ## 2; \
+		get_key(i, 3, RK3); \
+	pslld $7,		x3 ## 1; \
+	psrld $(32 - 7),	x4 ## 1; \
+	por x4 ## 1,		x3 ## 1; \
+	movdqa x1 ## 1,		x4 ## 1; \
+	pslld $7,		x4 ## 1; \
+	pxor x1 ## 1,		x0 ## 1; \
+	pxor x3 ## 1,		x0 ## 1; \
+	pxor x3 ## 1,		x2 ## 1; \
+	pxor x4 ## 1,		x2 ## 1; \
+	get_key(i, 0, RK0); \
+		pslld $7,		x3 ## 2; \
+		psrld $(32 - 7),	x4 ## 2; \
+		por x4 ## 2,		x3 ## 2; \
+		movdqa x1 ## 2,		x4 ## 2; \
+		pslld $7,		x4 ## 2; \
+		pxor x1 ## 2,		x0 ## 2; \
+		pxor x3 ## 2,		x0 ## 2; \
+		pxor x3 ## 2,		x2 ## 2; \
+		pxor x4 ## 2,		x2 ## 2; \
+		get_key(i, 2, RK2); \
+	pxor RK1,		x1 ## 1; \
+	pxor RK3,		x3 ## 1; \
+	movdqa x0 ## 1,		x4 ## 1; \
+	pslld $5,		x0 ## 1; \
+	psrld $(32 - 5),	x4 ## 1; \
+	por x4 ## 1,		x0 ## 1; \
+	movdqa x2 ## 1,		x4 ## 1; \
+	pslld $22,		x2 ## 1; \
+	psrld $(32 - 22),	x4 ## 1; \
+	por x4 ## 1,		x2 ## 1; \
+	pxor RK0,		x0 ## 1; \
+	pxor RK2,		x2 ## 1; \
+		pxor RK1,		x1 ## 2; \
+		pxor RK3,		x3 ## 2; \
+		movdqa x0 ## 2,		x4 ## 2; \
+		pslld $5,		x0 ## 2; \
+		psrld $(32 - 5),	x4 ## 2; \
+		por x4 ## 2,		x0 ## 2; \
+		movdqa x2 ## 2,		x4 ## 2; \
+		pslld $22,		x2 ## 2; \
+		psrld $(32 - 22),	x4 ## 2; \
+		por x4 ## 2,		x2 ## 2; \
+		pxor RK0,		x0 ## 2; \
+		pxor RK2,		x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+	pxor RK0,		x0 ## 1; \
+	pxor RK2,		x2 ## 1; \
+	movdqa x0 ## 1,		x4 ## 1; \
+	psrld $5,		x0 ## 1; \
+	pslld $(32 - 5),	x4 ## 1; \
+	por x4 ## 1,		x0 ## 1; \
+	pxor RK3,		x3 ## 1; \
+	pxor RK1,		x1 ## 1; \
+	movdqa x2 ## 1,		x4 ## 1; \
+	psrld $22,		x2 ## 1; \
+	pslld $(32 - 22),	x4 ## 1; \
+	por x4 ## 1,		x2 ## 1; \
+	pxor x3 ## 1,		x2 ## 1; \
+		pxor RK0,		x0 ## 2; \
+		pxor RK2,		x2 ## 2; \
+		movdqa x0 ## 2,		x4 ## 2; \
+		psrld $5,		x0 ## 2; \
+		pslld $(32 - 5),	x4 ## 2; \
+		por x4 ## 2,		x0 ## 2; \
+		pxor RK3,		x3 ## 2; \
+		pxor RK1,		x1 ## 2; \
+		movdqa x2 ## 2,		x4 ## 2; \
+		psrld $22,		x2 ## 2; \
+		pslld $(32 - 22),	x4 ## 2; \
+		por x4 ## 2,		x2 ## 2; \
+		pxor x3 ## 2,		x2 ## 2; \
+	pxor x3 ## 1,		x0 ## 1; \
+	movdqa x1 ## 1,		x4 ## 1; \
+	pslld $7,		x4 ## 1; \
+	pxor x1 ## 1,		x0 ## 1; \
+	pxor x4 ## 1,		x2 ## 1; \
+	movdqa x1 ## 1,		x4 ## 1; \
+	psrld $1,		x1 ## 1; \
+	pslld $(32 - 1),	x4 ## 1; \
+	por x4 ## 1,		x1 ## 1; \
+		pxor x3 ## 2,		x0 ## 2; \
+		movdqa x1 ## 2,		x4 ## 2; \
+		pslld $7,		x4 ## 2; \
+		pxor x1 ## 2,		x0 ## 2; \
+		pxor x4 ## 2,		x2 ## 2; \
+		movdqa x1 ## 2,		x4 ## 2; \
+		psrld $1,		x1 ## 2; \
+		pslld $(32 - 1),	x4 ## 2; \
+		por x4 ## 2,		x1 ## 2; \
+	movdqa x3 ## 1,		x4 ## 1; \
+	psrld $7,		x3 ## 1; \
+	pslld $(32 - 7),	x4 ## 1; \
+	por x4 ## 1,		x3 ## 1; \
+	pxor x0 ## 1,		x1 ## 1; \
+	movdqa x0 ## 1,		x4 ## 1; \
+	pslld $3,		x4 ## 1; \
+	pxor x4 ## 1,		x3 ## 1; \
+	movdqa x0 ## 1,		x4 ## 1; \
+		movdqa x3 ## 2,		x4 ## 2; \
+		psrld $7,		x3 ## 2; \
+		pslld $(32 - 7),	x4 ## 2; \
+		por x4 ## 2,		x3 ## 2; \
+		pxor x0 ## 2,		x1 ## 2; \
+		movdqa x0 ## 2,		x4 ## 2; \
+		pslld $3,		x4 ## 2; \
+		pxor x4 ## 2,		x3 ## 2; \
+		movdqa x0 ## 2,		x4 ## 2; \
+	psrld $13,		x0 ## 1; \
+	pslld $(32 - 13),	x4 ## 1; \
+	por x4 ## 1,		x0 ## 1; \
+	pxor x2 ## 1,		x1 ## 1; \
+	pxor x2 ## 1,		x3 ## 1; \
+	movdqa x2 ## 1,		x4 ## 1; \
+	psrld $3,		x2 ## 1; \
+	pslld $(32 - 3),	x4 ## 1; \
+	por x4 ## 1,		x2 ## 1; \
+		psrld $13,		x0 ## 2; \
+		pslld $(32 - 13),	x4 ## 2; \
+		por x4 ## 2,		x0 ## 2; \
+		pxor x2 ## 2,		x1 ## 2; \
+		pxor x2 ## 2,		x3 ## 2; \
+		movdqa x2 ## 2,		x4 ## 2; \
+		psrld $3,		x2 ## 2; \
+		pslld $(32 - 3),	x4 ## 2; \
+		por x4 ## 2,		x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+	get_key(i, 0, RK0); \
+	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	get_key(i, 2, RK2); \
+	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+	get_key(i, 3, RK3); \
+	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	get_key(i, 1, RK1); \
+	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+	movdqa x2,		t3; \
+	movdqa x0,		t1; \
+	unpcklps x3,		t3; \
+	movdqa x0,		t2; \
+	unpcklps x1,		t1; \
+	unpckhps x1,		t2; \
+	movdqa t3,		x1; \
+	unpckhps x3,		x2; \
+	movdqa t1,		x0; \
+	movhlps t1,		x1; \
+	movdqa t2,		t1; \
+	movlhps t3,		x0; \
+	movlhps x2,		t1; \
+	movhlps t2,		x2; \
+	movdqa x2,		x3; \
+	movdqa t1,		x2;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+	movdqu (0*4*4)(in),	x0; \
+	movdqu (1*4*4)(in),	x1; \
+	movdqu (2*4*4)(in),	x2; \
+	movdqu (3*4*4)(in),	x3; \
+	\
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	\
+	movdqu x0,		(0*4*4)(out); \
+	movdqu x1,		(1*4*4)(out); \
+	movdqu x2,		(2*4*4)(out); \
+	movdqu x3,		(3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	\
+	movdqu (0*4*4)(out),	t0; \
+	pxor t0,		x0; \
+	movdqu x0,		(0*4*4)(out); \
+	movdqu (1*4*4)(out),	t0; \
+	pxor t0,		x1; \
+	movdqu x1,		(1*4*4)(out); \
+	movdqu (2*4*4)(out),	t0; \
+	pxor t0,		x2; \
+	movdqu x2,		(2*4*4)(out); \
+	movdqu (3*4*4)(out),	t0; \
+	pxor t0,		x3; \
+	movdqu x3,		(3*4*4)(out);
+
+.align 8
+.global __serpent_enc_blk_8way
+.type   __serpent_enc_blk_8way,@function;
+
+__serpent_enc_blk_8way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: bool, if true: xor output
+	 */
+
+	pcmpeqd RNOT, RNOT;
+
+	leaq (4*4*4)(%rdx), %rax;
+	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+						 K2(RA, RB, RC, RD, RE, 0);
+	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1);
+	S(S1, RC, RB, RD, RA, RE);		LK2(RE, RD, RA, RC, RB, 2);
+	S(S2, RE, RD, RA, RC, RB);		LK2(RB, RD, RE, RC, RA, 3);
+	S(S3, RB, RD, RE, RC, RA);		LK2(RC, RA, RD, RB, RE, 4);
+	S(S4, RC, RA, RD, RB, RE);		LK2(RA, RD, RB, RE, RC, 5);
+	S(S5, RA, RD, RB, RE, RC);		LK2(RC, RA, RD, RE, RB, 6);
+	S(S6, RC, RA, RD, RE, RB);		LK2(RD, RB, RA, RE, RC, 7);
+	S(S7, RD, RB, RA, RE, RC);		LK2(RC, RA, RE, RD, RB, 8);
+	S(S0, RC, RA, RE, RD, RB);		LK2(RE, RA, RD, RC, RB, 9);
+	S(S1, RE, RA, RD, RC, RB);		LK2(RB, RD, RC, RE, RA, 10);
+	S(S2, RB, RD, RC, RE, RA);		LK2(RA, RD, RB, RE, RC, 11);
+	S(S3, RA, RD, RB, RE, RC);		LK2(RE, RC, RD, RA, RB, 12);
+	S(S4, RE, RC, RD, RA, RB);		LK2(RC, RD, RA, RB, RE, 13);
+	S(S5, RC, RD, RA, RB, RE);		LK2(RE, RC, RD, RB, RA, 14);
+	S(S6, RE, RC, RD, RB, RA);		LK2(RD, RA, RC, RB, RE, 15);
+	S(S7, RD, RA, RC, RB, RE);		LK2(RE, RC, RB, RD, RA, 16);
+	S(S0, RE, RC, RB, RD, RA);		LK2(RB, RC, RD, RE, RA, 17);
+	S(S1, RB, RC, RD, RE, RA);		LK2(RA, RD, RE, RB, RC, 18);
+	S(S2, RA, RD, RE, RB, RC);		LK2(RC, RD, RA, RB, RE, 19);
+	S(S3, RC, RD, RA, RB, RE);		LK2(RB, RE, RD, RC, RA, 20);
+	S(S4, RB, RE, RD, RC, RA);		LK2(RE, RD, RC, RA, RB, 21);
+	S(S5, RE, RD, RC, RA, RB);		LK2(RB, RE, RD, RA, RC, 22);
+	S(S6, RB, RE, RD, RA, RC);		LK2(RD, RC, RE, RA, RB, 23);
+	S(S7, RD, RC, RE, RA, RB);		LK2(RB, RE, RA, RD, RC, 24);
+	S(S0, RB, RE, RA, RD, RC);		LK2(RA, RE, RD, RB, RC, 25);
+	S(S1, RA, RE, RD, RB, RC);		LK2(RC, RD, RB, RA, RE, 26);
+	S(S2, RC, RD, RB, RA, RE);		LK2(RE, RD, RC, RA, RB, 27);
+	S(S3, RE, RD, RC, RA, RB);		LK2(RA, RB, RD, RE, RC, 28);
+	S(S4, RA, RB, RD, RE, RC);		LK2(RB, RD, RE, RC, RA, 29);
+	S(S5, RB, RD, RE, RC, RA);		LK2(RA, RB, RD, RC, RE, 30);
+	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);
+	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32);
+
+	leaq (4*4*4)(%rsi), %rax;
+
+	testb %cl, %cl;
+	jnz __enc_xor8;
+
+	write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+	ret;
+
+__enc_xor8:
+	xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+	ret;
+
+.align 8
+.global serpent_dec_blk_8way
+.type   serpent_dec_blk_8way,@function;
+
+serpent_dec_blk_8way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	pcmpeqd RNOT, RNOT;
+
+	leaq (4*4*4)(%rdx), %rax;
+	read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+						 K2(RA, RB, RC, RD, RE, 32);
+	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31);
+	SP(SI6, RB, RD, RA, RE, RC, 30);	KL2(RA, RC, RE, RB, RD, 30);
+	SP(SI5, RA, RC, RE, RB, RD, 29);	KL2(RC, RD, RA, RE, RB, 29);
+	SP(SI4, RC, RD, RA, RE, RB, 28);	KL2(RC, RA, RB, RE, RD, 28);
+	SP(SI3, RC, RA, RB, RE, RD, 27);	KL2(RB, RC, RD, RE, RA, 27);
+	SP(SI2, RB, RC, RD, RE, RA, 26);	KL2(RC, RA, RE, RD, RB, 26);
+	SP(SI1, RC, RA, RE, RD, RB, 25);	KL2(RB, RA, RE, RD, RC, 25);
+	SP(SI0, RB, RA, RE, RD, RC, 24);	KL2(RE, RC, RA, RB, RD, 24);
+	SP(SI7, RE, RC, RA, RB, RD, 23);	KL2(RC, RB, RE, RD, RA, 23);
+	SP(SI6, RC, RB, RE, RD, RA, 22);	KL2(RE, RA, RD, RC, RB, 22);
+	SP(SI5, RE, RA, RD, RC, RB, 21);	KL2(RA, RB, RE, RD, RC, 21);
+	SP(SI4, RA, RB, RE, RD, RC, 20);	KL2(RA, RE, RC, RD, RB, 20);
+	SP(SI3, RA, RE, RC, RD, RB, 19);	KL2(RC, RA, RB, RD, RE, 19);
+	SP(SI2, RC, RA, RB, RD, RE, 18);	KL2(RA, RE, RD, RB, RC, 18);
+	SP(SI1, RA, RE, RD, RB, RC, 17);	KL2(RC, RE, RD, RB, RA, 17);
+	SP(SI0, RC, RE, RD, RB, RA, 16);	KL2(RD, RA, RE, RC, RB, 16);
+	SP(SI7, RD, RA, RE, RC, RB, 15);	KL2(RA, RC, RD, RB, RE, 15);
+	SP(SI6, RA, RC, RD, RB, RE, 14);	KL2(RD, RE, RB, RA, RC, 14);
+	SP(SI5, RD, RE, RB, RA, RC, 13);	KL2(RE, RC, RD, RB, RA, 13);
+	SP(SI4, RE, RC, RD, RB, RA, 12);	KL2(RE, RD, RA, RB, RC, 12);
+	SP(SI3, RE, RD, RA, RB, RC, 11);	KL2(RA, RE, RC, RB, RD, 11);
+	SP(SI2, RA, RE, RC, RB, RD, 10);	KL2(RE, RD, RB, RC, RA, 10);
+	SP(SI1, RE, RD, RB, RC, RA, 9);		KL2(RA, RD, RB, RC, RE, 9);
+	SP(SI0, RA, RD, RB, RC, RE, 8);		KL2(RB, RE, RD, RA, RC, 8);
+	SP(SI7, RB, RE, RD, RA, RC, 7);		KL2(RE, RA, RB, RC, RD, 7);
+	SP(SI6, RE, RA, RB, RC, RD, 6);		KL2(RB, RD, RC, RE, RA, 6);
+	SP(SI5, RB, RD, RC, RE, RA, 5);		KL2(RD, RA, RB, RC, RE, 5);
+	SP(SI4, RD, RA, RB, RC, RE, 4);		KL2(RD, RB, RE, RC, RA, 4);
+	SP(SI3, RD, RB, RE, RC, RA, 3);		KL2(RE, RD, RA, RC, RB, 3);
+	SP(SI2, RE, RD, RA, RC, RB, 2);		KL2(RD, RB, RC, RA, RE, 2);
+	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);
+	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0);
+
+	leaq (4*4*4)(%rsi), %rax;
+	write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+	write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+	ret;
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
new file mode 100644
index 0000000..947cf57
--- /dev/null
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -0,0 +1,719 @@
+/*
+ * Glue Code for SSE2 assembler versions of Serpent Cipher
+ *
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Glue code based on aesni-intel_glue.c by:
+ *  Copyright (C) 2008, Intel Corp.
+ *    Author: Huang Ying <ying.huang@intel.com>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/serpent.h>
+#include <crypto/cryptd.h>
+#include <crypto/b128ops.h>
+#include <crypto/ctr.h>
+#include <asm/i387.h>
+#include <asm/serpent.h>
+#include <crypto/scatterwalk.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+
+struct async_serpent_ctx {
+	struct cryptd_ablkcipher *cryptd_tfm;
+};
+
+static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+	if (fpu_enabled)
+		return true;
+
+	/* SSE2 is only used when chunk to be processed is large enough, so
+	 * do not enable FPU until it is necessary.
+	 */
+	if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS)
+		return false;
+
+	kernel_fpu_begin();
+	return true;
+}
+
+static inline void serpent_fpu_end(bool fpu_enabled)
+{
+	if (fpu_enabled)
+		kernel_fpu_end();
+}
+
+static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+		     bool enc)
+{
+	bool fpu_enabled = false;
+	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	unsigned int nbytes;
+	int err;
+
+	err = blkcipher_walk_virt(desc, walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	while ((nbytes = walk->nbytes)) {
+		u8 *wsrc = walk->src.virt.addr;
+		u8 *wdst = walk->dst.virt.addr;
+
+		fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
+
+		/* Process multi-block batch */
+		if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
+			do {
+				if (enc)
+					serpent_enc_blk_xway(ctx, wdst, wsrc);
+				else
+					serpent_dec_blk_xway(ctx, wdst, wsrc);
+
+				wsrc += bsize * SERPENT_PARALLEL_BLOCKS;
+				wdst += bsize * SERPENT_PARALLEL_BLOCKS;
+				nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
+			} while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
+
+			if (nbytes < bsize)
+				goto done;
+		}
+
+		/* Handle leftovers */
+		do {
+			if (enc)
+				__serpent_encrypt(ctx, wdst, wsrc);
+			else
+				__serpent_decrypt(ctx, wdst, wsrc);
+
+			wsrc += bsize;
+			wdst += bsize;
+			nbytes -= bsize;
+		} while (nbytes >= bsize);
+
+done:
+		err = blkcipher_walk_done(desc, walk, nbytes);
+	}
+
+	serpent_fpu_end(fpu_enabled);
+	return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ecb_crypt(desc, &walk, true);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ecb_crypt(desc, &walk, false);
+}
+
+static struct crypto_alg blk_ecb_alg = {
+	.cra_name		= "__ecb-serpent-sse2",
+	.cra_driver_name	= "__driver-ecb-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_ecb_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+};
+
+static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
+				  struct blkcipher_walk *walk)
+{
+	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u128 *src = (u128 *)walk->src.virt.addr;
+	u128 *dst = (u128 *)walk->dst.virt.addr;
+	u128 *iv = (u128 *)walk->iv;
+
+	do {
+		u128_xor(dst, src, iv);
+		__serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst);
+		iv = dst;
+
+		src += 1;
+		dst += 1;
+		nbytes -= bsize;
+	} while (nbytes >= bsize);
+
+	u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
+	return nbytes;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	while ((nbytes = walk.nbytes)) {
+		nbytes = __cbc_encrypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
+				  struct blkcipher_walk *walk)
+{
+	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u128 *src = (u128 *)walk->src.virt.addr;
+	u128 *dst = (u128 *)walk->dst.virt.addr;
+	u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
+	u128 last_iv;
+	int i;
+
+	/* Start of the last block. */
+	src += nbytes / bsize - 1;
+	dst += nbytes / bsize - 1;
+
+	last_iv = *src;
+
+	/* Process multi-block batch */
+	if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
+		do {
+			nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1);
+			src -= SERPENT_PARALLEL_BLOCKS - 1;
+			dst -= SERPENT_PARALLEL_BLOCKS - 1;
+
+			for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
+				ivs[i] = src[i];
+
+			serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
+
+			for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
+				u128_xor(dst + (i + 1), dst + (i + 1), ivs + i);
+
+			nbytes -= bsize;
+			if (nbytes < bsize)
+				goto done;
+
+			u128_xor(dst, dst, src - 1);
+			src -= 1;
+			dst -= 1;
+		} while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Handle leftovers */
+	for (;;) {
+		__serpent_decrypt(ctx, (u8 *)dst, (u8 *)src);
+
+		nbytes -= bsize;
+		if (nbytes < bsize)
+			break;
+
+		u128_xor(dst, dst, src - 1);
+		src -= 1;
+		dst -= 1;
+	}
+
+done:
+	u128_xor(dst, dst, (u128 *)walk->iv);
+	*(u128 *)walk->iv = last_iv;
+
+	return nbytes;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	bool fpu_enabled = false;
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	while ((nbytes = walk.nbytes)) {
+		fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
+		nbytes = __cbc_decrypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	serpent_fpu_end(fpu_enabled);
+	return err;
+}
+
+static struct crypto_alg blk_cbc_alg = {
+	.cra_name		= "__cbc-serpent-sse2",
+	.cra_driver_name	= "__driver-cbc-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_cbc_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+};
+
+static inline void u128_to_be128(be128 *dst, const u128 *src)
+{
+	dst->a = cpu_to_be64(src->a);
+	dst->b = cpu_to_be64(src->b);
+}
+
+static inline void be128_to_u128(u128 *dst, const be128 *src)
+{
+	dst->a = be64_to_cpu(src->a);
+	dst->b = be64_to_cpu(src->b);
+}
+
+static inline void u128_inc(u128 *i)
+{
+	i->b++;
+	if (!i->b)
+		i->a++;
+}
+
+static void ctr_crypt_final(struct blkcipher_desc *desc,
+			    struct blkcipher_walk *walk)
+{
+	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	u8 *ctrblk = walk->iv;
+	u8 keystream[SERPENT_BLOCK_SIZE];
+	u8 *src = walk->src.virt.addr;
+	u8 *dst = walk->dst.virt.addr;
+	unsigned int nbytes = walk->nbytes;
+
+	__serpent_encrypt(ctx, keystream, ctrblk);
+	crypto_xor(keystream, src, nbytes);
+	memcpy(dst, keystream, nbytes);
+
+	crypto_inc(ctrblk, SERPENT_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
+				struct blkcipher_walk *walk)
+{
+	struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u128 *src = (u128 *)walk->src.virt.addr;
+	u128 *dst = (u128 *)walk->dst.virt.addr;
+	u128 ctrblk;
+	be128 ctrblocks[SERPENT_PARALLEL_BLOCKS];
+	int i;
+
+	be128_to_u128(&ctrblk, (be128 *)walk->iv);
+
+	/* Process multi-block batch */
+	if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
+		do {
+			/* create ctrblks for parallel encrypt */
+			for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
+				if (dst != src)
+					dst[i] = src[i];
+
+				u128_to_be128(&ctrblocks[i], &ctrblk);
+				u128_inc(&ctrblk);
+			}
+
+			serpent_enc_blk_xway_xor(ctx, (u8 *)dst,
+						 (u8 *)ctrblocks);
+
+			src += SERPENT_PARALLEL_BLOCKS;
+			dst += SERPENT_PARALLEL_BLOCKS;
+			nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
+		} while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Handle leftovers */
+	do {
+		if (dst != src)
+			*dst = *src;
+
+		u128_to_be128(&ctrblocks[0], &ctrblk);
+		u128_inc(&ctrblk);
+
+		__serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
+		u128_xor(dst, dst, (u128 *)ctrblocks);
+
+		src += 1;
+		dst += 1;
+		nbytes -= bsize;
+	} while (nbytes >= bsize);
+
+done:
+	u128_to_be128((be128 *)walk->iv, &ctrblk);
+	return nbytes;
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	bool fpu_enabled = false;
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) {
+		fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
+		nbytes = __ctr_crypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	serpent_fpu_end(fpu_enabled);
+
+	if (walk.nbytes) {
+		ctr_crypt_final(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	return err;
+}
+
+static struct crypto_alg blk_ctr_alg = {
+	.cra_name		= "__ctr-serpent-sse2",
+	.cra_driver_name	= "__driver-ctr-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_ctr_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+};
+
+static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
+			unsigned int key_len)
+{
+	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
+	int err;
+
+	crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+	crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
+				    & CRYPTO_TFM_REQ_MASK);
+	err = crypto_ablkcipher_setkey(child, key, key_len);
+	crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
+				    & CRYPTO_TFM_RES_MASK);
+	return err;
+}
+
+static int __ablk_encrypt(struct ablkcipher_request *req)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	struct blkcipher_desc desc;
+
+	desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
+	desc.info = req->info;
+	desc.flags = 0;
+
+	return crypto_blkcipher_crt(desc.tfm)->encrypt(
+		&desc, req->dst, req->src, req->nbytes);
+}
+
+static int ablk_encrypt(struct ablkcipher_request *req)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+	if (!irq_fpu_usable()) {
+		struct ablkcipher_request *cryptd_req =
+			ablkcipher_request_ctx(req);
+
+		memcpy(cryptd_req, req, sizeof(*req));
+		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+
+		return crypto_ablkcipher_encrypt(cryptd_req);
+	} else {
+		return __ablk_encrypt(req);
+	}
+}
+
+static int ablk_decrypt(struct ablkcipher_request *req)
+{
+	struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+	struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+	if (!irq_fpu_usable()) {
+		struct ablkcipher_request *cryptd_req =
+			ablkcipher_request_ctx(req);
+
+		memcpy(cryptd_req, req, sizeof(*req));
+		ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+
+		return crypto_ablkcipher_decrypt(cryptd_req);
+	} else {
+		struct blkcipher_desc desc;
+
+		desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
+		desc.info = req->info;
+		desc.flags = 0;
+
+		return crypto_blkcipher_crt(desc.tfm)->decrypt(
+			&desc, req->dst, req->src, req->nbytes);
+	}
+}
+
+static void ablk_exit(struct crypto_tfm *tfm)
+{
+	struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	cryptd_free_ablkcipher(ctx->cryptd_tfm);
+}
+
+static void ablk_init_common(struct crypto_tfm *tfm,
+			     struct cryptd_ablkcipher *cryptd_tfm)
+{
+	struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->cryptd_tfm = cryptd_tfm;
+	tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
+		crypto_ablkcipher_reqsize(&cryptd_tfm->base);
+}
+
+static int ablk_ecb_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-serpent-sse2", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_ecb_alg = {
+	.cra_name		= "ecb(serpent)",
+	.cra_driver_name	= "ecb-serpent-sse2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
+	.cra_init		= ablk_ecb_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+};
+
+static int ablk_cbc_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_cbc_alg = {
+	.cra_name		= "cbc(serpent)",
+	.cra_driver_name	= "cbc-serpent-sse2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
+	.cra_init		= ablk_cbc_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= __ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+};
+
+static int ablk_ctr_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_ctr_alg = {
+	.cra_name		= "ctr(serpent)",
+	.cra_driver_name	= "ctr-serpent-sse2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
+	.cra_init		= ablk_ctr_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_encrypt,
+			.geniv		= "chainiv",
+		},
+	},
+};
+
+static int __init serpent_sse2_init(void)
+{
+	int err;
+
+	if (!cpu_has_xmm2) {
+		printk(KERN_INFO "SSE2 instructions are not detected.\n");
+		return -ENODEV;
+	}
+
+	err = crypto_register_alg(&blk_ecb_alg);
+	if (err)
+		goto blk_ecb_err;
+	err = crypto_register_alg(&blk_cbc_alg);
+	if (err)
+		goto blk_cbc_err;
+	err = crypto_register_alg(&blk_ctr_alg);
+	if (err)
+		goto blk_ctr_err;
+	err = crypto_register_alg(&ablk_ecb_alg);
+	if (err)
+		goto ablk_ecb_err;
+	err = crypto_register_alg(&ablk_cbc_alg);
+	if (err)
+		goto ablk_cbc_err;
+	err = crypto_register_alg(&ablk_ctr_alg);
+	if (err)
+		goto ablk_ctr_err;
+	return err;
+
+ablk_ctr_err:
+	crypto_unregister_alg(&ablk_cbc_alg);
+ablk_cbc_err:
+	crypto_unregister_alg(&ablk_ecb_alg);
+ablk_ecb_err:
+	crypto_unregister_alg(&blk_ctr_alg);
+blk_ctr_err:
+	crypto_unregister_alg(&blk_cbc_alg);
+blk_cbc_err:
+	crypto_unregister_alg(&blk_ecb_alg);
+blk_ecb_err:
+	return err;
+}
+
+static void __exit serpent_sse2_exit(void)
+{
+	crypto_unregister_alg(&ablk_ctr_alg);
+	crypto_unregister_alg(&ablk_cbc_alg);
+	crypto_unregister_alg(&ablk_ecb_alg);
+	crypto_unregister_alg(&blk_ctr_alg);
+	crypto_unregister_alg(&blk_cbc_alg);
+	crypto_unregister_alg(&blk_ecb_alg);
+}
+
+module_init(serpent_sse2_init);
+module_exit(serpent_sse2_exit);
+
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("serpent");
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h
new file mode 100644
index 0000000..b7fd3b5
--- /dev/null
+++ b/arch/x86/include/asm/serpent.h
@@ -0,0 +1,32 @@
+#ifndef ASM_X86_SERPENT_H
+#define ASM_X86_SERPENT_H
+
+#include <linux/crypto.h>
+#include <crypto/serpent.h>
+
+#define SERPENT_PARALLEL_BLOCKS 8
+
+asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
+				       const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst,
+				     const u8 *src);
+
+static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+				   const u8 *src)
+{
+	__serpent_enc_blk_8way(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
+				       const u8 *src)
+{
+	__serpent_enc_blk_8way(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+				   const u8 *src)
+{
+	serpent_dec_blk_8way(ctx, dst, src);
+}
+
+#endif
-- 
cgit v1.1


From 251496dbfc1be38bc43b49651f3d33c02faccc47 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Wed, 9 Nov 2011 16:26:31 +0200
Subject: crypto: serpent - add 4-way parallel i586/SSE2 assembler
 implementation

Patch adds i586/SSE2 assembler implementation of serpent cipher. Assembler
functions crypt data in four block chunks.

Patch has been tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios):

Intel Atom N270:

size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16      0.95x   1.12x   1.02x   1.07x   0.97x   0.98x
64      1.73x   1.82x   1.08x   1.82x   1.72x   1.73x
256     2.08x   2.00x   1.04x   2.07x   1.99x   2.01x
1024    2.28x   2.18x   1.05x   2.23x   2.17x   2.20x
8192    2.28x   2.13x   1.05x   2.23x   2.18x   2.20x

Full output:
 http://koti.mbnet.fi/axh/kernel/crypto/atom-n270/serpent-generic.txt
 http://koti.mbnet.fi/axh/kernel/crypto/atom-n270/serpent-sse2.txt

Userspace test results:

Encryption/decryption of sse2-i586 vs generic on Intel Atom N270:
 encrypt: 2.35x
 decrypt: 2.54x

Encryption/decryption of sse2-i586 vs generic on AMD Phenom II:
 encrypt: 1.82x
 decrypt: 2.51x

Encryption/decryption of sse2-i586 vs generic on Intel Xeon E7330:
 encrypt: 2.99x
 decrypt: 3.48x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/Makefile                   |   2 +
 arch/x86/crypto/serpent-sse2-i586-asm_32.S | 638 +++++++++++++++++++++++++++++
 arch/x86/include/asm/serpent.h             |  31 ++
 3 files changed, 671 insertions(+)
 create mode 100644 arch/x86/crypto/serpent-sse2-i586-asm_32.S

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 12ebdbd..2b0b963 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -5,6 +5,7 @@
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
 obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
 obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
@@ -21,6 +22,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
 salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
+serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
new file mode 100644
index 0000000..4e37677
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -0,0 +1,638 @@
+/*
+ * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on crypto/serpent.c by
+ *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
+ *                2003 Herbert Valerio Riedel <hvr@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ */
+
+.file "serpent-sse2-i586-asm_32.S"
+.text
+
+#define arg_ctx 4
+#define arg_dst 8
+#define arg_src 12
+#define arg_xor 16
+
+/**********************************************************************
+  4-way SSE2 serpent
+ **********************************************************************/
+#define CTX %edx
+
+#define RA %xmm0
+#define RB %xmm1
+#define RC %xmm2
+#define RD %xmm3
+#define RE %xmm4
+
+#define RT0 %xmm5
+#define RT1 %xmm6
+
+#define RNOT %xmm7
+
+#define get_key(i, j, t) \
+	movd (4*(i)+(j))*4(CTX), t; \
+	pshufd $0, t, t;
+
+#define K(x0, x1, x2, x3, x4, i) \
+	get_key(i, 0, x4); \
+	get_key(i, 1, RT0); \
+	get_key(i, 2, RT1); \
+	pxor x4,		x0; \
+	pxor RT0,		x1; \
+	pxor RT1,		x2; \
+	get_key(i, 3, x4); \
+	pxor x4,		x3;
+
+#define LK(x0, x1, x2, x3, x4, i) \
+	movdqa x0,		x4; \
+	pslld $13,		x0; \
+	psrld $(32 - 13),	x4; \
+	por x4,			x0; \
+	pxor x0,		x1; \
+	movdqa x2,		x4; \
+	pslld $3,		x2; \
+	psrld $(32 - 3),	x4; \
+	por x4,			x2; \
+	pxor x2,		x1; \
+	movdqa x1,		x4; \
+	pslld $1,		x1; \
+	psrld $(32 - 1),	x4; \
+	por x4,			x1; \
+	movdqa x0,		x4; \
+	pslld $3,		x4; \
+	pxor x2,		x3; \
+	pxor x4,		x3; \
+	movdqa x3,		x4; \
+	pslld $7,		x3; \
+	psrld $(32 - 7),	x4; \
+	por x4,			x3; \
+	movdqa x1,		x4; \
+	pslld $7,		x4; \
+	pxor x1,		x0; \
+	pxor x3,		x0; \
+	pxor x3,		x2; \
+	pxor x4,		x2; \
+	movdqa x0,		x4; \
+	get_key(i, 1, RT0); \
+	pxor RT0,		x1; \
+	get_key(i, 3, RT0); \
+	pxor RT0,		x3; \
+	pslld $5,		x0; \
+	psrld $(32 - 5),	x4; \
+	por x4,			x0; \
+	movdqa x2,		x4; \
+	pslld $22,		x2; \
+	psrld $(32 - 22),	x4; \
+	por x4,			x2; \
+	get_key(i, 0, RT0); \
+	pxor RT0,		x0; \
+	get_key(i, 2, RT0); \
+	pxor RT0,		x2;
+
+#define KL(x0, x1, x2, x3, x4, i) \
+	K(x0, x1, x2, x3, x4, i); \
+	movdqa x0,		x4; \
+	psrld $5,		x0; \
+	pslld $(32 - 5),	x4; \
+	por x4,			x0; \
+	movdqa x2,		x4; \
+	psrld $22,		x2; \
+	pslld $(32 - 22),	x4; \
+	por x4,			x2; \
+	pxor x3,		x2; \
+	pxor x3,		x0; \
+	movdqa x1,		x4; \
+	pslld $7,		x4; \
+	pxor x1,		x0; \
+	pxor x4,		x2; \
+	movdqa x1,		x4; \
+	psrld $1,		x1; \
+	pslld $(32 - 1),	x4; \
+	por x4,			x1; \
+	movdqa x3,		x4; \
+	psrld $7,		x3; \
+	pslld $(32 - 7),	x4; \
+	por x4,			x3; \
+	pxor x0,		x1; \
+	movdqa x0,		x4; \
+	pslld $3,		x4; \
+	pxor x4,		x3; \
+	movdqa x0,		x4; \
+	psrld $13,		x0; \
+	pslld $(32 - 13),	x4; \
+	por x4,			x0; \
+	pxor x2,		x1; \
+	pxor x2,		x3; \
+	movdqa x2,		x4; \
+	psrld $3,		x2; \
+	pslld $(32 - 3),	x4; \
+	por x4,			x2;
+
+#define S0(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	por x0,			x3; \
+	pxor x4,		x0; \
+	pxor x2,		x4; \
+	pxor RNOT,		x4; \
+	pxor x1,		x3; \
+	pand x0,		x1; \
+	pxor x4,		x1; \
+	pxor x0,		x2; \
+	pxor x3,		x0; \
+	por x0,			x4; \
+	pxor x2,		x0; \
+	pand x1,		x2; \
+	pxor x2,		x3; \
+	pxor RNOT,		x1; \
+	pxor x4,		x2; \
+	pxor x2,		x1;
+
+#define S1(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	pxor x0,		x1; \
+	pxor x3,		x0; \
+	pxor RNOT,		x3; \
+	pand x1,		x4; \
+	por x1,			x0; \
+	pxor x2,		x3; \
+	pxor x3,		x0; \
+	pxor x3,		x1; \
+	pxor x4,		x3; \
+	por x4,			x1; \
+	pxor x2,		x4; \
+	pand x0,		x2; \
+	pxor x1,		x2; \
+	por x0,			x1; \
+	pxor RNOT,		x0; \
+	pxor x2,		x0; \
+	pxor x1,		x4;
+
+#define S2(x0, x1, x2, x3, x4) \
+	pxor RNOT,		x3; \
+	pxor x0,		x1; \
+	movdqa x0,		x4; \
+	pand x2,		x0; \
+	pxor x3,		x0; \
+	por x4,			x3; \
+	pxor x1,		x2; \
+	pxor x1,		x3; \
+	pand x0,		x1; \
+	pxor x2,		x0; \
+	pand x3,		x2; \
+	por x1,			x3; \
+	pxor RNOT,		x0; \
+	pxor x0,		x3; \
+	pxor x0,		x4; \
+	pxor x2,		x0; \
+	por x2,			x1;
+
+#define S3(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	pxor x3,		x1; \
+	por x0,			x3; \
+	pand x0,		x4; \
+	pxor x2,		x0; \
+	pxor x1,		x2; \
+	pand x3,		x1; \
+	pxor x3,		x2; \
+	por x4,			x0; \
+	pxor x3,		x4; \
+	pxor x0,		x1; \
+	pand x3,		x0; \
+	pand x4,		x3; \
+	pxor x2,		x3; \
+	por x1,			x4; \
+	pand x1,		x2; \
+	pxor x3,		x4; \
+	pxor x3,		x0; \
+	pxor x2,		x3;
+
+#define S4(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	pand x0,		x3; \
+	pxor x4,		x0; \
+	pxor x2,		x3; \
+	por x4,			x2; \
+	pxor x1,		x0; \
+	pxor x3,		x4; \
+	por x0,			x2; \
+	pxor x1,		x2; \
+	pand x0,		x1; \
+	pxor x4,		x1; \
+	pand x2,		x4; \
+	pxor x3,		x2; \
+	pxor x0,		x4; \
+	por x1,			x3; \
+	pxor RNOT,		x1; \
+	pxor x0,		x3;
+
+#define S5(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	por x0,			x1; \
+	pxor x1,		x2; \
+	pxor RNOT,		x3; \
+	pxor x0,		x4; \
+	pxor x2,		x0; \
+	pand x4,		x1; \
+	por x3,			x4; \
+	pxor x0,		x4; \
+	pand x3,		x0; \
+	pxor x3,		x1; \
+	pxor x2,		x3; \
+	pxor x1,		x0; \
+	pand x4,		x2; \
+	pxor x2,		x1; \
+	pand x0,		x2; \
+	pxor x2,		x3;
+
+#define S6(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	pxor x0,		x3; \
+	pxor x2,		x1; \
+	pxor x0,		x2; \
+	pand x3,		x0; \
+	por x3,			x1; \
+	pxor RNOT,		x4; \
+	pxor x1,		x0; \
+	pxor x2,		x1; \
+	pxor x4,		x3; \
+	pxor x0,		x4; \
+	pand x0,		x2; \
+	pxor x1,		x4; \
+	pxor x3,		x2; \
+	pand x1,		x3; \
+	pxor x0,		x3; \
+	pxor x2,		x1;
+
+#define S7(x0, x1, x2, x3, x4) \
+	pxor RNOT,		x1; \
+	movdqa x1,		x4; \
+	pxor RNOT,		x0; \
+	pand x2,		x1; \
+	pxor x3,		x1; \
+	por x4,			x3; \
+	pxor x2,		x4; \
+	pxor x3,		x2; \
+	pxor x0,		x3; \
+	por x1,			x0; \
+	pand x0,		x2; \
+	pxor x4,		x0; \
+	pxor x3,		x4; \
+	pand x0,		x3; \
+	pxor x1,		x4; \
+	pxor x4,		x2; \
+	pxor x1,		x3; \
+	por x0,			x4; \
+	pxor x1,		x4;
+
+#define SI0(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	pxor x0,		x1; \
+	por x1,			x3; \
+	pxor x1,		x4; \
+	pxor RNOT,		x0; \
+	pxor x3,		x2; \
+	pxor x0,		x3; \
+	pand x1,		x0; \
+	pxor x2,		x0; \
+	pand x3,		x2; \
+	pxor x4,		x3; \
+	pxor x3,		x2; \
+	pxor x3,		x1; \
+	pand x0,		x3; \
+	pxor x0,		x1; \
+	pxor x2,		x0; \
+	pxor x3,		x4;
+
+#define SI1(x0, x1, x2, x3, x4) \
+	pxor x3,		x1; \
+	movdqa x0,		x4; \
+	pxor x2,		x0; \
+	pxor RNOT,		x2; \
+	por x1,			x4; \
+	pxor x3,		x4; \
+	pand x1,		x3; \
+	pxor x2,		x1; \
+	pand x4,		x2; \
+	pxor x1,		x4; \
+	por x3,			x1; \
+	pxor x0,		x3; \
+	pxor x0,		x2; \
+	por x4,			x0; \
+	pxor x4,		x2; \
+	pxor x0,		x1; \
+	pxor x1,		x4;
+
+#define SI2(x0, x1, x2, x3, x4) \
+	pxor x1,		x2; \
+	movdqa x3,		x4; \
+	pxor RNOT,		x3; \
+	por x2,			x3; \
+	pxor x4,		x2; \
+	pxor x0,		x4; \
+	pxor x1,		x3; \
+	por x2,			x1; \
+	pxor x0,		x2; \
+	pxor x4,		x1; \
+	por x3,			x4; \
+	pxor x3,		x2; \
+	pxor x2,		x4; \
+	pand x1,		x2; \
+	pxor x3,		x2; \
+	pxor x4,		x3; \
+	pxor x0,		x4;
+
+#define SI3(x0, x1, x2, x3, x4) \
+	pxor x1,		x2; \
+	movdqa x1,		x4; \
+	pand x2,		x1; \
+	pxor x0,		x1; \
+	por x4,			x0; \
+	pxor x3,		x4; \
+	pxor x3,		x0; \
+	por x1,			x3; \
+	pxor x2,		x1; \
+	pxor x3,		x1; \
+	pxor x2,		x0; \
+	pxor x3,		x2; \
+	pand x1,		x3; \
+	pxor x0,		x1; \
+	pand x2,		x0; \
+	pxor x3,		x4; \
+	pxor x0,		x3; \
+	pxor x1,		x0;
+
+#define SI4(x0, x1, x2, x3, x4) \
+	pxor x3,		x2; \
+	movdqa x0,		x4; \
+	pand x1,		x0; \
+	pxor x2,		x0; \
+	por x3,			x2; \
+	pxor RNOT,		x4; \
+	pxor x0,		x1; \
+	pxor x2,		x0; \
+	pand x4,		x2; \
+	pxor x0,		x2; \
+	por x4,			x0; \
+	pxor x3,		x0; \
+	pand x2,		x3; \
+	pxor x3,		x4; \
+	pxor x1,		x3; \
+	pand x0,		x1; \
+	pxor x1,		x4; \
+	pxor x3,		x0;
+
+#define SI5(x0, x1, x2, x3, x4) \
+	movdqa x1,		x4; \
+	por x2,			x1; \
+	pxor x4,		x2; \
+	pxor x3,		x1; \
+	pand x4,		x3; \
+	pxor x3,		x2; \
+	por x0,			x3; \
+	pxor RNOT,		x0; \
+	pxor x2,		x3; \
+	por x0,			x2; \
+	pxor x1,		x4; \
+	pxor x4,		x2; \
+	pand x0,		x4; \
+	pxor x1,		x0; \
+	pxor x3,		x1; \
+	pand x2,		x0; \
+	pxor x3,		x2; \
+	pxor x2,		x0; \
+	pxor x4,		x2; \
+	pxor x3,		x4;
+
+#define SI6(x0, x1, x2, x3, x4) \
+	pxor x2,		x0; \
+	movdqa x0,		x4; \
+	pand x3,		x0; \
+	pxor x3,		x2; \
+	pxor x2,		x0; \
+	pxor x1,		x3; \
+	por x4,			x2; \
+	pxor x3,		x2; \
+	pand x0,		x3; \
+	pxor RNOT,		x0; \
+	pxor x1,		x3; \
+	pand x2,		x1; \
+	pxor x0,		x4; \
+	pxor x4,		x3; \
+	pxor x2,		x4; \
+	pxor x1,		x0; \
+	pxor x0,		x2;
+
+#define SI7(x0, x1, x2, x3, x4) \
+	movdqa x3,		x4; \
+	pand x0,		x3; \
+	pxor x2,		x0; \
+	por x4,			x2; \
+	pxor x1,		x4; \
+	pxor RNOT,		x0; \
+	por x3,			x1; \
+	pxor x0,		x4; \
+	pand x2,		x0; \
+	pxor x1,		x0; \
+	pand x2,		x1; \
+	pxor x2,		x3; \
+	pxor x3,		x4; \
+	pand x3,		x2; \
+	por x0,			x3; \
+	pxor x4,		x1; \
+	pxor x4,		x3; \
+	pand x0,		x4; \
+	pxor x2,		x4;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+	movdqa x2,		t3; \
+	movdqa x0,		t1; \
+	unpcklps x3,		t3; \
+	movdqa x0,		t2; \
+	unpcklps x1,		t1; \
+	unpckhps x1,		t2; \
+	movdqa t3,		x1; \
+	unpckhps x3,		x2; \
+	movdqa t1,		x0; \
+	movhlps t1,		x1; \
+	movdqa t2,		t1; \
+	movlhps t3,		x0; \
+	movlhps x2,		t1; \
+	movhlps t2,		x2; \
+	movdqa x2,		x3; \
+	movdqa t1,		x2;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+	movdqu (0*4*4)(in),	x0; \
+	movdqu (1*4*4)(in),	x1; \
+	movdqu (2*4*4)(in),	x2; \
+	movdqu (3*4*4)(in),	x3; \
+	\
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	\
+	movdqu x0, (0*4*4)(out); \
+	movdqu x1, (1*4*4)(out); \
+	movdqu x2, (2*4*4)(out); \
+	movdqu x3, (3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	\
+	movdqu (0*4*4)(out),	t0; \
+	pxor t0,		x0; \
+	movdqu x0,		(0*4*4)(out); \
+	movdqu (1*4*4)(out),	t0; \
+	pxor t0,		x1; \
+	movdqu x1,		(1*4*4)(out); \
+	movdqu (2*4*4)(out),	t0; \
+	pxor t0,		x2; \
+	movdqu x2,		(2*4*4)(out); \
+	movdqu (3*4*4)(out),	t0; \
+	pxor t0,		x3; \
+	movdqu x3,		(3*4*4)(out);
+
+.align 8
+.global __serpent_enc_blk_4way
+.type   __serpent_enc_blk_4way,@function;
+
+__serpent_enc_blk_4way:
+	/* input:
+	 *	arg_ctx(%esp): ctx, CTX
+	 *	arg_dst(%esp): dst
+	 *	arg_src(%esp): src
+	 *	arg_xor(%esp): bool, if true: xor output
+	 */
+
+	pcmpeqd RNOT, RNOT;
+
+	movl arg_ctx(%esp), CTX;
+
+	movl arg_src(%esp), %eax;
+	read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+					 K(RA, RB, RC, RD, RE, 0);
+	S0(RA, RB, RC, RD, RE);		LK(RC, RB, RD, RA, RE, 1);
+	S1(RC, RB, RD, RA, RE);		LK(RE, RD, RA, RC, RB, 2);
+	S2(RE, RD, RA, RC, RB);		LK(RB, RD, RE, RC, RA, 3);
+	S3(RB, RD, RE, RC, RA);		LK(RC, RA, RD, RB, RE, 4);
+	S4(RC, RA, RD, RB, RE);		LK(RA, RD, RB, RE, RC, 5);
+	S5(RA, RD, RB, RE, RC);		LK(RC, RA, RD, RE, RB, 6);
+	S6(RC, RA, RD, RE, RB);		LK(RD, RB, RA, RE, RC, 7);
+	S7(RD, RB, RA, RE, RC);		LK(RC, RA, RE, RD, RB, 8);
+	S0(RC, RA, RE, RD, RB);		LK(RE, RA, RD, RC, RB, 9);
+	S1(RE, RA, RD, RC, RB);		LK(RB, RD, RC, RE, RA, 10);
+	S2(RB, RD, RC, RE, RA);		LK(RA, RD, RB, RE, RC, 11);
+	S3(RA, RD, RB, RE, RC);		LK(RE, RC, RD, RA, RB, 12);
+	S4(RE, RC, RD, RA, RB);		LK(RC, RD, RA, RB, RE, 13);
+	S5(RC, RD, RA, RB, RE);		LK(RE, RC, RD, RB, RA, 14);
+	S6(RE, RC, RD, RB, RA);		LK(RD, RA, RC, RB, RE, 15);
+	S7(RD, RA, RC, RB, RE);		LK(RE, RC, RB, RD, RA, 16);
+	S0(RE, RC, RB, RD, RA);		LK(RB, RC, RD, RE, RA, 17);
+	S1(RB, RC, RD, RE, RA);		LK(RA, RD, RE, RB, RC, 18);
+	S2(RA, RD, RE, RB, RC);		LK(RC, RD, RA, RB, RE, 19);
+	S3(RC, RD, RA, RB, RE);		LK(RB, RE, RD, RC, RA, 20);
+	S4(RB, RE, RD, RC, RA);		LK(RE, RD, RC, RA, RB, 21);
+	S5(RE, RD, RC, RA, RB);		LK(RB, RE, RD, RA, RC, 22);
+	S6(RB, RE, RD, RA, RC);		LK(RD, RC, RE, RA, RB, 23);
+	S7(RD, RC, RE, RA, RB);		LK(RB, RE, RA, RD, RC, 24);
+	S0(RB, RE, RA, RD, RC);		LK(RA, RE, RD, RB, RC, 25);
+	S1(RA, RE, RD, RB, RC);		LK(RC, RD, RB, RA, RE, 26);
+	S2(RC, RD, RB, RA, RE);		LK(RE, RD, RC, RA, RB, 27);
+	S3(RE, RD, RC, RA, RB);		LK(RA, RB, RD, RE, RC, 28);
+	S4(RA, RB, RD, RE, RC);		LK(RB, RD, RE, RC, RA, 29);
+	S5(RB, RD, RE, RC, RA);		LK(RA, RB, RD, RC, RE, 30);
+	S6(RA, RB, RD, RC, RE);		LK(RD, RE, RB, RC, RA, 31);
+	S7(RD, RE, RB, RC, RA);		 K(RA, RB, RC, RD, RE, 32);
+
+	movl arg_dst(%esp), %eax;
+
+	cmpb $0, arg_xor(%esp);
+	jnz __enc_xor4;
+
+	write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+	ret;
+
+__enc_xor4:
+	xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+	ret;
+
+.align 8
+.global serpent_dec_blk_4way
+.type   serpent_dec_blk_4way,@function;
+
+serpent_dec_blk_4way:
+	/* input:
+	 *	arg_ctx(%esp): ctx, CTX
+	 *	arg_dst(%esp): dst
+	 *	arg_src(%esp): src
+	 */
+
+	pcmpeqd RNOT, RNOT;
+
+	movl arg_ctx(%esp), CTX;
+
+	movl arg_src(%esp), %eax;
+	read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+					 K(RA, RB, RC, RD, RE, 32);
+	SI7(RA, RB, RC, RD, RE);	KL(RB, RD, RA, RE, RC, 31);
+	SI6(RB, RD, RA, RE, RC);	KL(RA, RC, RE, RB, RD, 30);
+	SI5(RA, RC, RE, RB, RD);	KL(RC, RD, RA, RE, RB, 29);
+	SI4(RC, RD, RA, RE, RB);	KL(RC, RA, RB, RE, RD, 28);
+	SI3(RC, RA, RB, RE, RD);	KL(RB, RC, RD, RE, RA, 27);
+	SI2(RB, RC, RD, RE, RA);	KL(RC, RA, RE, RD, RB, 26);
+	SI1(RC, RA, RE, RD, RB);	KL(RB, RA, RE, RD, RC, 25);
+	SI0(RB, RA, RE, RD, RC);	KL(RE, RC, RA, RB, RD, 24);
+	SI7(RE, RC, RA, RB, RD);	KL(RC, RB, RE, RD, RA, 23);
+	SI6(RC, RB, RE, RD, RA);	KL(RE, RA, RD, RC, RB, 22);
+	SI5(RE, RA, RD, RC, RB);	KL(RA, RB, RE, RD, RC, 21);
+	SI4(RA, RB, RE, RD, RC);	KL(RA, RE, RC, RD, RB, 20);
+	SI3(RA, RE, RC, RD, RB);	KL(RC, RA, RB, RD, RE, 19);
+	SI2(RC, RA, RB, RD, RE);	KL(RA, RE, RD, RB, RC, 18);
+	SI1(RA, RE, RD, RB, RC);	KL(RC, RE, RD, RB, RA, 17);
+	SI0(RC, RE, RD, RB, RA);	KL(RD, RA, RE, RC, RB, 16);
+	SI7(RD, RA, RE, RC, RB);	KL(RA, RC, RD, RB, RE, 15);
+	SI6(RA, RC, RD, RB, RE);	KL(RD, RE, RB, RA, RC, 14);
+	SI5(RD, RE, RB, RA, RC);	KL(RE, RC, RD, RB, RA, 13);
+	SI4(RE, RC, RD, RB, RA);	KL(RE, RD, RA, RB, RC, 12);
+	SI3(RE, RD, RA, RB, RC);	KL(RA, RE, RC, RB, RD, 11);
+	SI2(RA, RE, RC, RB, RD);	KL(RE, RD, RB, RC, RA, 10);
+	SI1(RE, RD, RB, RC, RA);	KL(RA, RD, RB, RC, RE, 9);
+	SI0(RA, RD, RB, RC, RE);	KL(RB, RE, RD, RA, RC, 8);
+	SI7(RB, RE, RD, RA, RC);	KL(RE, RA, RB, RC, RD, 7);
+	SI6(RE, RA, RB, RC, RD);	KL(RB, RD, RC, RE, RA, 6);
+	SI5(RB, RD, RC, RE, RA);	KL(RD, RA, RB, RC, RE, 5);
+	SI4(RD, RA, RB, RC, RE);	KL(RD, RB, RE, RC, RA, 4);
+	SI3(RD, RB, RE, RC, RA);	KL(RE, RD, RA, RC, RB, 3);
+	SI2(RE, RD, RA, RC, RB);	KL(RD, RB, RC, RA, RE, 2);
+	SI1(RD, RB, RC, RA, RE);	KL(RE, RB, RC, RA, RD, 1);
+	SI0(RE, RB, RC, RA, RD);	 K(RC, RD, RB, RE, RA, 0);
+
+	movl arg_dst(%esp), %eax;
+	write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
+
+	ret;
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h
index b7fd3b5..d3ef63f 100644
--- a/arch/x86/include/asm/serpent.h
+++ b/arch/x86/include/asm/serpent.h
@@ -4,6 +4,35 @@
 #include <linux/crypto.h>
 #include <crypto/serpent.h>
 
+#ifdef CONFIG_X86_32
+
+#define SERPENT_PARALLEL_BLOCKS 4
+
+asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst,
+				       const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst,
+				     const u8 *src);
+
+static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	__serpent_enc_blk_4way(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
+					    const u8 *src)
+{
+	__serpent_enc_blk_4way(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	serpent_dec_blk_4way(ctx, dst, src);
+}
+
+#else
+
 #define SERPENT_PARALLEL_BLOCKS 8
 
 asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
@@ -30,3 +59,5 @@ static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
 }
 
 #endif
+
+#endif
-- 
cgit v1.1


From 18482053f92b099663bd36a10e8f6bd2c8544669 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Wed, 9 Nov 2011 16:26:36 +0200
Subject: crypto: serpent-sse2 - add lrw support

Patch adds LRW support for serpent-sse2 by using lrw_crypt(). Patch has been
tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios):

Benchmark results with tcrypt:

Intel Celeron T1600 (x86_64) (fam:6, model:15, step:13):
size    lrw-enc lrw-dec
16B     1.00x   0.96x
64B     1.01x   1.01x
256B    3.01x   2.97x
1024B   3.39x   3.33x
8192B   3.35x   3.33x

AMD Phenom II 1055T (x86_64) (fam:16, model:10):
size    lrw-enc lrw-dec
16B     0.98x   1.03x
64B     1.01x   1.04x
256B    2.10x   2.14x
1024B   2.28x   2.33x
8192B   2.30x   2.33x

Intel Atom N270 (i586):
size    lrw-enc lrw-dec
16B     0.97x   0.97x
64B     1.47x   1.50x
256B    1.72x   1.69x
1024B   1.88x   1.81x
8192B   1.84x   1.79x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_sse2_glue.c | 211 ++++++++++++++++++++++++++++++++++++
 1 file changed, 211 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 947cf57..db318e5 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -39,12 +39,17 @@
 #include <crypto/cryptd.h>
 #include <crypto/b128ops.h>
 #include <crypto/ctr.h>
+#include <crypto/lrw.h>
 #include <asm/i387.h>
 #include <asm/serpent.h>
 #include <crypto/scatterwalk.h>
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
 
+#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
+#define HAS_LRW
+#endif
+
 struct async_serpent_ctx {
 	struct cryptd_ablkcipher *cryptd_tfm;
 };
@@ -460,6 +465,152 @@ static struct crypto_alg blk_ctr_alg = {
 	},
 };
 
+#ifdef HAS_LRW
+
+struct crypt_priv {
+	struct serpent_ctx *ctx;
+	bool fpu_enabled;
+};
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+		return;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		__serpent_encrypt(ctx->ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+		serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+		return;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
+}
+
+struct serpent_lrw_ctx {
+	struct lrw_table_ctx lrw_table;
+	struct serpent_ctx serpent_ctx;
+};
+
+static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+	int err;
+
+	err = __serpent_setkey(&ctx->serpent_ctx, key, keylen -
+							SERPENT_BLOCK_SIZE);
+	if (err)
+		return err;
+
+	return lrw_init_table(&ctx->lrw_table, key + keylen -
+						SERPENT_BLOCK_SIZE);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->serpent_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->serpent_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static void lrw_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	lrw_free_table(&ctx->lrw_table);
+}
+
+static struct crypto_alg blk_lrw_alg = {
+	.cra_name		= "__lrw-serpent-sse2",
+	.cra_driver_name	= "__driver-lrw-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_lrw_alg.cra_list),
+	.cra_exit		= lrw_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= lrw_serpent_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+};
+
+#endif
+
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int key_len)
 {
@@ -658,6 +809,48 @@ static struct crypto_alg ablk_ctr_alg = {
 	},
 };
 
+#ifdef HAS_LRW
+
+static int ablk_lrw_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-lrw-serpent-sse2", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_lrw_alg = {
+	.cra_name		= "lrw(serpent)",
+	.cra_driver_name	= "lrw-serpent-sse2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
+	.cra_init		= ablk_lrw_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+};
+
+#endif
+
 static int __init serpent_sse2_init(void)
 {
 	int err;
@@ -685,8 +878,22 @@ static int __init serpent_sse2_init(void)
 	err = crypto_register_alg(&ablk_ctr_alg);
 	if (err)
 		goto ablk_ctr_err;
+#ifdef HAS_LRW
+	err = crypto_register_alg(&blk_lrw_alg);
+	if (err)
+		goto blk_lrw_err;
+	err = crypto_register_alg(&ablk_lrw_alg);
+	if (err)
+		goto ablk_lrw_err;
+#endif
 	return err;
 
+#ifdef HAS_LRW
+ablk_lrw_err:
+	crypto_unregister_alg(&blk_lrw_alg);
+blk_lrw_err:
+	crypto_unregister_alg(&ablk_ctr_alg);
+#endif
 ablk_ctr_err:
 	crypto_unregister_alg(&ablk_cbc_alg);
 ablk_cbc_err:
@@ -703,6 +910,10 @@ blk_ecb_err:
 
 static void __exit serpent_sse2_exit(void)
 {
+#ifdef HAS_LRW
+	crypto_unregister_alg(&ablk_lrw_alg);
+	crypto_unregister_alg(&blk_lrw_alg);
+#endif
 	crypto_unregister_alg(&ablk_ctr_alg);
 	crypto_unregister_alg(&ablk_cbc_alg);
 	crypto_unregister_alg(&ablk_ecb_alg);
-- 
cgit v1.1


From 5962f8b66dd040ad89d55b58967ea2dec607f4d3 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Wed, 9 Nov 2011 16:26:41 +0200
Subject: crypto: serpent-sse2 - add xts support

Patch adds XTS support for serpent-sse2 by using xts_crypt(). Patch has been
tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios):

Intel Celeron T1600 (x86_64) (fam:6, model:15, step:13):
size    xts-enc xts-dec
16B     0.98x   1.00x
64B     1.00x   1.01x
256B    2.78x   2.75x
1024B   3.30x   3.26x
8192B   3.39x   3.30x

AMD Phenom II 1055T (x86_64) (fam:16, model:10):
size    xts-enc xts-dec
16B     1.05x   1.02x
64B     1.04x   1.03x
256B    2.10x   2.05x
1024B   2.34x   2.35x
8192B   2.34x   2.40x

Intel Atom N270 (i586):
size    xts-enc xts-dec
16B     0.95x   0.96x
64B     1.53x   1.50x
256B    1.72x   1.75x
1024B   1.88x   1.87x
8192B   1.86x   1.83x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_sse2_glue.c | 180 +++++++++++++++++++++++++++++++++++-
 1 file changed, 178 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index db318e5..2dffc5a 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -40,6 +40,7 @@
 #include <crypto/b128ops.h>
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
+#include <crypto/xts.h>
 #include <asm/i387.h>
 #include <asm/serpent.h>
 #include <crypto/scatterwalk.h>
@@ -50,6 +51,10 @@
 #define HAS_LRW
 #endif
 
+#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
+#define HAS_XTS
+#endif
+
 struct async_serpent_ctx {
 	struct cryptd_ablkcipher *cryptd_tfm;
 };
@@ -465,7 +470,7 @@ static struct crypto_alg blk_ctr_alg = {
 	},
 };
 
-#ifdef HAS_LRW
+#if defined(HAS_LRW) || defined(HAS_XTS)
 
 struct crypt_priv {
 	struct serpent_ctx *ctx;
@@ -506,6 +511,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
 }
 
+#endif
+
+#ifdef HAS_LRW
+
 struct serpent_lrw_ctx {
 	struct lrw_table_ctx lrw_table;
 	struct serpent_ctx serpent_ctx;
@@ -611,6 +620,114 @@ static struct crypto_alg blk_lrw_alg = {
 
 #endif
 
+#ifdef HAS_XTS
+
+struct serpent_xts_ctx {
+	struct serpent_ctx tweak_ctx;
+	struct serpent_ctx crypt_ctx;
+};
+
+static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen)
+{
+	struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+	u32 *flags = &tfm->crt_flags;
+	int err;
+
+	/* key consists of keys of equal size concatenated, therefore
+	 * the length must be even
+	 */
+	if (keylen % 2) {
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	/* first half of xts-key is for crypt */
+	err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
+	if (err)
+		return err;
+
+	/* second half of xts-key is for tweak */
+	return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->crypt_ctx,
+		.fpu_enabled = false,
+	};
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	ret = xts_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->crypt_ctx,
+		.fpu_enabled = false,
+	};
+	struct xts_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.tweak_ctx = &ctx->tweak_ctx,
+		.tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	ret = xts_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static struct crypto_alg blk_xts_alg = {
+	.cra_name		= "__xts-serpent-sse2",
+	.cra_driver_name	= "__driver-xts-serpent-sse2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_xts_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= xts_serpent_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+};
+
+#endif
+
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int key_len)
 {
@@ -851,6 +968,46 @@ static struct crypto_alg ablk_lrw_alg = {
 
 #endif
 
+#ifdef HAS_XTS
+
+static int ablk_xts_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-xts-serpent-sse2", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_xts_alg = {
+	.cra_name		= "xts(serpent)",
+	.cra_driver_name	= "xts-serpent-sse2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_xts_alg.cra_list),
+	.cra_init		= ablk_xts_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+};
+
+#endif
+
 static int __init serpent_sse2_init(void)
 {
 	int err;
@@ -886,14 +1043,29 @@ static int __init serpent_sse2_init(void)
 	if (err)
 		goto ablk_lrw_err;
 #endif
+#ifdef HAS_XTS
+	err = crypto_register_alg(&blk_xts_alg);
+	if (err)
+		goto blk_xts_err;
+	err = crypto_register_alg(&ablk_xts_alg);
+	if (err)
+		goto ablk_xts_err;
+#endif
 	return err;
 
+#ifdef HAS_XTS
+	crypto_unregister_alg(&ablk_xts_alg);
+ablk_xts_err:
+	crypto_unregister_alg(&blk_xts_alg);
+blk_xts_err:
+#endif
 #ifdef HAS_LRW
+	crypto_unregister_alg(&ablk_lrw_alg);
 ablk_lrw_err:
 	crypto_unregister_alg(&blk_lrw_alg);
 blk_lrw_err:
-	crypto_unregister_alg(&ablk_ctr_alg);
 #endif
+	crypto_unregister_alg(&ablk_ctr_alg);
 ablk_ctr_err:
 	crypto_unregister_alg(&ablk_cbc_alg);
 ablk_cbc_err:
@@ -910,6 +1082,10 @@ blk_ecb_err:
 
 static void __exit serpent_sse2_exit(void)
 {
+#ifdef HAS_XTS
+	crypto_unregister_alg(&ablk_xts_alg);
+	crypto_unregister_alg(&blk_xts_alg);
+#endif
 #ifdef HAS_LRW
 	crypto_unregister_alg(&ablk_lrw_alg);
 	crypto_unregister_alg(&blk_lrw_alg);
-- 
cgit v1.1


From d35643385628d44a5933a0755b01478eb4df5c65 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Wed, 9 Nov 2011 19:44:12 +0200
Subject: crypto: serpent-sse2 - clear CRYPTO_TFM_REQ_MAY_SLEEP in lrw and xts
 modes

LRW/XTS patches for serpent-sse2 forgot to add this. CRYPTO_TFM_REQ_MAY_SLEEP
should be cleared as sleeping between kernel_fpu_begin()/kernel_fpu_end() is
not allowed.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_sse2_glue.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 2dffc5a..2f5c304 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -554,6 +554,7 @@ static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	};
 	int ret;
 
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 	ret = lrw_crypt(desc, dst, src, nbytes, &req);
 	serpent_fpu_end(crypt_ctx.fpu_enabled);
 
@@ -579,6 +580,7 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	};
 	int ret;
 
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 	ret = lrw_crypt(desc, dst, src, nbytes, &req);
 	serpent_fpu_end(crypt_ctx.fpu_enabled);
 
@@ -671,6 +673,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	};
 	int ret;
 
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 	ret = xts_crypt(desc, dst, src, nbytes, &req);
 	serpent_fpu_end(crypt_ctx.fpu_enabled);
 
@@ -697,6 +700,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	};
 	int ret;
 
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
 	ret = xts_crypt(desc, dst, src, nbytes, &req);
 	serpent_fpu_end(crypt_ctx.fpu_enabled);
 
-- 
cgit v1.1


From d88e4cb67197d007fb778d62fe17360e970d5bfa Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 21 Nov 2011 12:32:25 -0800
Subject: freezer: remove now unused TIF_FREEZE

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-arch@vger.kernel.org
---
 arch/x86/include/asm/thread_info.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c1..32125af 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -90,7 +90,6 @@ struct thread_info {
 #define TIF_MEMDIE		20	/* is terminating due to OOM killer */
 #define TIF_DEBUG		21	/* uses debug registers */
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
-#define TIF_FREEZE		23	/* is freezing for suspend */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
 #define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
@@ -112,7 +111,6 @@ struct thread_info {
 #define _TIF_FORK		(1 << TIF_FORK)
 #define _TIF_DEBUG		(1 << TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
-#define _TIF_FREEZE		(1 << TIF_FREEZE)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
 #define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
-- 
cgit v1.1


From 80df46494846e857399618c54df30ce294dc1edd Mon Sep 17 00:00:00 2001
From: Maxim Uvarov <maxim.uvarov@oracle.com>
Date: Fri, 14 Oct 2011 15:36:51 -0700
Subject: xen: Make XEN_MAX_DOMAIN_MEMORY have more sensible defaults

Which is that 128GB is not going to happen with 32-bit PV DomU.
Lets use something more realistic. Also update the 64-bit to 500GB
which is the max a PV guest can do.

Signed-off-by: Maxim Uvarov <maxim.uvarov@oracle.com>
[v1: Updated 128GB->500GB for 64-bit]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/Kconfig | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 26c731a..fdce49c 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -29,7 +29,8 @@ config XEN_PVHVM
 
 config XEN_MAX_DOMAIN_MEMORY
        int
-       default 128
+       default 500 if X86_64
+       default 64 if X86_32
        depends on XEN
        help
          This only affects the sizing of some bss arrays, the unused
@@ -48,3 +49,4 @@ config XEN_DEBUG_FS
 	help
 	  Enable statistics output and various tuning options in debugfs.
 	  Enabling this option may incur a significant performance overhead.
+
-- 
cgit v1.1


From 0f9f5a9588468cddeccc9146b86798492c7cd4f5 Mon Sep 17 00:00:00 2001
From: Annie Li <annie.li@oracle.com>
Date: Tue, 22 Nov 2011 09:58:06 +0800
Subject: xen/granttable: Introducing grant table V2 stucture

This patch introduces new structures of grant table V2, grant table V2 is an
extension from V1. Grant table is shared between guest and Xen, and Xen is
responsible to do corresponding work for grant operations, such as: figure
out guest's grant table version, perform different actions based on
different grant table version, etc. Although full-page structure of V2
is different from V1, it play the same role as V1.

Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Annie Li <annie.li@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/grant-table.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 6bbfd7a..c6ab2e7 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -64,10 +64,10 @@ static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
 
 int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
 			   unsigned long max_nr_gframes,
-			   struct grant_entry **__shared)
+			   void **__shared)
 {
 	int rc;
-	struct grant_entry *shared = *__shared;
+	void *shared = *__shared;
 
 	if (shared == NULL) {
 		struct vm_struct *area =
@@ -83,8 +83,7 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
 	return rc;
 }
 
-void arch_gnttab_unmap_shared(struct grant_entry *shared,
-			      unsigned long nr_gframes)
+void arch_gnttab_unmap_shared(void *shared, unsigned long nr_gframes)
 {
 	apply_to_page_range(&init_mm, (unsigned long)shared,
 			    PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
-- 
cgit v1.1


From 85ff6acb075a484780b3d763fdf41596d8fc0970 Mon Sep 17 00:00:00 2001
From: Annie Li <annie.li@oracle.com>
Date: Tue, 22 Nov 2011 09:59:21 +0800
Subject: xen/granttable: Grant tables V2 implementation

Receiver-side copying of packets is based on this implementation, it gives
better performance and better CPU accounting. It totally supports three types:
full-page, sub-page and transitive grants.

However this patch does not cover sub-page and transitive grants, it mainly
focus on Full-page part and implements grant table V2 interfaces corresponding
to what already exists in grant table V1, such as: grant table V2
initialization, mapping, releasing and exported interfaces.

Each guest can only supports one type of grant table type, every entry in grant
table should be the same version. It is necessary to set V1 or V2 version before
initializing the grant table.

Grant table exported interfaces of V2 are same with those of V1, Xen is
responsible to judge what grant table version guests are using in every grant
operation.

V2 fulfills the same role of V1, and it is totally backwards compitable with V1.
If dom0 support grant table V2, the guests runing on it can run with either V1
or V2.

Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Annie Li <annie.li@oracle.com>
[v1: Modified alloc_vm_area call (new parameters), indentation, and cleanpatch
     warnings]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/grant-table.c | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index c6ab2e7..65160a8 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -54,6 +54,20 @@ static int map_pte_fn(pte_t *pte, struct page *pmd_page,
 	return 0;
 }
 
+/*
+ * This function is used to map shared frames to store grant status. It is
+ * different from map_pte_fn above, the frames type here is uint64_t.
+ */
+static int map_pte_fn_status(pte_t *pte, struct page *pmd_page,
+			     unsigned long addr, void *data)
+{
+	uint64_t **frames = (uint64_t **)data;
+
+	set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
+	(*frames)++;
+	return 0;
+}
+
 static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
 			unsigned long addr, void *data)
 {
@@ -83,7 +97,30 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
 	return rc;
 }
 
-void arch_gnttab_unmap_shared(void *shared, unsigned long nr_gframes)
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+			   unsigned long max_nr_gframes,
+			   grant_status_t **__shared)
+{
+	int rc;
+	grant_status_t *shared = *__shared;
+
+	if (shared == NULL) {
+		/* No need to pass in PTE as we are going to do it
+		 * in apply_to_page_range anyhow. */
+		struct vm_struct *area =
+			alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
+		BUG_ON(area == NULL);
+		shared = area->addr;
+		*__shared = shared;
+	}
+
+	rc = apply_to_page_range(&init_mm, (unsigned long)shared,
+				 PAGE_SIZE * nr_gframes,
+				 map_pte_fn_status, &frames);
+	return rc;
+}
+
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
 {
 	apply_to_page_range(&init_mm, (unsigned long)shared,
 			    PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
-- 
cgit v1.1


From 282e5aaba2a0cdfde4d2c2e34bc7438cd6f7a00f Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Thu, 17 Nov 2011 11:41:31 +0100
Subject: x86: Kconfig: drop unknown symbol 'APM_MODULE'

There's no Kconfig symbol APM_MODULE, so the check for it will always
fail. There's no need to append _MODULE to tristate symbols anyhow,
because the config tools will do the right thing automagically.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6a47bb2..771a185 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1718,7 +1718,7 @@ source "drivers/sfi/Kconfig"
 
 config X86_APM_BOOT
 	def_bool y
-	depends on APM || APM_MODULE
+	depends on APM
 
 menuconfig APM
 	tristate "APM (Advanced Power Management) BIOS support"
-- 
cgit v1.1


From 4673ca8eb3690832e76371371955a8b02e1f59d4 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 24 Nov 2011 14:54:28 +0200
Subject: lib: move GENERIC_IOMAP to lib/Kconfig

define GENERIC_IOMAP in a central location
instead of all architectures. This will be helpful
for the follow-up patch which makes it select
other configs. Code is also a bit shorter this way.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 arch/x86/Kconfig | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cb9a104..08af645 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -75,6 +75,7 @@ config X86
 	select HAVE_BPF_JIT if (X86_64 && NET)
 	select CLKEVT_I8253
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select GENERIC_IOMAP
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
@@ -140,9 +141,6 @@ config NEED_SG_DMA_LENGTH
 config GENERIC_ISA_DMA
 	def_bool ISA_DMA_API
 
-config GENERIC_IOMAP
-	def_bool y
-
 config GENERIC_BUG
 	def_bool y
 	depends on BUG
-- 
cgit v1.1


From 37fe6a42b3433b79a159ceb06a94cd1ef00e279d Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Date: Tue, 29 Nov 2011 15:08:29 +0900
Subject: x86: Check stack overflow in detail

Currently, only kernel stack is checked for the overflow, which
is not sufficient for systems that need a high reliability. To
enhance it, it is required to check the IRQ and exception
stacks, as well.

This patch checks all the stack types and will cause messages of
stacks in detail when free stack space drops below a certain
limit except user stack.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Link: http://lkml.kernel.org/r/20111129060829.11076.51733.stgit@ltc219.sdl.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/Kconfig.debug   |  7 +++++--
 arch/x86/kernel/irq_64.c | 29 +++++++++++++++++++++++------
 2 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bf56e17..4caec12 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -63,8 +63,11 @@ config DEBUG_STACKOVERFLOW
 	bool "Check for stack overflows"
 	depends on DEBUG_KERNEL
 	---help---
-	  This option will cause messages to be printed if free stack space
-	  drops below a certain limit.
+	  Say Y here if you want to check the overflows of kernel, IRQ
+	  and exception stacks. This option will cause messages of the
+	  stacks in detail when free stack space drops below a certain
+	  limit.
+	  If in doubt, say "N".
 
 config X86_PTDUMP
 	bool "Export kernel pagetable layout to userspace via debugfs"
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 69bca46..928a7e9 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -36,18 +36,35 @@ EXPORT_PER_CPU_SYMBOL(irq_regs);
 static inline void stack_overflow_check(struct pt_regs *regs)
 {
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
+	struct orig_ist *oist;
+	u64 irq_stack_top, irq_stack_bottom;
+	u64 estack_top, estack_bottom;
 	u64 curbase = (u64)task_stack_page(current);
 
 	if (user_mode_vm(regs))
 		return;
 
-	WARN_ONCE(regs->sp >= curbase &&
-		  regs->sp <= curbase + THREAD_SIZE &&
-		  regs->sp <  curbase + sizeof(struct thread_info) +
-					sizeof(struct pt_regs) + 128,
+	if (regs->sp >= curbase &&
+	    regs->sp <= curbase + THREAD_SIZE &&
+	    regs->sp >= curbase + sizeof(struct thread_info) +
+				  sizeof(struct pt_regs) + 128)
+		return;
+
+	irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack);
+	irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr);
+	if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
+		return;
+
+	oist = &__get_cpu_var(orig_ist);
+	estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ;
+	estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
+	if (regs->sp >= estack_top && regs->sp <= estack_bottom)
+		return;
 
-		  "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
-			current->comm, curbase, regs->sp);
+	WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
+		current->comm, curbase, regs->sp,
+		irq_stack_top, irq_stack_bottom,
+		estack_top, estack_bottom);
 #endif
 }
 
-- 
cgit v1.1


From 55af77969fbd7a841838220ea2287432e0da8ae5 Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Date: Tue, 29 Nov 2011 15:08:36 +0900
Subject: x86: Panic on detection of stack overflow

Currently, messages are just output on the detection of stack
overflow, which is not sufficient for systems that need a
high reliability. This is because in general the overflow may
corrupt data, and the additional corruption may occur due to
reading them unless systems stop.

This patch adds the sysctl parameter
kernel.panic_on_stackoverflow and causes a panic when detecting
the overflows of kernel, IRQ and exception stacks except user
stack according to the parameter. It is disabled by default.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq_32.c | 2 ++
 arch/x86/kernel/irq_64.c | 5 +++++
 2 files changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 7209070..e16e99eb 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -43,6 +43,8 @@ static void print_stack_overflow(void)
 {
 	printk(KERN_WARNING "low stack detected by irq handler\n");
 	dump_stack();
+	if (sysctl_panic_on_stackoverflow)
+		panic("low stack detected by irq handler - check messages\n");
 }
 
 #else
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 928a7e9..42552b0 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
 DEFINE_PER_CPU(struct pt_regs *, irq_regs);
 EXPORT_PER_CPU_SYMBOL(irq_regs);
 
+int sysctl_panic_on_stackoverflow;
+
 /*
  * Probabilistic stack overflow check:
  *
@@ -65,6 +67,9 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 		current->comm, curbase, regs->sp,
 		irq_stack_top, irq_stack_bottom,
 		estack_top, estack_bottom);
+
+	if (sysctl_panic_on_stackoverflow)
+		panic("low stack detected by irq handler - check messages\n");
 #endif
 }
 
-- 
cgit v1.1


From 467e6b7a7c0eb792ebaf322ddb7363742b4ead40 Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Date: Tue, 29 Nov 2011 15:08:45 +0900
Subject: x86: Clean up the range of stack overflow checking

The overflow checking of kernel stack checks if the stack
pointer points to the available kernel stack range, which is
derived from the original overflow checking.

It is clear that curbase address is always less than low
boundary of available kernel stack. So, this patch removes the
first condition that checks if the pointer is higher than
curbase.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Randy Dunlap <rdunlap@xenotime.net>
Link: http://lkml.kernel.org/r/20111129060845.11076.40916.stgit@ltc219.sdl.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/kernel/irq_64.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 42552b0..54e2b2b 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -46,10 +46,9 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 	if (user_mode_vm(regs))
 		return;
 
-	if (regs->sp >= curbase &&
-	    regs->sp <= curbase + THREAD_SIZE &&
-	    regs->sp >= curbase + sizeof(struct thread_info) +
-				  sizeof(struct pt_regs) + 128)
+	if (regs->sp >= curbase + sizeof(struct thread_info) +
+				  sizeof(struct pt_regs) + 128 &&
+	    regs->sp <= curbase + THREAD_SIZE)
 		return;
 
 	irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack);
-- 
cgit v1.1


From 3603a2512f9e69dc87914ba922eb4a0812b21cd6 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Thu, 13 Oct 2011 15:14:25 -0400
Subject: x86, reboot: Use NMI instead of REBOOT_VECTOR to stop cpus

A recent discussion started talking about the locking on the
pstore fs and how it relates to the kmsg infrastructure.  We
noticed it was possible for userspace to r/w to the pstore fs
(grabbing the locks in the process) and block the panic path
from r/w to the same fs.

The reason was the cpu with the lock could be doing work while
the crashing cpu is panic'ing.  Busting those spinlocks might
cause those cpus to step on each other's data.  Fine, fair
enough.

It was suggested it would be nice to serialize the panic path
(ie stop the other cpus) and have only one cpu running.  This
would allow us to bust the spinlocks and not worry about another
cpu stepping on the data.

Of course, smp_send_stop() does this in the panic case.
kmsg_dump() would have to be moved to be called after it.  Easy
enough.

The only problem is on x86 the smp_send_stop() function calls
the REBOOT_VECTOR.  Any cpu with irqs disabled (which pstore and
its backend ERST would do), block this IPI and thus do not stop.
 This makes it difficult to reliably log data to the pstore fs.

The patch below switches from the REBOOT_VECTOR to NMI (and
mimics what kdump does).  Switching to NMI allows us to deliver
the IPI when irqs are disabled, increasing the reliability of
this function.

However, Andi carefully noted that on some machines this
approach does not work because of broken BIOSes or whatever.

To help accomodate this, the next couple of patches will run a
selftest and provide a knob to disable.

V2:
  uses atomic ops to serialize the cpu that shuts everyone down
V3:
  comment cleanup

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <robert.richter@amd.com>
Cc: seiji.aguchi@hds.com
Cc: vgoyal@redhat.com
Cc: mjg@redhat.com
Cc: tony.luck@intel.com
Cc: gong.chen@intel.com
Cc: satoru.moriya@hds.com
Cc: avi@redhat.com
Cc: Andi Kleen <andi@firstfloor.org>
Link: http://lkml.kernel.org/r/1318533267-18880-2-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 16204dc..e72b175 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -29,6 +29,7 @@
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
 #include <asm/apic.h>
+#include <asm/nmi.h>
 /*
  *	Some notes on x86 processor bugs affecting SMP operation:
  *
@@ -148,6 +149,60 @@ void native_send_call_func_ipi(const struct cpumask *mask)
 	free_cpumask_var(allbutself);
 }
 
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+
+static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
+{
+	/* We are registered on stopping cpu too, avoid spurious NMI */
+	if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
+		return NMI_HANDLED;
+
+	stop_this_cpu(NULL);
+
+	return NMI_HANDLED;
+}
+
+static void native_nmi_stop_other_cpus(int wait)
+{
+	unsigned long flags;
+	unsigned long timeout;
+
+	if (reboot_force)
+		return;
+
+	/*
+	 * Use an own vector here because smp_call_function
+	 * does lots of things not suitable in a panic situation.
+	 */
+	if (num_online_cpus() > 1) {
+		/* did someone beat us here? */
+		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id() != -1))
+			return;
+
+		if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+					 NMI_FLAG_FIRST, "smp_stop"))
+			/* Note: we ignore failures here */
+			return;
+
+		/* sync above data before sending NMI */
+		wmb();
+
+		apic->send_IPI_allbutself(NMI_VECTOR);
+
+		/*
+		 * Don't wait longer than a second if the caller
+		 * didn't ask us to wait.
+		 */
+		timeout = USEC_PER_SEC;
+		while (num_online_cpus() > 1 && (wait || timeout--))
+			udelay(1);
+	}
+
+	local_irq_save(flags);
+	disable_local_APIC();
+	local_irq_restore(flags);
+}
+
 /*
  * this function calls the 'stop' function on all other CPUs in the system.
  */
@@ -160,7 +215,7 @@ asmlinkage void smp_reboot_interrupt(void)
 	irq_exit();
 }
 
-static void native_stop_other_cpus(int wait)
+static void native_irq_stop_other_cpus(int wait)
 {
 	unsigned long flags;
 	unsigned long timeout;
@@ -230,7 +285,7 @@ struct smp_ops smp_ops = {
 	.smp_prepare_cpus	= native_smp_prepare_cpus,
 	.smp_cpus_done		= native_smp_cpus_done,
 
-	.stop_other_cpus	= native_stop_other_cpus,
+	.stop_other_cpus	= native_nmi_stop_other_cpus,
 	.smp_send_reschedule	= native_smp_send_reschedule,
 
 	.cpu_up			= native_cpu_up,
-- 
cgit v1.1


From 99e8b9ca90d688c3ac7d3a141b701c9694a93925 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Thu, 13 Oct 2011 15:14:26 -0400
Subject: x86, NMI: Add NMI IPI selftest

The previous patch modified the stop cpus path to use NMI
instead of IRQ as the way to communicate to the other cpus to
shutdown.  There were some concerns that various machines may
have problems with using an NMI IPI.

This patch creates a selftest to check if NMI is working at
boot. The idea is to help catch any issues before the machine
panics and we learn the hard way.

Loosely based on the locking-selftest.c file, this separate file
runs a couple of simple tests and reports the results.  The
output looks like:

...
Brought up 4 CPUs
----------------
| NMI testsuite:
--------------------
  remote IPI:  ok  |
   local IPI:  ok  |
--------------------
Good, all   2 testcases passed! |
---------------------------------
Total of 4 processors activated (21330.61 BogoMIPS).
...

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <robert.richter@amd.com>
Cc: seiji.aguchi@hds.com
Cc: vgoyal@redhat.com
Cc: mjg@redhat.com
Cc: tony.luck@intel.com
Cc: gong.chen@intel.com
Cc: satoru.moriya@hds.com
Cc: avi@redhat.com
Cc: Andi Kleen <andi@firstfloor.org>
Link: http://lkml.kernel.org/r/1318533267-18880-3-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.debug         |  12 +++
 arch/x86/include/asm/smp.h     |   6 ++
 arch/x86/kernel/Makefile       |   1 +
 arch/x86/kernel/nmi_selftest.c | 179 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/smpboot.c      |   1 +
 5 files changed, 199 insertions(+)
 create mode 100644 arch/x86/kernel/nmi_selftest.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 4caec12..97da3c1 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -287,4 +287,16 @@ config DEBUG_STRICT_USER_COPY_CHECKS
 
 	  If unsure, or if you run an older (pre 4.4) gcc, say N.
 
+config DEBUG_NMI_SELFTEST
+	bool "NMI Selftest"
+	depends on DEBUG_KERNEL
+	---help---
+	  Enabling this option turns on a quick NMI selftest to verify
+	  that the NMI behaves correctly.
+
+	  This might help diagnose strange hangs that rely on NMI to
+	  function properly.
+
+	  If unsure, say N.
+
 endmenu
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 73b11bc..0434c40 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -225,5 +225,11 @@ extern int hard_smp_processor_id(void);
 
 #endif /* CONFIG_X86_LOCAL_APIC */
 
+#ifdef CONFIG_DEBUG_NMI_SELFTEST
+extern void nmi_selftest(void);
+#else
+#define nmi_selftest() do { } while (0)
+#endif
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8baca3c..02b2f05 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -80,6 +80,7 @@ obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
 obj-$(CONFIG_AMD_NB)		+= amd_nb.o
 obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
 obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
+obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
 
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o
 obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
new file mode 100644
index 0000000..572adb6
--- /dev/null
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -0,0 +1,179 @@
+/*
+ * arch/x86/kernel/nmi-selftest.c
+ *
+ * Testsuite for NMI: IPIs
+ *
+ * Started by Don Zickus:
+ * (using lib/locking-selftest.c as a guide)
+ *
+ *   Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com>
+ */
+
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#define SUCCESS		0
+#define FAILURE		1
+#define TIMEOUT		2
+
+static int nmi_fail;
+
+/* check to see if NMI IPIs work on this machine */
+static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly;
+
+static int testcase_total;
+static int testcase_successes;
+static int expected_testcase_failures;
+static int unexpected_testcase_failures;
+static int unexpected_testcase_unknowns;
+
+static int nmi_unk_cb(unsigned int val, struct pt_regs *regs)
+{
+	unexpected_testcase_unknowns++;
+	return NMI_HANDLED;
+}
+
+static void init_nmi_testsuite(void)
+{
+	/* trap all the unknown NMIs we may generate */
+	register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
+}
+
+static void cleanup_nmi_testsuite(void)
+{
+	unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
+}
+
+static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
+{
+        int cpu = raw_smp_processor_id();
+
+        if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask)))
+                return NMI_HANDLED;
+
+        return NMI_DONE;
+}
+
+static void test_nmi_ipi(struct cpumask *mask)
+{
+	unsigned long timeout;
+
+	if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
+				 NMI_FLAG_FIRST, "nmi_selftest")) {
+		nmi_fail = FAILURE;
+		return;
+	}
+
+	/* sync above data before sending NMI */
+	wmb();
+
+	apic->send_IPI_mask(mask, NMI_VECTOR);
+
+	/* Don't wait longer than a second */
+	timeout = USEC_PER_SEC;
+	while (!cpumask_empty(mask) && timeout--)
+	        udelay(1);
+
+	/* What happens if we timeout, do we still unregister?? */
+	unregister_nmi_handler(NMI_LOCAL, "nmi_selftest");
+
+	if (!timeout)
+		nmi_fail = TIMEOUT;
+	return;
+}
+
+static void remote_ipi(void)
+{
+	cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
+	cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+	test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void local_ipi(void)
+{
+	cpumask_clear(to_cpumask(nmi_ipi_mask));
+	cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+	test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void reset_nmi(void)
+{
+	nmi_fail = 0;
+}
+
+static void dotest(void (*testcase_fn)(void), int expected)
+{
+	testcase_fn();
+	/*
+	 * Filter out expected failures:
+	 */
+	if (nmi_fail != expected) {
+		unexpected_testcase_failures++;
+
+		if (nmi_fail == FAILURE)
+			printk("FAILED |");
+		else if (nmi_fail == TIMEOUT)
+			printk("TIMEOUT|");
+		else
+			printk("ERROR  |");
+		dump_stack();
+	} else {
+		testcase_successes++;
+		printk("  ok  |");
+	}
+	testcase_total++;
+
+	reset_nmi();
+}
+
+static inline void print_testname(const char *testname)
+{
+	printk("%12s:", testname);
+}
+
+void nmi_selftest(void)
+{
+	init_nmi_testsuite();
+
+        /*
+	 * Run the testsuite:
+	 */
+	printk("----------------\n");
+	printk("| NMI testsuite:\n");
+	printk("--------------------\n");
+
+	print_testname("remote IPI");
+	dotest(remote_ipi, SUCCESS);
+	printk("\n");
+	print_testname("local IPI");
+	dotest(local_ipi, SUCCESS);
+	printk("\n");
+
+	cleanup_nmi_testsuite();
+
+	if (unexpected_testcase_failures) {
+		printk("--------------------\n");
+		printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
+			unexpected_testcase_failures, testcase_total);
+		printk("-----------------------------------------------------------------\n");
+	} else if (expected_testcase_failures && testcase_successes) {
+		printk("--------------------\n");
+		printk("%3d out of %3d testcases failed, as expected. |\n",
+			expected_testcase_failures, testcase_total);
+		printk("----------------------------------------------------\n");
+	} else if (expected_testcase_failures && !testcase_successes) {
+		printk("--------------------\n");
+		printk("All %3d testcases failed, as expected. |\n",
+			expected_testcase_failures);
+		printk("----------------------------------------\n");
+	} else {
+		printk("--------------------\n");
+		printk("Good, all %3d testcases passed! |\n",
+			testcase_successes);
+		printk("---------------------------------\n");
+	}
+}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9f548cb..1927781 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1142,6 +1142,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 {
 	pr_debug("Boot done.\n");
 
+	nmi_selftest();
 	impress_friends();
 #ifdef CONFIG_X86_IO_APIC
 	setup_ioapic_dest();
-- 
cgit v1.1


From bda62633983f9db49ce0b1a9235b3709c1cda5f0 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Thu, 13 Oct 2011 15:14:27 -0400
Subject: x86, NMI: Add knob to disable using NMI IPIs to stop cpus

Some machines may exhibit problems using the NMI to stop other
cpus. This knob just allows one to revert back to the original
behaviour to help diagnose the problem.

V2:
  make function static

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <robert.richter@amd.com>
Cc: seiji.aguchi@hds.com
Cc: vgoyal@redhat.com
Cc: mjg@redhat.com
Cc: tony.luck@intel.com
Cc: gong.chen@intel.com
Cc: satoru.moriya@hds.com
Cc: avi@redhat.com
Cc: Andi Kleen <andi@firstfloor.org>
Link: http://lkml.kernel.org/r/1318533267-18880-4-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smp.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index e72b175..113acda 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -249,6 +249,11 @@ static void native_irq_stop_other_cpus(int wait)
 	local_irq_restore(flags);
 }
 
+static void native_smp_disable_nmi_ipi(void)
+{
+	smp_ops.stop_other_cpus = native_irq_stop_other_cpus;
+}
+
 /*
  * Reschedule call back.
  */
@@ -280,6 +285,14 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+static int __init nonmi_ipi_setup(char *str)
+{
+        native_smp_disable_nmi_ipi();
+        return 1;
+}
+
+__setup("nonmi_ipi", nonmi_ipi_setup);
+
 struct smp_ops smp_ops = {
 	.smp_prepare_boot_cpu	= native_smp_prepare_boot_cpu,
 	.smp_prepare_cpus	= native_smp_prepare_cpus,
-- 
cgit v1.1


From 53b5650273fea486ac8ac6c1d1e9a6cd17aa31ca Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 5 Dec 2011 12:25:44 +0100
Subject: x86: Fix the 32-bit stackoverflow-debug build

The panic_on_stackoverflow variable needs to be avilable
on the 32-bit side as well ...

Cc: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq_32.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index e16e99eb..40fc861 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -28,6 +28,9 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs);
 EXPORT_PER_CPU_SYMBOL(irq_regs);
 
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
+
+int sysctl_panic_on_stackoverflow __read_mostly;
+
 /* Debugging check for stack overflow: is there less than 1KB free? */
 static int check_stack_overflow(void)
 {
-- 
cgit v1.1


From 54b0264ec8c6e90f0413ad30e2f91c65e7844613 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Wed, 16 Nov 2011 18:17:40 +0000
Subject: x86/sfi: Kill the IRQ as id hack

Nothing should now need it so take it out

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/mrst/mrst.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index b1489a0..6a21f60 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -802,8 +802,7 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *entry)
 	if (mrst_has_msic())
 		return;
 
-	/* ID as IRQ is a hack that will go away */
-	pdev = platform_device_alloc(entry->name, entry->irq);
+	pdev = platform_device_alloc(entry->name, 0);
 	if (pdev == NULL) {
 		pr_err("out of memory for SFI platform device '%s'.\n",
 			entry->name);
-- 
cgit v1.1


From 1ea7c6737c8f68453f55c894b3d07d7f48fcbef8 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 10 Nov 2011 13:29:14 +0000
Subject: x86/config: Revamp configuration for MID devices

This follows on from the patch applied in 3.2rc1 which creates
an INTEL_MID configuration. We can now add the entry for
Medfield specific code. After this is merged the final patch
will be submitted which moves the rest of the device Kconfig
dependancies to MRST/MEDFIELD/INTEL_MID as appropriate.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                | 17 +++++++++++++++++
 arch/x86/Kconfig.debug          |  6 +++---
 arch/x86/kernel/early_printk.c  |  2 +-
 arch/x86/platform/mrst/Makefile |  2 +-
 4 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cb9a104..9e7a361 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -419,6 +419,23 @@ config X86_MRST
 	  nor standard legacy replacement devices/features. e.g. Moorestown does
 	  not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
 
+config X86_MDFLD
+       bool "Medfield MID platform"
+	depends on PCI
+	depends on PCI_GOANY
+	depends on X86_IO_APIC
+	select APB_TIMER
+	select I2C
+	select SPI
+	select INTEL_SCU_IPC
+	select X86_PLATFORM_DEVICES
+	---help---
+	  Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
+	  Internet Device(MID) platform. 
+	  Unlike standard x86 PCs, Medfield does not have many legacy devices
+	  nor standard legacy replacement devices/features. e.g. Medfield does
+	  not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
+
 endif
 
 config X86_RDC321X
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bf56e17..28c3c73 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,9 +43,9 @@ config EARLY_PRINTK
 	  with klogd/syslogd or the X server. You should normally N here,
 	  unless you want to debug such a crash.
 
-config EARLY_PRINTK_MRST
-	bool "Early printk for MRST platform support"
-	depends on EARLY_PRINTK && X86_MRST
+config EARLY_PRINTK_INTEL_MID
+	bool "Early printk for Intel MID platform support"
+	depends on EARLY_PRINTK && X86_INTEL_MID
 
 config EARLY_PRINTK_DBGP
 	bool "Early printk via EHCI debug port"
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index cd28a35..7a53da0 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -240,7 +240,7 @@ static int __init setup_early_printk(char *buf)
 		if (!strncmp(buf, "xen", 3))
 			early_console_register(&xenboot_console, keep);
 #endif
-#ifdef CONFIG_EARLY_PRINTK_MRST
+#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
 		if (!strncmp(buf, "mrst", 4)) {
 			mrst_early_console_init();
 			early_console_register(&early_mrst_console, keep);
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index 1ea3877..ddeec73 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_X86_MRST)		+= mrst.o
 obj-$(CONFIG_X86_MRST)		+= vrtc.o
-obj-$(CONFIG_EARLY_PRINTK_MRST)	+= early_printk_mrst.o
+obj-$(CONFIG_EARLY_PRINTK_INTEL_MID)	+= early_printk_mrst.o
 obj-$(CONFIG_X86_MRST)		+= pmu.o
-- 
cgit v1.1


From 9af0c7a6fa860698d080481f24a342ba74b68982 Mon Sep 17 00:00:00 2001
From: Ludwig Nussel <ludwig.nussel@suse.de>
Date: Tue, 15 Nov 2011 14:46:46 -0800
Subject: x86: Fix mmap random address range

On x86_32 casting the unsigned int result of get_random_int() to
long may result in a negative value.  On x86_32 the range of
mmap_rnd() therefore was -255 to 255.  The 32bit mode on x86_64
used 0 to 255 as intended.

The bug was introduced by 675a081 ("x86: unify mmap_{32|64}.c")
in January 2008.

Signed-off-by: Ludwig Nussel <ludwig.nussel@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: harvey.harrison@gmail.com
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/201111152246.pAFMklOB028527@wpaz5.hot.corp.google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/mmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 4b5ba85..845df68 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -75,9 +75,9 @@ static unsigned long mmap_rnd(void)
 	*/
 	if (current->flags & PF_RANDOMIZE) {
 		if (mmap_is_ia32())
-			rnd = (long)get_random_int() % (1<<8);
+			rnd = get_random_int() % (1<<8);
 		else
-			rnd = (long)(get_random_int() % (1<<28));
+			rnd = get_random_int() % (1<<28);
 	}
 	return rnd << PAGE_SHIFT;
 }
-- 
cgit v1.1


From d1bbdd669298b7ca08284ddb29153dfc039dd89d Mon Sep 17 00:00:00 2001
From: Mike Ditto <mditto@google.com>
Date: Tue, 15 Nov 2011 14:46:50 -0800
Subject: arch/x86/kernel/e820.c: Eliminate bubble sort from
 sanitize_e820_map()

Replace the bubble sort in sanitize_e820_map() with a call to
the generic kernel sort function to avoid pathological
performance with large maps.

On large (thousands of entries) E820 maps, the previous code
took minutes to run; with this change it's now milliseconds.

Signed-off-by: Mike Ditto <mditto@google.com>
Cc: sassmann@kpanic.de
Cc: yuenn@google.com
Cc: Stefan Assmann <sassmann@kpanic.de>
Cc: Nancy Yuen <yuenn@google.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 59 ++++++++++++++++++++------------------------------
 1 file changed, 24 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 303a0e4..f655f80 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -19,6 +19,7 @@
 #include <linux/acpi.h>
 #include <linux/firmware-map.h>
 #include <linux/memblock.h>
+#include <linux/sort.h>
 
 #include <asm/e820.h>
 #include <asm/proto.h>
@@ -227,22 +228,38 @@ void __init e820_print_map(char *who)
  *	   ____________________33__
  *	   ______________________4_
  */
+struct change_member {
+	struct e820entry *pbios; /* pointer to original bios entry */
+	unsigned long long addr; /* address for this change point */
+};
+
+static int __init cpcompare(const void *a, const void *b)
+{
+	struct change_member * const *app = a, * const *bpp = b;
+	const struct change_member *ap = *app, *bp = *bpp;
+
+	/*
+	 * Inputs are pointers to two elements of change_point[].  If their
+	 * addresses are unequal, their difference dominates.  If the addresses
+	 * are equal, then consider one that represents the end of its region
+	 * to be greater than one that does not.
+	 */
+	if (ap->addr != bp->addr)
+		return ap->addr > bp->addr ? 1 : -1;
+
+	return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
+}
 
 int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 			     u32 *pnr_map)
 {
-	struct change_member {
-		struct e820entry *pbios; /* pointer to original bios entry */
-		unsigned long long addr; /* address for this change point */
-	};
 	static struct change_member change_point_list[2*E820_X_MAX] __initdata;
 	static struct change_member *change_point[2*E820_X_MAX] __initdata;
 	static struct e820entry *overlap_list[E820_X_MAX] __initdata;
 	static struct e820entry new_bios[E820_X_MAX] __initdata;
-	struct change_member *change_tmp;
 	unsigned long current_type, last_type;
 	unsigned long long last_addr;
-	int chgidx, still_changing;
+	int chgidx;
 	int overlap_entries;
 	int new_bios_entry;
 	int old_nr, new_nr, chg_nr;
@@ -279,35 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 	chg_nr = chgidx;
 
 	/* sort change-point list by memory addresses (low -> high) */
-	still_changing = 1;
-	while (still_changing)	{
-		still_changing = 0;
-		for (i = 1; i < chg_nr; i++)  {
-			unsigned long long curaddr, lastaddr;
-			unsigned long long curpbaddr, lastpbaddr;
-
-			curaddr = change_point[i]->addr;
-			lastaddr = change_point[i - 1]->addr;
-			curpbaddr = change_point[i]->pbios->addr;
-			lastpbaddr = change_point[i - 1]->pbios->addr;
-
-			/*
-			 * swap entries, when:
-			 *
-			 * curaddr > lastaddr or
-			 * curaddr == lastaddr and curaddr == curpbaddr and
-			 * lastaddr != lastpbaddr
-			 */
-			if (curaddr < lastaddr ||
-			    (curaddr == lastaddr && curaddr == curpbaddr &&
-			     lastaddr != lastpbaddr)) {
-				change_tmp = change_point[i];
-				change_point[i] = change_point[i-1];
-				change_point[i-1] = change_tmp;
-				still_changing = 1;
-			}
-		}
-	}
+	sort(change_point, chg_nr, sizeof *change_point, cpcompare, 0);
 
 	/* create a new bios memory map, removing overlaps */
 	overlap_entries = 0;	 /* number of entries in the overlap table */
-- 
cgit v1.1


From 706d9a9c8b5758390036b9980a2b12d809599777 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Tue, 15 Nov 2011 14:48:56 -0800
Subject: arch/x86/kernel/e820.c: quiet sparse noise about plain integer as
 NULL pointer

The last parameter to sort() is a pointer to the function used
to swap items.  This parameter should be NULL, not 0, when not
used.  This quiets the following sparse warning:

 warning: Using plain integer as NULL pointer

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: hartleys@visionengravers.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index f655f80..d6bd853 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -296,7 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 	chg_nr = chgidx;
 
 	/* sort change-point list by memory addresses (low -> high) */
-	sort(change_point, chg_nr, sizeof *change_point, cpcompare, 0);
+	sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
 
 	/* create a new bios memory map, removing overlaps */
 	overlap_entries = 0;	 /* number of entries in the overlap table */
-- 
cgit v1.1


From b565201cf75210614903ef2ae5917b4379681647 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Tue, 15 Nov 2011 15:33:56 -0800
Subject: x86: Reduce clock calibration time during slave cpu startup

Reduce the startup time for slave cpus.

Adds hooks for an arch-specific function for clock calibration.
These hooks are used on x86.  If a newly started cpu has the
same phys_proc_id as a core already active, uses the TSC for the
delay loop and has a CONSTANT_TSC, use the already-calculated
value of loops_per_jiffy.

This patch reduces the time required to start slave cpus on a
4096 cpu system from: 465 sec OLD 62 sec NEW

This reduces boot time on a 4096p system by almost 7 minutes.
Nice...

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: John Stultz <john.stultz@linaro.org>
[fix CONFIG_SMP=n build]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 16 +++++++++++-----
 arch/x86/kernel/tsc.c     | 20 ++++++++++++++++++++
 2 files changed, 31 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9f548cb..00eef55 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -207,23 +207,29 @@ static void __cpuinit smp_callin(void)
 	 * Need to setup vector mappings before we enable interrupts.
 	 */
 	setup_vector_irq(smp_processor_id());
+
+	/*
+	 * Save our processor parameters. Note: this information
+	 * is needed for clock calibration.
+	 */
+	smp_store_cpu_info(cpuid);
+
 	/*
 	 * Get our bogomips.
+	 * Update loops_per_jiffy in cpu_data. Previous call to
+	 * smp_store_cpu_info() stored a value that is close but not as
+	 * accurate as the value just calculated.
 	 *
 	 * Need to enable IRQs because it can take longer and then
 	 * the NMI watchdog might kill us.
 	 */
 	local_irq_enable();
 	calibrate_delay();
+	cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
 	local_irq_disable();
 	pr_debug("Stack at about %p\n", &cpuid);
 
 	/*
-	 * Save our processor parameters
-	 */
-	smp_store_cpu_info(cpuid);
-
-	/*
 	 * This must be done before setting cpu_online_mask
 	 * or calling notify_cpu_starting.
 	 */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index db48336..490fb33 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -995,3 +995,23 @@ void __init tsc_init(void)
 	check_system_tsc_reliable();
 }
 
+#ifdef CONFIG_SMP
+/*
+ * If we have a constant TSC and are using the TSC for the delay loop,
+ * we can skip clock calibration if another cpu in the same socket has already
+ * been calibrated. This assumes that CONSTANT_TSC applies to all
+ * cpus in the socket - this should be a safe assumption.
+ */
+unsigned long __cpuinit calibrate_delay_is_known(void)
+{
+	int i, cpu = smp_processor_id();
+
+	if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
+		return 0;
+
+	for_each_online_cpu(i)
+		if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id)
+			return cpu_data(i).loops_per_jiffy;
+	return 0;
+}
+#endif
-- 
cgit v1.1


From f9b15df466ba923a5832c9121ad8327ccf5483ef Mon Sep 17 00:00:00 2001
From: Alessandro Rubini <rubini@gnudd.com>
Date: Sat, 29 Oct 2011 00:48:42 +0200
Subject: x86/Kconfig: Cyclone-timer depends on x86-summit

CONFIG_X86_CYCLONE_TIMER depends on CONFIG_X86_32_NON_STANDARD,
which forces drivers/clocksource/cyclone.c to be compiled. The
file doesn't do anything unless enabled by
arch/x86/kernel/apic/summit_32.c

Make CONFIG_X86_CYCLONE_TIMER depend by X86_SUMMIT instead, to
avoid unnecessary code in other non-standard systems.

Signed-off-by: Alessandro Rubini <rubini@gnudd.com>
Cc: john stultz <johnstul@us.ibm.com>
Link: http://lkml.kernel.org/r/20111028224842.GA7582@mail.gnudd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9e7a361..faf39a0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -633,7 +633,7 @@ config X86_SUMMIT_NUMA
 
 config X86_CYCLONE_TIMER
 	def_bool y
-	depends on X86_32_NON_STANDARD
+	depends on X86_SUMMIT
 
 source "arch/x86/Kconfig.cpu"
 
-- 
cgit v1.1


From 45db1c6176c8171d9ae6fa6d82e07d115a5950ca Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 5 Dec 2011 16:08:49 -0800
Subject: x86, um: Use the same style generated syscall tables as native

Now when the native kernel uses a single style of generated system
call table, follow suite for UML and implement the same style, all in
C.  This requires __NR_syscall_max and NR_syscalls to be generated; on
native this is done in asm-headers.h but that file is common to all
UML architectures; therefore put it in user-headers.h instead which
already have accommodations for architecture-specific values.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/um/Makefile            |  3 ++-
 arch/x86/um/sys_call_table_32.S | 26 -------------------
 arch/x86/um/sys_call_table_32.c | 55 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/um/sys_call_table_64.c | 31 +++++++++--------------
 arch/x86/um/user-offsets.c      | 15 +++++++++++
 5 files changed, 84 insertions(+), 46 deletions(-)
 delete mode 100644 arch/x86/um/sys_call_table_32.S
 create mode 100644 arch/x86/um/sys_call_table_32.c

(limited to 'arch/x86')

diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 8fb5840..5d065b2 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -37,7 +37,8 @@ subarch-$(CONFIG_MODULES) += ../kernel/module.o
 USER_OBJS := bugs_$(BITS).o ptrace_user.o fault.o
 
 extra-y += user-offsets.s
-$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS)
+$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) \
+	-Iarch/x86/include/generated
 
 UNPROFILE_OBJS := stub_segv.o
 CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING)
diff --git a/arch/x86/um/sys_call_table_32.S b/arch/x86/um/sys_call_table_32.S
deleted file mode 100644
index a7ca80d..0000000
--- a/arch/x86/um/sys_call_table_32.S
+++ /dev/null
@@ -1,26 +0,0 @@
-#include <linux/linkage.h>
-/* Steal i386 syscall table for our purposes, but with some slight changes.*/
-
-#define sys_iopl sys_ni_syscall
-#define sys_ioperm sys_ni_syscall
-
-#define sys_vm86old sys_ni_syscall
-#define sys_vm86 sys_ni_syscall
-
-#define old_mmap sys_old_mmap
-
-#define ptregs_fork sys_fork
-#define ptregs_execve sys_execve
-#define ptregs_iopl sys_iopl
-#define ptregs_vm86old sys_vm86old
-#define ptregs_clone sys_clone
-#define ptregs_vm86 sys_vm86
-#define ptregs_sigaltstack sys_sigaltstack
-#define ptregs_vfork sys_vfork
-
-.section .rodata,"a"
-
-#include "../kernel/syscall_table_32.S"
-
-ENTRY(syscall_table_size)
-.long .-sys_call_table
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
new file mode 100644
index 0000000..b897fca
--- /dev/null
+++ b/arch/x86/um/sys_call_table_32.c
@@ -0,0 +1,55 @@
+/*
+ * System call table for UML/i386, copied from arch/x86/kernel/syscall_*.c
+ * with some changes for UML.
+ */
+
+#include <linux/linkage.h>
+#include <linux/sys.h>
+#include <linux/cache.h>
+#include <generated/user_constants.h>
+
+#define __NO_STUBS
+
+/*
+ * Below you can see, in terms of #define's, the differences between the x86-64
+ * and the UML syscall table.
+ */
+
+/* Not going to be implemented by UML, since we have no hardware. */
+#define stub_iopl sys_ni_syscall
+#define sys_ioperm sys_ni_syscall
+
+#define sys_vm86old sys_ni_syscall
+#define sys_vm86 sys_ni_syscall
+
+#define old_mmap sys_old_mmap
+
+#define ptregs_fork sys_fork
+#define ptregs_execve sys_execve
+#define ptregs_iopl sys_iopl
+#define ptregs_vm86old sys_vm86old
+#define ptregs_clone sys_clone
+#define ptregs_vm86 sys_vm86
+#define ptregs_sigaltstack sys_sigaltstack
+#define ptregs_vfork sys_vfork
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#include <asm/syscalls_32.h>
+
+#undef __SYSCALL_I386
+#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
+
+typedef void (*sys_call_ptr_t)(void);
+
+extern void sys_ni_syscall(void);
+
+sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
+	/*
+	 * Smells like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
+	[0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_32.h>
+};
+
+int syscall_table_size = sizeof(sys_call_table);
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 99522f7..797a639 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -1,11 +1,12 @@
 /*
- * System call table for UML/x86-64, copied from arch/x86_64/kernel/syscall.c
+ * System call table for UML/x86-64, copied from arch/x86/kernel/syscall_*.c
  * with some changes for UML.
  */
 
 #include <linux/linkage.h>
 #include <linux/sys.h>
 #include <linux/cache.h>
+#include <generated/user_constants.h>
 
 #define __NO_STUBS
 
@@ -34,31 +35,23 @@
 #define stub_sigaltstack sys_sigaltstack
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef _ASM_X86_UNISTD_64_H
-#include "../../x86/include/asm/unistd_64.h"
+#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
+#include <asm/syscalls_64.h>
 
-#undef __SYSCALL
-#define __SYSCALL(nr, sym) [ nr ] = sym,
-#undef _ASM_X86_UNISTD_64_H
+#undef __SYSCALL_64
+#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym,
 
 typedef void (*sys_call_ptr_t)(void);
 
 extern void sys_ni_syscall(void);
 
-/*
- * We used to have a trick here which made sure that holes in the
- * x86_64 table were filled in with sys_ni_syscall, but a comment in
- * unistd_64.h says that holes aren't allowed, so the trick was
- * removed.
- * The trick looked like this
- *	[0 ... UM_NR_syscall_max] = &sys_ni_syscall
- * before including unistd_64.h - the later initializations overwrote
- * the sys_ni_syscall filler.
- */
-
 sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
-#include <asm/unistd_64.h>
+	/*
+	 * Smells like a compiler bug -- it doesn't work
+	 * when the & below is removed.
+	 */
+	[0 ... __NR_syscall_max] = &sys_ni_syscall,
+#include <asm/syscalls_64.h>
 };
 
 int syscall_table_size = sizeof(sys_call_table);
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index ca49be8..5edf4f4 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -8,6 +8,18 @@
 #include <asm/ptrace.h>
 #include <asm/types.h>
 
+#ifdef __i386__
+#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+static char syscalls[] = {
+#include <asm/syscalls_32.h>
+};
+#else
+#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+static char syscalls[] = {
+#include <asm/syscalls_64.h>
+};
+#endif
+
 #define DEFINE(sym, val) \
 	asm volatile("\n->" #sym " %0 " #val : : "i" (val))
 
@@ -77,4 +89,7 @@ void foo(void)
 	DEFINE(UM_PROT_READ, PROT_READ);
 	DEFINE(UM_PROT_WRITE, PROT_WRITE);
 	DEFINE(UM_PROT_EXEC, PROT_EXEC);
+
+	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
+	DEFINE(NR_syscalls, sizeof(syscalls));
 }
-- 
cgit v1.1


From a074335a370eca6d72f2ec890e4ae22923a2aea4 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 5 Dec 2011 22:48:49 -0800
Subject: x86, um: Mark system call tables readonly

Mark the system call tables readonly, as they already are on native,
and the 32-bit UM version was in the previous assembly version.  The
32-bit version lost it due to copy and paste from the 64-bit version,
which was missing the const.

Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Link: http://lkml.kernel.org/r/tip-45db1c6176c8171d9ae6fa6d82e07d115a5950ca@git.kernel.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/um/sys_call_table_32.c | 2 +-
 arch/x86/um/sys_call_table_64.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index b897fca..0606aa3 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -43,7 +43,7 @@ typedef void (*sys_call_ptr_t)(void);
 
 extern void sys_ni_syscall(void);
 
-sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
+const sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
 	/*
 	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 797a639..fe626c3 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -45,7 +45,7 @@ typedef void (*sys_call_ptr_t)(void);
 
 extern void sys_ni_syscall(void);
 
-sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
+const sys_call_ptr_t sys_call_table[] __cacheline_aligned = {
 	/*
 	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
-- 
cgit v1.1


From 855c743a27bb58a9a521bdc485ef5acfdb69badc Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 6 Dec 2011 09:08:34 +0100
Subject: x86/mm: Initialize high mem before free_all_bootmem()

Patch fixes a boot crash with pagealloc debugging enabled:

  Initializing HighMem for node 0 (000377fe:0003fff0)
  BUG: unable to handle kernel paging request at f6fefe80
  IP: [<c1621ab5>] find_range_array+0x5e/0x69
  [...]
  Call Trace:
   [<c1622064>] __get_free_all_memory_range+0x39/0xb4
   [<c1620dd0>] add_highpages_with_active_regions+0x18/0x9b
   [<c1621a2e>] set_highmem_pages_init+0x70/0x90
   [<c162122b>] mem_init+0x50/0x21b
   [<c16155bd>] start_kernel+0x1bf/0x31c
   [<c1615065>] i386_start_kernel+0x65/0x67

The crash happens when memblock wants to allocate big area for
temporary "struct range" array and reuses pages from top of low
memory, which were already passed to the buddy allocator.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: linux-mm@kvack.org
Cc: Mel Gorman <mgorman@suse.de>
Link: http://lkml.kernel.org/r/20111206080833.GB3105@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 3bebaed..a2fecb1 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -744,6 +744,17 @@ void __init mem_init(void)
 #ifdef CONFIG_FLATMEM
 	BUG_ON(!mem_map);
 #endif
+	/*
+	 * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
+	 * be done before free_all_bootmem(). Memblock use free low memory for
+	 * temporary data (see find_range_array()) and for this purpose can use
+	 * pages that was already passed to the buddy allocator, hence marked as
+	 * not accessible in the page tables when compiled with
+	 * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
+	 * important here.
+	 */
+	set_highmem_pages_init();
+
 	/* this will put all low memory onto the freelists */
 	totalram_pages += free_all_bootmem();
 
@@ -755,8 +766,6 @@ void __init mem_init(void)
 		if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
 			reservedpages++;
 
-	set_highmem_pages_init();
-
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
-- 
cgit v1.1


From 54c29c635ae91f5d75ced7bffeaa77ba37ca02bb Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 29 Nov 2011 17:05:11 +0100
Subject: mm, x86: Remove debug_pagealloc_enabled

When (no)bootmem finish operation, it pass pages to buddy
allocator. Since debug_pagealloc_enabled is not set, we will do
not protect pages, what is not what we want with
CONFIG_DEBUG_PAGEALLOC=y.

To fix remove debug_pagealloc_enabled. That variable was
introduced by commit 12d6f21e "x86: do not PSE on
CONFIG_DEBUG_PAGEALLOC=y" to get more CPA (change page
attribude) code testing. But currently we have CONFIG_CPA_DEBUG,
which test CPA.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1322582711-14571-1-git-send-email-sgruszka@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pageattr.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index f9e5267..5031eef 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1334,12 +1334,6 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 	}
 
 	/*
-	 * If page allocator is not up yet then do not call c_p_a():
-	 */
-	if (!debug_pagealloc_enabled)
-		return;
-
-	/*
 	 * The return value is ignored as the calls cannot fail.
 	 * Large pages for identity mappings are not used at boot time
 	 * and hence no memory allocations during large page split.
-- 
cgit v1.1


From 565cbc3e934f221369a656b4469a044aa4c3f2a8 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Tue, 6 Dec 2011 13:08:59 -0500
Subject: x86, NMI: NMI-selftest should handle the UP case properly

If no remote cpus are online, then just quietly skip the remote
IPI test for now.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: andi@firstfloor.org
Cc: torvalds@linux-foundation.org
Cc: peterz@infradead.org
Cc: robert.richter@amd.com
Link: http://lkml.kernel.org/r/20111206180859.GR1669@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi_selftest.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 572adb6..1e42a23 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -90,7 +90,8 @@ static void remote_ipi(void)
 {
 	cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
-	test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+	if (!cpumask_empty(nmi_ipi_mask))
+		test_nmi_ipi(to_cpumask(nmi_ipi_mask));
 }
 
 static void local_ipi(void)
-- 
cgit v1.1


From d2db6610219cbcadceea6c43ee03d89068b7d759 Mon Sep 17 00:00:00 2001
From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Date: Wed, 7 Dec 2011 17:29:10 +0900
Subject: x86: Add stack top margin for stack overflow checking

It seems that a margin for stack overflow checking is added to
top of a kernel stack but is not added to IRQ and exception
stacks in stack_overflow_check(). Therefore, the overflows of
IRQ and exception stacks are always detected only after they
actually occurred and data corruption might occur due to them.

This patch adds the margin to top of IRQ and exception stacks
as well as a kernel stack to enhance reliability.

Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20111207082910.9847.3359.stgit@ltc219.sdl.hitachi.co.jp
[ removed the #undef - we typically don't do that for uncommon names ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq_64.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 54e2b2b..d04d3ecd 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -38,6 +38,7 @@ int sysctl_panic_on_stackoverflow;
 static inline void stack_overflow_check(struct pt_regs *regs)
 {
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
+#define STACK_TOP_MARGIN	128
 	struct orig_ist *oist;
 	u64 irq_stack_top, irq_stack_bottom;
 	u64 estack_top, estack_bottom;
@@ -47,17 +48,18 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 		return;
 
 	if (regs->sp >= curbase + sizeof(struct thread_info) +
-				  sizeof(struct pt_regs) + 128 &&
+				  sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
 	    regs->sp <= curbase + THREAD_SIZE)
 		return;
 
-	irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack);
+	irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) +
+			STACK_TOP_MARGIN;
 	irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr);
 	if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
 		return;
 
 	oist = &__get_cpu_var(orig_ist);
-	estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ;
+	estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
 	estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
 	if (regs->sp >= estack_top && regs->sp <= estack_bottom)
 		return;
-- 
cgit v1.1


From 4f941c57fe7e04e38c2401d53516bfd16038c9ab Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Wed, 7 Dec 2011 16:06:30 -0500
Subject: x86, NMI: NMI selftest depends on the local apic

The selftest doesn't work with out a local apic for now.

Reported-by: Randy Durlap <rdunlap@xenotime.net>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Link: http://lkml.kernel.org/r/20111207210630.GI1669@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 97da3c1..aa4158f 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -289,7 +289,7 @@ config DEBUG_STRICT_USER_COPY_CHECKS
 
 config DEBUG_NMI_SELFTEST
 	bool "NMI Selftest"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && X86_LOCAL_APIC
 	---help---
 	  Enabling this option turns on a quick NMI selftest to verify
 	  that the NMI behaves correctly.
-- 
cgit v1.1


From 3d6240b53e34e372be007e08f7066e7625910675 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 7 Dec 2011 14:06:12 +0300
Subject: x86, NMI: Add to_cpumask() to silence compile warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Gcc complains if we don't cast this to a struct cpumask pointer.
arch/x86/kernel/nmi_selftest.c:93:2:

	warning: passing argument 1 of ‘cpumask_empty’ from
	incompatible pointer type [enabled by default]

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Don Zickus <dzickus@redhat.com>
Link: http://lkml.kernel.org/r/20111207110612.GA3437@mwanda
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/nmi_selftest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
index 1e42a23..0d01a8e 100644
--- a/arch/x86/kernel/nmi_selftest.c
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -90,7 +90,7 @@ static void remote_ipi(void)
 {
 	cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
-	if (!cpumask_empty(nmi_ipi_mask))
+	if (!cpumask_empty(to_cpumask(nmi_ipi_mask)))
 		test_nmi_ipi(to_cpumask(nmi_ipi_mask));
 }
 
-- 
cgit v1.1


From 54eed6cb16ec315565aaaf8e34252ca253a68b7b Mon Sep 17 00:00:00 2001
From: Petr Holasek <pholasek@redhat.com>
Date: Thu, 8 Dec 2011 13:16:41 +0100
Subject: x86/numa: Add constraints check for nid parameters

This patch adds constraint checks to the numa_set_distance()
function.

When the check triggers (this should not happen normally) it
emits a warning and avoids a store to a negative index in
numa_distance[] array - i.e. avoids memory corruption.

Negative ids can be passed when the pxm-to-nids mapping is not
properly filled while parsing the SRAT.

Signed-off-by: Petr Holasek <pholasek@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Anton Arapov <anton@redhat.com>
Link: http://lkml.kernel.org/r/20111208121640.GA2229@dhcp-27-244.brq.redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/numa.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index fbeaaf4..cdc0054 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -430,8 +430,9 @@ static int __init numa_alloc_distance(void)
  * calls are ignored until the distance table is reset with
  * numa_reset_distance().
  *
- * If @from or @to is higher than the highest known node at the time of
- * table creation or @distance doesn't make sense, the call is ignored.
+ * If @from or @to is higher than the highest known node or lower than zero
+ * at the time of table creation or @distance doesn't make sense, the call
+ * is ignored.
  * This is to allow simplification of specific NUMA config implementations.
  */
 void __init numa_set_distance(int from, int to, int distance)
@@ -439,8 +440,9 @@ void __init numa_set_distance(int from, int to, int distance)
 	if (!numa_distance && numa_alloc_distance() < 0)
 		return;
 
-	if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
-		printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
+			from < 0 || to < 0) {
+		pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
 			    from, to, distance);
 		return;
 	}
-- 
cgit v1.1


From 819a693b5a503788a7af54a3d95c4857780a7230 Mon Sep 17 00:00:00 2001
From: Wang YanQing <udknight@gmail.com>
Date: Thu, 8 Dec 2011 12:00:27 +0800
Subject: typo fixes: aera -> area, exntension -> extension

One printk and one comment typo fix.

Signed-off-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/pci/pcbios.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index db0e9a5..da8fe05 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -44,7 +44,7 @@ static inline void set_bios_x(void)
 	pcibios_enabled = 1;
 	set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
 	if (__supported_pte_mask & _PAGE_NX)
-		printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n");
+		printk(KERN_INFO "PCI : PCI BIOS area is rw and x. Use pci=nobios if you want it NX.\n");
 }
 
 /*
-- 
cgit v1.1


From 47db9e7c808a45b1f86971f25eca5e38fa95ab86 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 9 Dec 2011 11:13:59 -0800
Subject: x86, um: Fix typo in 32-bit system call modifications

We override sys_iopl(), not stub_iopl(); the latter is a 64-bitism
that doesn't apply to i386 in the first place.

Reported-by: Richard Weinberger <richard@nod.at>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/um/sys_call_table_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 0606aa3..416bd40 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -16,7 +16,7 @@
  */
 
 /* Not going to be implemented by UML, since we have no hardware. */
-#define stub_iopl sys_ni_syscall
+#define sys_iopl sys_ni_syscall
 #define sys_ioperm sys_ni_syscall
 
 #define sys_vm86old sys_ni_syscall
-- 
cgit v1.1


From 8af21e7e71d1ac56d9b66fb787a14fd66af7f5f7 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Sat, 27 Aug 2011 09:35:45 +0100
Subject: x86: Add missing bzImage fields to struct setup_header

commit 37ba7ab5e33c ("x86, boot: make kernel_alignment adjustable; new
bzImage fields") introduced some new fields into the bzImage header
but struct setup_header was not updated accordingly. Add the missing
'pref_address' and 'init_size' fields.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1318848017-12301-1-git-send-email-matt@console-pimps.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/bootparam.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index e020d88..2f90c51 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -64,6 +64,8 @@ struct setup_header {
 	__u32	payload_offset;
 	__u32	payload_length;
 	__u64	setup_data;
+	__u64	pref_address;
+	__u32	init_size;
 } __attribute__((packed));
 
 struct sys_desc_table {
-- 
cgit v1.1


From f7d7d01be53cb47e0ae212c4e968aa28b82d2138 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Tue, 15 Nov 2011 12:56:14 +0000
Subject: x86: Don't use magic strings for EFI loader signature

Introduce a symbol, EFI_LOADER_SIGNATURE instead of using the magic
strings, which also helps to reduce the amount of ifdeffery.

Cc: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1318848017-12301-1-git-send-email-matt@console-pimps.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/efi.h | 4 ++++
 arch/x86/kernel/setup.c    | 7 +------
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index b8d8bfc..26d8c18 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -3,6 +3,8 @@
 
 #ifdef CONFIG_X86_32
 
+#define EFI_LOADER_SIGNATURE	"EL32"
+
 extern unsigned long asmlinkage efi_call_phys(void *, ...);
 
 #define efi_call_phys0(f)		efi_call_phys(f)
@@ -35,6 +37,8 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
 
 #else /* !CONFIG_X86_32 */
 
+#define EFI_LOADER_SIGNATURE	"EL64"
+
 extern u64 efi_call0(void *fp);
 extern u64 efi_call1(void *fp, u64 arg1);
 extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9a9e40f..4d5243c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -752,12 +752,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #ifdef CONFIG_EFI
 	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-#ifdef CONFIG_X86_32
-		     "EL32",
-#else
-		     "EL64",
-#endif
-	 4)) {
+		     EFI_LOADER_SIGNATURE, 4)) {
 		efi_enabled = 1;
 		efi_memblock_x86_reserve_range();
 	}
-- 
cgit v1.1


From 291f36325f9f252bd76ef5f603995f37e453fc60 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Mon, 12 Dec 2011 21:27:52 +0000
Subject: x86, efi: EFI boot stub support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is currently a large divide between kernel development and the
development of EFI boot loaders. The idea behind this patch is to give
the kernel developers full control over the EFI boot process. As
H. Peter Anvin put it,

"The 'kernel carries its own stub' approach been very successful in
dealing with BIOS, and would make a lot of sense to me for EFI as
well."

This patch introduces an EFI boot stub that allows an x86 bzImage to
be loaded and executed by EFI firmware. The bzImage appears to the
firmware as an EFI application. Luckily there are enough free bits
within the bzImage header so that it can masquerade as an EFI
application, thereby coercing the EFI firmware into loading it and
jumping to its entry point. The beauty of this masquerading approach
is that both BIOS and EFI boot loaders can still load and run the same
bzImage, thereby allowing a single kernel image to work in any boot
environment.

The EFI boot stub supports multiple initrds, but they must exist on
the same partition as the bzImage. Command-line arguments for the
kernel can be appended after the bzImage name when run from the EFI
shell, e.g.

Shell> bzImage console=ttyS0 root=/dev/sdb initrd=initrd.img

v7:
 - Fix checkpatch warnings.

v6:

 - Try to allocate initrd memory just below hdr->inird_addr_max.

v5:

 - load_options_size is UTF-16, which needs dividing by 2 to convert
   to the corresponding ASCII size.

v4:

 - Don't read more than image->load_options_size

v3:

 - Fix following warnings when compiling CONFIG_EFI_STUB=n

   arch/x86/boot/tools/build.c: In function ‘main’:
   arch/x86/boot/tools/build.c:138:24: warning: unused variable ‘pe_header’
   arch/x86/boot/tools/build.c:138:15: warning: unused variable ‘file_sz’

 - As reported by Matthew Garrett, some Apple machines have GOPs that
   don't have hardware attached. We need to weed these out by
   searching for ones that handle the PCIIO protocol.

 - Don't allocate memory if no initrds are on cmdline
 - Don't trust image->load_options_size

Maarten Lankhorst noted:
 - Don't strip first argument when booted from efibootmgr
 - Don't allocate too much memory for cmdline
 - Don't update cmdline_size, the kernel considers it read-only
 - Don't accept '\n' for initrd names

v2:

 - File alignment was too large, was 8192 should be 512. Reported by
   Maarten Lankhorst on LKML.
 - Added UGA support for graphics
 - Use VIDEO_TYPE_EFI instead of hard-coded number.
 - Move linelength assignment until after we've assigned depth
 - Dynamically fill out AddressOfEntryPoint in tools/build.c
 - Don't use magic number for GDT/TSS stuff. Requested by Andi Kleen
 - The bzImage may need to be relocated as it may have been loaded at
   a high address address by the firmware. This was required to get my
   macbook booting because the firmware loaded it at 0x7cxxxxxx, which
   triggers this error in decompress_kernel(),

	if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
		error("Destination address too large");

Cc: Mike Waychison <mikew@google.com>
Cc: Matthew Garrett <mjg@redhat.com>
Tested-by: Henrik Rydberg <rydberg@euromail.se>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/1321383097.2657.9.camel@mfleming-mobl1.ger.corp.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig                       |    7 +
 arch/x86/boot/compressed/Makefile      |   10 +-
 arch/x86/boot/compressed/eboot.c       | 1014 ++++++++++++++++++++++++++++++++
 arch/x86/boot/compressed/eboot.h       |   60 ++
 arch/x86/boot/compressed/efi_stub_32.S |   86 +++
 arch/x86/boot/compressed/efi_stub_64.S |    1 +
 arch/x86/boot/compressed/head_32.S     |   22 +
 arch/x86/boot/compressed/head_64.S     |   20 +
 arch/x86/boot/compressed/string.c      |    9 +
 arch/x86/boot/header.S                 |  158 +++++
 arch/x86/boot/string.c                 |   35 ++
 arch/x86/boot/tools/build.c            |   39 ++
 arch/x86/kernel/asm-offsets.c          |    2 +
 13 files changed, 1462 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/boot/compressed/eboot.c
 create mode 100644 arch/x86/boot/compressed/eboot.h
 create mode 100644 arch/x86/boot/compressed/efi_stub_32.S
 create mode 100644 arch/x86/boot/compressed/efi_stub_64.S

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index efb4294..d71b656 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1478,6 +1478,13 @@ config EFI
 	  resultant kernel should continue to boot on existing non-EFI
 	  platforms.
 
+config EFI_STUB
+       bool "EFI stub support"
+       depends on EFI
+       ---help---
+          This kernel feature allows a bzImage to be loaded directly
+	  by EFI firmware without the use of a bootloader.
+
 config SECCOMP
 	def_bool y
 	prompt "Enable seccomp to safely compute untrusted bytecode"
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 09664ef..b123b9a 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -23,7 +23,15 @@ LDFLAGS_vmlinux := -T
 
 hostprogs-y	:= mkpiggy
 
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE
+VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
+	$(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
+	$(obj)/piggy.o
+
+ifeq ($(CONFIG_EFI_STUB), y)
+	VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o
+endif
+
+$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE
 	$(call if_changed,ld)
 	@:
 
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
new file mode 100644
index 0000000..4055e63
--- /dev/null
+++ b/arch/x86/boot/compressed/eboot.c
@@ -0,0 +1,1014 @@
+/* -----------------------------------------------------------------------
+ *
+ *   Copyright 2011 Intel Corporation; author Matt Fleming
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+#include <linux/efi.h>
+#include <asm/efi.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+
+#include "eboot.h"
+
+static efi_system_table_t *sys_table;
+
+static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size,
+			      unsigned long *desc_size)
+{
+	efi_memory_desc_t *m = NULL;
+	efi_status_t status;
+	unsigned long key;
+	u32 desc_version;
+
+	*map_size = sizeof(*m) * 32;
+again:
+	/*
+	 * Add an additional efi_memory_desc_t because we're doing an
+	 * allocation which may be in a new descriptor region.
+	 */
+	*map_size += sizeof(*m);
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, *map_size, (void **)&m);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	status = efi_call_phys5(sys_table->boottime->get_memory_map, map_size,
+				m, &key, desc_size, &desc_version);
+	if (status == EFI_BUFFER_TOO_SMALL) {
+		efi_call_phys1(sys_table->boottime->free_pool, m);
+		goto again;
+	}
+
+	if (status != EFI_SUCCESS)
+		efi_call_phys1(sys_table->boottime->free_pool, m);
+
+fail:
+	*map = m;
+	return status;
+}
+
+/*
+ * Allocate at the highest possible address that is not above 'max'.
+ */
+static efi_status_t high_alloc(unsigned long size, unsigned long align,
+			      unsigned long *addr, unsigned long max)
+{
+	unsigned long map_size, desc_size;
+	efi_memory_desc_t *map;
+	efi_status_t status;
+	unsigned long nr_pages;
+	u64 max_addr = 0;
+	int i;
+
+	status = __get_map(&map, &map_size, &desc_size);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+again:
+	for (i = 0; i < map_size / desc_size; i++) {
+		efi_memory_desc_t *desc;
+		unsigned long m = (unsigned long)map;
+		u64 start, end;
+
+		desc = (efi_memory_desc_t *)(m + (i * desc_size));
+		if (desc->type != EFI_CONVENTIONAL_MEMORY)
+			continue;
+
+		if (desc->num_pages < nr_pages)
+			continue;
+
+		start = desc->phys_addr;
+		end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
+
+		if ((start + size) > end || (start + size) > max)
+			continue;
+
+		if (end - size > max)
+			end = max;
+
+		if (round_down(end - size, align) < start)
+			continue;
+
+		start = round_down(end - size, align);
+
+		/*
+		 * Don't allocate at 0x0. It will confuse code that
+		 * checks pointers against NULL.
+		 */
+		if (start == 0x0)
+			continue;
+
+		if (start > max_addr)
+			max_addr = start;
+	}
+
+	if (!max_addr)
+		status = EFI_NOT_FOUND;
+	else {
+		status = efi_call_phys4(sys_table->boottime->allocate_pages,
+					EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+					nr_pages, &max_addr);
+		if (status != EFI_SUCCESS) {
+			max = max_addr;
+			max_addr = 0;
+			goto again;
+		}
+
+		*addr = max_addr;
+	}
+
+free_pool:
+	efi_call_phys1(sys_table->boottime->free_pool, map);
+
+fail:
+	return status;
+}
+
+/*
+ * Allocate at the lowest possible address.
+ */
+static efi_status_t low_alloc(unsigned long size, unsigned long align,
+			      unsigned long *addr)
+{
+	unsigned long map_size, desc_size;
+	efi_memory_desc_t *map;
+	efi_status_t status;
+	unsigned long nr_pages;
+	int i;
+
+	status = __get_map(&map, &map_size, &desc_size);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+	for (i = 0; i < map_size / desc_size; i++) {
+		efi_memory_desc_t *desc;
+		unsigned long m = (unsigned long)map;
+		u64 start, end;
+
+		desc = (efi_memory_desc_t *)(m + (i * desc_size));
+
+		if (desc->type != EFI_CONVENTIONAL_MEMORY)
+			continue;
+
+		if (desc->num_pages < nr_pages)
+			continue;
+
+		start = desc->phys_addr;
+		end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
+
+		/*
+		 * Don't allocate at 0x0. It will confuse code that
+		 * checks pointers against NULL. Skip the first 8
+		 * bytes so we start at a nice even number.
+		 */
+		if (start == 0x0)
+			start += 8;
+
+		start = round_up(start, align);
+		if ((start + size) > end)
+			continue;
+
+		status = efi_call_phys4(sys_table->boottime->allocate_pages,
+					EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+					nr_pages, &start);
+		if (status == EFI_SUCCESS) {
+			*addr = start;
+			break;
+		}
+	}
+
+	if (i == map_size / desc_size)
+		status = EFI_NOT_FOUND;
+
+free_pool:
+	efi_call_phys1(sys_table->boottime->free_pool, map);
+fail:
+	return status;
+}
+
+static void low_free(unsigned long size, unsigned long addr)
+{
+	unsigned long nr_pages;
+
+	nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+	efi_call_phys2(sys_table->boottime->free_pages, addr, size);
+}
+
+static void find_bits(unsigned long mask, u8 *pos, u8 *size)
+{
+	u8 first, len;
+
+	first = 0;
+	len = 0;
+
+	if (mask) {
+		while (!(mask & 0x1)) {
+			mask = mask >> 1;
+			first++;
+		}
+
+		while (mask & 0x1) {
+			mask = mask >> 1;
+			len++;
+		}
+	}
+
+	*pos = first;
+	*size = len;
+}
+
+/*
+ * See if we have Graphics Output Protocol
+ */
+static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
+			      unsigned long size)
+{
+	struct efi_graphics_output_protocol *gop, *first_gop;
+	struct efi_pixel_bitmask pixel_info;
+	unsigned long nr_gops;
+	efi_status_t status;
+	void **gop_handle;
+	u16 width, height;
+	u32 fb_base, fb_size;
+	u32 pixels_per_scan_line;
+	int pixel_format;
+	int i;
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, size, &gop_handle);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	status = efi_call_phys5(sys_table->boottime->locate_handle,
+				EFI_LOCATE_BY_PROTOCOL, proto,
+				NULL, &size, gop_handle);
+	if (status != EFI_SUCCESS)
+		goto free_handle;
+
+	first_gop = NULL;
+
+	nr_gops = size / sizeof(void *);
+	for (i = 0; i < nr_gops; i++) {
+		struct efi_graphics_output_mode_info *info;
+		efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
+		void *pciio;
+		void *h = gop_handle[i];
+
+		status = efi_call_phys3(sys_table->boottime->handle_protocol,
+					h, proto, &gop);
+		if (status != EFI_SUCCESS)
+			continue;
+
+		efi_call_phys3(sys_table->boottime->handle_protocol,
+			       h, &pciio_proto, &pciio);
+
+		status = efi_call_phys4(gop->query_mode, gop,
+					gop->mode->mode, &size, &info);
+		if (status == EFI_SUCCESS && (!first_gop || pciio)) {
+			/*
+			 * Apple provide GOPs that are not backed by
+			 * real hardware (they're used to handle
+			 * multiple displays). The workaround is to
+			 * search for a GOP implementing the PCIIO
+			 * protocol, and if one isn't found, to just
+			 * fallback to the first GOP.
+			 */
+			width = info->horizontal_resolution;
+			height = info->vertical_resolution;
+			fb_base = gop->mode->frame_buffer_base;
+			fb_size = gop->mode->frame_buffer_size;
+			pixel_format = info->pixel_format;
+			pixel_info = info->pixel_information;
+			pixels_per_scan_line = info->pixels_per_scan_line;
+
+			/*
+			 * Once we've found a GOP supporting PCIIO,
+			 * don't bother looking any further.
+			 */
+			if (pciio)
+				break;
+
+			first_gop = gop;
+		}
+	}
+
+	/* Did we find any GOPs? */
+	if (!first_gop)
+		goto free_handle;
+
+	/* EFI framebuffer */
+	si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+	si->lfb_width = width;
+	si->lfb_height = height;
+	si->lfb_base = fb_base;
+	si->lfb_size = fb_size;
+	si->pages = 1;
+
+	if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
+		si->lfb_depth = 32;
+		si->lfb_linelength = pixels_per_scan_line * 4;
+		si->red_size = 8;
+		si->red_pos = 0;
+		si->green_size = 8;
+		si->green_pos = 8;
+		si->blue_size = 8;
+		si->blue_pos = 16;
+		si->rsvd_size = 8;
+		si->rsvd_pos = 24;
+	} else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) {
+		si->lfb_depth = 32;
+		si->lfb_linelength = pixels_per_scan_line * 4;
+		si->red_size = 8;
+		si->red_pos = 16;
+		si->green_size = 8;
+		si->green_pos = 8;
+		si->blue_size = 8;
+		si->blue_pos = 0;
+		si->rsvd_size = 8;
+		si->rsvd_pos = 24;
+	} else if (pixel_format == PIXEL_BIT_MASK) {
+		find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size);
+		find_bits(pixel_info.green_mask, &si->green_pos,
+			  &si->green_size);
+		find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size);
+		find_bits(pixel_info.reserved_mask, &si->rsvd_pos,
+			  &si->rsvd_size);
+		si->lfb_depth = si->red_size + si->green_size +
+			si->blue_size + si->rsvd_size;
+		si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8;
+	} else {
+		si->lfb_depth = 4;
+		si->lfb_linelength = si->lfb_width / 2;
+		si->red_size = 0;
+		si->red_pos = 0;
+		si->green_size = 0;
+		si->green_pos = 0;
+		si->blue_size = 0;
+		si->blue_pos = 0;
+		si->rsvd_size = 0;
+		si->rsvd_pos = 0;
+	}
+
+free_handle:
+	efi_call_phys1(sys_table->boottime->free_pool, gop_handle);
+	return status;
+}
+
+/*
+ * See if we have Universal Graphics Adapter (UGA) protocol
+ */
+static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
+			      unsigned long size)
+{
+	struct efi_uga_draw_protocol *uga, *first_uga;
+	unsigned long nr_ugas;
+	efi_status_t status;
+	u32 width, height;
+	void **uga_handle = NULL;
+	int i;
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, size, &uga_handle);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	status = efi_call_phys5(sys_table->boottime->locate_handle,
+				EFI_LOCATE_BY_PROTOCOL, uga_proto,
+				NULL, &size, uga_handle);
+	if (status != EFI_SUCCESS)
+		goto free_handle;
+
+	first_uga = NULL;
+
+	nr_ugas = size / sizeof(void *);
+	for (i = 0; i < nr_ugas; i++) {
+		efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
+		void *handle = uga_handle[i];
+		u32 w, h, depth, refresh;
+		void *pciio;
+
+		status = efi_call_phys3(sys_table->boottime->handle_protocol,
+					handle, uga_proto, &uga);
+		if (status != EFI_SUCCESS)
+			continue;
+
+		efi_call_phys3(sys_table->boottime->handle_protocol,
+			       handle, &pciio_proto, &pciio);
+
+		status = efi_call_phys5(uga->get_mode, uga, &w, &h,
+					&depth, &refresh);
+		if (status == EFI_SUCCESS && (!first_uga || pciio)) {
+			width = w;
+			height = h;
+
+			/*
+			 * Once we've found a UGA supporting PCIIO,
+			 * don't bother looking any further.
+			 */
+			if (pciio)
+				break;
+
+			first_uga = uga;
+		}
+	}
+
+	if (!first_uga)
+		goto free_handle;
+
+	/* EFI framebuffer */
+	si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+	si->lfb_depth = 32;
+	si->lfb_width = width;
+	si->lfb_height = height;
+
+	si->red_size = 8;
+	si->red_pos = 16;
+	si->green_size = 8;
+	si->green_pos = 8;
+	si->blue_size = 8;
+	si->blue_pos = 0;
+	si->rsvd_size = 8;
+	si->rsvd_pos = 24;
+
+
+free_handle:
+	efi_call_phys1(sys_table->boottime->free_pool, uga_handle);
+	return status;
+}
+
+void setup_graphics(struct boot_params *boot_params)
+{
+	efi_guid_t graphics_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID;
+	struct screen_info *si;
+	efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
+	efi_status_t status;
+	unsigned long size;
+	void **gop_handle = NULL;
+	void **uga_handle = NULL;
+
+	si = &boot_params->screen_info;
+	memset(si, 0, sizeof(*si));
+
+	size = 0;
+	status = efi_call_phys5(sys_table->boottime->locate_handle,
+				EFI_LOCATE_BY_PROTOCOL, &graphics_proto,
+				NULL, &size, gop_handle);
+	if (status == EFI_BUFFER_TOO_SMALL)
+		status = setup_gop(si, &graphics_proto, size);
+
+	if (status != EFI_SUCCESS) {
+		size = 0;
+		status = efi_call_phys5(sys_table->boottime->locate_handle,
+					EFI_LOCATE_BY_PROTOCOL, &uga_proto,
+					NULL, &size, uga_handle);
+		if (status == EFI_BUFFER_TOO_SMALL)
+			setup_uga(si, &uga_proto, size);
+	}
+}
+
+struct initrd {
+	efi_file_handle_t *handle;
+	u64 size;
+};
+
+/*
+ * Check the cmdline for a LILO-style initrd= arguments.
+ *
+ * We only support loading an initrd from the same filesystem as the
+ * kernel image.
+ */
+static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
+				    struct setup_header *hdr)
+{
+	struct initrd *initrds;
+	unsigned long initrd_addr;
+	efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
+	u64 initrd_total;
+	efi_file_io_interface_t *io;
+	efi_file_handle_t *fh;
+	efi_status_t status;
+	int nr_initrds;
+	char *str;
+	int i, j, k;
+
+	initrd_addr = 0;
+	initrd_total = 0;
+
+	str = (char *)(unsigned long)hdr->cmd_line_ptr;
+
+	j = 0;			/* See close_handles */
+
+	if (!str || !*str)
+		return EFI_SUCCESS;
+
+	for (nr_initrds = 0; *str; nr_initrds++) {
+		str = strstr(str, "initrd=");
+		if (!str)
+			break;
+
+		str += 7;
+
+		/* Skip any leading slashes */
+		while (*str == '/' || *str == '\\')
+			str++;
+
+		while (*str && *str != ' ' && *str != '\n')
+			str++;
+	}
+
+	if (!nr_initrds)
+		return EFI_SUCCESS;
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA,
+				nr_initrds * sizeof(*initrds),
+				&initrds);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	str = (char *)(unsigned long)hdr->cmd_line_ptr;
+	for (i = 0; i < nr_initrds; i++) {
+		struct initrd *initrd;
+		efi_file_handle_t *h;
+		efi_file_info_t *info;
+		efi_char16_t filename[256];
+		unsigned long info_sz;
+		efi_guid_t info_guid = EFI_FILE_INFO_ID;
+		efi_char16_t *p;
+		u64 file_sz;
+
+		str = strstr(str, "initrd=");
+		if (!str)
+			break;
+
+		str += 7;
+
+		initrd = &initrds[i];
+		p = filename;
+
+		/* Skip any leading slashes */
+		while (*str == '/' || *str == '\\')
+			str++;
+
+		while (*str && *str != ' ' && *str != '\n') {
+			if (p >= filename + sizeof(filename))
+				break;
+
+			*p++ = *str++;
+		}
+
+		*p = '\0';
+
+		/* Only open the volume once. */
+		if (!i) {
+			efi_boot_services_t *boottime;
+
+			boottime = sys_table->boottime;
+
+			status = efi_call_phys3(boottime->handle_protocol,
+					image->device_handle, &fs_proto, &io);
+			if (status != EFI_SUCCESS)
+				goto free_initrds;
+
+			status = efi_call_phys2(io->open_volume, io, &fh);
+			if (status != EFI_SUCCESS)
+				goto free_initrds;
+		}
+
+		status = efi_call_phys5(fh->open, fh, &h, filename,
+					EFI_FILE_MODE_READ, (u64)0);
+		if (status != EFI_SUCCESS)
+			goto close_handles;
+
+		initrd->handle = h;
+
+		info_sz = 0;
+		status = efi_call_phys4(h->get_info, h, &info_guid,
+					&info_sz, NULL);
+		if (status != EFI_BUFFER_TOO_SMALL)
+			goto close_handles;
+
+grow:
+		status = efi_call_phys3(sys_table->boottime->allocate_pool,
+					EFI_LOADER_DATA, info_sz, &info);
+		if (status != EFI_SUCCESS)
+			goto close_handles;
+
+		status = efi_call_phys4(h->get_info, h, &info_guid,
+					&info_sz, info);
+		if (status == EFI_BUFFER_TOO_SMALL) {
+			efi_call_phys1(sys_table->boottime->free_pool, info);
+			goto grow;
+		}
+
+		file_sz = info->file_size;
+		efi_call_phys1(sys_table->boottime->free_pool, info);
+
+		if (status != EFI_SUCCESS)
+			goto close_handles;
+
+		initrd->size = file_sz;
+		initrd_total += file_sz;
+	}
+
+	if (initrd_total) {
+		unsigned long addr;
+
+		/*
+		 * Multiple initrd's need to be at consecutive
+		 * addresses in memory, so allocate enough memory for
+		 * all the initrd's.
+		 */
+		status = high_alloc(initrd_total, 0x1000,
+				   &initrd_addr, hdr->initrd_addr_max);
+		if (status != EFI_SUCCESS)
+			goto close_handles;
+
+		/* We've run out of free low memory. */
+		if (initrd_addr > hdr->initrd_addr_max) {
+			status = EFI_INVALID_PARAMETER;
+			goto free_initrd_total;
+		}
+
+		addr = initrd_addr;
+		for (j = 0; j < nr_initrds; j++) {
+			u64 size;
+
+			size = initrds[j].size;
+			status = efi_call_phys3(fh->read, initrds[j].handle,
+						&size, addr);
+			if (status != EFI_SUCCESS)
+				goto free_initrd_total;
+
+			efi_call_phys1(fh->close, initrds[j].handle);
+
+			addr += size;
+		}
+
+	}
+
+	efi_call_phys1(sys_table->boottime->free_pool, initrds);
+
+	hdr->ramdisk_image = initrd_addr;
+	hdr->ramdisk_size = initrd_total;
+
+	return status;
+
+free_initrd_total:
+	low_free(initrd_total, initrd_addr);
+
+close_handles:
+	for (k = j; k < nr_initrds; k++)
+		efi_call_phys1(fh->close, initrds[k].handle);
+free_initrds:
+	efi_call_phys1(sys_table->boottime->free_pool, initrds);
+fail:
+	hdr->ramdisk_image = 0;
+	hdr->ramdisk_size = 0;
+
+	return status;
+}
+
+/*
+ * Because the x86 boot code expects to be passed a boot_params we
+ * need to create one ourselves (usually the bootloader would create
+ * one for us).
+ */
+static efi_status_t make_boot_params(struct boot_params *boot_params,
+				     efi_loaded_image_t *image,
+				     void *handle)
+{
+	struct efi_info *efi = &boot_params->efi_info;
+	struct apm_bios_info *bi = &boot_params->apm_bios_info;
+	struct sys_desc_table *sdt = &boot_params->sys_desc_table;
+	struct e820entry *e820_map = &boot_params->e820_map[0];
+	struct e820entry *prev = NULL;
+	struct setup_header *hdr = &boot_params->hdr;
+	unsigned long size, key, desc_size, _size;
+	efi_memory_desc_t *mem_map;
+	void *options = image->load_options;
+	u32 load_options_size = image->load_options_size / 2; /* ASCII */
+	int options_size = 0;
+	efi_status_t status;
+	__u32 desc_version;
+	unsigned long cmdline;
+	u8 nr_entries;
+	u16 *s2;
+	u8 *s1;
+	int i;
+
+	hdr->type_of_loader = 0x21;
+
+	/* Convert unicode cmdline to ascii */
+	cmdline = 0;
+	s2 = (u16 *)options;
+
+	if (s2) {
+		while (*s2 && *s2 != '\n' && options_size < load_options_size) {
+			s2++;
+			options_size++;
+		}
+
+		if (options_size) {
+			if (options_size > hdr->cmdline_size)
+				options_size = hdr->cmdline_size;
+
+			options_size++;	/* NUL termination */
+
+			status = low_alloc(options_size, 1, &cmdline);
+			if (status != EFI_SUCCESS)
+				goto fail;
+
+			s1 = (u8 *)(unsigned long)cmdline;
+			s2 = (u16 *)options;
+
+			for (i = 0; i < options_size - 1; i++)
+				*s1++ = *s2++;
+
+			*s1 = '\0';
+		}
+	}
+
+	hdr->cmd_line_ptr = cmdline;
+
+	hdr->ramdisk_image = 0;
+	hdr->ramdisk_size = 0;
+
+	status = handle_ramdisks(image, hdr);
+	if (status != EFI_SUCCESS)
+		goto free_cmdline;
+
+	setup_graphics(boot_params);
+
+	/* Clear APM BIOS info */
+	memset(bi, 0, sizeof(*bi));
+
+	memset(sdt, 0, sizeof(*sdt));
+
+	memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));
+
+	size = sizeof(*mem_map) * 32;
+
+again:
+	size += sizeof(*mem_map);
+	_size = size;
+	status = low_alloc(size, 1, (unsigned long *)&mem_map);
+	if (status != EFI_SUCCESS)
+		goto free_cmdline;
+
+	status = efi_call_phys5(sys_table->boottime->get_memory_map, &size,
+				mem_map, &key, &desc_size, &desc_version);
+	if (status == EFI_BUFFER_TOO_SMALL) {
+		low_free(_size, (unsigned long)mem_map);
+		goto again;
+	}
+
+	if (status != EFI_SUCCESS)
+		goto free_mem_map;
+
+	efi->efi_systab = (unsigned long)sys_table;
+	efi->efi_memdesc_size = desc_size;
+	efi->efi_memdesc_version = desc_version;
+	efi->efi_memmap = (unsigned long)mem_map;
+	efi->efi_memmap_size = size;
+
+#ifdef CONFIG_X86_64
+	efi->efi_systab_hi = (unsigned long)sys_table >> 32;
+	efi->efi_memmap_hi = (unsigned long)mem_map >> 32;
+#endif
+
+	/* Might as well exit boot services now */
+	status = efi_call_phys2(sys_table->boottime->exit_boot_services,
+				handle, key);
+	if (status != EFI_SUCCESS)
+		goto free_mem_map;
+
+	/* Historic? */
+	boot_params->alt_mem_k = 32 * 1024;
+
+	/*
+	 * Convert the EFI memory map to E820.
+	 */
+	nr_entries = 0;
+	for (i = 0; i < size / desc_size; i++) {
+		efi_memory_desc_t *d;
+		unsigned int e820_type = 0;
+		unsigned long m = (unsigned long)mem_map;
+
+		d = (efi_memory_desc_t *)(m + (i * desc_size));
+		switch (d->type) {
+		case EFI_RESERVED_TYPE:
+		case EFI_RUNTIME_SERVICES_CODE:
+		case EFI_RUNTIME_SERVICES_DATA:
+		case EFI_MEMORY_MAPPED_IO:
+		case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
+		case EFI_PAL_CODE:
+			e820_type = E820_RESERVED;
+			break;
+
+		case EFI_UNUSABLE_MEMORY:
+			e820_type = E820_UNUSABLE;
+			break;
+
+		case EFI_ACPI_RECLAIM_MEMORY:
+			e820_type = E820_ACPI;
+			break;
+
+		case EFI_LOADER_CODE:
+		case EFI_LOADER_DATA:
+		case EFI_BOOT_SERVICES_CODE:
+		case EFI_BOOT_SERVICES_DATA:
+		case EFI_CONVENTIONAL_MEMORY:
+			e820_type = E820_RAM;
+			break;
+
+		case EFI_ACPI_MEMORY_NVS:
+			e820_type = E820_NVS;
+			break;
+
+		default:
+			continue;
+		}
+
+		/* Merge adjacent mappings */
+		if (prev && prev->type == e820_type &&
+		    (prev->addr + prev->size) == d->phys_addr)
+			prev->size += d->num_pages << 12;
+		else {
+			e820_map->addr = d->phys_addr;
+			e820_map->size = d->num_pages << 12;
+			e820_map->type = e820_type;
+			prev = e820_map++;
+			nr_entries++;
+		}
+	}
+
+	boot_params->e820_entries = nr_entries;
+
+	return EFI_SUCCESS;
+
+free_mem_map:
+	low_free(_size, (unsigned long)mem_map);
+free_cmdline:
+	if (options_size)
+		low_free(options_size, hdr->cmd_line_ptr);
+fail:
+	return status;
+}
+
+/*
+ * On success we return a pointer to a boot_params structure, and NULL
+ * on failure.
+ */
+struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
+{
+	struct boot_params *boot_params;
+	unsigned long start, nr_pages;
+	struct desc_ptr *gdt, *idt;
+	efi_loaded_image_t *image;
+	struct setup_header *hdr;
+	efi_status_t status;
+	efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
+	struct desc_struct *desc;
+
+	sys_table = _table;
+
+	/* Check if we were booted by the EFI firmware */
+	if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+		goto fail;
+
+	status = efi_call_phys3(sys_table->boottime->handle_protocol,
+				handle, &proto, (void *)&image);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	memset(boot_params, 0x0, 0x4000);
+
+	/* Copy first two sectors to boot_params */
+	memcpy(boot_params, image->image_base, 1024);
+
+	hdr = &boot_params->hdr;
+
+	/*
+	 * The EFI firmware loader could have placed the kernel image
+	 * anywhere in memory, but the kernel has various restrictions
+	 * on the max physical address it can run at. Attempt to move
+	 * the kernel to boot_params.pref_address, or as low as
+	 * possible.
+	 */
+	start = hdr->pref_address;
+	nr_pages = round_up(hdr->init_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+
+	status = efi_call_phys4(sys_table->boottime->allocate_pages,
+				EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+				nr_pages, &start);
+	if (status != EFI_SUCCESS) {
+		status = low_alloc(hdr->init_size, hdr->kernel_alignment,
+				   &start);
+		if (status != EFI_SUCCESS)
+			goto fail;
+	}
+
+	hdr->code32_start = (__u32)start;
+	hdr->pref_address = (__u64)(unsigned long)image->image_base;
+
+	memcpy((void *)start, image->image_base, image->image_size);
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, sizeof(*gdt),
+				(void **)&gdt);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	gdt->size = 0x800;
+	status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	status = efi_call_phys3(sys_table->boottime->allocate_pool,
+				EFI_LOADER_DATA, sizeof(*idt),
+				(void **)&idt);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	idt->size = 0;
+	idt->address = 0;
+
+	status = make_boot_params(boot_params, image, handle);
+	if (status != EFI_SUCCESS)
+		goto fail;
+
+	memset((char *)gdt->address, 0x0, gdt->size);
+	desc = (struct desc_struct *)gdt->address;
+
+	/* The first GDT is a dummy and the second is unused. */
+	desc += 2;
+
+	desc->limit0 = 0xffff;
+	desc->base0 = 0x0000;
+	desc->base1 = 0x0000;
+	desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
+	desc->s = DESC_TYPE_CODE_DATA;
+	desc->dpl = 0;
+	desc->p = 1;
+	desc->limit = 0xf;
+	desc->avl = 0;
+	desc->l = 0;
+	desc->d = SEG_OP_SIZE_32BIT;
+	desc->g = SEG_GRANULARITY_4KB;
+	desc->base2 = 0x00;
+
+	desc++;
+	desc->limit0 = 0xffff;
+	desc->base0 = 0x0000;
+	desc->base1 = 0x0000;
+	desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE;
+	desc->s = DESC_TYPE_CODE_DATA;
+	desc->dpl = 0;
+	desc->p = 1;
+	desc->limit = 0xf;
+	desc->avl = 0;
+	desc->l = 0;
+	desc->d = SEG_OP_SIZE_32BIT;
+	desc->g = SEG_GRANULARITY_4KB;
+	desc->base2 = 0x00;
+
+#ifdef CONFIG_X86_64
+	/* Task segment value */
+	desc++;
+	desc->limit0 = 0x0000;
+	desc->base0 = 0x0000;
+	desc->base1 = 0x0000;
+	desc->type = SEG_TYPE_TSS;
+	desc->s = 0;
+	desc->dpl = 0;
+	desc->p = 1;
+	desc->limit = 0x0;
+	desc->avl = 0;
+	desc->l = 0;
+	desc->d = 0;
+	desc->g = SEG_GRANULARITY_4KB;
+	desc->base2 = 0x00;
+#endif /* CONFIG_X86_64 */
+
+	asm volatile ("lidt %0" : : "m" (*idt));
+	asm volatile ("lgdt %0" : : "m" (*gdt));
+
+	asm volatile("cli");
+
+	return boot_params;
+fail:
+	return NULL;
+}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
new file mode 100644
index 0000000..f66d023
--- /dev/null
+++ b/arch/x86/boot/compressed/eboot.h
@@ -0,0 +1,60 @@
+#ifndef BOOT_COMPRESSED_EBOOT_H
+#define BOOT_COMPRESSED_EBOOT_H
+
+#define SEG_TYPE_DATA		(0 << 3)
+#define SEG_TYPE_READ_WRITE	(1 << 1)
+#define SEG_TYPE_CODE		(1 << 3)
+#define SEG_TYPE_EXEC_READ	(1 << 1)
+#define SEG_TYPE_TSS		((1 << 3) | (1 << 0))
+#define SEG_OP_SIZE_32BIT	(1 << 0)
+#define SEG_GRANULARITY_4KB	(1 << 0)
+
+#define DESC_TYPE_CODE_DATA	(1 << 0)
+
+#define EFI_PAGE_SIZE		(1UL << EFI_PAGE_SHIFT)
+
+#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR		0
+#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR		1
+#define PIXEL_BIT_MASK					2
+#define PIXEL_BLT_ONLY					3
+#define PIXEL_FORMAT_MAX				4
+
+struct efi_pixel_bitmask {
+	u32 red_mask;
+	u32 green_mask;
+	u32 blue_mask;
+	u32 reserved_mask;
+};
+
+struct efi_graphics_output_mode_info {
+	u32 version;
+	u32 horizontal_resolution;
+	u32 vertical_resolution;
+	int pixel_format;
+	struct efi_pixel_bitmask pixel_information;
+	u32 pixels_per_scan_line;
+} __packed;
+
+struct efi_graphics_output_protocol_mode {
+	u32 max_mode;
+	u32 mode;
+	unsigned long info;
+	unsigned long size_of_info;
+	u64 frame_buffer_base;
+	unsigned long frame_buffer_size;
+} __packed;
+
+struct efi_graphics_output_protocol {
+	void *query_mode;
+	unsigned long set_mode;
+	unsigned long blt;
+	struct efi_graphics_output_protocol_mode *mode;
+};
+
+struct efi_uga_draw_protocol {
+	void *get_mode;
+	void *set_mode;
+	void *blt;
+};
+
+#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S
new file mode 100644
index 0000000..a53440e
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_stub_32.S
@@ -0,0 +1,86 @@
+/*
+ * EFI call stub for IA32.
+ *
+ * This stub allows us to make EFI calls in physical mode with interrupts
+ * turned off. Note that this implementation is different from the one in
+ * arch/x86/platform/efi/efi_stub_32.S because we're _already_ in physical
+ * mode at this point.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+
+/*
+ * efi_call_phys(void *, ...) is a function with variable parameters.
+ * All the callers of this function assure that all the parameters are 4-bytes.
+ */
+
+/*
+ * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
+ * So we'd better save all of them at the beginning of this function and restore
+ * at the end no matter how many we use, because we can not assure EFI runtime
+ * service functions will comply with gcc calling convention, too.
+ */
+
+.text
+ENTRY(efi_call_phys)
+	/*
+	 * 0. The function can only be called in Linux kernel. So CS has been
+	 * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
+	 * the values of these registers are the same. And, the corresponding
+	 * GDT entries are identical. So I will do nothing about segment reg
+	 * and GDT, but change GDT base register in prelog and epilog.
+	 */
+
+	/*
+	 * 1. Because we haven't been relocated by this point we need to
+	 * use relative addressing.
+	 */
+	call	1f
+1:	popl	%edx
+	subl	$1b, %edx
+
+	/*
+	 * 2. Now on the top of stack is the return
+	 * address in the caller of efi_call_phys(), then parameter 1,
+	 * parameter 2, ..., param n. To make things easy, we save the return
+	 * address of efi_call_phys in a global variable.
+	 */
+	popl	%ecx
+	movl	%ecx, saved_return_addr(%edx)
+	/* get the function pointer into ECX*/
+	popl	%ecx
+	movl	%ecx, efi_rt_function_ptr(%edx)
+
+	/*
+	 * 3. Call the physical function.
+	 */
+	call	*%ecx
+
+	/*
+	 * 4. Balance the stack. And because EAX contain the return value,
+	 * we'd better not clobber it. We need to calculate our address
+	 * again because %ecx and %edx are not preserved across EFI function
+	 * calls.
+	 */
+	call	1f
+1:	popl	%edx
+	subl	$1b, %edx
+
+	movl	efi_rt_function_ptr(%edx), %ecx
+	pushl	%ecx
+
+	/*
+	 * 10. Push the saved return address onto the stack and return.
+	 */
+	movl	saved_return_addr(%edx), %ecx
+	pushl	%ecx
+	ret
+ENDPROC(efi_call_phys)
+.previous
+
+.data
+saved_return_addr:
+	.long 0
+efi_rt_function_ptr:
+	.long 0
diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S
new file mode 100644
index 0000000..cedc60d
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_stub_64.S
@@ -0,0 +1 @@
+#include "../../platform/efi/efi_stub_64.S"
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 67a655a..a055993 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -32,6 +32,28 @@
 
 	__HEAD
 ENTRY(startup_32)
+#ifdef CONFIG_EFI_STUB
+	/*
+	 * We don't need the return address, so set up the stack so
+	 * efi_main() can find its arugments.
+	 */
+	add	$0x4, %esp
+
+	call	efi_main
+	cmpl	$0, %eax
+	je	preferred_addr
+	movl	%eax, %esi
+	call	1f
+1:
+	popl	%eax
+	subl	$1b, %eax
+	subl	BP_pref_address(%esi), %eax
+	add	BP_code32_start(%esi), %eax
+	leal	preferred_addr(%eax), %eax
+	jmp	*%eax
+
+preferred_addr:
+#endif
 	cld
 	/*
 	 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 35af09d..558d76c 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -199,6 +199,26 @@ ENTRY(startup_64)
 	 * an identity mapped page table being provied that maps our
 	 * entire text+data+bss and hopefully all of memory.
 	 */
+#ifdef CONFIG_EFI_STUB
+	pushq	%rsi
+	mov	%rcx, %rdi
+	mov	%rdx, %rsi
+	call	efi_main
+	popq	%rsi
+	cmpq	$0,%rax
+	je	preferred_addr
+	movq	%rax,%rsi
+	call	1f
+1:
+	popq	%rax
+	subq	$1b, %rax
+	subq	BP_pref_address(%rsi), %rax
+	add	BP_code32_start(%esi), %eax
+	leaq	preferred_addr(%rax), %rax
+	jmp	*%rax
+
+preferred_addr:
+#endif
 
 	/* Setup data segments. */
 	xorl	%eax, %eax
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index 19b3e69..ffb9c5c 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,2 +1,11 @@
 #include "misc.h"
+
+int memcmp(const void *s1, const void *s2, size_t len)
+{
+	u8 diff;
+	asm("repe; cmpsb; setnz %0"
+	    : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+	return diff;
+}
+
 #include "../string.c"
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index bdb4d45..f1bbeeb 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -45,6 +45,11 @@ SYSSEG		= 0x1000		/* historical load address >> 4 */
 
 	.global bootsect_start
 bootsect_start:
+#ifdef CONFIG_EFI_STUB
+	# "MZ", MS-DOS header
+	.byte 0x4d
+	.byte 0x5a
+#endif
 
 	# Normalize the start address
 	ljmp	$BOOTSEG, $start2
@@ -79,6 +84,14 @@ bs_die:
 	# invoke the BIOS reset code...
 	ljmp	$0xf000,$0xfff0
 
+#ifdef CONFIG_EFI_STUB
+	.org	0x3c
+	#
+	# Offset to the PE header.
+	#
+	.long	pe_header
+#endif /* CONFIG_EFI_STUB */
+
 	.section ".bsdata", "a"
 bugger_off_msg:
 	.ascii	"Direct booting from floppy is no longer supported.\r\n"
@@ -87,6 +100,141 @@ bugger_off_msg:
 	.ascii	"Remove disk and press any key to reboot . . .\r\n"
 	.byte	0
 
+#ifdef CONFIG_EFI_STUB
+pe_header:
+	.ascii	"PE"
+	.word 	0
+
+coff_header:
+#ifdef CONFIG_X86_32
+	.word	0x14c				# i386
+#else
+	.word	0x8664				# x86-64
+#endif
+	.word	2				# nr_sections
+	.long	0 				# TimeDateStamp
+	.long	0				# PointerToSymbolTable
+	.long	1				# NumberOfSymbols
+	.word	section_table - optional_header	# SizeOfOptionalHeader
+#ifdef CONFIG_X86_32
+	.word	0x306				# Characteristics.
+						# IMAGE_FILE_32BIT_MACHINE |
+						# IMAGE_FILE_DEBUG_STRIPPED |
+						# IMAGE_FILE_EXECUTABLE_IMAGE |
+						# IMAGE_FILE_LINE_NUMS_STRIPPED
+#else
+	.word	0x206				# Characteristics
+						# IMAGE_FILE_DEBUG_STRIPPED |
+						# IMAGE_FILE_EXECUTABLE_IMAGE |
+						# IMAGE_FILE_LINE_NUMS_STRIPPED
+#endif
+
+optional_header:
+#ifdef CONFIG_X86_32
+	.word	0x10b				# PE32 format
+#else
+	.word	0x20b 				# PE32+ format
+#endif
+	.byte	0x02				# MajorLinkerVersion
+	.byte	0x14				# MinorLinkerVersion
+
+	# Filled in by build.c
+	.long	0				# SizeOfCode
+
+	.long	0				# SizeOfInitializedData
+	.long	0				# SizeOfUninitializedData
+
+	# Filled in by build.c
+	.long	0x0000				# AddressOfEntryPoint
+
+	.long	0x0000				# BaseOfCode
+#ifdef CONFIG_X86_32
+	.long	0				# data
+#endif
+
+extra_header_fields:
+#ifdef CONFIG_X86_32
+	.long	0				# ImageBase
+#else
+	.quad	0				# ImageBase
+#endif
+	.long	0x1000				# SectionAlignment
+	.long	0x200				# FileAlignment
+	.word	0				# MajorOperatingSystemVersion
+	.word	0				# MinorOperatingSystemVersion
+	.word	0				# MajorImageVersion
+	.word	0				# MinorImageVersion
+	.word	0				# MajorSubsystemVersion
+	.word	0				# MinorSubsystemVersion
+	.long	0				# Win32VersionValue
+
+	#
+	# The size of the bzImage is written in tools/build.c
+	#
+	.long	0				# SizeOfImage
+
+	.long	0x200				# SizeOfHeaders
+	.long	0				# CheckSum
+	.word	0xa				# Subsystem (EFI application)
+	.word	0				# DllCharacteristics
+#ifdef CONFIG_X86_32
+	.long	0				# SizeOfStackReserve
+	.long	0				# SizeOfStackCommit
+	.long	0				# SizeOfHeapReserve
+	.long	0				# SizeOfHeapCommit
+#else
+	.quad	0				# SizeOfStackReserve
+	.quad	0				# SizeOfStackCommit
+	.quad	0				# SizeOfHeapReserve
+	.quad	0				# SizeOfHeapCommit
+#endif
+	.long	0				# LoaderFlags
+	.long	0x1				# NumberOfRvaAndSizes
+
+	.quad	0				# ExportTable
+	.quad	0				# ImportTable
+	.quad	0				# ResourceTable
+	.quad	0				# ExceptionTable
+	.quad	0				# CertificationTable
+	.quad	0				# BaseRelocationTable
+
+	# Section table
+section_table:
+	.ascii	".text"
+	.byte	0
+	.byte	0
+	.byte	0
+	.long	0
+	.long	0x0				# startup_{32,64}
+	.long	0				# Size of initialized data
+						# on disk
+	.long	0x0				# startup_{32,64}
+	.long	0				# PointerToRelocations
+	.long	0				# PointerToLineNumbers
+	.word	0				# NumberOfRelocations
+	.word	0				# NumberOfLineNumbers
+	.long	0x60500020			# Characteristics (section flags)
+
+	#
+	# The EFI application loader requires a relocation section
+	# because EFI applications are relocatable and not having
+	# this section seems to confuse it. But since we don't need
+	# the loader to fixup any relocs for us just fill it with a
+	# single dummy reloc.
+	#
+	.ascii	".reloc"
+	.byte	0
+	.byte	0
+	.long	reloc_end - reloc_start
+	.long	reloc_start
+	.long	reloc_end - reloc_start		# SizeOfRawData
+	.long	reloc_start			# PointerToRawData
+	.long	0				# PointerToRelocations
+	.long	0				# PointerToLineNumbers
+	.word	0				# NumberOfRelocations
+	.word	0				# NumberOfLineNumbers
+	.long	0x42100040			# Characteristics (section flags)
+#endif /* CONFIG_EFI_STUB */
 
 	# Kernel attributes; used by setup.  This is part 1 of the
 	# header, from the old boot sector.
@@ -318,3 +466,13 @@ die:
 setup_corrupt:
 	.byte	7
 	.string	"No setup signature found...\n"
+
+	.data
+dummy:	.long	0
+
+	.section .reloc
+reloc_start:
+	.long	dummy - reloc_start
+	.long	10
+	.word	0
+reloc_end:
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 3cbc405..574dedf 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -111,3 +111,38 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas
 
 	return result;
 }
+
+/**
+ * strlen - Find the length of a string
+ * @s: The string to be sized
+ */
+size_t strlen(const char *s)
+{
+	const char *sc;
+
+	for (sc = s; *sc != '\0'; ++sc)
+		/* nothing */;
+	return sc - s;
+}
+
+/**
+ * strstr - Find the first substring in a %NUL terminated string
+ * @s1: The string to be searched
+ * @s2: The string to search for
+ */
+char *strstr(const char *s1, const char *s2)
+{
+	size_t l1, l2;
+
+	l2 = strlen(s2);
+	if (!l2)
+		return (char *)s1;
+	l1 = strlen(s1);
+	while (l1 >= l2) {
+		l1--;
+		if (!memcmp(s1, s2, l2))
+			return (char *)s1;
+		s1++;
+	}
+	return NULL;
+}
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index fdc60a0..4e9bd6b 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -135,6 +135,9 @@ static void usage(void)
 
 int main(int argc, char ** argv)
 {
+#ifdef CONFIG_EFI_STUB
+	unsigned int file_sz, pe_header;
+#endif
 	unsigned int i, sz, setup_sectors;
 	int c;
 	u32 sys_size;
@@ -194,6 +197,42 @@ int main(int argc, char ** argv)
 	buf[0x1f6] = sys_size >> 16;
 	buf[0x1f7] = sys_size >> 24;
 
+#ifdef CONFIG_EFI_STUB
+	file_sz = sz + i + ((sys_size * 16) - sz);
+
+	pe_header = *(unsigned int *)&buf[0x3c];
+
+	/* Size of code */
+	*(unsigned int *)&buf[pe_header + 0x1c] = file_sz;
+
+	/* Size of image */
+	*(unsigned int *)&buf[pe_header + 0x50] = file_sz;
+
+#ifdef CONFIG_X86_32
+	/* Address of entry point */
+	*(unsigned int *)&buf[pe_header + 0x28] = i;
+
+	/* .text size */
+	*(unsigned int *)&buf[pe_header + 0xb0] = file_sz;
+
+	/* .text size of initialised data */
+	*(unsigned int *)&buf[pe_header + 0xb8] = file_sz;
+#else
+	/*
+	 * Address of entry point. startup_32 is at the beginning and
+	 * the 64-bit entry point (startup_64) is always 512 bytes
+	 * after.
+	 */
+	*(unsigned int *)&buf[pe_header + 0x28] = i + 512;
+
+	/* .text size */
+	*(unsigned int *)&buf[pe_header + 0xc0] = file_sz;
+
+	/* .text size of initialised data */
+	*(unsigned int *)&buf[pe_header + 0xc8] = file_sz;
+#endif /* CONFIG_X86_32 */
+#endif /* CONFIG_EFI_STUB */
+
 	crc = partial_crc32(buf, i, crc);
 	if (fwrite(buf, 1, i, stdout) != i)
 		die("Writing setup failed");
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 4f13faf..68de2dc 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -67,4 +67,6 @@ void common(void) {
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_version, boot_params, hdr.version);
 	OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+	OFFSET(BP_pref_address, boot_params, hdr.pref_address);
+	OFFSET(BP_code32_start, boot_params, hdr.code32_start);
 }
-- 
cgit v1.1


From 3e8f9451d3db669d7c0d8b330d4f5770149d90d5 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 15 Dec 2011 22:19:41 +0000
Subject: x86: Fix INTEL_MID silly

Doh.. pass the brown paper bags - preferably filled with mince
pies..

This fixes occasional build failures.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/n/tip-r0oc1knlvzuqr69artaeq8s8@git.kernel.org
[ extended the changelog a bit ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index faf39a0..2b54a2f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -429,6 +429,7 @@ config X86_MDFLD
 	select SPI
 	select INTEL_SCU_IPC
 	select X86_PLATFORM_DEVICES
+	select X86_INTEL_MID
 	---help---
 	  Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
 	  Internet Device(MID) platform. 
-- 
cgit v1.1


From 2d2da60fb40a80cc59383121ccf763e0e0e8a42a Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <m.b.lankhorst@gmail.com>
Date: Fri, 16 Dec 2011 13:30:58 +0100
Subject: x86, efi: Break up large initrd reads

The efi boot stub tries to read the entire initrd in 1 go, however
some efi implementations hang if too much if asked to read too much
data at the same time. After some experimentation I found out that my
asrock p67 board will hang if asked to read chunks of 4MiB, so use a
safe value.

elilo reads in chunks of 16KiB, but since that requires many read
calls I use a value of 1 MiB.  hpa suggested adding individual
blacklists for when systems are found where this value causes a crash.

Signed-off-by: Maarten Lankhorst <m.b.lankhorst@gmail.com>
Link: http://lkml.kernel.org/r/4EEB3A02.3090201@gmail.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/compressed/eboot.c | 20 ++++++++++++++------
 arch/x86/boot/compressed/eboot.h |  1 +
 2 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 4055e63..fec216f 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -643,14 +643,22 @@ grow:
 			u64 size;
 
 			size = initrds[j].size;
-			status = efi_call_phys3(fh->read, initrds[j].handle,
-						&size, addr);
-			if (status != EFI_SUCCESS)
-				goto free_initrd_total;
+			while (size) {
+				u64 chunksize;
+				if (size > EFI_READ_CHUNK_SIZE)
+					chunksize = EFI_READ_CHUNK_SIZE;
+				else
+					chunksize = size;
+				status = efi_call_phys3(fh->read,
+							initrds[j].handle,
+							&chunksize, addr);
+				if (status != EFI_SUCCESS)
+					goto free_initrd_total;
+				addr += chunksize;
+				size -= chunksize;
+			}
 
 			efi_call_phys1(fh->close, initrds[j].handle);
-
-			addr += size;
 		}
 
 	}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index f66d023..3925166 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -12,6 +12,7 @@
 #define DESC_TYPE_CODE_DATA	(1 << 0)
 
 #define EFI_PAGE_SIZE		(1UL << EFI_PAGE_SHIFT)
+#define EFI_READ_CHUNK_SIZE	(1024 * 1024)
 
 #define PIXEL_RGB_RESERVED_8BIT_PER_COLOR		0
 #define PIXEL_BGR_RESERVED_8BIT_PER_COLOR		1
-- 
cgit v1.1


From a0c3832a578c84d4a93c61e22cb09c99fa9447ea Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 17 Dec 2011 21:57:25 +0000
Subject: x86/apb: Fix configuration constraints

The APB timer requires SFI, SCU and MID support

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/r/20111217215719.3743.93550.stgit@bob.linux.org.uk
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2b54a2f..ca4ee76 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -665,6 +665,7 @@ config APB_TIMER
        def_bool y if MRST
        prompt "Langwell APB Timer Support" if X86_MRST
        select DW_APB_TIMER
+       depends on X86_INTEL_MID && SFI
        help
          APB timer is the replacement for 8254, HPET on X86 MID platforms.
          The APBT provides a stable time base on SMP
-- 
cgit v1.1


From 933b9463a0ef75da681747b2dac06c1754465372 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Sat, 17 Dec 2011 17:43:40 +0000
Subject: x86/intel config: Revamp configuration to allow for Moorestown and
 Medfield

This sets all up the other bits that need to be INTEL_MID
specific rather than Moorestown specific.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/r/20111217174318.7207.91543.stgit@bob.linux.org.uk
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                | 4 ++--
 arch/x86/include/asm/fixmap.h   | 2 +-
 arch/x86/include/asm/setup.h    | 2 +-
 arch/x86/pci/Makefile           | 2 +-
 arch/x86/platform/mrst/Makefile | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ca4ee76..c3c9343 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -662,8 +662,8 @@ config HPET_EMULATE_RTC
 	depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
 
 config APB_TIMER
-       def_bool y if MRST
-       prompt "Langwell APB Timer Support" if X86_MRST
+       def_bool y if X86_INTEL_MID
+       prompt "Intel MID APB Timer Support" if X86_INTEL_MID
        select DW_APB_TIMER
        depends on X86_INTEL_MID && SFI
        help
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 460c74e..4da3c0c 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -117,7 +117,7 @@ enum fixed_addresses {
 #endif
 	FIX_TEXT_POKE1,	/* reserve 2 pages for text_poke() */
 	FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
-#ifdef	CONFIG_X86_MRST
+#ifdef	CONFIG_X86_INTEL_MID
 	FIX_LNW_VRTC,
 #endif
 	__end_of_permanent_fixed_addresses,
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 9756551..d0f19f9 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -47,7 +47,7 @@ extern void reserve_standard_io_resources(void);
 extern void i386_reserve_resources(void);
 extern void setup_default_timer_irq(void);
 
-#ifdef CONFIG_X86_MRST
+#ifdef CONFIG_X86_INTEL_MID
 extern void x86_mrst_early_setup(void);
 #else
 static inline void x86_mrst_early_setup(void) { }
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 6b8759f..75b06f3 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -15,7 +15,7 @@ obj-$(CONFIG_X86_VISWS)		+= visws.o
 
 obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
 
-obj-$(CONFIG_X86_MRST)		+= mrst.o
+obj-$(CONFIG_X86_INTEL_MID)	+= mrst.o
 
 obj-y				+= common.o early.o
 obj-y				+= amd_bus.o bus_numa.o
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index ddeec73..7baed51 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_X86_MRST)		+= mrst.o
-obj-$(CONFIG_X86_MRST)		+= vrtc.o
+obj-$(CONFIG_X86_INTEL_MID)	+= mrst.o
+obj-$(CONFIG_X86_INTEL_MID)	+= vrtc.o
 obj-$(CONFIG_EARLY_PRINTK_INTEL_MID)	+= early_printk_mrst.o
 obj-$(CONFIG_X86_MRST)		+= pmu.o
-- 
cgit v1.1


From d79a8869d8a4b565b12a88faeff834b09a36368c Mon Sep 17 00:00:00 2001
From: Michael Demeter <michael.demeter@intel.com>
Date: Thu, 15 Dec 2011 22:31:23 +0000
Subject: x86/mrst: Add additional debug prints for pb_keys

Added additional debug output that we always seem to add
during power ons to validate firmware operation.

Signed-off-by: Michael Demeter <michael.demeter@intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/r/20111215223116.10166.50803.stgit@bob.linux.org.uk
[ fixed line breaks, formatting and commit title. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/mrst/mrst.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 6a21f60..b6a33d2 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -983,6 +983,7 @@ static int __init pb_keys_init(void)
 	num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
 	for (i = 0; i < num; i++) {
 		gb[i].gpio = get_gpio_by_name(gb[i].desc);
+		pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, gb[i].gpio);
 		if (gb[i].gpio == -1)
 			continue;
 
-- 
cgit v1.1


From 88715b9ade718564fd8b1318735826370481366b Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Tue, 13 Dec 2011 12:53:07 +0200
Subject: crypto: twofish-x86_64-3way - remove unneeded LRW/XTS #ifdefs

Since LRW & XTS are selected by twofish-x86_64-3way, we don't need these
#ifdefs anymore.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish_glue_3way.c | 32 --------------------------------
 1 file changed, 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 954f59e..7fee8c1 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -35,14 +35,6 @@
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
 
-#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
-#define HAS_LRW
-#endif
-
-#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
-#define HAS_XTS
-#endif
-
 /* regular block cipher functions from twofish_x86_64 module */
 asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
 				const u8 *src);
@@ -442,8 +434,6 @@ static struct crypto_alg blk_ctr_alg = {
 	},
 };
 
-#if defined(HAS_LRW) || defined(HAS_XTS)
-
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
 	const unsigned int bsize = TF_BLOCK_SIZE;
@@ -474,10 +464,6 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 		twofish_dec_blk(ctx, srcdst, srcdst);
 }
 
-#endif
-
-#ifdef HAS_LRW
-
 struct twofish_lrw_ctx {
 	struct lrw_table_ctx lrw_table;
 	struct twofish_ctx twofish_ctx;
@@ -562,10 +548,6 @@ static struct crypto_alg blk_lrw_alg = {
 	},
 };
 
-#endif
-
-#ifdef HAS_XTS
-
 struct twofish_xts_ctx {
 	struct twofish_ctx tweak_ctx;
 	struct twofish_ctx crypt_ctx;
@@ -655,8 +637,6 @@ static struct crypto_alg blk_xts_alg = {
 	},
 };
 
-#endif
-
 int __init init(void)
 {
 	int err;
@@ -670,27 +650,19 @@ int __init init(void)
 	err = crypto_register_alg(&blk_ctr_alg);
 	if (err)
 		goto ctr_err;
-#ifdef HAS_LRW
 	err = crypto_register_alg(&blk_lrw_alg);
 	if (err)
 		goto blk_lrw_err;
-#endif
-#ifdef HAS_XTS
 	err = crypto_register_alg(&blk_xts_alg);
 	if (err)
 		goto blk_xts_err;
-#endif
 
 	return 0;
 
-#ifdef HAS_XTS
 	crypto_unregister_alg(&blk_xts_alg);
 blk_xts_err:
-#endif
-#ifdef HAS_LRW
 	crypto_unregister_alg(&blk_lrw_alg);
 blk_lrw_err:
-#endif
 	crypto_unregister_alg(&blk_ctr_alg);
 ctr_err:
 	crypto_unregister_alg(&blk_cbc_alg);
@@ -702,12 +674,8 @@ ecb_err:
 
 void __exit fini(void)
 {
-#ifdef HAS_XTS
 	crypto_unregister_alg(&blk_xts_alg);
-#endif
-#ifdef HAS_LRW
 	crypto_unregister_alg(&blk_lrw_alg);
-#endif
 	crypto_unregister_alg(&blk_ctr_alg);
 	crypto_unregister_alg(&blk_cbc_alg);
 	crypto_unregister_alg(&blk_ecb_alg);
-- 
cgit v1.1


From 7ba8babf84fa4e9b648e247223043785f596dd23 Mon Sep 17 00:00:00 2001
From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Date: Tue, 13 Dec 2011 12:53:17 +0200
Subject: crypto: serpent-sse2 - remove unneeded LRW/XTS #ifdefs

Since LRW & XTS are selected by serpent-sse2, we don't need these #ifdefs
anymore.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_sse2_glue.c | 40 -------------------------------------
 1 file changed, 40 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 2f5c304..7955a9b 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -47,14 +47,6 @@
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
 
-#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE)
-#define HAS_LRW
-#endif
-
-#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE)
-#define HAS_XTS
-#endif
-
 struct async_serpent_ctx {
 	struct cryptd_ablkcipher *cryptd_tfm;
 };
@@ -470,8 +462,6 @@ static struct crypto_alg blk_ctr_alg = {
 	},
 };
 
-#if defined(HAS_LRW) || defined(HAS_XTS)
-
 struct crypt_priv {
 	struct serpent_ctx *ctx;
 	bool fpu_enabled;
@@ -511,10 +501,6 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
 }
 
-#endif
-
-#ifdef HAS_LRW
-
 struct serpent_lrw_ctx {
 	struct lrw_table_ctx lrw_table;
 	struct serpent_ctx serpent_ctx;
@@ -620,10 +606,6 @@ static struct crypto_alg blk_lrw_alg = {
 	},
 };
 
-#endif
-
-#ifdef HAS_XTS
-
 struct serpent_xts_ctx {
 	struct serpent_ctx tweak_ctx;
 	struct serpent_ctx crypt_ctx;
@@ -730,8 +712,6 @@ static struct crypto_alg blk_xts_alg = {
 	},
 };
 
-#endif
-
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int key_len)
 {
@@ -930,8 +910,6 @@ static struct crypto_alg ablk_ctr_alg = {
 	},
 };
 
-#ifdef HAS_LRW
-
 static int ablk_lrw_init(struct crypto_tfm *tfm)
 {
 	struct cryptd_ablkcipher *cryptd_tfm;
@@ -970,10 +948,6 @@ static struct crypto_alg ablk_lrw_alg = {
 	},
 };
 
-#endif
-
-#ifdef HAS_XTS
-
 static int ablk_xts_init(struct crypto_tfm *tfm)
 {
 	struct cryptd_ablkcipher *cryptd_tfm;
@@ -1010,8 +984,6 @@ static struct crypto_alg ablk_xts_alg = {
 	},
 };
 
-#endif
-
 static int __init serpent_sse2_init(void)
 {
 	int err;
@@ -1039,36 +1011,28 @@ static int __init serpent_sse2_init(void)
 	err = crypto_register_alg(&ablk_ctr_alg);
 	if (err)
 		goto ablk_ctr_err;
-#ifdef HAS_LRW
 	err = crypto_register_alg(&blk_lrw_alg);
 	if (err)
 		goto blk_lrw_err;
 	err = crypto_register_alg(&ablk_lrw_alg);
 	if (err)
 		goto ablk_lrw_err;
-#endif
-#ifdef HAS_XTS
 	err = crypto_register_alg(&blk_xts_alg);
 	if (err)
 		goto blk_xts_err;
 	err = crypto_register_alg(&ablk_xts_alg);
 	if (err)
 		goto ablk_xts_err;
-#endif
 	return err;
 
-#ifdef HAS_XTS
 	crypto_unregister_alg(&ablk_xts_alg);
 ablk_xts_err:
 	crypto_unregister_alg(&blk_xts_alg);
 blk_xts_err:
-#endif
-#ifdef HAS_LRW
 	crypto_unregister_alg(&ablk_lrw_alg);
 ablk_lrw_err:
 	crypto_unregister_alg(&blk_lrw_alg);
 blk_lrw_err:
-#endif
 	crypto_unregister_alg(&ablk_ctr_alg);
 ablk_ctr_err:
 	crypto_unregister_alg(&ablk_cbc_alg);
@@ -1086,14 +1050,10 @@ blk_ecb_err:
 
 static void __exit serpent_sse2_exit(void)
 {
-#ifdef HAS_XTS
 	crypto_unregister_alg(&ablk_xts_alg);
 	crypto_unregister_alg(&blk_xts_alg);
-#endif
-#ifdef HAS_LRW
 	crypto_unregister_alg(&ablk_lrw_alg);
 	crypto_unregister_alg(&blk_lrw_alg);
-#endif
 	crypto_unregister_alg(&ablk_ctr_alg);
 	crypto_unregister_alg(&ablk_cbc_alg);
 	crypto_unregister_alg(&ablk_ecb_alg);
-- 
cgit v1.1


From 549c89b98c4530b278dde1a3f68ce5ebbb1e6304 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 29 Nov 2011 12:44:55 -0800
Subject: x86: Do not schedule while still in NMI context

The NMI handler uses the paranoid_exit routine that checks the
NEED_RESCHED flag, and if it is set and the return is for userspace,
then interrupts are enabled, the stack is swapped to the thread's stack,
and schedule is called. The problem with this is that we are still in an
NMI context until an iret is executed. This means that any new NMIs are
now starved until an interrupt or exception occurs and does the iret.

As NMIs can not be masked and can interrupt any location, they are
treated as a special case. NEED_RESCHED should not be set in an NMI
handler. The interruption by the NMI should not disturb the work flow
for scheduling. Any IPI sent to a processor after sending the
NEED_RESCHED would have to wait for the NMI anyway, and after the IPI
finishes the schedule would be called as required.

There is no reason to do anything special leaving an NMI. Remove the
call to paranoid_exit and do a simple return. This not only fixes the
bug of starved NMIs, but it also cleans up the code.

Link: http://lkml.kernel.org/r/CA+55aFzgM55hXTs4griX5e9=v_O+=ue+7Rj0PTD=M7hFYpyULQ@mail.gmail.com

Acked-by: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Turner <pjt@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 32 --------------------------------
 1 file changed, 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index faf8d5e..3819ea9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1489,46 +1489,14 @@ ENTRY(nmi)
 	movq %rsp,%rdi
 	movq $-1,%rsi
 	call do_nmi
-#ifdef CONFIG_TRACE_IRQFLAGS
-	/* paranoidexit; without TRACE_IRQS_OFF */
-	/* ebx:	no swapgs flag */
-	DISABLE_INTERRUPTS(CLBR_NONE)
 	testl %ebx,%ebx				/* swapgs needed? */
 	jnz nmi_restore
-	testl $3,CS(%rsp)
-	jnz nmi_userspace
 nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
 	RESTORE_ALL 8
 	jmp irq_return
-nmi_userspace:
-	GET_THREAD_INFO(%rcx)
-	movl TI_flags(%rcx),%ebx
-	andl $_TIF_WORK_MASK,%ebx
-	jz nmi_swapgs
-	movq %rsp,%rdi			/* &pt_regs */
-	call sync_regs
-	movq %rax,%rsp			/* switch stack for scheduling */
-	testl $_TIF_NEED_RESCHED,%ebx
-	jnz nmi_schedule
-	movl %ebx,%edx			/* arg3: thread flags */
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	xorl %esi,%esi 			/* arg2: oldset */
-	movq %rsp,%rdi 			/* arg1: &pt_regs */
-	call do_notify_resume
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	jmp nmi_userspace
-nmi_schedule:
-	ENABLE_INTERRUPTS(CLBR_ANY)
-	call schedule
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	jmp nmi_userspace
 	CFI_ENDPROC
-#else
-	jmp paranoid_exit
-	CFI_ENDPROC
-#endif
 END(nmi)
 
 ENTRY(ignore_sysret)
-- 
cgit v1.1


From 1fd466efc88c48f50e5ee29f4dbb4e210a889172 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 8 Dec 2011 12:32:27 -0500
Subject: x86: Document the NMI handler about not using paranoid_exit

Linus cleaned up the NMI handler but it still needs some comments to
explain why it uses save_paranoid but not paranoid_exit. Just to keep
others from adding that in the future, document why it's not used.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3819ea9..d1d5434 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1480,9 +1480,16 @@ END(error_exit)
 ENTRY(nmi)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	pushq_cfi $-1
+	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
 	subq $ORIG_RAX-R15, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+	/*
+	 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
+	 * as we should not be calling schedule in NMI context.
+	 * Even with normal interrupts enabled. An NMI should not be
+	 * setting NEED_RESCHED or anything that normal interrupts and
+	 * exceptions might do.
+	 */
 	call save_paranoid
 	DEFAULT_FRAME 0
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
-- 
cgit v1.1


From 3f3c8b8c4b2a34776c3470142a7c8baafcda6eb0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 8 Dec 2011 12:36:23 -0500
Subject: x86: Add workaround to NMI iret woes

In x86, when an NMI goes off, the CPU goes into an NMI context that
prevents other NMIs to trigger on that CPU. If an NMI is suppose to
trigger, it has to wait till the previous NMI leaves NMI context.
At that time, the next NMI can trigger (note, only one more NMI will
trigger, as only one can be latched at a time).

The way x86 gets out of NMI context is by calling iret. The problem
with this is that this causes problems if the NMI handle either
triggers an exception, or a breakpoint. Both the exception and the
breakpoint handlers will finish with an iret. If this happens while
in NMI context, the CPU will leave NMI context and a new NMI may come
in. As NMI handlers are not made to be re-entrant, this can cause
havoc with the system, not to mention, the nested NMI will write
all over the previous NMI's stack.

Linus Torvalds proposed the following workaround to this problem:

https://lkml.org/lkml/2010/7/14/264

"In fact, I wonder if we couldn't just do a software NMI disable
instead? Hav ea per-cpu variable (in the _core_ percpu areas that get
allocated statically) that points to the NMI stack frame, and just
make the NMI code itself do something like

 NMI entry:
 - load percpu NMI stack frame pointer
 - if non-zero we know we're nested, and should ignore this NMI:
    - we're returning to kernel mode, so return immediately by using
"popf/ret", which also keeps NMI's disabled in the hardware until the
"real" NMI iret happens.
    - before the popf/iret, use the NMI stack pointer to make the NMI
return stack be invalid and cause a fault
  - set the NMI stack pointer to the current stack pointer

 NMI exit (not the above "immediate exit because we nested"):
   clear the percpu NMI stack pointer
   Just do the iret.

Now, the thing is, now the "iret" is atomic. If we had a nested NMI,
we'll take a fault, and that re-does our "delayed" NMI - and NMI's
will stay masked.

And if we didn't have a nested NMI, that iret will now unmask NMI's,
and everything is happy."

I first tried to follow this advice but as I started implementing this
code, a few gotchas showed up.

One, is accessing per-cpu variables in the NMI handler.

The problem is that per-cpu variables use the %gs register to get the
variable for the given CPU. But as the NMI may happen in userspace,
we must first perform a SWAPGS to get to it. The NMI handler already
does this later in the code, but its too late as we have saved off
all the registers and we don't want to do that for a disabled NMI.

Peter Zijlstra suggested to keep all variables on the stack. This
simplifies things greatly and it has the added benefit of cache locality.

Two, faulting on the iret.

I really wanted to make this work, but it was becoming very hacky, and
I never got it to be stable. The iret already had a fault handler for
userspace faulting with bad segment registers, and getting NMI to trigger
a fault and detect it was very tricky. But for strange reasons, the system
would usually take a double fault and crash. I never figured out why
and decided to go with a simple "jmp" approach. The new approach I took
also simplified things.

Finally, the last problem with Linus's approach was to have the nested
NMI handler do a ret instead of an iret to give the first NMI NMI-context
again.

The problem is that ret is much more limited than an iret. I couldn't figure
out how to get the stack back where it belonged. I could have copied the
current stack, pushed the return onto it, but my fear here is that there
may be some place that writes data below the stack pointer. I know that
is not something code should depend on, but I don't want to chance it.
I may add this feature later, but for now, an NMI handler that loses NMI
context will not get it back.

Here's what is done:

When an NMI comes in, the HW pushes the interrupt stack frame onto the
per cpu NMI stack that is selected by the IST.

A special location on the NMI stack holds a variable that is set when
the first NMI handler runs. If this variable is set then we know that
this is a nested NMI and we process the nested NMI code.

There is still a race when this variable is cleared and an NMI comes
in just before the first NMI does the return. For this case, if the
variable is cleared, we also check if the interrupted stack is the
NMI stack. If it is, then we process the nested NMI code.

Why the two tests and not just test the interrupted stack?

If the first NMI hits a breakpoint and loses NMI context, and then it
hits another breakpoint and while processing that breakpoint we get a
nested NMI. When processing a breakpoint, the stack changes to the
breakpoint stack. If another NMI comes in here we can't rely on the
interrupted stack to be the NMI stack.

If the variable is not set and the interrupted task's stack is not the
NMI stack, then we know this is the first NMI and we can process things
normally. But in order to do so, we need to do a few things first.

1) Set the stack variable that tells us that we are in an NMI handler

2) Make two copies of the interrupt stack frame.
   One copy is used to return on iret
   The other is used to restore the first one if we have a nested NMI.

This is what the stack will look like:

	  +-------------------------+
	  | original SS             |
	  | original Return RSP     |
	  | original RFLAGS         |
	  | original CS             |
	  | original RIP            |
	  +-------------------------+
	  | temp storage for rdx    |
	  +-------------------------+
	  | NMI executing variable  |
	  +-------------------------+
	  | Saved SS                |
	  | Saved Return RSP        |
	  | Saved RFLAGS            |
	  | Saved CS                |
	  | Saved RIP               |
	  +-------------------------+
	  | copied SS               |
	  | copied Return RSP       |
	  | copied RFLAGS           |
	  | copied CS               |
	  | copied RIP              |
	  +-------------------------+
	  | pt_regs                 |
	  +-------------------------+

The original stack frame contains what the HW put in when we entered
the NMI.

We store %rdx as a temp variable to use. Both the original HW stack
frame and this %rdx storage will be clobbered by nested NMIs so we
can not rely on them later in the first NMI handler.

The next item is the special stack variable that is set when we execute
the rest of the NMI handler.

Then we have two copies of the interrupt stack. The second copy is
modified by any nested NMIs to let the first NMI know that we triggered
a second NMI (latched) and that we should repeat the NMI handler.

If the first NMI hits an exception or breakpoint that takes it out of
NMI context, if a second NMI comes in before the first one finishes,
it will update the copied interrupt stack to point to a fix up location
to trigger another NMI.

When the first NMI calls iret, it will instead jump to the fix up
location. This fix up location will copy the saved interrupt stack back
to the copy and execute the nmi handler again.

Note, the nested NMI knows enough to check if it preempted a previous
NMI handler while it is in the fixup location. If it has, it will not
modify the copied interrupt stack and will just leave as if nothing
happened. As the NMI handle is about to execute again, there's no reason
to latch now.

To test all this, I forced the NMI handler to call iret and take itself
out of NMI context. I also added assemble code to write to the serial to
make sure that it hits the nested path as well as the fix up path.
Everything seems to be working fine.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Turner <pjt@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/entry_64.S | 177 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index d1d5434..b62aa29 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1475,11 +1475,166 @@ ENTRY(error_exit)
 	CFI_ENDPROC
 END(error_exit)
 
+/*
+ * Test if a given stack is an NMI stack or not.
+ */
+	.macro test_in_nmi reg stack nmi_ret normal_ret
+	cmpq %\reg, \stack
+	ja \normal_ret
+	subq $EXCEPTION_STKSZ, %\reg
+	cmpq %\reg, \stack
+	jb \normal_ret
+	jmp \nmi_ret
+	.endm
 
 	/* runs on exception stack */
 ENTRY(nmi)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
+	/*
+	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
+	 * the iretq it performs will take us out of NMI context.
+	 * This means that we can have nested NMIs where the next
+	 * NMI is using the top of the stack of the previous NMI. We
+	 * can't let it execute because the nested NMI will corrupt the
+	 * stack of the previous NMI. NMI handlers are not re-entrant
+	 * anyway.
+	 *
+	 * To handle this case we do the following:
+	 *  Check the a special location on the stack that contains
+	 *  a variable that is set when NMIs are executing.
+	 *  The interrupted task's stack is also checked to see if it
+	 *  is an NMI stack.
+	 *  If the variable is not set and the stack is not the NMI
+	 *  stack then:
+	 *    o Set the special variable on the stack
+	 *    o Copy the interrupt frame into a "saved" location on the stack
+	 *    o Copy the interrupt frame into a "copy" location on the stack
+	 *    o Continue processing the NMI
+	 *  If the variable is set or the previous stack is the NMI stack:
+	 *    o Modify the "copy" location to jump to the repeate_nmi
+	 *    o return back to the first NMI
+	 *
+	 * Now on exit of the first NMI, we first clear the stack variable
+	 * The NMI stack will tell any nested NMIs at that point that it is
+	 * nested. Then we pop the stack normally with iret, and if there was
+	 * a nested NMI that updated the copy interrupt stack frame, a
+	 * jump will be made to the repeat_nmi code that will handle the second
+	 * NMI.
+	 */
+
+	/* Use %rdx as out temp variable throughout */
+	pushq_cfi %rdx
+
+	/*
+	 * Check the special variable on the stack to see if NMIs are
+	 * executing.
+	 */
+	cmp $1, -8(%rsp)
+	je nested_nmi
+
+	/*
+	 * Now test if the previous stack was an NMI stack.
+	 * We need the double check. We check the NMI stack to satisfy the
+	 * race when the first NMI clears the variable before returning.
+	 * We check the variable because the first NMI could be in a
+	 * breakpoint routine using a breakpoint stack.
+	 */
+	lea 6*8(%rsp), %rdx
+	test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+
+nested_nmi:
+	/*
+	 * Do nothing if we interrupted the fixup in repeat_nmi.
+	 * It's about to repeat the NMI handler, so we are fine
+	 * with ignoring this one.
+	 */
+	movq $repeat_nmi, %rdx
+	cmpq 8(%rsp), %rdx
+	ja 1f
+	movq $end_repeat_nmi, %rdx
+	cmpq 8(%rsp), %rdx
+	ja nested_nmi_out
+
+1:
+	/* Set up the interrupted NMIs stack to jump to repeat_nmi */
+	leaq -6*8(%rsp), %rdx
+	movq %rdx, %rsp
+	CFI_ADJUST_CFA_OFFSET 6*8
+	pushq_cfi $__KERNEL_DS
+	pushq_cfi %rdx
+	pushfq_cfi
+	pushq_cfi $__KERNEL_CS
+	pushq_cfi $repeat_nmi
+
+	/* Put stack back */
+	addq $(11*8), %rsp
+	CFI_ADJUST_CFA_OFFSET -11*8
+
+nested_nmi_out:
+	popq_cfi %rdx
+
+	/* No need to check faults here */
+	INTERRUPT_RETURN
+
+first_nmi:
+	/*
+	 * Because nested NMIs will use the pushed location that we
+	 * stored in rdx, we must keep that space available.
+	 * Here's what our stack frame will look like:
+	 * +-------------------------+
+	 * | original SS             |
+	 * | original Return RSP     |
+	 * | original RFLAGS         |
+	 * | original CS             |
+	 * | original RIP            |
+	 * +-------------------------+
+	 * | temp storage for rdx    |
+	 * +-------------------------+
+	 * | NMI executing variable  |
+	 * +-------------------------+
+	 * | Saved SS                |
+	 * | Saved Return RSP        |
+	 * | Saved RFLAGS            |
+	 * | Saved CS                |
+	 * | Saved RIP               |
+	 * +-------------------------+
+	 * | copied SS               |
+	 * | copied Return RSP       |
+	 * | copied RFLAGS           |
+	 * | copied CS               |
+	 * | copied RIP              |
+	 * +-------------------------+
+	 * | pt_regs                 |
+	 * +-------------------------+
+	 *
+	 * The saved RIP is used to fix up the copied RIP that a nested
+	 * NMI may zero out. The original stack frame and the temp storage
+	 * is also used by nested NMIs and can not be trusted on exit.
+	 */
+	/* Set the NMI executing variable on the stack. */
+	pushq_cfi $1
+
+	/* Copy the stack frame to the Saved frame */
+	.rept 5
+	pushq_cfi 6*8(%rsp)
+	.endr
+
+	/* Make another copy, this one may be modified by nested NMIs */
+	.rept 5
+	pushq_cfi 4*8(%rsp)
+	.endr
+
+	/* Do not pop rdx, nested NMIs will corrupt it */
+	movq 11*8(%rsp), %rdx
+
+	/*
+	 * Everything below this point can be preempted by a nested
+	 * NMI if the first NMI took an exception. Repeated NMIs
+	 * caused by an exception and nested NMI will start here, and
+	 * can still be preempted by another NMI.
+	 */
+restart_nmi:
 	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
 	subq $ORIG_RAX-R15, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@@ -1502,10 +1657,32 @@ nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
 	RESTORE_ALL 8
+	/* Clear the NMI executing stack variable */
+	movq $0, 10*8(%rsp)
 	jmp irq_return
 	CFI_ENDPROC
 END(nmi)
 
+	/*
+	 * If an NMI hit an iret because of an exception or breakpoint,
+	 * it can lose its NMI context, and a nested NMI may come in.
+	 * In that case, the nested NMI will change the preempted NMI's
+	 * stack to jump to here when it does the final iret.
+	 */
+repeat_nmi:
+	INTR_FRAME
+	/* Update the stack variable to say we are still in NMI */
+	movq $1, 5*8(%rsp)
+
+	/* copy the saved stack back to copy stack */
+	.rept 5
+	pushq_cfi 4*8(%rsp)
+	.endr
+
+	jmp restart_nmi
+	CFI_ENDPROC
+end_repeat_nmi:
+
 ENTRY(ignore_sysret)
 	CFI_STARTPROC
 	mov $-ENOSYS,%eax
-- 
cgit v1.1


From 228bdaa95fb830e08b6acd1afd4d2c55093cabfa Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 9 Dec 2011 03:02:19 -0500
Subject: x86: Keep current stack in NMI breakpoints

We want to allow NMI handlers to have breakpoints to be able to
remove stop_machine from ftrace, kprobes and jump_labels. But if
an NMI interrupts a current breakpoint, and then it triggers a
breakpoint itself, it will switch to the breakpoint stack and
corrupt the data on it for the breakpoint processing that it
interrupted.

Instead, have the NMI check if it interrupted breakpoint processing
by checking if the stack that is currently used is a breakpoint
stack. If it is, then load a special IDT that changes the IST
for the debug exception to keep the same stack in kernel context.
When the NMI is done, it puts it back.

This way, if the NMI does trigger a breakpoint, it will keep
using the same stack and not stomp on the breakpoint data for
the breakpoint it interrupted.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/desc.h      | 12 ++++++++++++
 arch/x86/include/asm/processor.h |  6 ++++++
 arch/x86/kernel/cpu/common.c     | 22 ++++++++++++++++++++++
 arch/x86/kernel/head_64.S        |  4 ++++
 arch/x86/kernel/nmi.c            | 15 +++++++++++++++
 arch/x86/kernel/traps.c          |  6 ++++++
 6 files changed, 65 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 41935fa..e95822d 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
 
 extern struct desc_ptr idt_descr;
 extern gate_desc idt_table[];
+extern struct desc_ptr nmi_idt_descr;
+extern gate_desc nmi_idt_table[];
 
 struct gdt_page {
 	struct desc_struct gdt[GDT_ENTRIES];
@@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
 	desc->limit = (limit >> 16) & 0xf;
 }
 
+#ifdef CONFIG_X86_64
+static inline void set_nmi_gate(int gate, void *addr)
+{
+	gate_desc s;
+
+	pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
+	write_idt_entry(nmi_idt_table, gate, &s);
+}
+#endif
+
 static inline void _set_gate(int gate, unsigned type, void *addr,
 			     unsigned dpl, unsigned ist, unsigned seg)
 {
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b650435..4b39d6d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -402,6 +402,9 @@ DECLARE_PER_CPU(char *, irq_stack_ptr);
 DECLARE_PER_CPU(unsigned int, irq_count);
 extern unsigned long kernel_eflags;
 extern asmlinkage void ignore_sysret(void);
+int is_debug_stack(unsigned long addr);
+void debug_stack_set_zero(void);
+void debug_stack_reset(void);
 #else	/* X86_64 */
 #ifdef CONFIG_CC_STACKPROTECTOR
 /*
@@ -416,6 +419,9 @@ struct stack_canary {
 };
 DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
+static inline int is_debug_stack(unsigned long addr) { return 0; }
+static inline void debug_stack_set_zero(void) { }
+static inline void debug_stack_reset(void) { }
 #endif	/* X86_64 */
 
 extern unsigned int xstate_size;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index aa003b1..caa4045 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1026,6 +1026,8 @@ __setup("clearcpuid=", setup_disablecpuid);
 
 #ifdef CONFIG_X86_64
 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
+struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
+				    (unsigned long) nmi_idt_table };
 
 DEFINE_PER_CPU_FIRST(union irq_stack_union,
 		     irq_stack_union) __aligned(PAGE_SIZE);
@@ -1090,6 +1092,24 @@ unsigned long kernel_eflags;
  */
 DEFINE_PER_CPU(struct orig_ist, orig_ist);
 
+static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
+
+int is_debug_stack(unsigned long addr)
+{
+	return addr <= __get_cpu_var(debug_stack_addr) &&
+		addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ);
+}
+
+void debug_stack_set_zero(void)
+{
+	load_idt((const struct desc_ptr *)&nmi_idt_descr);
+}
+
+void debug_stack_reset(void)
+{
+	load_idt((const struct desc_ptr *)&idt_descr);
+}
+
 #else	/* CONFIG_X86_64 */
 
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
@@ -1208,6 +1228,8 @@ void __cpuinit cpu_init(void)
 			estacks += exception_stack_sizes[v];
 			oist->ist[v] = t->x86_tss.ist[v] =
 					(unsigned long)estacks;
+			if (v == DEBUG_STACK-1)
+				per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
 		}
 	}
 
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e11e394..40f4eb3 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -417,6 +417,10 @@ ENTRY(phys_base)
 ENTRY(idt_table)
 	.skip IDT_ENTRIES * 16
 
+	.align L1_CACHE_BYTES
+ENTRY(nmi_idt_table)
+	.skip IDT_ENTRIES * 16
+
 	__PAGE_ALIGNED_BSS
 	.align PAGE_SIZE
 ENTRY(empty_zero_page)
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index e88f37b..de8d4b3 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -408,6 +408,18 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 dotraplinkage notrace __kprobes void
 do_nmi(struct pt_regs *regs, long error_code)
 {
+	int update_debug_stack = 0;
+
+	/*
+	 * If we interrupted a breakpoint, it is possible that
+	 * the nmi handler will have breakpoints too. We need to
+	 * change the IDT such that breakpoints that happen here
+	 * continue to use the NMI stack.
+	 */
+	if (unlikely(is_debug_stack(regs->sp))) {
+		debug_stack_set_zero();
+		update_debug_stack = 1;
+	}
 	nmi_enter();
 
 	inc_irq_stat(__nmi_count);
@@ -416,6 +428,9 @@ do_nmi(struct pt_regs *regs, long error_code)
 		default_do_nmi(regs);
 
 	nmi_exit();
+
+	if (unlikely(update_debug_stack))
+		debug_stack_reset();
 }
 
 void stop_nmi(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a8e3eb8..a93c5ca 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -723,4 +723,10 @@ void __init trap_init(void)
 	cpu_init();
 
 	x86_init.irqs.trap_init();
+
+#ifdef CONFIG_X86_64
+	memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
+	set_nmi_gate(1, &debug);
+	set_nmi_gate(3, &int3);
+#endif
 }
-- 
cgit v1.1


From ccd49c2391773ffbf52bb80d75c4a92b16972517 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 13 Dec 2011 16:44:16 -0500
Subject: x86: Allow NMIs to hit breakpoints in i386

With i386, NMIs and breakpoints use the current stack and they
do not reset the stack pointer to a fix point that might corrupt
a previous NMI or breakpoint (as it does in x86_64). But NMIs are
still not made to be re-entrant, and need to prevent the case that
an NMI hitting a breakpoint (which does an iret), doesn't allow
another NMI to run.

The fix is to let the NMI be in 3 different states:

1) not running
2) executing
3) latched

When no NMI is executing on a given CPU, the state is "not running".
When the first NMI comes in, the state is switched to "executing".
On exit of that NMI, a cmpxchg is performed to switch the state
back to "not running" and if that fails, the NMI is restarted.

If a breakpoint is hit and does an iret, which re-enables NMIs,
and another NMI comes in before the first NMI finished, it will
detect that the state is not in the "not running" state and the
current NMI is nested. In this case, the state is switched to "latched"
to let the interrupted NMI know to restart the NMI handler, and
the nested NMI exits without doing anything.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Turner <pjt@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/nmi.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 94 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index de8d4b3..47acaf3 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -405,11 +405,84 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		unknown_nmi_error(reason, regs);
 }
 
-dotraplinkage notrace __kprobes void
-do_nmi(struct pt_regs *regs, long error_code)
-{
-	int update_debug_stack = 0;
+/*
+ * NMIs can hit breakpoints which will cause it to lose its
+ * NMI context with the CPU when the breakpoint does an iret.
+ */
+#ifdef CONFIG_X86_32
+/*
+ * For i386, NMIs use the same stack as the kernel, and we can
+ * add a workaround to the iret problem in C. Simply have 3 states
+ * the NMI can be in.
+ *
+ *  1) not running
+ *  2) executing
+ *  3) latched
+ *
+ * When no NMI is in progress, it is in the "not running" state.
+ * When an NMI comes in, it goes into the "executing" state.
+ * Normally, if another NMI is triggered, it does not interrupt
+ * the running NMI and the HW will simply latch it so that when
+ * the first NMI finishes, it will restart the second NMI.
+ * (Note, the latch is binary, thus multiple NMIs triggering,
+ *  when one is running, are ignored. Only one NMI is restarted.)
+ *
+ * If an NMI hits a breakpoint that executes an iret, another
+ * NMI can preempt it. We do not want to allow this new NMI
+ * to run, but we want to execute it when the first one finishes.
+ * We set the state to "latched", and the first NMI will perform
+ * an cmpxchg on the state, and if it doesn't successfully
+ * reset the state to "not running" it will restart the next
+ * NMI.
+ */
+enum nmi_states {
+	NMI_NOT_RUNNING,
+	NMI_EXECUTING,
+	NMI_LATCHED,
+};
+static DEFINE_PER_CPU(enum nmi_states, nmi_state);
+
+#define nmi_nesting_preprocess(regs)					\
+	do {								\
+		if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) {	\
+			__get_cpu_var(nmi_state) = NMI_LATCHED;		\
+			return;						\
+		}							\
+	nmi_restart:							\
+		__get_cpu_var(nmi_state) = NMI_EXECUTING;		\
+	} while (0)
+
+#define nmi_nesting_postprocess()					\
+	do {								\
+		if (cmpxchg(&__get_cpu_var(nmi_state),			\
+		    NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING)	\
+			goto nmi_restart;				\
+	} while (0)
+#else /* x86_64 */
+/*
+ * In x86_64 things are a bit more difficult. This has the same problem
+ * where an NMI hitting a breakpoint that calls iret will remove the
+ * NMI context, allowing a nested NMI to enter. What makes this more
+ * difficult is that both NMIs and breakpoints have their own stack.
+ * When a new NMI or breakpoint is executed, the stack is set to a fixed
+ * point. If an NMI is nested, it will have its stack set at that same
+ * fixed address that the first NMI had, and will start corrupting the
+ * stack. This is handled in entry_64.S, but the same problem exists with
+ * the breakpoint stack.
+ *
+ * If a breakpoint is being processed, and the debug stack is being used,
+ * if an NMI comes in and also hits a breakpoint, the stack pointer
+ * will be set to the same fixed address as the breakpoint that was
+ * interrupted, causing that stack to be corrupted. To handle this case,
+ * check if the stack that was interrupted is the debug stack, and if
+ * so, change the IDT so that new breakpoints will use the current stack
+ * and not switch to the fixed address. On return of the NMI, switch back
+ * to the original IDT.
+ */
+static DEFINE_PER_CPU(int, update_debug_stack);
 
+static inline void nmi_nesting_preprocess(struct pt_regs *regs)
+{
 	/*
 	 * If we interrupted a breakpoint, it is possible that
 	 * the nmi handler will have breakpoints too. We need to
@@ -418,8 +491,22 @@ do_nmi(struct pt_regs *regs, long error_code)
 	 */
 	if (unlikely(is_debug_stack(regs->sp))) {
 		debug_stack_set_zero();
-		update_debug_stack = 1;
+		__get_cpu_var(update_debug_stack) = 1;
 	}
+}
+
+static inline void nmi_nesting_postprocess(void)
+{
+	if (unlikely(__get_cpu_var(update_debug_stack)))
+		debug_stack_reset();
+}
+#endif
+
+dotraplinkage notrace __kprobes void
+do_nmi(struct pt_regs *regs, long error_code)
+{
+	nmi_nesting_preprocess(regs);
+
 	nmi_enter();
 
 	inc_irq_stat(__nmi_count);
@@ -429,8 +516,8 @@ do_nmi(struct pt_regs *regs, long error_code)
 
 	nmi_exit();
 
-	if (unlikely(update_debug_stack))
-		debug_stack_reset();
+	/* On i386, may loop back to preprocess */
+	nmi_nesting_postprocess();
 }
 
 void stop_nmi(void)
-- 
cgit v1.1


From 42181186ad4db986fcaa40ca95c6e407e9e79372 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 16 Dec 2011 11:43:02 -0500
Subject: x86: Add counter when debug stack is used with interrupts enabled

Mathieu Desnoyers pointed out a case that can cause issues with
NMIs running on the debug stack:

  int3 -> interrupt -> NMI -> int3

Because the interrupt changes the stack, the NMI will not see that
it preempted the debug stack. Looking deeper at this case,
interrupts only happen when the int3 is from userspace or in
an a location in the exception table (fixup).

  userspace -> int3 -> interurpt -> NMI -> int3

All other int3s that happen in the kernel should be processed
without ever enabling interrupts, as the do_trap() call will
panic the kernel if it is called to process any other location
within the kernel.

Adding a counter around the sections that enable interrupts while
using the debug stack allows the NMI to also check that case.
If the NMI sees that it either interrupted a task using the debug
stack or the debug counter is non-zero, then it will have to
change the IDT table to make the int3 not change stacks (which will
corrupt the stack if it does).

Note, I had to move the debug_usage functions out of processor.h
and into debugreg.h because of the static inlined functions to
inc and dec the debug_usage counter. __get_cpu_var() requires
smp.h which includes processor.h, and would fail to build.

Link: http://lkml.kernel.org/r/1323976535.23971.112.camel@gandalf.stny.rr.com

Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul Turner <pjt@google.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/debugreg.h  | 22 ++++++++++++++++++++++
 arch/x86/include/asm/processor.h |  6 ------
 arch/x86/kernel/cpu/common.c     |  6 ++++--
 arch/x86/kernel/traps.c          | 14 ++++++++++++++
 4 files changed, 40 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 078ad0c..b903d5e 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump);
 
 extern void hw_breakpoint_restore(void);
 
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(int, debug_stack_usage);
+static inline void debug_stack_usage_inc(void)
+{
+	__get_cpu_var(debug_stack_usage)++;
+}
+static inline void debug_stack_usage_dec(void)
+{
+	__get_cpu_var(debug_stack_usage)--;
+}
+int is_debug_stack(unsigned long addr);
+void debug_stack_set_zero(void);
+void debug_stack_reset(void);
+#else /* !X86_64 */
+static inline int is_debug_stack(unsigned long addr) { return 0; }
+static inline void debug_stack_set_zero(void) { }
+static inline void debug_stack_reset(void) { }
+static inline void debug_stack_usage_inc(void) { }
+static inline void debug_stack_usage_dec(void) { }
+#endif /* X86_64 */
+
+
 #endif	/* __KERNEL__ */
 
 #endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4b39d6d..b650435 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -402,9 +402,6 @@ DECLARE_PER_CPU(char *, irq_stack_ptr);
 DECLARE_PER_CPU(unsigned int, irq_count);
 extern unsigned long kernel_eflags;
 extern asmlinkage void ignore_sysret(void);
-int is_debug_stack(unsigned long addr);
-void debug_stack_set_zero(void);
-void debug_stack_reset(void);
 #else	/* X86_64 */
 #ifdef CONFIG_CC_STACKPROTECTOR
 /*
@@ -419,9 +416,6 @@ struct stack_canary {
 };
 DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
-static inline int is_debug_stack(unsigned long addr) { return 0; }
-static inline void debug_stack_set_zero(void) { }
-static inline void debug_stack_reset(void) { }
 #endif	/* X86_64 */
 
 extern unsigned int xstate_size;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index caa4045..266e464 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1093,11 +1093,13 @@ unsigned long kernel_eflags;
 DEFINE_PER_CPU(struct orig_ist, orig_ist);
 
 static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
+DEFINE_PER_CPU(int, debug_stack_usage);
 
 int is_debug_stack(unsigned long addr)
 {
-	return addr <= __get_cpu_var(debug_stack_addr) &&
-		addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ);
+	return __get_cpu_var(debug_stack_usage) ||
+		(addr <= __get_cpu_var(debug_stack_addr) &&
+		 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
 }
 
 void debug_stack_set_zero(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a93c5ca..0072b38 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -316,9 +316,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 		return;
 #endif
 
+	/*
+	 * Let others (NMI) know that the debug stack is in use
+	 * as we may switch to the interrupt stack.
+	 */
+	debug_stack_usage_inc();
 	preempt_conditional_sti(regs);
 	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
 	preempt_conditional_cli(regs);
+	debug_stack_usage_dec();
 }
 
 #ifdef CONFIG_X86_64
@@ -411,6 +417,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 							SIGTRAP) == NOTIFY_STOP)
 		return;
 
+	/*
+	 * Let others (NMI) know that the debug stack is in use
+	 * as we may switch to the interrupt stack.
+	 */
+	debug_stack_usage_inc();
+
 	/* It's safe to allow irq's after DR6 has been saved */
 	preempt_conditional_sti(regs);
 
@@ -418,6 +430,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 		handle_vm86_trap((struct kernel_vm86_regs *) regs,
 				error_code, 1);
 		preempt_conditional_cli(regs);
+		debug_stack_usage_dec();
 		return;
 	}
 
@@ -437,6 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(tsk, regs, error_code, si_code);
 	preempt_conditional_cli(regs);
+	debug_stack_usage_dec();
 
 	return;
 }
-- 
cgit v1.1


From 8a25a2fd126c621f44f3aeaef80d51f00fc11639 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Wed, 21 Dec 2011 14:29:42 -0800
Subject: cpu: convert 'cpu' and 'machinecheck' sysdev_class to a regular
 subsystem

This moves the 'cpu sysdev_class' over to a regular 'cpu' subsystem
and converts the devices to regular devices. The sysdev drivers are
implemented as subsystem interfaces now.

After all sysdev classes are ported to regular driver core entities, the
sysdev implementation will be entirely removed from the kernel.

Userspace relies on events and generic sysfs subsystem infrastructure
from sysdev devices, which are made available with this conversion.

Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Borislav Petkov <bp@amd64.org>
Cc: Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
Cc: Len Brown <lenb@kernel.org>
Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/include/asm/mce.h                |   2 +-
 arch/x86/kernel/cpu/intel_cacheinfo.c     |  25 +++---
 arch/x86/kernel/cpu/mcheck/mce-internal.h |   4 +-
 arch/x86/kernel/cpu/mcheck/mce.c          | 128 +++++++++++++++---------------
 arch/x86/kernel/cpu/mcheck/mce_amd.c      |  11 ++-
 arch/x86/kernel/cpu/mcheck/therm_throt.c  |  63 ++++++++-------
 arch/x86/kernel/microcode_core.c          |  58 +++++++-------
 7 files changed, 145 insertions(+), 146 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c9321f3..0b05fb4 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -149,7 +149,7 @@ static inline void enable_p5_mce(void) {}
 
 void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
-DECLARE_PER_CPU(struct sys_device, mce_sysdev);
+DECLARE_PER_CPU(struct device, mce_device);
 
 /*
  * Maximum banks number.
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index a3b0811..6b45e5e 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -844,8 +844,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
 
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
-
-extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
+#include <linux/cpu.h>
 
 /* pointer to kobject for cpuX/cache */
 static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
@@ -1073,9 +1072,9 @@ err_out:
 static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
 
 /* Add/Remove cache interface for CPU device */
-static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
+static int __cpuinit cache_add_dev(struct device *dev)
 {
-	unsigned int cpu = sys_dev->id;
+	unsigned int cpu = dev->id;
 	unsigned long i, j;
 	struct _index_kobject *this_object;
 	struct _cpuid4_info   *this_leaf;
@@ -1087,7 +1086,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 
 	retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
 				      &ktype_percpu_entry,
-				      &sys_dev->kobj, "%s", "cache");
+				      &dev->kobj, "%s", "cache");
 	if (retval < 0) {
 		cpuid4_cache_sysfs_exit(cpu);
 		return retval;
@@ -1124,9 +1123,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 	return 0;
 }
 
-static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
+static void __cpuinit cache_remove_dev(struct device *dev)
 {
-	unsigned int cpu = sys_dev->id;
+	unsigned int cpu = dev->id;
 	unsigned long i;
 
 	if (per_cpu(ici_cpuid4_info, cpu) == NULL)
@@ -1145,17 +1144,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *dev;
 
-	sys_dev = get_cpu_sysdev(cpu);
+	dev = get_cpu_device(cpu);
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		cache_add_dev(sys_dev);
+		cache_add_dev(dev);
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-		cache_remove_dev(sys_dev);
+		cache_remove_dev(dev);
 		break;
 	}
 	return NOTIFY_OK;
@@ -1174,9 +1173,9 @@ static int __cpuinit cache_sysfs_init(void)
 
 	for_each_online_cpu(i) {
 		int err;
-		struct sys_device *sys_dev = get_cpu_sysdev(i);
+		struct device *dev = get_cpu_device(i);
 
-		err = cache_add_dev(sys_dev);
+		err = cache_add_dev(dev);
 		if (err)
 			return err;
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index fefcc69..ed44c8a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,4 +1,4 @@
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <asm/mce.h>
 
 enum severity_level {
@@ -17,7 +17,7 @@ enum severity_level {
 struct mce_bank {
 	u64			ctl;			/* subevents to enable */
 	unsigned char init;				/* initialise bank? */
-	struct sysdev_attribute attr;			/* sysdev attribute */
+	struct device_attribute attr;			/* device attribute */
 	char			attrname[ATTR_LEN];	/* attribute name */
 };
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 362056a..0156c6f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -19,7 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/percpu.h>
 #include <linux/string.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/syscore_ops.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
@@ -1751,7 +1751,7 @@ static struct syscore_ops mce_syscore_ops = {
 };
 
 /*
- * mce_sysdev: Sysfs support
+ * mce_device: Sysfs support
  */
 
 static void mce_cpu_restart(void *data)
@@ -1787,27 +1787,28 @@ static void mce_enable_ce(void *all)
 		__mcheck_cpu_init_timer();
 }
 
-static struct sysdev_class mce_sysdev_class = {
+static struct bus_type mce_subsys = {
 	.name		= "machinecheck",
+	.dev_name	= "machinecheck",
 };
 
-DEFINE_PER_CPU(struct sys_device, mce_sysdev);
+DEFINE_PER_CPU(struct device, mce_device);
 
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
-static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
+static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
 {
 	return container_of(attr, struct mce_bank, attr);
 }
 
-static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t show_bank(struct device *s, struct device_attribute *attr,
 			 char *buf)
 {
 	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
 }
 
-static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t set_bank(struct device *s, struct device_attribute *attr,
 			const char *buf, size_t size)
 {
 	u64 new;
@@ -1822,14 +1823,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
 }
 
 static ssize_t
-show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
 {
 	strcpy(buf, mce_helper);
 	strcat(buf, "\n");
 	return strlen(mce_helper) + 1;
 }
 
-static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
 				const char *buf, size_t siz)
 {
 	char *p;
@@ -1844,8 +1845,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 	return strlen(mce_helper) + !!p;
 }
 
-static ssize_t set_ignore_ce(struct sys_device *s,
-			     struct sysdev_attribute *attr,
+static ssize_t set_ignore_ce(struct device *s,
+			     struct device_attribute *attr,
 			     const char *buf, size_t size)
 {
 	u64 new;
@@ -1868,8 +1869,8 @@ static ssize_t set_ignore_ce(struct sys_device *s,
 	return size;
 }
 
-static ssize_t set_cmci_disabled(struct sys_device *s,
-				 struct sysdev_attribute *attr,
+static ssize_t set_cmci_disabled(struct device *s,
+				 struct device_attribute *attr,
 				 const char *buf, size_t size)
 {
 	u64 new;
@@ -1891,108 +1892,107 @@ static ssize_t set_cmci_disabled(struct sys_device *s,
 	return size;
 }
 
-static ssize_t store_int_with_restart(struct sys_device *s,
-				      struct sysdev_attribute *attr,
+static ssize_t store_int_with_restart(struct device *s,
+				      struct device_attribute *attr,
 				      const char *buf, size_t size)
 {
-	ssize_t ret = sysdev_store_int(s, attr, buf, size);
+	ssize_t ret = device_store_int(s, attr, buf, size);
 	mce_restart();
 	return ret;
 }
 
-static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
-static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
-static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
-static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
+static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
+static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
+static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
 
-static struct sysdev_ext_attribute attr_check_interval = {
-	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
-		     store_int_with_restart),
+static struct dev_ext_attribute dev_attr_check_interval = {
+	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
 	&check_interval
 };
 
-static struct sysdev_ext_attribute attr_ignore_ce = {
-	_SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
+static struct dev_ext_attribute dev_attr_ignore_ce = {
+	__ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
 	&mce_ignore_ce
 };
 
-static struct sysdev_ext_attribute attr_cmci_disabled = {
-	_SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
+static struct dev_ext_attribute dev_attr_cmci_disabled = {
+	__ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
 	&mce_cmci_disabled
 };
 
-static struct sysdev_attribute *mce_sysdev_attrs[] = {
-	&attr_tolerant.attr,
-	&attr_check_interval.attr,
-	&attr_trigger,
-	&attr_monarch_timeout.attr,
-	&attr_dont_log_ce.attr,
-	&attr_ignore_ce.attr,
-	&attr_cmci_disabled.attr,
+static struct device_attribute *mce_device_attrs[] = {
+	&dev_attr_tolerant.attr,
+	&dev_attr_check_interval.attr,
+	&dev_attr_trigger,
+	&dev_attr_monarch_timeout.attr,
+	&dev_attr_dont_log_ce.attr,
+	&dev_attr_ignore_ce.attr,
+	&dev_attr_cmci_disabled.attr,
 	NULL
 };
 
-static cpumask_var_t mce_sysdev_initialized;
+static cpumask_var_t mce_device_initialized;
 
-/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_sysdev_create(unsigned int cpu)
+/* Per cpu device init. All of the cpus still share the same ctrl bank: */
+static __cpuinit int mce_device_create(unsigned int cpu)
 {
-	struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
+	struct device *dev = &per_cpu(mce_device, cpu);
 	int err;
 	int i, j;
 
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	memset(&sysdev->kobj, 0, sizeof(struct kobject));
-	sysdev->id  = cpu;
-	sysdev->cls = &mce_sysdev_class;
+	memset(&dev->kobj, 0, sizeof(struct kobject));
+	dev->id  = cpu;
+	dev->bus = &mce_subsys;
 
-	err = sysdev_register(sysdev);
+	err = device_register(dev);
 	if (err)
 		return err;
 
-	for (i = 0; mce_sysdev_attrs[i]; i++) {
-		err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
+	for (i = 0; mce_device_attrs[i]; i++) {
+		err = device_create_file(dev, mce_device_attrs[i]);
 		if (err)
 			goto error;
 	}
 	for (j = 0; j < banks; j++) {
-		err = sysdev_create_file(sysdev, &mce_banks[j].attr);
+		err = device_create_file(dev, &mce_banks[j].attr);
 		if (err)
 			goto error2;
 	}
-	cpumask_set_cpu(cpu, mce_sysdev_initialized);
+	cpumask_set_cpu(cpu, mce_device_initialized);
 
 	return 0;
 error2:
 	while (--j >= 0)
-		sysdev_remove_file(sysdev, &mce_banks[j].attr);
+		device_remove_file(dev, &mce_banks[j].attr);
 error:
 	while (--i >= 0)
-		sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
+		device_remove_file(dev, mce_device_attrs[i]);
 
-	sysdev_unregister(sysdev);
+	device_unregister(dev);
 
 	return err;
 }
 
-static __cpuinit void mce_sysdev_remove(unsigned int cpu)
+static __cpuinit void mce_device_remove(unsigned int cpu)
 {
-	struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
+	struct device *dev = &per_cpu(mce_device, cpu);
 	int i;
 
-	if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
+	if (!cpumask_test_cpu(cpu, mce_device_initialized))
 		return;
 
-	for (i = 0; mce_sysdev_attrs[i]; i++)
-		sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
+	for (i = 0; mce_device_attrs[i]; i++)
+		device_remove_file(dev, mce_device_attrs[i]);
 
 	for (i = 0; i < banks; i++)
-		sysdev_remove_file(sysdev, &mce_banks[i].attr);
+		device_remove_file(dev, &mce_banks[i].attr);
 
-	sysdev_unregister(sysdev);
-	cpumask_clear_cpu(cpu, mce_sysdev_initialized);
+	device_unregister(dev);
+	cpumask_clear_cpu(cpu, mce_device_initialized);
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
@@ -2042,7 +2042,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		mce_sysdev_create(cpu);
+		mce_device_create(cpu);
 		if (threshold_cpu_callback)
 			threshold_cpu_callback(action, cpu);
 		break;
@@ -2050,7 +2050,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_DEAD_FROZEN:
 		if (threshold_cpu_callback)
 			threshold_cpu_callback(action, cpu);
-		mce_sysdev_remove(cpu);
+		mce_device_remove(cpu);
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
@@ -2084,7 +2084,7 @@ static __init void mce_init_banks(void)
 
 	for (i = 0; i < banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
-		struct sysdev_attribute *a = &b->attr;
+		struct device_attribute *a = &b->attr;
 
 		sysfs_attr_init(&a->attr);
 		a->attr.name	= b->attrname;
@@ -2104,16 +2104,16 @@ static __init int mcheck_init_device(void)
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
+	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
 
 	mce_init_banks();
 
-	err = sysdev_class_register(&mce_sysdev_class);
+	err = subsys_system_register(&mce_subsys, NULL);
 	if (err)
 		return err;
 
 	for_each_online_cpu(i) {
-		err = mce_sysdev_create(i);
+		err = mce_device_create(i);
 		if (err)
 			return err;
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f547421..56d2aa1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -17,7 +17,6 @@
 #include <linux/notifier.h>
 #include <linux/kobject.h>
 #include <linux/percpu.h>
-#include <linux/sysdev.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/sysfs.h>
@@ -548,7 +547,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (!b)
 			goto out;
 
-		err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
+		err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj,
 					b->kobj, name);
 		if (err)
 			goto out;
@@ -571,7 +570,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		goto out;
 	}
 
-	b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
+	b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj);
 	if (!b->kobj)
 		goto out_free;
 
@@ -591,7 +590,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (i == cpu)
 			continue;
 
-		err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
+		err = sysfs_create_link(&per_cpu(mce_device, i).kobj,
 					b->kobj, name);
 		if (err)
 			goto out;
@@ -669,7 +668,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #ifdef CONFIG_SMP
 	/* sibling symlink */
 	if (shared_bank[bank] && b->blocks->cpu != cpu) {
-		sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
+		sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
 
 		return;
@@ -681,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 		if (i == cpu)
 			continue;
 
-		sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
+		sysfs_remove_link(&per_cpu(mce_device, i).kobj, name);
 		per_cpu(threshold_banks, i)[bank] = NULL;
 	}
 
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 787e06c..59e3f6e 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -19,7 +19,6 @@
 #include <linux/kernel.h>
 #include <linux/percpu.h>
 #include <linux/export.h>
-#include <linux/sysdev.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/smp.h>
@@ -69,16 +68,16 @@ static atomic_t therm_throt_en	= ATOMIC_INIT(0);
 static u32 lvtthmr_init __read_mostly;
 
 #ifdef CONFIG_SYSFS
-#define define_therm_throt_sysdev_one_ro(_name)				\
-	static SYSDEV_ATTR(_name, 0444,					\
-			   therm_throt_sysdev_show_##_name,		\
+#define define_therm_throt_device_one_ro(_name)				\
+	static DEVICE_ATTR(_name, 0444,					\
+			   therm_throt_device_show_##_name,		\
 				   NULL)				\
 
-#define define_therm_throt_sysdev_show_func(event, name)		\
+#define define_therm_throt_device_show_func(event, name)		\
 									\
-static ssize_t therm_throt_sysdev_show_##event##_##name(		\
-			struct sys_device *dev,				\
-			struct sysdev_attribute *attr,			\
+static ssize_t therm_throt_device_show_##event##_##name(		\
+			struct device *dev,				\
+			struct device_attribute *attr,			\
 			char *buf)					\
 {									\
 	unsigned int cpu = dev->id;					\
@@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name(		\
 	return ret;							\
 }
 
-define_therm_throt_sysdev_show_func(core_throttle, count);
-define_therm_throt_sysdev_one_ro(core_throttle_count);
+define_therm_throt_device_show_func(core_throttle, count);
+define_therm_throt_device_one_ro(core_throttle_count);
 
-define_therm_throt_sysdev_show_func(core_power_limit, count);
-define_therm_throt_sysdev_one_ro(core_power_limit_count);
+define_therm_throt_device_show_func(core_power_limit, count);
+define_therm_throt_device_one_ro(core_power_limit_count);
 
-define_therm_throt_sysdev_show_func(package_throttle, count);
-define_therm_throt_sysdev_one_ro(package_throttle_count);
+define_therm_throt_device_show_func(package_throttle, count);
+define_therm_throt_device_one_ro(package_throttle_count);
 
-define_therm_throt_sysdev_show_func(package_power_limit, count);
-define_therm_throt_sysdev_one_ro(package_power_limit_count);
+define_therm_throt_device_show_func(package_power_limit, count);
+define_therm_throt_device_one_ro(package_power_limit_count);
 
 static struct attribute *thermal_throttle_attrs[] = {
-	&attr_core_throttle_count.attr,
+	&dev_attr_core_throttle_count.attr,
 	NULL
 };
 
@@ -223,36 +222,36 @@ static int thresh_event_valid(int event)
 
 #ifdef CONFIG_SYSFS
 /* Add/Remove thermal_throttle interface for CPU device: */
-static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
+static __cpuinit int thermal_throttle_add_dev(struct device *dev,
 				unsigned int cpu)
 {
 	int err;
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
-	err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
+	err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
 	if (err)
 		return err;
 
 	if (cpu_has(c, X86_FEATURE_PLN))
-		err = sysfs_add_file_to_group(&sys_dev->kobj,
-					      &attr_core_power_limit_count.attr,
+		err = sysfs_add_file_to_group(&dev->kobj,
+					      &dev_attr_core_power_limit_count.attr,
 					      thermal_attr_group.name);
 	if (cpu_has(c, X86_FEATURE_PTS)) {
-		err = sysfs_add_file_to_group(&sys_dev->kobj,
-					      &attr_package_throttle_count.attr,
+		err = sysfs_add_file_to_group(&dev->kobj,
+					      &dev_attr_package_throttle_count.attr,
 					      thermal_attr_group.name);
 		if (cpu_has(c, X86_FEATURE_PLN))
-			err = sysfs_add_file_to_group(&sys_dev->kobj,
-					&attr_package_power_limit_count.attr,
+			err = sysfs_add_file_to_group(&dev->kobj,
+					&dev_attr_package_power_limit_count.attr,
 					thermal_attr_group.name);
 	}
 
 	return err;
 }
 
-static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
+static __cpuinit void thermal_throttle_remove_dev(struct device *dev)
 {
-	sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
+	sysfs_remove_group(&dev->kobj, &thermal_attr_group);
 }
 
 /* Mutex protecting device creation against CPU hotplug: */
@@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
 			      void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *dev;
 	int err = 0;
 
-	sys_dev = get_cpu_sysdev(cpu);
+	dev = get_cpu_device(cpu);
 
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 		mutex_lock(&therm_cpu_lock);
-		err = thermal_throttle_add_dev(sys_dev, cpu);
+		err = thermal_throttle_add_dev(dev, cpu);
 		mutex_unlock(&therm_cpu_lock);
 		WARN_ON(err);
 		break;
@@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 		mutex_lock(&therm_cpu_lock);
-		thermal_throttle_remove_dev(sys_dev);
+		thermal_throttle_remove_dev(dev);
 		mutex_unlock(&therm_cpu_lock);
 		break;
 	}
@@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void)
 #endif
 	/* connect live CPUs to sysfs */
 	for_each_online_cpu(cpu) {
-		err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu);
+		err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
 		WARN_ON(err);
 	}
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index f2d2a66..cf88f2a 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu)
 	return err;
 }
 
-static ssize_t reload_store(struct sys_device *dev,
-			    struct sysdev_attribute *attr,
+static ssize_t reload_store(struct device *dev,
+			    struct device_attribute *attr,
 			    const char *buf, size_t size)
 {
 	unsigned long val;
@@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev,
 	return ret;
 }
 
-static ssize_t version_show(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t version_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
 
 	return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
 }
 
-static ssize_t pf_show(struct sys_device *dev,
-			struct sysdev_attribute *attr, char *buf)
+static ssize_t pf_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
 
 	return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
 }
 
-static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
-static SYSDEV_ATTR(version, 0400, version_show, NULL);
-static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+static DEVICE_ATTR(reload, 0200, NULL, reload_store);
+static DEVICE_ATTR(version, 0400, version_show, NULL);
+static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
 
 static struct attribute *mc_default_attrs[] = {
-	&attr_reload.attr,
-	&attr_version.attr,
-	&attr_processor_flags.attr,
+	&dev_attr_reload.attr,
+	&dev_attr_version.attr,
+	&dev_attr_processor_flags.attr,
 	NULL
 };
 
@@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu)
 	return ustate;
 }
 
-static int mc_sysdev_add(struct sys_device *sys_dev)
+static int mc_device_add(struct device *dev, struct subsys_interface *sif)
 {
-	int err, cpu = sys_dev->id;
+	int err, cpu = dev->id;
 
 	if (!cpu_online(cpu))
 		return 0;
 
 	pr_debug("CPU%d added\n", cpu);
 
-	err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+	err = sysfs_create_group(&dev->kobj, &mc_attr_group);
 	if (err)
 		return err;
 
 	if (microcode_init_cpu(cpu) == UCODE_ERROR) {
-		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+		sysfs_remove_group(&dev->kobj, &mc_attr_group);
 		return -EINVAL;
 	}
 
 	return err;
 }
 
-static int mc_sysdev_remove(struct sys_device *sys_dev)
+static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
 {
-	int cpu = sys_dev->id;
+	int cpu = dev->id;
 
 	if (!cpu_online(cpu))
 		return 0;
 
 	pr_debug("CPU%d removed\n", cpu);
 	microcode_fini_cpu(cpu);
-	sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+	sysfs_remove_group(&dev->kobj, &mc_attr_group);
 	return 0;
 }
 
-static struct sysdev_driver mc_sysdev_driver = {
-	.add			= mc_sysdev_add,
-	.remove			= mc_sysdev_remove,
+static struct subsys_interface mc_cpu_interface = {
+	.name			= "microcode",
+	.subsys			= &cpu_subsys,
+	.add_dev		= mc_device_add,
+	.remove_dev		= mc_device_remove,
 };
 
 /**
@@ -464,9 +466,9 @@ static __cpuinit int
 mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
+	struct device *dev;
 
-	sys_dev = get_cpu_sysdev(cpu);
+	dev = get_cpu_device(cpu);
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
@@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 		pr_debug("CPU%d added\n", cpu);
-		if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
+		if (sysfs_create_group(&dev->kobj, &mc_attr_group))
 			pr_err("Failed to create group for CPU%d\n", cpu);
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		/* Suspend is in progress, only remove the interface */
-		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+		sysfs_remove_group(&dev->kobj, &mc_attr_group);
 		pr_debug("CPU%d removed\n", cpu);
 		break;
 
@@ -527,7 +529,7 @@ static int __init microcode_init(void)
 	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 
-	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+	error = subsys_interface_register(&mc_cpu_interface);
 
 	mutex_unlock(&microcode_mutex);
 	put_online_cpus();
@@ -561,7 +563,7 @@ static void __exit microcode_exit(void)
 	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 
-	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+	subsys_interface_unregister(&mc_cpu_interface);
 
 	mutex_unlock(&microcode_mutex);
 	put_online_cpus();
-- 
cgit v1.1


From edbaa603eb801655e80808a9cf3d3b622e8ac66b Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Wed, 21 Dec 2011 16:26:03 -0800
Subject: driver-core: remove sysdev.h usage.

The sysdev.h file should not be needed by any in-kernel code, so remove
the .h file from these random files that seem to still want to include
it.

The sysdev code will be going away soon, so this include needs to be
removed no matter what.

Cc: Jiandong Zheng <jdzheng@broadcom.com>
Cc: Scott Branden <sbranden@broadcom.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Kukjin Kim <kgene.kim@samsung.com>
Cc: David Brown <davidb@codeaurora.org>
Cc: Daniel Walker <dwalker@fifo99.com>
Cc: Bryan Huntsman <bryanh@codeaurora.org>
Cc: Ben Dooks <ben-linux@fluff.org>
Cc: Wan ZongShun <mcuos.com@gmail.com>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: "Venkatesh Pallipadi
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Richard Purdie <rpurdie@rpsys.net>
Cc: Matthew Garrett <mjg@redhat.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
---
 arch/x86/kernel/hpet.c          | 1 -
 arch/x86/kernel/irqinit.c       | 2 +-
 arch/x86/platform/uv/uv_sysfs.c | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index b946a9e..98094a9 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -2,7 +2,6 @@
 #include <linux/clockchips.h>
 #include <linux/interrupt.h>
 #include <linux/export.h>
-#include <linux/sysdev.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/i8253.h>
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index b3300e6..313fb5c 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -9,7 +9,7 @@
 #include <linux/kprobes.h>
 #include <linux/init.h>
 #include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <linux/bitops.h>
 #include <linux/acpi.h>
 #include <linux/io.h>
diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c
index 309c70f..5d4ba30 100644
--- a/arch/x86/platform/uv/uv_sysfs.c
+++ b/arch/x86/platform/uv/uv_sysfs.c
@@ -19,7 +19,7 @@
  *  Copyright (c) Russ Anderson
  */
 
-#include <linux/sysdev.h>
+#include <linux/device.h>
 #include <asm/uv/bios.h>
 #include <asm/uv/uv.h>
 
-- 
cgit v1.1


From 933393f58fef9963eac61db8093689544e29a600 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 22 Dec 2011 11:58:51 -0600
Subject: percpu: Remove irqsafe_cpu_xxx variants

We simply say that regular this_cpu use must be safe regardless of
preemption and interrupt state.  That has no material change for x86
and s390 implementations of this_cpu operations.  However, arches that
do not provide their own implementation for this_cpu operations will
now get code generated that disables interrupts instead of preemption.

-tj: This is part of on-going percpu API cleanup.  For detailed
     discussion of the subject, please refer to the following thread.

     http://thread.gmane.org/gmane.linux.kernel/1222078

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
LKML-Reference: <alpine.DEB.2.00.1112221154380.11787@router.home>
---
 arch/x86/include/asm/percpu.h | 28 ----------------------------
 1 file changed, 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 3470c9d..562ccb5 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -414,22 +414,6 @@ do {									\
 #define this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
 #define this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
 
-#define irqsafe_cpu_add_1(pcp, val)	percpu_add_op((pcp), val)
-#define irqsafe_cpu_add_2(pcp, val)	percpu_add_op((pcp), val)
-#define irqsafe_cpu_add_4(pcp, val)	percpu_add_op((pcp), val)
-#define irqsafe_cpu_and_1(pcp, val)	percpu_to_op("and", (pcp), val)
-#define irqsafe_cpu_and_2(pcp, val)	percpu_to_op("and", (pcp), val)
-#define irqsafe_cpu_and_4(pcp, val)	percpu_to_op("and", (pcp), val)
-#define irqsafe_cpu_or_1(pcp, val)	percpu_to_op("or", (pcp), val)
-#define irqsafe_cpu_or_2(pcp, val)	percpu_to_op("or", (pcp), val)
-#define irqsafe_cpu_or_4(pcp, val)	percpu_to_op("or", (pcp), val)
-#define irqsafe_cpu_xor_1(pcp, val)	percpu_to_op("xor", (pcp), val)
-#define irqsafe_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)
-#define irqsafe_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val)
-#define irqsafe_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval)
-
 #ifndef CONFIG_M386
 #define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
 #define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
@@ -445,9 +429,6 @@ do {									\
 #define this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 #define this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 
-#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
-#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
-#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 #endif /* !CONFIG_M386 */
 
 #ifdef CONFIG_X86_CMPXCHG64
@@ -467,7 +448,6 @@ do {									\
 
 #define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)		percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
 #define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)		percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
-#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2)	percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
 #endif /* CONFIG_X86_CMPXCHG64 */
 
 /*
@@ -495,13 +475,6 @@ do {									\
 #define this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
 #define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 
-#define irqsafe_cpu_add_8(pcp, val)	percpu_add_op((pcp), val)
-#define irqsafe_cpu_and_8(pcp, val)	percpu_to_op("and", (pcp), val)
-#define irqsafe_cpu_or_8(pcp, val)	percpu_to_op("or", (pcp), val)
-#define irqsafe_cpu_xor_8(pcp, val)	percpu_to_op("xor", (pcp), val)
-#define irqsafe_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
-
 /*
  * Pretty complex macro to generate cmpxchg16 instruction.  The instruction
  * is not supported on early AMD64 processors so we must be able to emulate
@@ -532,7 +505,6 @@ do {									\
 
 #define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)		percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
 #define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)		percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
-#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2)	percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
 
 #endif
 
-- 
cgit v1.1


From d6185f20a0efbf175e12831d0de330e4f21725aa Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Thu, 22 Sep 2011 13:52:56 +0300
Subject: KVM: nVMX: Add KVM_REQ_IMMEDIATE_EXIT

This patch adds a new vcpu->requests bit, KVM_REQ_IMMEDIATE_EXIT.
This bit requests that when next entering the guest, we should run it only
for as little as possible, and exit again.

We use this new option in nested VMX: When L1 launches L2, but L0 wishes L1
to continue running so it can inject an event to it, we unfortunately cannot
just pretend to have run L2 for a little while - We must really launch L2,
otherwise certain one-off vmcs12 parameters (namely, L1 injection into L2)
will be lost. So the existing code runs L2 in this case.
But L2 could potentially run for a long time until it exits, and the
injection into L1 will be delayed. The new KVM_REQ_IMMEDIATE_EXIT allows us
to request that L2 will be entered, as necessary, but will exit as soon as
possible after entry.

Our implementation of this request uses smp_send_reschedule() to send a
self-IPI, with interrupts disabled. The interrupts remain disabled until the
guest is entered, and then, after the entry is complete (often including
processing an injection and jumping to the relevant handler), the physical
interrupt is noticed and causes an exit.

On recent Intel processors, we could have achieved the same goal by using
MTF instead of a self-IPI. Another technique worth considering in the future
is to use VM_EXIT_ACK_INTR_ON_EXIT and a highest-priority vector IPI - to
slightly improve performance by avoiding the useless interrupt handler
which ends up being called when smp_send_reschedule() is used.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 11 +++++++----
 arch/x86/kvm/x86.c |  7 ++++++-
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 579a0b5..d75d914 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3945,12 +3945,15 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
-	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
-		/* We can get here when nested_run_pending caused
-		 * vmx_interrupt_allowed() to return false. In this case, do
-		 * nothing - the interrupt will be injected later.
+	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+		/*
+		 * We get here if vmx_interrupt_allowed() said we can't
+		 * inject to L1 now because L2 must run. Ask L2 to exit
+		 * right after entry, so we can inject to L1 more promptly.
 		 */
+		kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
 		return;
+	}
 
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4c938da..e24edbc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5648,6 +5648,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	int r;
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
+	bool req_immediate_exit = 0;
 
 	if (vcpu->requests) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5687,7 +5688,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			record_steal_time(vcpu);
 		if (kvm_check_request(KVM_REQ_NMI, vcpu))
 			process_nmi(vcpu);
-
+		req_immediate_exit =
+			kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
 	}
 
 	r = kvm_mmu_reload(vcpu);
@@ -5738,6 +5740,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
+	if (req_immediate_exit)
+		smp_send_reschedule(vcpu->cpu);
+
 	kvm_guest_enter();
 
 	if (unlikely(vcpu->arch.switch_db_regs)) {
-- 
cgit v1.1


From 51cfe38ea50aa631f58ed8c340ed6f0143c325a8 Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@il.ibm.com>
Date: Thu, 22 Sep 2011 13:53:26 +0300
Subject: KVM: nVMX: Fix warning-causing idt-vectoring-info behavior

When L0 wishes to inject an interrupt while L2 is running, it emulates an exit
to L1 with EXIT_REASON_EXTERNAL_INTERRUPT. This was explained in the original
nVMX patch 23, titled "Correct handling of interrupt injection".

Unfortunately, it is possible (though rare) that at this point there is valid
idt_vectoring_info in vmcs02. For example, L1 injected some interrupt to L2,
and when L2 tried to run this interrupt's handler, it got a page fault - so
it returns the original interrupt vector in idt_vectoring_info. The problem
is that if this is the case, we cannot exit to L1 with EXTERNAL_INTERRUPT
like we wished to, because the VMX spec guarantees that idt_vectoring_info
and exit_reason_external_interrupt can never happen together. This is not
just specified in the spec - a KVM L1 actually prints a kernel warning
"unexpected, valid vectoring info" if we violate this guarantee, and some
users noticed these warnings in L1's logs.

In order to better emulate a processor, which would never return the external
interrupt and the idt-vectoring-info together, we need to separate the two
injection steps: First, complete L1's injection into L2 (i.e., enter L2,
injecting to it the idt-vectoring-info); Second, after entry into L2 succeeds
and it exits back to L0, exit to L1 with the EXIT_REASON_EXTERNAL_INTERRUPT.
Most of this is already in the code - the only change we need is to remain
in L2 (and not exit to L1) in this case.

Note that the previous patch ensures (by using KVM_REQ_IMMEDIATE_EXIT) that
although we do enter L2 first, it will exit immediately after processing its
injection, allowing us to promptly inject to L1.

Note how we test vmcs12->idt_vectoring_info_field; This isn't really the
vmcs12 value (we haven't exited to L1 yet, so vmcs12 hasn't been updated),
but rather the place we save, at the end of vmx_vcpu_run, the vmcs02 value
of this field. This was explained in patch 25 ("Correct handling of idt
vectoring info") of the original nVMX patch series.

Thanks to Dave Allan and to Federico Simoncelli for reporting this bug,
to Abel Gordon for helping me figure out the solution, and to Avi Kivity
for helping to improve it.

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d75d914..6e28d58 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4080,11 +4080,12 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
 	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
-		struct vmcs12 *vmcs12;
-		if (to_vmx(vcpu)->nested.nested_run_pending)
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+		if (to_vmx(vcpu)->nested.nested_run_pending ||
+		    (vmcs12->idt_vectoring_info_field &
+		     VECTORING_INFO_VALID_MASK))
 			return 0;
 		nested_vmx_vmexit(vcpu);
-		vmcs12 = get_vmcs12(vcpu);
 		vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
 		vmcs12->vm_exit_intr_info = 0;
 		/* fall through to normal code, but now in L1, not L2 */
-- 
cgit v1.1


From f759e2b4c728cee82e4bc1132d0e41177b79a0b1 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 22 Sep 2011 16:53:17 +0800
Subject: KVM: MMU: avoid pte_list_desc running out in kvm_mmu_pte_write

kvm_mmu_pte_write is unsafe since we need to alloc pte_list_desc in the
function when spte is prefetched, unfortunately, we can not know how many
spte need to be prefetched on this path, that means we can use out of the
free  pte_list_desc object in the cache, and BUG_ON() is triggered, also some
path does not fill the cache, such as INS instruction emulated that does not
trigger page fault

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f1b36cf..232c5a3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -593,6 +593,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 	return 0;
 }
 
+static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
+{
+	return cache->nobjs;
+}
+
 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
 				  struct kmem_cache *cache)
 {
@@ -970,6 +975,14 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 	return &linfo->rmap_pde;
 }
 
+static bool rmap_can_add(struct kvm_vcpu *vcpu)
+{
+	struct kvm_mmu_memory_cache *cache;
+
+	cache = &vcpu->arch.mmu_pte_list_desc_cache;
+	return mmu_memory_cache_free_objects(cache);
+}
+
 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 {
 	struct kvm_mmu_page *sp;
@@ -3586,6 +3599,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		break;
 	}
 
+	/*
+	 * No need to care whether allocation memory is successful
+	 * or not since pte prefetch is skiped if it does not have
+	 * enough objects in the cache.
+	 */
+	mmu_topup_memory_caches(vcpu);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
 		gentry = 0;
@@ -3656,7 +3675,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			mmu_page_zap_pte(vcpu->kvm, sp, spte);
 			if (gentry &&
 			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
-			      & mask.word))
+			      & mask.word) && rmap_can_add(vcpu))
 				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
 			if (!remote_flush && need_remote_flush(entry, *spte))
 				remote_flush = true;
@@ -3717,10 +3736,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 		goto out;
 	}
 
-	r = mmu_topup_memory_caches(vcpu);
-	if (r)
-		goto out;
-
 	er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
 
 	switch (er) {
-- 
cgit v1.1


From d5ae7ce835cc89556dc18e2070e754f026402efa Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 22 Sep 2011 16:53:46 +0800
Subject: KVM: x86: tag the instructions which are used to write page table

The idea is from Avi:
| tag instructions that are typically used to modify the page tables, and
| drop shadow if any other instruction is used.
| The list would include, I'd guess, and, or, bts, btc, mov, xchg, cmpxchg,
| and cmpxchg8b.

This patch is used to tag the instructions and in the later path, shadow page
is dropped if it is written by other instructions

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f1e3be18..a10950a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -125,8 +125,9 @@
 #define Lock        (1<<26) /* lock prefix is allowed for the instruction */
 #define Priv        (1<<27) /* instruction generates #GP if current CPL != 0 */
 #define No64	    (1<<28)
+#define PageTable   (1 << 29)   /* instruction used to write page table */
 /* Source 2 operand type */
-#define Src2Shift   (29)
+#define Src2Shift   (30)
 #define Src2None    (OpNone << Src2Shift)
 #define Src2CL      (OpCL << Src2Shift)
 #define Src2ImmByte (OpImmByte << Src2Shift)
@@ -3033,10 +3034,10 @@ static struct opcode group7_rm7[] = {
 
 static struct opcode group1[] = {
 	I(Lock, em_add),
-	I(Lock, em_or),
+	I(Lock | PageTable, em_or),
 	I(Lock, em_adc),
 	I(Lock, em_sbb),
-	I(Lock, em_and),
+	I(Lock | PageTable, em_and),
 	I(Lock, em_sub),
 	I(Lock, em_xor),
 	I(0, em_cmp),
@@ -3096,18 +3097,21 @@ static struct group_dual group7 = { {
 
 static struct opcode group8[] = {
 	N, N, N, N,
-	D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
-	D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
+	D(DstMem | SrcImmByte | ModRM),
+	D(DstMem | SrcImmByte | ModRM | Lock | PageTable),
+	D(DstMem | SrcImmByte | ModRM | Lock),
+	D(DstMem | SrcImmByte | ModRM | Lock | PageTable),
 };
 
 static struct group_dual group9 = { {
-	N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
+	N, D(DstMem64 | ModRM | Lock | PageTable), N, N, N, N, N, N,
 }, {
 	N, N, N, N, N, N, N, N,
 } };
 
 static struct opcode group11[] = {
-	I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
+	I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov),
+	X7(D(Undefined)),
 };
 
 static struct gprefix pfx_0f_6f_0f_7f = {
@@ -3120,7 +3124,7 @@ static struct opcode opcode_table[256] = {
 	I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
 	I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
 	/* 0x08 - 0x0F */
-	I6ALU(Lock, em_or),
+	I6ALU(Lock | PageTable, em_or),
 	I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
 	N,
 	/* 0x10 - 0x17 */
@@ -3132,7 +3136,7 @@ static struct opcode opcode_table[256] = {
 	I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
 	I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
 	/* 0x20 - 0x27 */
-	I6ALU(Lock, em_and), N, N,
+	I6ALU(Lock | PageTable, em_and), N, N,
 	/* 0x28 - 0x2F */
 	I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
 	/* 0x30 - 0x37 */
@@ -3165,11 +3169,11 @@ static struct opcode opcode_table[256] = {
 	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
 	G(DstMem | SrcImmByte | ModRM | Group, group1),
 	I2bv(DstMem | SrcReg | ModRM, em_test),
-	I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg),
+	I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
 	/* 0x88 - 0x8F */
-	I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
+	I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
 	I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
-	I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg),
+	I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg),
 	D(ModRM | SrcMem | NoAccess | DstReg),
 	I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
 	G(0, group1A),
@@ -3182,7 +3186,7 @@ static struct opcode opcode_table[256] = {
 	II(ImplicitOps | Stack, em_popf, popf), N, N,
 	/* 0xA0 - 0xA7 */
 	I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
-	I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
+	I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
 	I2bv(SrcSI | DstDI | Mov | String, em_mov),
 	I2bv(SrcSI | DstDI | String, em_cmp),
 	/* 0xA8 - 0xAF */
@@ -3280,12 +3284,13 @@ static struct opcode twobyte_table[256] = {
 	D(DstMem | SrcReg | Src2CL | ModRM), N, N,
 	/* 0xA8 - 0xAF */
 	I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
-	DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	DI(ImplicitOps, rsm),
+	D(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable),
 	D(DstMem | SrcReg | Src2ImmByte | ModRM),
 	D(DstMem | SrcReg | Src2CL | ModRM),
 	D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
 	/* 0xB0 - 0xB7 */
-	D2bv(DstMem | SrcReg | ModRM | Lock),
+	D2bv(DstMem | SrcReg | ModRM | Lock | PageTable),
 	I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
 	D(DstMem | SrcReg | ModRM | BitOp | Lock),
 	I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
@@ -3293,7 +3298,7 @@ static struct opcode twobyte_table[256] = {
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xB8 - 0xBF */
 	N, N,
-	G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable),
 	D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
-- 
cgit v1.1


From 1cb3f3ae5a3855ba430430706da4201ace1d6ec4 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 22 Sep 2011 17:02:48 +0800
Subject: KVM: x86: retry non-page-table writing instructions

If the emulation is caused by #PF and it is non-page_table writing instruction,
it means the VM-EXIT is caused by shadow page protected, we can zap the shadow
page and retry this instruction directly

The idea is from Avi

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/include/asm/kvm_host.h    |  5 ++++
 arch/x86/kvm/emulate.c             |  5 ++++
 arch/x86/kvm/mmu.c                 | 25 +++++++++++++++-----
 arch/x86/kvm/x86.c                 | 47 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 77 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index a026507..9a4acf4 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -364,6 +364,7 @@ enum x86_intercept {
 #endif
 
 int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
 #define EMULATION_FAILED -1
 #define EMULATION_OK 0
 #define EMULATION_RESTART 1
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b4973f4..4ceefa9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -444,6 +444,9 @@ struct kvm_vcpu_arch {
 
 	cpumask_var_t wbinvd_dirty_mask;
 
+	unsigned long last_retry_eip;
+	unsigned long last_retry_addr;
+
 	struct {
 		bool halted;
 		gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
@@ -692,6 +695,7 @@ enum emulation_result {
 #define EMULTYPE_NO_DECODE	    (1 << 0)
 #define EMULTYPE_TRAP_UD	    (1 << 1)
 #define EMULTYPE_SKIP		    (1 << 2)
+#define EMULTYPE_RETRY		    (1 << 3)
 int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
 			    int emulation_type, void *insn, int insn_len);
 
@@ -756,6 +760,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes,
 		       bool guest_initiated);
+int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a10950a..8547958 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3702,6 +3702,11 @@ done:
 	return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
 }
 
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt)
+{
+	return ctxt->d & PageTable;
+}
+
 static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
 {
 	/* The second termination condition only applies for REPE
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 232c5a3..7a22eb8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1998,7 +1998,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
 }
 
-static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node;
@@ -2007,7 +2007,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 
 	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
 	r = 0;
-
+	spin_lock(&kvm->mmu_lock);
 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
 		pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
 			 sp->role.word);
@@ -2015,8 +2015,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
 	}
 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
+	spin_unlock(&kvm->mmu_lock);
+
 	return r;
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 
 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 {
@@ -3698,9 +3701,8 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 
 	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
 
-	spin_lock(&vcpu->kvm->mmu_lock);
 	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-	spin_unlock(&vcpu->kvm->mmu_lock);
+
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
@@ -3721,10 +3723,18 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
+static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
+{
+	if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
+		return vcpu_match_mmio_gpa(vcpu, addr);
+
+	return vcpu_match_mmio_gva(vcpu, addr);
+}
+
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 		       void *insn, int insn_len)
 {
-	int r;
+	int r, emulation_type = EMULTYPE_RETRY;
 	enum emulation_result er;
 
 	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
@@ -3736,7 +3746,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 		goto out;
 	}
 
-	er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
+	if (is_mmio_page_fault(vcpu, cr2))
+		emulation_type = 0;
+
+	er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
 
 	switch (er) {
 	case EMULATE_DONE:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e24edbc..7ba1ab7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4836,6 +4836,50 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 	return false;
 }
 
+static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
+			      unsigned long cr2,  int emulation_type)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+	unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
+
+	last_retry_eip = vcpu->arch.last_retry_eip;
+	last_retry_addr = vcpu->arch.last_retry_addr;
+
+	/*
+	 * If the emulation is caused by #PF and it is non-page_table
+	 * writing instruction, it means the VM-EXIT is caused by shadow
+	 * page protected, we can zap the shadow page and retry this
+	 * instruction directly.
+	 *
+	 * Note: if the guest uses a non-page-table modifying instruction
+	 * on the PDE that points to the instruction, then we will unmap
+	 * the instruction and go to an infinite loop. So, we cache the
+	 * last retried eip and the last fault address, if we meet the eip
+	 * and the address again, we can break out of the potential infinite
+	 * loop.
+	 */
+	vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
+
+	if (!(emulation_type & EMULTYPE_RETRY))
+		return false;
+
+	if (x86_page_table_writing_insn(ctxt))
+		return false;
+
+	if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
+		return false;
+
+	vcpu->arch.last_retry_eip = ctxt->eip;
+	vcpu->arch.last_retry_addr = cr2;
+
+	if (!vcpu->arch.mmu.direct_map)
+		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
+
+	kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+
+	return true;
+}
+
 int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 			    unsigned long cr2,
 			    int emulation_type,
@@ -4877,6 +4921,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 		return EMULATE_DONE;
 	}
 
+	if (retry_instruction(ctxt, cr2, emulation_type))
+		return EMULATE_DONE;
+
 	/* this is needed for vmware backdoor interface to work since it
 	   changes registers values  during IO operation */
 	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
-- 
cgit v1.1


From 6f6fbe98c3a9f3e9d69cd354a0459989e594e707 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 22 Sep 2011 16:55:10 +0800
Subject: KVM: x86: cleanup port-in/port-out emulated

Remove the same code between emulator_pio_in_emulated and
emulator_pio_out_emulated

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 59 ++++++++++++++++++++++++------------------------------
 1 file changed, 26 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7ba1ab7..a215448 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4349,32 +4349,24 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 	return r;
 }
 
-
-static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
-				    int size, unsigned short port, void *val,
-				    unsigned int count)
+static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
+			       unsigned short port, void *val,
+			       unsigned int count, bool in)
 {
-	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-
-	if (vcpu->arch.pio.count)
-		goto data_avail;
-
-	trace_kvm_pio(0, port, size, count);
+	trace_kvm_pio(!in, port, size, count);
 
 	vcpu->arch.pio.port = port;
-	vcpu->arch.pio.in = 1;
+	vcpu->arch.pio.in = in;
 	vcpu->arch.pio.count  = count;
 	vcpu->arch.pio.size = size;
 
 	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
-	data_avail:
-		memcpy(val, vcpu->arch.pio_data, size * count);
 		vcpu->arch.pio.count = 0;
 		return 1;
 	}
 
 	vcpu->run->exit_reason = KVM_EXIT_IO;
-	vcpu->run->io.direction = KVM_EXIT_IO_IN;
+	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
 	vcpu->run->io.size = size;
 	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
 	vcpu->run->io.count = count;
@@ -4383,36 +4375,37 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 	return 0;
 }
 
-static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
-				     int size, unsigned short port,
-				     const void *val, unsigned int count)
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+				    int size, unsigned short port, void *val,
+				    unsigned int count)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+	int ret;
 
-	trace_kvm_pio(1, port, size, count);
-
-	vcpu->arch.pio.port = port;
-	vcpu->arch.pio.in = 0;
-	vcpu->arch.pio.count = count;
-	vcpu->arch.pio.size = size;
-
-	memcpy(vcpu->arch.pio_data, val, size * count);
+	if (vcpu->arch.pio.count)
+		goto data_avail;
 
-	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+	ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
+	if (ret) {
+data_avail:
+		memcpy(val, vcpu->arch.pio_data, size * count);
 		vcpu->arch.pio.count = 0;
 		return 1;
 	}
 
-	vcpu->run->exit_reason = KVM_EXIT_IO;
-	vcpu->run->io.direction = KVM_EXIT_IO_OUT;
-	vcpu->run->io.size = size;
-	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-	vcpu->run->io.count = count;
-	vcpu->run->io.port = port;
-
 	return 0;
 }
 
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
+				     int size, unsigned short port,
+				     const void *val, unsigned int count)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+	memcpy(vcpu->arch.pio_data, val, size * count);
+	return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
+}
+
 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
 	return kvm_x86_ops->get_segment_base(vcpu, seg);
-- 
cgit v1.1


From d01f8d5e02cc79998e3160f7ad545f77891b00e5 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 22 Sep 2011 16:55:36 +0800
Subject: KVM: MMU: do not mark accessed bit on pte write path

In current code, the accessed bit is always set when page fault occurred,
do not need to set it on pte write path

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/mmu.c              | 22 +---------------------
 2 files changed, 1 insertion(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4ceefa9..f8ab0d7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -356,7 +356,6 @@ struct kvm_vcpu_arch {
 	gfn_t last_pt_write_gfn;
 	int   last_pt_write_count;
 	u64  *last_pte_updated;
-	gfn_t last_pte_gfn;
 
 	struct fpu guest_fpu;
 	u64 xcr0;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7a22eb8..b432a71 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2207,11 +2207,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (set_mmio_spte(sptep, gfn, pfn, pte_access))
 		return 0;
 
-	/*
-	 * We don't set the accessed bit, since we sometimes want to see
-	 * whether the guest actually used the pte (in order to detect
-	 * demand paging).
-	 */
 	spte = PT_PRESENT_MASK;
 	if (!speculative)
 		spte |= shadow_accessed_mask;
@@ -2362,10 +2357,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		}
 	}
 	kvm_release_pfn_clean(pfn);
-	if (speculative) {
+	if (speculative)
 		vcpu->arch.last_pte_updated = sptep;
-		vcpu->arch.last_pte_gfn = gfn;
-	}
 }
 
 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -3533,18 +3526,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 	return !!(spte && (*spte & shadow_accessed_mask));
 }
 
-static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-	u64 *spte = vcpu->arch.last_pte_updated;
-
-	if (spte
-	    && vcpu->arch.last_pte_gfn == gfn
-	    && shadow_accessed_mask
-	    && !(*spte & shadow_accessed_mask)
-	    && is_shadow_present_pte(*spte))
-		set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
-}
-
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes,
 		       bool guest_initiated)
@@ -3615,7 +3596,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	++vcpu->kvm->stat.mmu_pte_write;
 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 	if (guest_initiated) {
-		kvm_mmu_access_page(vcpu, gfn);
 		if (gfn == vcpu->arch.last_pt_write_gfn
 		    && !last_updated_pte_accessed(vcpu)) {
 			++vcpu->arch.last_pt_write_count;
-- 
cgit v1.1


From 505aef8f30a95f7e4abf2c07e54ded1521587ba0 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 22 Sep 2011 16:56:06 +0800
Subject: KVM: MMU: cleanup FNAME(invlpg)

Directly Use mmu_page_zap_pte to zap spte in FNAME(invlpg), also remove the
same code between FNAME(invlpg) and FNAME(sync_page)

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 16 ++++++++++------
 arch/x86/kvm/paging_tmpl.h | 44 +++++++++++++++++---------------------------
 2 files changed, 27 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b432a71..d15f908 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1809,7 +1809,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	}
 }
 
-static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
 			     u64 *spte)
 {
 	u64 pte;
@@ -1817,17 +1817,21 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
 
 	pte = *spte;
 	if (is_shadow_present_pte(pte)) {
-		if (is_last_spte(pte, sp->role.level))
+		if (is_last_spte(pte, sp->role.level)) {
 			drop_spte(kvm, spte);
-		else {
+			if (is_large_pte(pte))
+				--kvm->stat.lpages;
+		} else {
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			drop_parent_pte(child, spte);
 		}
-	} else if (is_mmio_spte(pte))
+		return true;
+	}
+
+	if (is_mmio_spte(pte))
 		mmu_spte_clear_no_track(spte);
 
-	if (is_large_pte(pte))
-		--kvm->stat.lpages;
+	return false;
 }
 
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9299410..d8d3906 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -656,6 +656,18 @@ out_unlock:
 	return 0;
 }
 
+static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
+{
+	int offset = 0;
+
+	WARN_ON(sp->role.level != 1);
+
+	if (PTTYPE == 32)
+		offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+	return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+}
+
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	struct kvm_shadow_walk_iterator iterator;
@@ -663,7 +675,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 	gpa_t pte_gpa = -1;
 	int level;
 	u64 *sptep;
-	int need_flush = 0;
 
 	vcpu_clear_mmio_info(vcpu, gva);
 
@@ -675,36 +686,20 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 
 		sp = page_header(__pa(sptep));
 		if (is_last_spte(*sptep, level)) {
-			int offset, shift;
-
 			if (!sp->unsync)
 				break;
 
-			shift = PAGE_SHIFT -
-				  (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
-			offset = sp->role.quadrant << shift;
-
-			pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
+			pte_gpa = FNAME(get_level1_sp_gpa)(sp);
 			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
-			if (is_shadow_present_pte(*sptep)) {
-				if (is_large_pte(*sptep))
-					--vcpu->kvm->stat.lpages;
-				drop_spte(vcpu->kvm, sptep);
-				need_flush = 1;
-			} else if (is_mmio_spte(*sptep))
-				mmu_spte_clear_no_track(sptep);
-
-			break;
+			if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
+				kvm_flush_remote_tlbs(vcpu->kvm);
 		}
 
 		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
 			break;
 	}
 
-	if (need_flush)
-		kvm_flush_remote_tlbs(vcpu->kvm);
-
 	atomic_inc(&vcpu->kvm->arch.invlpg_counter);
 
 	spin_unlock(&vcpu->kvm->mmu_lock);
@@ -769,19 +764,14 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
  */
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
-	int i, offset, nr_present;
+	int i, nr_present = 0;
 	bool host_writable;
 	gpa_t first_pte_gpa;
 
-	offset = nr_present = 0;
-
 	/* direct kvm_mmu_page can not be unsync. */
 	BUG_ON(sp->role.direct);
 
-	if (PTTYPE == 32)
-		offset = sp->role.quadrant << PT64_LEVEL_BITS;
-
-	first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+	first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
 
 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 		unsigned pte_access;
-- 
cgit v1.1


From f57f2ef58f6703e6df70ed52a198920cb3e8edba Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 22 Sep 2011 16:56:39 +0800
Subject: KVM: MMU: fast prefetch spte on invlpg path

Fast prefetch spte for the unsync shadow page on invlpg path

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  4 +---
 arch/x86/kvm/mmu.c              | 38 +++++++++++++++-----------------------
 arch/x86/kvm/paging_tmpl.h      | 30 ++++++++++++++++++------------
 arch/x86/kvm/x86.c              |  4 ++--
 4 files changed, 36 insertions(+), 40 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f8ab0d7..3c9ea26 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -461,7 +461,6 @@ struct kvm_arch {
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_max_mmu_pages;
 	unsigned int indirect_shadow_pages;
-	atomic_t invlpg_counter;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
 	 * Hash table of struct kvm_mmu_page.
@@ -757,8 +756,7 @@ int fx_init(struct kvm_vcpu *vcpu);
 
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes,
-		       bool guest_initiated);
+		       const u8 *new, int bytes);
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d15f908..c01137f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3531,8 +3531,7 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 }
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes,
-		       bool guest_initiated)
+		       const u8 *new, int bytes)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	union kvm_mmu_page_role mask = { .word = 0 };
@@ -3541,7 +3540,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	LIST_HEAD(invalid_list);
 	u64 entry, gentry, *spte;
 	unsigned pte_size, page_offset, misaligned, quadrant, offset;
-	int level, npte, invlpg_counter, r, flooded = 0;
+	int level, npte, r, flooded = 0;
 	bool remote_flush, local_flush, zap_page;
 
 	/*
@@ -3556,19 +3555,16 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
 
-	invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
-
 	/*
 	 * Assume that the pte write on a page table of the same type
 	 * as the current vcpu paging mode since we update the sptes only
 	 * when they have the same mode.
 	 */
-	if ((is_pae(vcpu) && bytes == 4) || !new) {
+	if (is_pae(vcpu) && bytes == 4) {
 		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
-		if (is_pae(vcpu)) {
-			gpa &= ~(gpa_t)7;
-			bytes = 8;
-		}
+		gpa &= ~(gpa_t)7;
+		bytes = 8;
+
 		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
 		if (r)
 			gentry = 0;
@@ -3594,22 +3590,18 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	 */
 	mmu_topup_memory_caches(vcpu);
 	spin_lock(&vcpu->kvm->mmu_lock);
-	if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
-		gentry = 0;
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
-	if (guest_initiated) {
-		if (gfn == vcpu->arch.last_pt_write_gfn
-		    && !last_updated_pte_accessed(vcpu)) {
-			++vcpu->arch.last_pt_write_count;
-			if (vcpu->arch.last_pt_write_count >= 3)
-				flooded = 1;
-		} else {
-			vcpu->arch.last_pt_write_gfn = gfn;
-			vcpu->arch.last_pt_write_count = 1;
-			vcpu->arch.last_pte_updated = NULL;
-		}
+	if (gfn == vcpu->arch.last_pt_write_gfn
+	    && !last_updated_pte_accessed(vcpu)) {
+		++vcpu->arch.last_pt_write_count;
+		if (vcpu->arch.last_pt_write_count >= 3)
+			flooded = 1;
+	} else {
+		vcpu->arch.last_pt_write_gfn = gfn;
+		vcpu->arch.last_pt_write_count = 1;
+		vcpu->arch.last_pte_updated = NULL;
 	}
 
 	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d8d3906..9efb860 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -672,20 +672,27 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
-	gpa_t pte_gpa = -1;
 	int level;
 	u64 *sptep;
 
 	vcpu_clear_mmio_info(vcpu, gva);
 
-	spin_lock(&vcpu->kvm->mmu_lock);
+	/*
+	 * No need to check return value here, rmap_can_add() can
+	 * help us to skip pte prefetch later.
+	 */
+	mmu_topup_memory_caches(vcpu);
 
+	spin_lock(&vcpu->kvm->mmu_lock);
 	for_each_shadow_entry(vcpu, gva, iterator) {
 		level = iterator.level;
 		sptep = iterator.sptep;
 
 		sp = page_header(__pa(sptep));
 		if (is_last_spte(*sptep, level)) {
+			pt_element_t gpte;
+			gpa_t pte_gpa;
+
 			if (!sp->unsync)
 				break;
 
@@ -694,22 +701,21 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 
 			if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
 				kvm_flush_remote_tlbs(vcpu->kvm);
+
+			if (!rmap_can_add(vcpu))
+				break;
+
+			if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
+						  sizeof(pt_element_t)))
+				break;
+
+			FNAME(update_pte)(vcpu, sp, sptep, &gpte);
 		}
 
 		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
 			break;
 	}
-
-	atomic_inc(&vcpu->kvm->arch.invlpg_counter);
-
 	spin_unlock(&vcpu->kvm->mmu_lock);
-
-	if (pte_gpa == -1)
-		return;
-
-	if (mmu_topup_memory_caches(vcpu))
-		return;
-	kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a215448..9c980ce 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4087,7 +4087,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
 	if (ret < 0)
 		return 0;
-	kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
 	return 1;
 }
 
@@ -4324,7 +4324,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 	if (!exchanged)
 		return X86EMUL_CMPXCHG_FAILED;
 
-	kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
+	kvm_mmu_pte_write(vcpu, gpa, new, bytes);
 
 	return X86EMUL_CONTINUE;
 
-- 
cgit v1.1


From f8734352c6f9c4f3d85f0c97b7731b7f925c62fd Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 22 Sep 2011 16:56:58 +0800
Subject: KVM: MMU: remove unnecessary kvm_mmu_free_some_pages

In kvm_mmu_pte_write, we do not need to alloc shadow page, so calling
kvm_mmu_free_some_pages is really unnecessary

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c01137f..7e57938 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3590,7 +3590,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	 */
 	mmu_topup_memory_caches(vcpu);
 	spin_lock(&vcpu->kvm->mmu_lock);
-	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 	if (gfn == vcpu->arch.last_pt_write_gfn
-- 
cgit v1.1


From 889e5cbced6c191bb7e25c1b30b43e59a12561f9 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 22 Sep 2011 16:57:23 +0800
Subject: KVM: MMU: split kvm_mmu_pte_write function

kvm_mmu_pte_write is too long, we split it for better readable

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 194 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 119 insertions(+), 75 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7e57938..986aea5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3530,48 +3530,28 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 	return !!(spte && (*spte & shadow_accessed_mask));
 }
 
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes)
+static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
+				    const u8 *new, int *bytes)
 {
-	gfn_t gfn = gpa >> PAGE_SHIFT;
-	union kvm_mmu_page_role mask = { .word = 0 };
-	struct kvm_mmu_page *sp;
-	struct hlist_node *node;
-	LIST_HEAD(invalid_list);
-	u64 entry, gentry, *spte;
-	unsigned pte_size, page_offset, misaligned, quadrant, offset;
-	int level, npte, r, flooded = 0;
-	bool remote_flush, local_flush, zap_page;
-
-	/*
-	 * If we don't have indirect shadow pages, it means no page is
-	 * write-protected, so we can exit simply.
-	 */
-	if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
-		return;
-
-	zap_page = remote_flush = local_flush = false;
-	offset = offset_in_page(gpa);
-
-	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
+	u64 gentry;
+	int r;
 
 	/*
 	 * Assume that the pte write on a page table of the same type
 	 * as the current vcpu paging mode since we update the sptes only
 	 * when they have the same mode.
 	 */
-	if (is_pae(vcpu) && bytes == 4) {
+	if (is_pae(vcpu) && *bytes == 4) {
 		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
-		gpa &= ~(gpa_t)7;
-		bytes = 8;
-
-		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
+		*gpa &= ~(gpa_t)7;
+		*bytes = 8;
+		r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8));
 		if (r)
 			gentry = 0;
 		new = (const u8 *)&gentry;
 	}
 
-	switch (bytes) {
+	switch (*bytes) {
 	case 4:
 		gentry = *(const u32 *)new;
 		break;
@@ -3583,71 +3563,135 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		break;
 	}
 
-	/*
-	 * No need to care whether allocation memory is successful
-	 * or not since pte prefetch is skiped if it does not have
-	 * enough objects in the cache.
-	 */
-	mmu_topup_memory_caches(vcpu);
-	spin_lock(&vcpu->kvm->mmu_lock);
-	++vcpu->kvm->stat.mmu_pte_write;
-	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
+	return gentry;
+}
+
+/*
+ * If we're seeing too many writes to a page, it may no longer be a page table,
+ * or we may be forking, in which case it is better to unmap the page.
+ */
+static bool detect_write_flooding(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	bool flooded = false;
+
 	if (gfn == vcpu->arch.last_pt_write_gfn
 	    && !last_updated_pte_accessed(vcpu)) {
 		++vcpu->arch.last_pt_write_count;
 		if (vcpu->arch.last_pt_write_count >= 3)
-			flooded = 1;
+			flooded = true;
 	} else {
 		vcpu->arch.last_pt_write_gfn = gfn;
 		vcpu->arch.last_pt_write_count = 1;
 		vcpu->arch.last_pte_updated = NULL;
 	}
 
+	return flooded;
+}
+
+/*
+ * Misaligned accesses are too much trouble to fix up; also, they usually
+ * indicate a page is not used as a page table.
+ */
+static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
+				    int bytes)
+{
+	unsigned offset, pte_size, misaligned;
+
+	pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+		 gpa, bytes, sp->role.word);
+
+	offset = offset_in_page(gpa);
+	pte_size = sp->role.cr4_pae ? 8 : 4;
+	misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+	misaligned |= bytes < 4;
+
+	return misaligned;
+}
+
+static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
+{
+	unsigned page_offset, quadrant;
+	u64 *spte;
+	int level;
+
+	page_offset = offset_in_page(gpa);
+	level = sp->role.level;
+	*nspte = 1;
+	if (!sp->role.cr4_pae) {
+		page_offset <<= 1;	/* 32->64 */
+		/*
+		 * A 32-bit pde maps 4MB while the shadow pdes map
+		 * only 2MB.  So we need to double the offset again
+		 * and zap two pdes instead of one.
+		 */
+		if (level == PT32_ROOT_LEVEL) {
+			page_offset &= ~7; /* kill rounding error */
+			page_offset <<= 1;
+			*nspte = 2;
+		}
+		quadrant = page_offset >> PAGE_SHIFT;
+		page_offset &= ~PAGE_MASK;
+		if (quadrant != sp->role.quadrant)
+			return NULL;
+	}
+
+	spte = &sp->spt[page_offset / sizeof(*spte)];
+	return spte;
+}
+
+void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+		       const u8 *new, int bytes)
+{
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+	union kvm_mmu_page_role mask = { .word = 0 };
+	struct kvm_mmu_page *sp;
+	struct hlist_node *node;
+	LIST_HEAD(invalid_list);
+	u64 entry, gentry, *spte;
+	int npte;
+	bool remote_flush, local_flush, zap_page, flooded, misaligned;
+
+	/*
+	 * If we don't have indirect shadow pages, it means no page is
+	 * write-protected, so we can exit simply.
+	 */
+	if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+		return;
+
+	zap_page = remote_flush = local_flush = false;
+
+	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
+
+	gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes);
+
+	/*
+	 * No need to care whether allocation memory is successful
+	 * or not since pte prefetch is skiped if it does not have
+	 * enough objects in the cache.
+	 */
+	mmu_topup_memory_caches(vcpu);
+
+	spin_lock(&vcpu->kvm->mmu_lock);
+	++vcpu->kvm->stat.mmu_pte_write;
+	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
+
+	flooded = detect_write_flooding(vcpu, gfn);
 	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
-		pte_size = sp->role.cr4_pae ? 8 : 4;
-		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
-		misaligned |= bytes < 4;
+		misaligned = detect_write_misaligned(sp, gpa, bytes);
+
 		if (misaligned || flooded) {
-			/*
-			 * Misaligned accesses are too much trouble to fix
-			 * up; also, they usually indicate a page is not used
-			 * as a page table.
-			 *
-			 * If we're seeing too many writes to a page,
-			 * it may no longer be a page table, or we may be
-			 * forking, in which case it is better to unmap the
-			 * page.
-			 */
-			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
-				 gpa, bytes, sp->role.word);
 			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
 						     &invalid_list);
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 		}
-		page_offset = offset;
-		level = sp->role.level;
-		npte = 1;
-		if (!sp->role.cr4_pae) {
-			page_offset <<= 1;	/* 32->64 */
-			/*
-			 * A 32-bit pde maps 4MB while the shadow pdes map
-			 * only 2MB.  So we need to double the offset again
-			 * and zap two pdes instead of one.
-			 */
-			if (level == PT32_ROOT_LEVEL) {
-				page_offset &= ~7; /* kill rounding error */
-				page_offset <<= 1;
-				npte = 2;
-			}
-			quadrant = page_offset >> PAGE_SHIFT;
-			page_offset &= ~PAGE_MASK;
-			if (quadrant != sp->role.quadrant)
-				continue;
-		}
+
+		spte = get_written_sptes(sp, gpa, &npte);
+		if (!spte)
+			continue;
+
 		local_flush = true;
-		spte = &sp->spt[page_offset / sizeof(*spte)];
 		while (npte--) {
 			entry = *spte;
 			mmu_page_zap_pte(vcpu->kvm, sp, spte);
-- 
cgit v1.1


From 5d9ca30e96f567b67a36727aa4ebb34911a2b84a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 22 Sep 2011 16:57:55 +0800
Subject: KVM: MMU: fix detecting misaligned accessed

Sometimes, we only modify the last one byte of a pte to update status bit,
for example, clear_bit is used to clear r/w bit in linux kernel and 'andb'
instruction is used in this function, in this case, kvm_mmu_pte_write will
treat it as misaligned access, and the shadow page table is zapped

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 986aea5..ca6f72a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3602,6 +3602,14 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
 
 	offset = offset_in_page(gpa);
 	pte_size = sp->role.cr4_pae ? 8 : 4;
+
+	/*
+	 * Sometimes, the OS only writes the last one bytes to update status
+	 * bits, for example, in linux, andb instruction is used in clear_bit().
+	 */
+	if (!(offset & (pte_size - 1)) && bytes == 1)
+		return false;
+
 	misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
 	misaligned |= bytes < 4;
 
-- 
cgit v1.1


From a30f47cb150dd8d109923eeb65fe73e8b3e09046 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 22 Sep 2011 16:58:36 +0800
Subject: KVM: MMU: improve write flooding detected

Detecting write-flooding does not work well, when we handle page written, if
the last speculative spte is not accessed, we treat the page is
write-flooding, however, we can speculative spte on many path, such as pte
prefetch, page synced, that means the last speculative spte may be not point
to the written page and the written page can be accessed via other sptes, so
depends on the Accessed bit of the last speculative spte is not enough

Instead of detected page accessed, we can detect whether the spte is accessed
after it is written, if the spte is not accessed but it is written frequently,
we treat is not a page table or it not used for a long time

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 ++--
 arch/x86/kvm/mmu.c              | 62 +++++++++++++++++------------------------
 arch/x86/kvm/paging_tmpl.h      | 12 ++++----
 3 files changed, 32 insertions(+), 48 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3c9ea26..c1f19de 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -239,6 +239,8 @@ struct kvm_mmu_page {
 	int clear_spte_count;
 #endif
 
+	int write_flooding_count;
+
 	struct rcu_head rcu;
 };
 
@@ -353,10 +355,6 @@ struct kvm_vcpu_arch {
 	struct kvm_mmu_memory_cache mmu_page_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
-	gfn_t last_pt_write_gfn;
-	int   last_pt_write_count;
-	u64  *last_pte_updated;
-
 	struct fpu guest_fpu;
 	u64 xcr0;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ca6f72a..e9534ce 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1653,6 +1653,18 @@ static void init_shadow_page_table(struct kvm_mmu_page *sp)
 		sp->spt[i] = 0ull;
 }
 
+static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
+{
+	sp->write_flooding_count = 0;
+}
+
+static void clear_sp_write_flooding_count(u64 *spte)
+{
+	struct kvm_mmu_page *sp =  page_header(__pa(spte));
+
+	__clear_sp_write_flooding_count(sp);
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     gfn_t gfn,
 					     gva_t gaddr,
@@ -1696,6 +1708,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		} else if (sp->unsync)
 			kvm_mmu_mark_parents_unsync(sp);
 
+		__clear_sp_write_flooding_count(sp);
 		trace_kvm_mmu_get_page(sp, false);
 		return sp;
 	}
@@ -1848,15 +1861,6 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
 	mmu_page_remove_parent_pte(sp, parent_pte);
 }
 
-static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
-{
-	int i;
-	struct kvm_vcpu *vcpu;
-
-	kvm_for_each_vcpu(i, vcpu, kvm)
-		vcpu->arch.last_pte_updated = NULL;
-}
-
 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	u64 *parent_pte;
@@ -1916,7 +1920,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	}
 
 	sp->role.invalid = 1;
-	kvm_mmu_reset_last_pte_updated(kvm);
 	return ret;
 }
 
@@ -2361,8 +2364,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		}
 	}
 	kvm_release_pfn_clean(pfn);
-	if (speculative)
-		vcpu->arch.last_pte_updated = sptep;
 }
 
 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -3523,13 +3524,6 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
 		kvm_mmu_flush_tlb(vcpu);
 }
 
-static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
-{
-	u64 *spte = vcpu->arch.last_pte_updated;
-
-	return !!(spte && (*spte & shadow_accessed_mask));
-}
-
 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
 				    const u8 *new, int *bytes)
 {
@@ -3570,22 +3564,16 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
  * If we're seeing too many writes to a page, it may no longer be a page table,
  * or we may be forking, in which case it is better to unmap the page.
  */
-static bool detect_write_flooding(struct kvm_vcpu *vcpu, gfn_t gfn)
+static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte)
 {
-	bool flooded = false;
-
-	if (gfn == vcpu->arch.last_pt_write_gfn
-	    && !last_updated_pte_accessed(vcpu)) {
-		++vcpu->arch.last_pt_write_count;
-		if (vcpu->arch.last_pt_write_count >= 3)
-			flooded = true;
-	} else {
-		vcpu->arch.last_pt_write_gfn = gfn;
-		vcpu->arch.last_pt_write_count = 1;
-		vcpu->arch.last_pte_updated = NULL;
-	}
+	/*
+	 * Skip write-flooding detected for the sp whose level is 1, because
+	 * it can become unsync, then the guest page is not write-protected.
+	 */
+	if (sp->role.level == 1)
+		return false;
 
-	return flooded;
+	return ++sp->write_flooding_count >= 3;
 }
 
 /*
@@ -3657,7 +3645,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	LIST_HEAD(invalid_list);
 	u64 entry, gentry, *spte;
 	int npte;
-	bool remote_flush, local_flush, zap_page, flooded, misaligned;
+	bool remote_flush, local_flush, zap_page;
 
 	/*
 	 * If we don't have indirect shadow pages, it means no page is
@@ -3683,12 +3671,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	++vcpu->kvm->stat.mmu_pte_write;
 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 
-	flooded = detect_write_flooding(vcpu, gfn);
 	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
-		misaligned = detect_write_misaligned(sp, gpa, bytes);
+		spte = get_written_sptes(sp, gpa, &npte);
 
-		if (misaligned || flooded) {
+		if (detect_write_misaligned(sp, gpa, bytes) ||
+		      detect_write_flooding(sp, spte)) {
 			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
 						     &invalid_list);
 			++vcpu->kvm->stat.mmu_flooded;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9efb860..52e9d58 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -497,6 +497,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	     shadow_walk_next(&it)) {
 		gfn_t table_gfn;
 
+		clear_sp_write_flooding_count(it.sptep);
 		drop_large_spte(vcpu, it.sptep);
 
 		sp = NULL;
@@ -522,6 +523,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	     shadow_walk_next(&it)) {
 		gfn_t direct_gfn;
 
+		clear_sp_write_flooding_count(it.sptep);
 		validate_direct_spte(vcpu, it.sptep, direct_access);
 
 		drop_large_spte(vcpu, it.sptep);
@@ -536,6 +538,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		link_shadow_page(it.sptep, sp);
 	}
 
+	clear_sp_write_flooding_count(it.sptep);
 	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
 		     user_fault, write_fault, emulate, it.level,
 		     gw->gfn, pfn, prefault, map_writable);
@@ -599,11 +602,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	 */
 	if (!r) {
 		pgprintk("%s: guest page fault\n", __func__);
-		if (!prefault) {
+		if (!prefault)
 			inject_page_fault(vcpu, &walker.fault);
-			/* reset fork detector */
-			vcpu->arch.last_pt_write_count = 0;
-		}
+
 		return 0;
 	}
 
@@ -641,9 +642,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
 		 sptep, *sptep, emulate);
 
-	if (!emulate)
-		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-
 	++vcpu->stat.pf_fixed;
 	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 	spin_unlock(&vcpu->kvm->mmu_lock);
-- 
cgit v1.1


From 3f2e5260f5a17d37be3e3c83aca2f335b9bf3893 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 14 Sep 2011 09:58:32 +0200
Subject: KVM: x86: Simplify kvm timer handler

The vcpu reference of a kvm_timer can't become NULL while the timer is
valid, so drop this redundant test. This also makes it pointless to
carry a separate __kvm_timer_fn, fold it into kvm_timer_fn.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/timer.c | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index ae432ea..6b85cc6 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -18,9 +18,10 @@
 #include <linux/atomic.h>
 #include "kvm_timer.h"
 
-static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
 {
-	int restart_timer = 0;
+	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+	struct kvm_vcpu *vcpu = ktimer->vcpu;
 	wait_queue_head_t *q = &vcpu->wq;
 
 	/*
@@ -40,26 +41,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
 
 	if (ktimer->t_ops->is_periodic(ktimer)) {
 		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
-		restart_timer = 1;
-	}
-
-	return restart_timer;
-}
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
-{
-	int restart_timer;
-	struct kvm_vcpu *vcpu;
-	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
-
-	vcpu = ktimer->vcpu;
-	if (!vcpu)
-		return HRTIMER_NORESTART;
-
-	restart_timer = __kvm_timer_fn(vcpu, ktimer);
-	if (restart_timer)
 		return HRTIMER_RESTART;
-	else
+	} else
 		return HRTIMER_NORESTART;
 }
-
-- 
cgit v1.1


From 5202397df819d3c5a3f201bd4af6b86542115fb6 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Tue, 1 Nov 2011 17:28:47 -0700
Subject: KVM guest: remove KVM guest pv mmu support

This has not been used for some years now.  It's time to remove it.

Signed-off-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kernel/kvm.c | 181 --------------------------------------------------
 1 file changed, 181 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a9c2116..f0c6fd6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,8 +39,6 @@
 #include <asm/desc.h>
 #include <asm/tlbflush.h>
 
-#define MMU_QUEUE_SIZE 1024
-
 static int kvmapf = 1;
 
 static int parse_no_kvmapf(char *arg)
@@ -60,21 +58,10 @@ static int parse_no_stealacc(char *arg)
 
 early_param("no-steal-acc", parse_no_stealacc);
 
-struct kvm_para_state {
-	u8 mmu_queue[MMU_QUEUE_SIZE];
-	int mmu_queue_len;
-};
-
-static DEFINE_PER_CPU(struct kvm_para_state, para_state);
 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
 static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
 static int has_steal_clock = 0;
 
-static struct kvm_para_state *kvm_para_state(void)
-{
-	return &per_cpu(para_state, raw_smp_processor_id());
-}
-
 /*
  * No need for any "IO delay" on KVM
  */
@@ -271,151 +258,6 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
 	}
 }
 
-static void kvm_mmu_op(void *buffer, unsigned len)
-{
-	int r;
-	unsigned long a1, a2;
-
-	do {
-		a1 = __pa(buffer);
-		a2 = 0;   /* on i386 __pa() always returns <4G */
-		r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
-		buffer += r;
-		len -= r;
-	} while (len);
-}
-
-static void mmu_queue_flush(struct kvm_para_state *state)
-{
-	if (state->mmu_queue_len) {
-		kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
-		state->mmu_queue_len = 0;
-	}
-}
-
-static void kvm_deferred_mmu_op(void *buffer, int len)
-{
-	struct kvm_para_state *state = kvm_para_state();
-
-	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
-		kvm_mmu_op(buffer, len);
-		return;
-	}
-	if (state->mmu_queue_len + len > sizeof state->mmu_queue)
-		mmu_queue_flush(state);
-	memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
-	state->mmu_queue_len += len;
-}
-
-static void kvm_mmu_write(void *dest, u64 val)
-{
-	__u64 pte_phys;
-	struct kvm_mmu_op_write_pte wpte;
-
-#ifdef CONFIG_HIGHPTE
-	struct page *page;
-	unsigned long dst = (unsigned long) dest;
-
-	page = kmap_atomic_to_page(dest);
-	pte_phys = page_to_pfn(page);
-	pte_phys <<= PAGE_SHIFT;
-	pte_phys += (dst & ~(PAGE_MASK));
-#else
-	pte_phys = (unsigned long)__pa(dest);
-#endif
-	wpte.header.op = KVM_MMU_OP_WRITE_PTE;
-	wpte.pte_val = val;
-	wpte.pte_phys = pte_phys;
-
-	kvm_deferred_mmu_op(&wpte, sizeof wpte);
-}
-
-/*
- * We only need to hook operations that are MMU writes.  We hook these so that
- * we can use lazy MMU mode to batch these operations.  We could probably
- * improve the performance of the host code if we used some of the information
- * here to simplify processing of batched writes.
- */
-static void kvm_set_pte(pte_t *ptep, pte_t pte)
-{
-	kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
-			   pte_t *ptep, pte_t pte)
-{
-	kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
-{
-	kvm_mmu_write(pmdp, pmd_val(pmd));
-}
-
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
-static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
-	kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_pte_clear(struct mm_struct *mm,
-			  unsigned long addr, pte_t *ptep)
-{
-	kvm_mmu_write(ptep, 0);
-}
-
-static void kvm_pmd_clear(pmd_t *pmdp)
-{
-	kvm_mmu_write(pmdp, 0);
-}
-#endif
-
-static void kvm_set_pud(pud_t *pudp, pud_t pud)
-{
-	kvm_mmu_write(pudp, pud_val(pud));
-}
-
-#if PAGETABLE_LEVELS == 4
-static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
-{
-	kvm_mmu_write(pgdp, pgd_val(pgd));
-}
-#endif
-#endif /* PAGETABLE_LEVELS >= 3 */
-
-static void kvm_flush_tlb(void)
-{
-	struct kvm_mmu_op_flush_tlb ftlb = {
-		.header.op = KVM_MMU_OP_FLUSH_TLB,
-	};
-
-	kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
-}
-
-static void kvm_release_pt(unsigned long pfn)
-{
-	struct kvm_mmu_op_release_pt rpt = {
-		.header.op = KVM_MMU_OP_RELEASE_PT,
-		.pt_phys = (u64)pfn << PAGE_SHIFT,
-	};
-
-	kvm_mmu_op(&rpt, sizeof rpt);
-}
-
-static void kvm_enter_lazy_mmu(void)
-{
-	paravirt_enter_lazy_mmu();
-}
-
-static void kvm_leave_lazy_mmu(void)
-{
-	struct kvm_para_state *state = kvm_para_state();
-
-	mmu_queue_flush(state);
-	paravirt_leave_lazy_mmu();
-}
-
 static void __init paravirt_ops_setup(void)
 {
 	pv_info.name = "KVM";
@@ -424,29 +266,6 @@ static void __init paravirt_ops_setup(void)
 	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
 		pv_cpu_ops.io_delay = kvm_io_delay;
 
-	if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
-		pv_mmu_ops.set_pte = kvm_set_pte;
-		pv_mmu_ops.set_pte_at = kvm_set_pte_at;
-		pv_mmu_ops.set_pmd = kvm_set_pmd;
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
-		pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
-		pv_mmu_ops.pte_clear = kvm_pte_clear;
-		pv_mmu_ops.pmd_clear = kvm_pmd_clear;
-#endif
-		pv_mmu_ops.set_pud = kvm_set_pud;
-#if PAGETABLE_LEVELS == 4
-		pv_mmu_ops.set_pgd = kvm_set_pgd;
-#endif
-#endif
-		pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
-		pv_mmu_ops.release_pte = kvm_release_pt;
-		pv_mmu_ops.release_pmd = kvm_release_pt;
-		pv_mmu_ops.release_pud = kvm_release_pt;
-
-		pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
-		pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
-	}
 #ifdef CONFIG_X86_IO_APIC
 	no_timer_check = 1;
 #endif
-- 
cgit v1.1


From fb92045843a8cd99c7b843d9b567a680a3854ba1 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Tue, 1 Nov 2011 17:31:18 -0700
Subject: KVM: MMU: remove KVM host pv mmu support

The host side pv mmu support has been marked for feature removal in
January 2011.  It's not in use, is slower than shadow or hardware
assisted paging, and a maintenance burden.  It's November 2011, time to
remove it.

Signed-off-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  13 ----
 arch/x86/kvm/mmu.c              | 135 ----------------------------------------
 arch/x86/kvm/x86.c              |  12 ----
 3 files changed, 160 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c1f19de..6d83264 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -244,13 +244,6 @@ struct kvm_mmu_page {
 	struct rcu_head rcu;
 };
 
-struct kvm_pv_mmu_op_buffer {
-	void *ptr;
-	unsigned len;
-	unsigned processed;
-	char buf[512] __aligned(sizeof(long));
-};
-
 struct kvm_pio_request {
 	unsigned long count;
 	int in;
@@ -347,10 +340,6 @@ struct kvm_vcpu_arch {
 	 */
 	struct kvm_mmu *walk_mmu;
 
-	/* only needed in kvm_pv_mmu_op() path, but it's hot so
-	 * put it here to avoid allocation */
-	struct kvm_pv_mmu_op_buffer mmu_op_buffer;
-
 	struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
 	struct kvm_mmu_memory_cache mmu_page_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
@@ -667,8 +656,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  const void *val, int bytes);
-int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
-		  gpa_t addr, unsigned long *ret);
 u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 extern bool tdp_enabled;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e9534ce..a9b3a32 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2028,20 +2028,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 
-static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
-{
-	struct kvm_mmu_page *sp;
-	struct hlist_node *node;
-	LIST_HEAD(invalid_list);
-
-	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
-		pgprintk("%s: zap %llx %x\n",
-			 __func__, gfn, sp->role.word);
-		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
-	}
-	kvm_mmu_commit_zap_page(kvm, &invalid_list);
-}
-
 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
 {
 	int slot = memslot_id(kvm, gfn);
@@ -4004,127 +3990,6 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
 	return nr_mmu_pages;
 }
 
-static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
-				unsigned len)
-{
-	if (len > buffer->len)
-		return NULL;
-	return buffer->ptr;
-}
-
-static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
-				unsigned len)
-{
-	void *ret;
-
-	ret = pv_mmu_peek_buffer(buffer, len);
-	if (!ret)
-		return ret;
-	buffer->ptr += len;
-	buffer->len -= len;
-	buffer->processed += len;
-	return ret;
-}
-
-static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
-			     gpa_t addr, gpa_t value)
-{
-	int bytes = 8;
-	int r;
-
-	if (!is_long_mode(vcpu) && !is_pae(vcpu))
-		bytes = 4;
-
-	r = mmu_topup_memory_caches(vcpu);
-	if (r)
-		return r;
-
-	if (!emulator_write_phys(vcpu, addr, &value, bytes))
-		return -EFAULT;
-
-	return 1;
-}
-
-static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
-	(void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
-	return 1;
-}
-
-static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
-{
-	spin_lock(&vcpu->kvm->mmu_lock);
-	mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
-	spin_unlock(&vcpu->kvm->mmu_lock);
-	return 1;
-}
-
-static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
-			     struct kvm_pv_mmu_op_buffer *buffer)
-{
-	struct kvm_mmu_op_header *header;
-
-	header = pv_mmu_peek_buffer(buffer, sizeof *header);
-	if (!header)
-		return 0;
-	switch (header->op) {
-	case KVM_MMU_OP_WRITE_PTE: {
-		struct kvm_mmu_op_write_pte *wpte;
-
-		wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
-		if (!wpte)
-			return 0;
-		return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
-					wpte->pte_val);
-	}
-	case KVM_MMU_OP_FLUSH_TLB: {
-		struct kvm_mmu_op_flush_tlb *ftlb;
-
-		ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
-		if (!ftlb)
-			return 0;
-		return kvm_pv_mmu_flush_tlb(vcpu);
-	}
-	case KVM_MMU_OP_RELEASE_PT: {
-		struct kvm_mmu_op_release_pt *rpt;
-
-		rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
-		if (!rpt)
-			return 0;
-		return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
-	}
-	default: return 0;
-	}
-}
-
-int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
-		  gpa_t addr, unsigned long *ret)
-{
-	int r;
-	struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
-
-	buffer->ptr = buffer->buf;
-	buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
-	buffer->processed = 0;
-
-	r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
-	if (r)
-		goto out;
-
-	while (buffer->len) {
-		r = kvm_pv_mmu_op_one(vcpu, buffer);
-		if (r < 0)
-			goto out;
-		if (r == 0)
-			break;
-	}
-
-	r = 1;
-out:
-	*ret = buffer->processed;
-	return r;
-}
-
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 {
 	struct kvm_shadow_walk_iterator iterator;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9c980ce..a3b25a5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5273,15 +5273,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
-static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
-			   unsigned long a1)
-{
-	if (is_long_mode(vcpu))
-		return a0;
-	else
-		return a0 | ((gpa_t)a1 << 32);
-}
-
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 {
 	u64 param, ingpa, outgpa, ret;
@@ -5377,9 +5368,6 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	case KVM_HC_VAPIC_POLL_IRQ:
 		ret = 0;
 		break;
-	case KVM_HC_MMU_OP:
-		r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
-		break;
 	default:
 		ret = -KVM_ENOSYS;
 		break;
-- 
cgit v1.1


From d6eebf8b80316ea61718dc115cd6a20c16195327 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 14 Nov 2011 18:21:34 +0900
Subject: KVM: MMU: Clean up BUG_ON() conditions in rmap_write_protect()

Remove redundant checks and use is_large_pte() macro.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a9b3a32..973f254 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1027,7 +1027,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 
 	spte = rmap_next(kvm, rmapp, NULL);
 	while (spte) {
-		BUG_ON(!spte);
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 		if (is_writable_pte(*spte)) {
@@ -1043,9 +1042,8 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 		rmapp = gfn_to_rmap(kvm, gfn, i);
 		spte = rmap_next(kvm, rmapp, NULL);
 		while (spte) {
-			BUG_ON(!spte);
 			BUG_ON(!(*spte & PT_PRESENT_MASK));
-			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
+			BUG_ON(!is_large_pte(*spte));
 			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 			if (is_writable_pte(*spte)) {
 				drop_spte(kvm, spte);
-- 
cgit v1.1


From 9b9b1492364758de82c19c36f07baa9ae162c7e5 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 14 Nov 2011 18:22:28 +0900
Subject: KVM: MMU: Split gfn_to_rmap() into two functions

rmap_write_protect() calls gfn_to_rmap() for each level with gfn fixed.
This results in calling gfn_to_memslot() repeatedly with that gfn.

This patch introduces __gfn_to_rmap() which takes the slot as an
argument to avoid this.

This is also needed for the following dirty logging optimization.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 973f254..fa71085 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -958,23 +958,29 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
 	}
 }
 
-/*
- * Take gfn and return the reverse mapping to it.
- */
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
+static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level,
+				    struct kvm_memory_slot *slot)
 {
-	struct kvm_memory_slot *slot;
 	struct kvm_lpage_info *linfo;
 
-	slot = gfn_to_memslot(kvm, gfn);
 	if (likely(level == PT_PAGE_TABLE_LEVEL))
 		return &slot->rmap[gfn - slot->base_gfn];
 
 	linfo = lpage_info_slot(gfn, slot, level);
-
 	return &linfo->rmap_pde;
 }
 
+/*
+ * Take gfn and return the reverse mapping to it.
+ */
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
+{
+	struct kvm_memory_slot *slot;
+
+	slot = gfn_to_memslot(kvm, gfn);
+	return __gfn_to_rmap(kvm, gfn, level, slot);
+}
+
 static bool rmap_can_add(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_memory_cache *cache;
@@ -1019,12 +1025,14 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 
 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
+	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	u64 *spte;
 	int i, write_protected = 0;
 
-	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
+	slot = gfn_to_memslot(kvm, gfn);
 
+	rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot);
 	spte = rmap_next(kvm, rmapp, NULL);
 	while (spte) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
@@ -1039,7 +1047,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 	/* check for huge page mappings */
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		rmapp = gfn_to_rmap(kvm, gfn, i);
+		rmapp = __gfn_to_rmap(kvm, gfn, i, slot);
 		spte = rmap_next(kvm, rmapp, NULL);
 		while (spte) {
 			BUG_ON(!(*spte & PT_PRESENT_MASK));
-- 
cgit v1.1


From 7850ac5420803996e2960d15b924021f28e0dffc Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 14 Nov 2011 18:23:34 +0900
Subject: KVM: Count the number of dirty pages for dirty logging

Needed for the next patch which uses this number to decide how to write
protect a slot.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a3b25a5..220c83b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3466,10 +3466,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 				      struct kvm_dirty_log *log)
 {
-	int r, i;
+	int r;
 	struct kvm_memory_slot *memslot;
 	unsigned long n;
-	unsigned long is_dirty = 0;
 
 	mutex_lock(&kvm->slots_lock);
 
@@ -3484,11 +3483,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 
 	n = kvm_dirty_bitmap_bytes(memslot);
 
-	for (i = 0; !is_dirty && i < n/sizeof(long); i++)
-		is_dirty = memslot->dirty_bitmap[i];
-
 	/* If nothing is dirty, don't bother messing with page tables. */
-	if (is_dirty) {
+	if (memslot->nr_dirty_pages) {
 		struct kvm_memslots *slots, *old_slots;
 		unsigned long *dirty_bitmap;
 
@@ -3503,6 +3499,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 			goto out;
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
+		slots->memslots[log->slot].nr_dirty_pages = 0;
 		slots->generation++;
 
 		old_slots = kvm->memslots;
-- 
cgit v1.1


From 95d4c16ce78cb6b7549a09159c409d52ddd18dae Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 14 Nov 2011 18:24:50 +0900
Subject: KVM: Optimize dirty logging by rmap_write_protect()

Currently, write protecting a slot needs to walk all the shadow pages
and checks ones which have a pte mapping a page in it.

The walk is overly heavy when dirty pages in that slot are not so many
and checking the shadow pages would result in unwanted cache pollution.

To mitigate this problem, we use rmap_write_protect() and check only
the sptes which can be reached from gfns marked in the dirty bitmap
when the number of dirty pages are less than that of shadow pages.

This criterion is reasonable in its meaning and worked well in our test:
write protection became some times faster than before when the ratio of
dirty pages are low and was not worse even when the ratio was near the
criterion.

Note that the locking for this write protection becomes fine grained.
The reason why this is safe is descripted in the comments.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu.c              | 14 +++++++---
 arch/x86/kvm/x86.c              | 58 ++++++++++++++++++++++++++++++++++++-----
 3 files changed, 63 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6d83264..69b6525 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -648,6 +648,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
+			       struct kvm_memory_slot *slot);
 void kvm_mmu_zap_all(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fa71085..aecdea2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1023,15 +1023,13 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }
 
-static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
+			       struct kvm_memory_slot *slot)
 {
-	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	u64 *spte;
 	int i, write_protected = 0;
 
-	slot = gfn_to_memslot(kvm, gfn);
-
 	rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot);
 	spte = rmap_next(kvm, rmapp, NULL);
 	while (spte) {
@@ -1066,6 +1064,14 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 	return write_protected;
 }
 
+static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+{
+	struct kvm_memory_slot *slot;
+
+	slot = gfn_to_memslot(kvm, gfn);
+	return kvm_mmu_rmap_write_protect(kvm, gfn, slot);
+}
+
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			   unsigned long data)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 220c83b..af546b7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3460,6 +3460,50 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 	return 0;
 }
 
+/**
+ * write_protect_slot - write protect a slot for dirty logging
+ * @kvm: the kvm instance
+ * @memslot: the slot we protect
+ * @dirty_bitmap: the bitmap indicating which pages are dirty
+ * @nr_dirty_pages: the number of dirty pages
+ *
+ * We have two ways to find all sptes to protect:
+ * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and
+ *    checks ones that have a spte mapping a page in the slot.
+ * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap.
+ *
+ * Generally speaking, if there are not so many dirty pages compared to the
+ * number of shadow pages, we should use the latter.
+ *
+ * Note that letting others write into a page marked dirty in the old bitmap
+ * by using the remaining tlb entry is not a problem.  That page will become
+ * write protected again when we flush the tlb and then be reported dirty to
+ * the user space by copying the old bitmap.
+ */
+static void write_protect_slot(struct kvm *kvm,
+			       struct kvm_memory_slot *memslot,
+			       unsigned long *dirty_bitmap,
+			       unsigned long nr_dirty_pages)
+{
+	/* Not many dirty pages compared to # of shadow pages. */
+	if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
+		unsigned long gfn_offset;
+
+		for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
+			unsigned long gfn = memslot->base_gfn + gfn_offset;
+
+			spin_lock(&kvm->mmu_lock);
+			kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
+			spin_unlock(&kvm->mmu_lock);
+		}
+		kvm_flush_remote_tlbs(kvm);
+	} else {
+		spin_lock(&kvm->mmu_lock);
+		kvm_mmu_slot_remove_write_access(kvm, memslot->id);
+		spin_unlock(&kvm->mmu_lock);
+	}
+}
+
 /*
  * Get (and clear) the dirty memory log for a memory slot.
  */
@@ -3468,7 +3512,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 {
 	int r;
 	struct kvm_memory_slot *memslot;
-	unsigned long n;
+	unsigned long n, nr_dirty_pages;
 
 	mutex_lock(&kvm->slots_lock);
 
@@ -3482,9 +3526,10 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		goto out;
 
 	n = kvm_dirty_bitmap_bytes(memslot);
+	nr_dirty_pages = memslot->nr_dirty_pages;
 
 	/* If nothing is dirty, don't bother messing with page tables. */
-	if (memslot->nr_dirty_pages) {
+	if (nr_dirty_pages) {
 		struct kvm_memslots *slots, *old_slots;
 		unsigned long *dirty_bitmap;
 
@@ -3498,8 +3543,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		if (!slots)
 			goto out;
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
-		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
-		slots->memslots[log->slot].nr_dirty_pages = 0;
+		memslot = &slots->memslots[log->slot];
+		memslot->dirty_bitmap = dirty_bitmap;
+		memslot->nr_dirty_pages = 0;
 		slots->generation++;
 
 		old_slots = kvm->memslots;
@@ -3508,9 +3554,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
 		kfree(old_slots);
 
-		spin_lock(&kvm->mmu_lock);
-		kvm_mmu_slot_remove_write_access(kvm, log->slot);
-		spin_unlock(&kvm->mmu_lock);
+		write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages);
 
 		r = -EFAULT;
 		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
-- 
cgit v1.1


From 46199f33c29533e7ad2a7d2128dc30175d1d4157 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 17 Nov 2011 10:56:09 +0200
Subject: KVM: VMX: remove unneeded vmx_load_host_state() calls.

vmx_load_host_state() does not handle msrs switching (except
MSR_KERNEL_GS_BASE) since commit 26bb0981b3f. Remove call to it
where it is no longer make sense.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6e28d58..ba24022 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1747,7 +1747,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 	int save_nmsrs, index;
 	unsigned long *msr_bitmap;
 
-	vmx_load_host_state(vmx);
 	save_nmsrs = 0;
 #ifdef CONFIG_X86_64
 	if (is_long_mode(&vmx->vcpu)) {
@@ -2142,12 +2141,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 			return 1;
 		/* Otherwise falls through */
 	default:
-		vmx_load_host_state(to_vmx(vcpu));
 		if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
 			return 0;
 		msr = find_msr_entry(to_vmx(vcpu), msr_index);
 		if (msr) {
-			vmx_load_host_state(to_vmx(vcpu));
 			data = msr->data;
 			break;
 		}
@@ -2171,7 +2168,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 
 	switch (msr_index) {
 	case MSR_EFER:
-		vmx_load_host_state(vmx);
 		ret = kvm_set_msr_common(vcpu, msr_index, data);
 		break;
 #ifdef CONFIG_X86_64
@@ -2220,7 +2216,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 			break;
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
-			vmx_load_host_state(vmx);
 			msr->data = data;
 			break;
 		}
-- 
cgit v1.1


From d7841a4b1b6e8509881e1ec21c024c82ccf565a6 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 22 Nov 2011 15:16:54 +0900
Subject: KVM: x86 emulator: Use opcode::execute for IN/OUT

IN : E4, E5, EC, ED
OUT: E6, E7, EE, EF

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 54 ++++++++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8547958..8ba4ea8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2776,6 +2776,24 @@ static int em_jcxz(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_in(struct x86_emulate_ctxt *ctxt)
+{
+	if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
+			     &ctxt->dst.val))
+		return X86EMUL_IO_NEEDED;
+
+	return X86EMUL_CONTINUE;
+}
+
+static int em_out(struct x86_emulate_ctxt *ctxt)
+{
+	ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
+				    &ctxt->src.val, 1);
+	/* Disable writeback. */
+	ctxt->dst.type = OP_NONE;
+	return X86EMUL_CONTINUE;
+}
+
 static int em_cli(struct x86_emulate_ctxt *ctxt)
 {
 	if (emulator_bad_iopl(ctxt))
@@ -3004,6 +3022,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 #define D2bv(_f)      D((_f) | ByteOp), D(_f)
 #define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
 #define I2bv(_f, _e)  I((_f) | ByteOp, _e), I(_f, _e)
+#define I2bvIP(_f, _e, _i, _p) \
+	IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
 
 #define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e),		\
 		I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e),	\
@@ -3217,13 +3237,13 @@ static struct opcode opcode_table[256] = {
 	/* 0xE0 - 0xE7 */
 	X3(I(SrcImmByte, em_loop)),
 	I(SrcImmByte, em_jcxz),
-	D2bvIP(SrcImmUByte | DstAcc, in,  check_perm_in),
-	D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
+	I2bvIP(SrcImmUByte | DstAcc, em_in,  in,  check_perm_in),
+	I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
 	/* 0xE8 - 0xEF */
 	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
 	I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
-	D2bvIP(SrcDX | DstAcc, in,  check_perm_in),
-	D2bvIP(SrcAcc | DstDX, out, check_perm_out),
+	I2bvIP(SrcDX | DstAcc, em_in,  in,  check_perm_in),
+	I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
 	/* 0xF0 - 0xF7 */
 	N, DI(ImplicitOps, icebp), N, N,
 	DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
@@ -3325,6 +3345,7 @@ static struct opcode twobyte_table[256] = {
 #undef D2bv
 #undef D2bvIP
 #undef I2bv
+#undef I2bvIP
 #undef I6ALU
 
 static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
@@ -3867,11 +3888,12 @@ special_insn:
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
 		ctxt->src.val = ctxt->regs[VCPU_REGS_RDX];
-		goto do_io_in;
+		rc = em_in(ctxt);
+		break;
 	case 0x6e:		/* outsb */
 	case 0x6f:		/* outsw/outsd */
 		ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX];
-		goto do_io_out;
+		rc = em_out(ctxt);
 		break;
 	case 0x70 ... 0x7f: /* jcc (short) */
 		if (test_cc(ctxt->b, ctxt->eflags))
@@ -3915,12 +3937,6 @@ special_insn:
 		ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
 		rc = em_grp2(ctxt);
 		break;
-	case 0xe4: 	/* inb */
-	case 0xe5: 	/* in */
-		goto do_io_in;
-	case 0xe6: /* outb */
-	case 0xe7: /* out */
-		goto do_io_out;
 	case 0xe8: /* call (near) */ {
 		long int rel = ctxt->src.val;
 		ctxt->src.val = (unsigned long) ctxt->_eip;
@@ -3933,20 +3949,6 @@ special_insn:
 		jmp_rel(ctxt, ctxt->src.val);
 		ctxt->dst.type = OP_NONE; /* Disable writeback. */
 		break;
-	case 0xec: /* in al,dx */
-	case 0xed: /* in (e/r)ax,dx */
-	do_io_in:
-		if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
-				     &ctxt->dst.val))
-			goto done; /* IO is needed */
-		break;
-	case 0xee: /* out dx,al */
-	case 0xef: /* out dx,(e/r)ax */
-	do_io_out:
-		ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
-				      &ctxt->src.val, 1);
-		ctxt->dst.type = OP_NONE;	/* Disable writeback. */
-		break;
 	case 0xf4:              /* hlt */
 		ctxt->ops->halt(ctxt);
 		break;
-- 
cgit v1.1


From ce7faab24fbfb0b5207636ee4795e924bcf97e8a Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 22 Nov 2011 15:17:48 +0900
Subject: KVM: x86 emulator: Use opcode::execute for BT family

BT : 0F A3
BTS: 0F AB
BTR: 0F B3
BTC: 0F BB

Group 8: 0F BA

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 77 +++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 39 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8ba4ea8..7a9ce6d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2813,6 +2813,35 @@ static int em_sti(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_bt(struct x86_emulate_ctxt *ctxt)
+{
+	/* Disable writeback. */
+	ctxt->dst.type = OP_NONE;
+	/* only subword offset */
+	ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
+
+	emulate_2op_SrcV_nobyte(ctxt, "bt");
+	return X86EMUL_CONTINUE;
+}
+
+static int em_bts(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_2op_SrcV_nobyte(ctxt, "bts");
+	return X86EMUL_CONTINUE;
+}
+
+static int em_btr(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_2op_SrcV_nobyte(ctxt, "btr");
+	return X86EMUL_CONTINUE;
+}
+
+static int em_btc(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_2op_SrcV_nobyte(ctxt, "btc");
+	return X86EMUL_CONTINUE;
+}
+
 static bool valid_cr(int nr)
 {
 	switch (nr) {
@@ -3117,10 +3146,10 @@ static struct group_dual group7 = { {
 
 static struct opcode group8[] = {
 	N, N, N, N,
-	D(DstMem | SrcImmByte | ModRM),
-	D(DstMem | SrcImmByte | ModRM | Lock | PageTable),
-	D(DstMem | SrcImmByte | ModRM | Lock),
-	D(DstMem | SrcImmByte | ModRM | Lock | PageTable),
+	I(DstMem | SrcImmByte | ModRM, em_bt),
+	I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts),
+	I(DstMem | SrcImmByte | ModRM | Lock, em_btr),
+	I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc),
 };
 
 static struct group_dual group9 = { {
@@ -3299,26 +3328,27 @@ static struct opcode twobyte_table[256] = {
 	X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
 	/* 0xA0 - 0xA7 */
 	I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
-	DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp),
+	DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
 	D(DstMem | SrcReg | Src2ImmByte | ModRM),
 	D(DstMem | SrcReg | Src2CL | ModRM), N, N,
 	/* 0xA8 - 0xAF */
 	I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
 	DI(ImplicitOps, rsm),
-	D(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable),
+	I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
 	D(DstMem | SrcReg | Src2ImmByte | ModRM),
 	D(DstMem | SrcReg | Src2CL | ModRM),
 	D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
 	/* 0xB0 - 0xB7 */
 	D2bv(DstMem | SrcReg | ModRM | Lock | PageTable),
 	I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
-	D(DstMem | SrcReg | ModRM | BitOp | Lock),
+	I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
 	I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
 	I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xB8 - 0xBF */
 	N, N,
-	G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable),
+	G(BitOp, group8),
+	I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
 	D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
@@ -4103,21 +4133,10 @@ twobyte_insn:
 	case 0x90 ... 0x9f:     /* setcc r/m8 */
 		ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
 		break;
-	case 0xa3:
-	      bt:		/* bt */
-		ctxt->dst.type = OP_NONE;
-		/* only subword offset */
-		ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
-		emulate_2op_SrcV_nobyte(ctxt, "bt");
-		break;
 	case 0xa4: /* shld imm8, r, r/m */
 	case 0xa5: /* shld cl, r, r/m */
 		emulate_2op_cl(ctxt, "shld");
 		break;
-	case 0xab:
-	      bts:		/* bts */
-		emulate_2op_SrcV_nobyte(ctxt, "bts");
-		break;
 	case 0xac: /* shrd imm8, r, r/m */
 	case 0xad: /* shrd cl, r, r/m */
 		emulate_2op_cl(ctxt, "shrd");
@@ -4141,31 +4160,11 @@ twobyte_insn:
 			ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
 		}
 		break;
-	case 0xb3:
-	      btr:		/* btr */
-		emulate_2op_SrcV_nobyte(ctxt, "btr");
-		break;
 	case 0xb6 ... 0xb7:	/* movzx */
 		ctxt->dst.bytes = ctxt->op_bytes;
 		ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
 						       : (u16) ctxt->src.val;
 		break;
-	case 0xba:		/* Grp8 */
-		switch (ctxt->modrm_reg & 3) {
-		case 0:
-			goto bt;
-		case 1:
-			goto bts;
-		case 2:
-			goto btr;
-		case 3:
-			goto btc;
-		}
-		break;
-	case 0xbb:
-	      btc:		/* btc */
-		emulate_2op_SrcV_nobyte(ctxt, "btc");
-		break;
 	case 0xbc: {		/* bsf */
 		u8 zf;
 		__asm__ ("bsf %2, %0; setz %1"
-- 
cgit v1.1


From d4ddafcdf2201326ec9717172767cfad0ede1472 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 22 Nov 2011 15:18:35 +0900
Subject: KVM: x86 emulator: Use opcode::execute for CALL

CALL: E8

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7a9ce6d..6b7a03b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2482,6 +2482,15 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_call(struct x86_emulate_ctxt *ctxt)
+{
+	long rel = ctxt->src.val;
+
+	ctxt->src.val = (unsigned long)ctxt->_eip;
+	jmp_rel(ctxt, rel);
+	return em_push(ctxt);
+}
+
 static int em_call_far(struct x86_emulate_ctxt *ctxt)
 {
 	u16 sel, old_cs;
@@ -3269,7 +3278,7 @@ static struct opcode opcode_table[256] = {
 	I2bvIP(SrcImmUByte | DstAcc, em_in,  in,  check_perm_in),
 	I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
 	/* 0xE8 - 0xEF */
-	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
+	I(SrcImm | Stack, em_call), D(SrcImm | ImplicitOps),
 	I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
 	I2bvIP(SrcDX | DstAcc, em_in,  in,  check_perm_in),
 	I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
@@ -3967,13 +3976,6 @@ special_insn:
 		ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
 		rc = em_grp2(ctxt);
 		break;
-	case 0xe8: /* call (near) */ {
-		long int rel = ctxt->src.val;
-		ctxt->src.val = (unsigned long) ctxt->_eip;
-		jmp_rel(ctxt, rel);
-		rc = em_push(ctxt);
-		break;
-	}
 	case 0xe9: /* jmp rel */
 	case 0xeb: /* jmp rel short */
 		jmp_rel(ctxt, ctxt->src.val);
-- 
cgit v1.1


From bc00f8d2c2df33c6e7fc8a12c94c0c74d491e566 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 22 Nov 2011 15:19:19 +0900
Subject: KVM: x86 emulator: Use opcode::execute for MOV to cr/dr

MOV: 0F 22 (move to control registers)
MOV: 0F 23 (move to debug registers)

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 52 +++++++++++++++++++++++++++++---------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6b7a03b..7fe5ed1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2638,6 +2638,34 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_cr_write(struct x86_emulate_ctxt *ctxt)
+{
+	if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
+		return emulate_gp(ctxt, 0);
+
+	/* Disable writeback. */
+	ctxt->dst.type = OP_NONE;
+	return X86EMUL_CONTINUE;
+}
+
+static int em_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+	unsigned long val;
+
+	if (ctxt->mode == X86EMUL_MODE_PROT64)
+		val = ctxt->src.val & ~0ULL;
+	else
+		val = ctxt->src.val & ~0U;
+
+	/* #UD condition is already handled. */
+	if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0)
+		return emulate_gp(ctxt, 0);
+
+	/* Disable writeback. */
+	ctxt->dst.type = OP_NONE;
+	return X86EMUL_CONTINUE;
+}
+
 static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
 {
 	if (ctxt->modrm_reg > VCPU_SREG_GS)
@@ -3304,8 +3332,8 @@ static struct opcode twobyte_table[256] = {
 	/* 0x20 - 0x2F */
 	DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
 	DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
-	DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write),
-	DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write),
+	IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
+	IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
 	N, N, N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x30 - 0x3F */
@@ -4080,26 +4108,6 @@ twobyte_insn:
 	case 0x21: /* mov from dr to reg */
 		ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
 		break;
-	case 0x22: /* mov reg, cr */
-		if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) {
-			emulate_gp(ctxt, 0);
-			rc = X86EMUL_PROPAGATE_FAULT;
-			goto done;
-		}
-		ctxt->dst.type = OP_NONE;
-		break;
-	case 0x23: /* mov from reg to dr */
-		if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val &
-				((ctxt->mode == X86EMUL_MODE_PROT64) ?
-				 ~0ULL : ~0U)) < 0) {
-			/* #UD condition is already handled by the code above */
-			emulate_gp(ctxt, 0);
-			rc = X86EMUL_PROPAGATE_FAULT;
-			goto done;
-		}
-
-		ctxt->dst.type = OP_NONE;	/* no writeback */
-		break;
 	case 0x30:
 		/* wrmsr */
 		msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
-- 
cgit v1.1


From e1e210b0a7f7b4d14577bdc76719963f7facc0e7 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 22 Nov 2011 15:20:03 +0900
Subject: KVM: x86 emulator: Use opcode::execute for WRMSR/RDMSR

WRMSR: 0F 30
RDMSR: 0F 32

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 52 +++++++++++++++++++++++++-------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7fe5ed1..906c5eb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2666,6 +2666,30 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
+{
+	u64 msr_data;
+
+	msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
+		| ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
+	if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data))
+		return emulate_gp(ctxt, 0);
+
+	return X86EMUL_CONTINUE;
+}
+
+static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
+{
+	u64 msr_data;
+
+	if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data))
+		return emulate_gp(ctxt, 0);
+
+	ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
+	ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
+	return X86EMUL_CONTINUE;
+}
+
 static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
 {
 	if (ctxt->modrm_reg > VCPU_SREG_GS)
@@ -3337,9 +3361,9 @@ static struct opcode twobyte_table[256] = {
 	N, N, N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x30 - 0x3F */
-	DI(ImplicitOps | Priv, wrmsr),
+	II(ImplicitOps | Priv, em_wrmsr, wrmsr),
 	IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
-	DI(ImplicitOps | Priv, rdmsr),
+	II(ImplicitOps | Priv, em_rdmsr, rdmsr),
 	DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
 	I(ImplicitOps | VendorSpecific, em_sysenter),
 	I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
@@ -3818,7 +3842,6 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
-	u64 msr_data;
 	int rc = X86EMUL_CONTINUE;
 	int saved_dst_type = ctxt->dst.type;
 
@@ -4108,29 +4131,6 @@ twobyte_insn:
 	case 0x21: /* mov from dr to reg */
 		ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
 		break;
-	case 0x30:
-		/* wrmsr */
-		msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
-			| ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
-		if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) {
-			emulate_gp(ctxt, 0);
-			rc = X86EMUL_PROPAGATE_FAULT;
-			goto done;
-		}
-		rc = X86EMUL_CONTINUE;
-		break;
-	case 0x32:
-		/* rdmsr */
-		if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) {
-			emulate_gp(ctxt, 0);
-			rc = X86EMUL_PROPAGATE_FAULT;
-			goto done;
-		} else {
-			ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
-			ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
-		}
-		rc = X86EMUL_CONTINUE;
-		break;
 	case 0x40 ... 0x4f:	/* cmov */
 		ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
 		if (!test_cc(ctxt->b, ctxt->eflags))
-- 
cgit v1.1


From e940b5c20f89282fe826c5e2237932ab280497cf Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 22 Nov 2011 15:20:47 +0900
Subject: KVM: x86 emulator: Use opcode::execute for CMPXCHG

CMPXCHG: 0F B0, 0F B1

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 906c5eb..799000d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1832,6 +1832,24 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
 	return rc;
 }
 
+static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
+{
+	/* Save real source value, then compare EAX against destination. */
+	ctxt->src.orig_val = ctxt->src.val;
+	ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
+	emulate_2op_SrcV(ctxt, "cmp");
+
+	if (ctxt->eflags & EFLG_ZF) {
+		/* Success: write back to memory. */
+		ctxt->dst.val = ctxt->src.orig_val;
+	} else {
+		/* Failure: write the value we saw to EAX. */
+		ctxt->dst.type = OP_REG;
+		ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
+	}
+	return X86EMUL_CONTINUE;
+}
+
 static int em_lseg(struct x86_emulate_ctxt *ctxt)
 {
 	int seg = ctxt->src2.val;
@@ -3400,7 +3418,7 @@ static struct opcode twobyte_table[256] = {
 	D(DstMem | SrcReg | Src2CL | ModRM),
 	D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
 	/* 0xB0 - 0xB7 */
-	D2bv(DstMem | SrcReg | ModRM | Lock | PageTable),
+	I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
 	I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
 	I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
 	I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
@@ -4153,23 +4171,6 @@ twobyte_insn:
 		break;
 	case 0xae:              /* clflush */
 		break;
-	case 0xb0 ... 0xb1:	/* cmpxchg */
-		/*
-		 * Save real source value, then compare EAX against
-		 * destination.
-		 */
-		ctxt->src.orig_val = ctxt->src.val;
-		ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
-		emulate_2op_SrcV(ctxt, "cmp");
-		if (ctxt->eflags & EFLG_ZF) {
-			/* Success: write back to memory. */
-			ctxt->dst.val = ctxt->src.orig_val;
-		} else {
-			/* Failure: write the value we saw to EAX. */
-			ctxt->dst.type = OP_REG;
-			ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
-		}
-		break;
 	case 0xb6 ... 0xb7:	/* movzx */
 		ctxt->dst.bytes = ctxt->op_bytes;
 		ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
-- 
cgit v1.1


From ff227392cd7b858b2b04732e02697122fd1b35b0 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 22 Nov 2011 15:21:33 +0900
Subject: KVM: x86 emulator: Use opcode::execute for BSF/BSR

BSF: 0F BC
BSR: 0F BD

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 60 +++++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 799000d..4cd3313 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2921,6 +2921,40 @@ static int em_btc(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_bsf(struct x86_emulate_ctxt *ctxt)
+{
+	u8 zf;
+
+	__asm__ ("bsf %2, %0; setz %1"
+		 : "=r"(ctxt->dst.val), "=q"(zf)
+		 : "r"(ctxt->src.val));
+
+	ctxt->eflags &= ~X86_EFLAGS_ZF;
+	if (zf) {
+		ctxt->eflags |= X86_EFLAGS_ZF;
+		/* Disable writeback. */
+		ctxt->dst.type = OP_NONE;
+	}
+	return X86EMUL_CONTINUE;
+}
+
+static int em_bsr(struct x86_emulate_ctxt *ctxt)
+{
+	u8 zf;
+
+	__asm__ ("bsr %2, %0; setz %1"
+		 : "=r"(ctxt->dst.val), "=q"(zf)
+		 : "r"(ctxt->src.val));
+
+	ctxt->eflags &= ~X86_EFLAGS_ZF;
+	if (zf) {
+		ctxt->eflags |= X86_EFLAGS_ZF;
+		/* Disable writeback. */
+		ctxt->dst.type = OP_NONE;
+	}
+	return X86EMUL_CONTINUE;
+}
+
 static bool valid_cr(int nr)
 {
 	switch (nr) {
@@ -3428,7 +3462,7 @@ static struct opcode twobyte_table[256] = {
 	N, N,
 	G(BitOp, group8),
 	I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
-	D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+	I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
 	D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xCF */
 	D2bv(DstMem | SrcReg | ModRM | Lock),
@@ -4176,30 +4210,6 @@ twobyte_insn:
 		ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
 						       : (u16) ctxt->src.val;
 		break;
-	case 0xbc: {		/* bsf */
-		u8 zf;
-		__asm__ ("bsf %2, %0; setz %1"
-			 : "=r"(ctxt->dst.val), "=q"(zf)
-			 : "r"(ctxt->src.val));
-		ctxt->eflags &= ~X86_EFLAGS_ZF;
-		if (zf) {
-			ctxt->eflags |= X86_EFLAGS_ZF;
-			ctxt->dst.type = OP_NONE;	/* Disable writeback. */
-		}
-		break;
-	}
-	case 0xbd: {		/* bsr */
-		u8 zf;
-		__asm__ ("bsr %2, %0; setz %1"
-			 : "=r"(ctxt->dst.val), "=q"(zf)
-			 : "r"(ctxt->src.val));
-		ctxt->eflags &= ~X86_EFLAGS_ZF;
-		if (zf) {
-			ctxt->eflags |= X86_EFLAGS_ZF;
-			ctxt->dst.type = OP_NONE;	/* Disable writeback. */
-		}
-		break;
-	}
 	case 0xbe ... 0xbf:	/* movsx */
 		ctxt->dst.bytes = ctxt->op_bytes;
 		ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
-- 
cgit v1.1


From 93a5cef07d686a0341d056b0f930a762c7174a13 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Thu, 24 Nov 2011 17:37:48 +0800
Subject: KVM: introduce KVM_MEM_SLOTS_NUM macro

Introduce KVM_MEM_SLOTS_NUM macro to instead of
KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 4 +++-
 arch/x86/kvm/mmu.c              | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 69b6525..1769f3d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -31,6 +31,8 @@
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
+#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+
 #define KVM_MMIO_SIZE 16
 
 #define KVM_PIO_PAGE_OFFSET 1
@@ -228,7 +230,7 @@ struct kvm_mmu_page {
 	 * One bit set per slot which has memory
 	 * in this shadow page.
 	 */
-	DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
+	DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM);
 	bool unsync;
 	int root_count;          /* Currently serving as active root */
 	unsigned int unsync_children;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aecdea2..715dcb4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1349,7 +1349,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 						  PAGE_SIZE);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
-	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
+	bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
 	sp->parent_ptes = 0;
 	mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
-- 
cgit v1.1


From be593d6286075801bba6d60fa466a39c24cc7616 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Thu, 24 Nov 2011 17:38:24 +0800
Subject: KVM: introduce update_memslots function

Introduce update_memslots to update slot which will be update to
kvm->memslots

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index af546b7..917a287 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3546,7 +3546,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		memslot = &slots->memslots[log->slot];
 		memslot->dirty_bitmap = dirty_bitmap;
 		memslot->nr_dirty_pages = 0;
-		slots->generation++;
+		update_memslots(slots, NULL);
 
 		old_slots = kvm->memslots;
 		rcu_assign_pointer(kvm->memslots, slots);
-- 
cgit v1.1


From be6ba0f0962a39091c52eb9167ddea201fe80716 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Thu, 24 Nov 2011 17:39:18 +0800
Subject: KVM: introduce kvm_for_each_memslot macro

Introduce kvm_for_each_memslot to walk all valid memslot

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 715dcb4..d737443 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1128,15 +1128,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
 					 unsigned long data))
 {
-	int i, j;
+	int j;
 	int ret;
 	int retval = 0;
 	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
 
 	slots = kvm_memslots(kvm);
 
-	for (i = 0; i < slots->nmemslots; i++) {
-		struct kvm_memory_slot *memslot = &slots->memslots[i];
+	kvm_for_each_memslot(memslot, slots) {
 		unsigned long start = memslot->userspace_addr;
 		unsigned long end;
 
@@ -3985,15 +3985,15 @@ nomem:
  */
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
 {
-	int i;
 	unsigned int nr_mmu_pages;
 	unsigned int  nr_pages = 0;
 	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
 
 	slots = kvm_memslots(kvm);
 
-	for (i = 0; i < slots->nmemslots; i++)
-		nr_pages += slots->memslots[i].npages;
+	kvm_for_each_memslot(memslot, slots)
+		nr_pages += memslot->npages;
 
 	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
 	nr_mmu_pages = max(nr_mmu_pages,
-- 
cgit v1.1


From 28a37544fb0223eb9805d2567b88f7360edec52a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong.eric@gmail.com>
Date: Thu, 24 Nov 2011 19:04:35 +0800
Subject: KVM: introduce id_to_memslot function

Introduce id_to_memslot to get memslot by slot id

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c |  6 ++++--
 arch/x86/kvm/x86.c | 18 +++++++++---------
 2 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ba24022..8f19d91 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2711,11 +2711,13 @@ static gva_t rmode_tss_base(struct kvm *kvm)
 {
 	if (!kvm->arch.tss_addr) {
 		struct kvm_memslots *slots;
+		struct kvm_memory_slot *slot;
 		gfn_t base_gfn;
 
 		slots = kvm_memslots(kvm);
-		base_gfn = slots->memslots[0].base_gfn +
-				 kvm->memslots->memslots[0].npages - 3;
+		slot = id_to_memslot(slots, 0);
+		base_gfn = slot->base_gfn + slot->npages - 3;
+
 		return base_gfn << PAGE_SHIFT;
 	}
 	return kvm->arch.tss_addr;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 917a287..b6776c6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3520,7 +3520,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	if (log->slot >= KVM_MEMORY_SLOTS)
 		goto out;
 
-	memslot = &kvm->memslots->memslots[log->slot];
+	memslot = id_to_memslot(kvm->memslots, log->slot);
 	r = -ENOENT;
 	if (!memslot->dirty_bitmap)
 		goto out;
@@ -3531,27 +3531,27 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	/* If nothing is dirty, don't bother messing with page tables. */
 	if (nr_dirty_pages) {
 		struct kvm_memslots *slots, *old_slots;
-		unsigned long *dirty_bitmap;
+		unsigned long *dirty_bitmap, *dirty_bitmap_head;
 
-		dirty_bitmap = memslot->dirty_bitmap_head;
-		if (memslot->dirty_bitmap == dirty_bitmap)
-			dirty_bitmap += n / sizeof(long);
-		memset(dirty_bitmap, 0, n);
+		dirty_bitmap = memslot->dirty_bitmap;
+		dirty_bitmap_head = memslot->dirty_bitmap_head;
+		if (dirty_bitmap == dirty_bitmap_head)
+			dirty_bitmap_head += n / sizeof(long);
+		memset(dirty_bitmap_head, 0, n);
 
 		r = -ENOMEM;
 		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
 		if (!slots)
 			goto out;
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
-		memslot = &slots->memslots[log->slot];
-		memslot->dirty_bitmap = dirty_bitmap;
+		memslot = id_to_memslot(slots, log->slot);
 		memslot->nr_dirty_pages = 0;
+		memslot->dirty_bitmap = dirty_bitmap_head;
 		update_memslots(slots, NULL);
 
 		old_slots = kvm->memslots;
 		rcu_assign_pointer(kvm->memslots, slots);
 		synchronize_srcu_expedited(&kvm->srcu);
-		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
 		kfree(old_slots);
 
 		write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages);
-- 
cgit v1.1


From 2b5e97e1fadf1ade87558f2a2003616879f9e228 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 23 Nov 2011 12:27:39 +0900
Subject: KVM: x86 emulator: Use opcode::execute for INS/OUTS from/to port in
 DX

INSB       : 6C
INSW/INSD  : 6D
OUTSB      : 6E
OUTSW/OUTSD: 6F

The I/O port address is read from the DX register when we decode the
operand because we see the SrcDX/DstDX flag is set.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4cd3313..ac8e5ed 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3321,8 +3321,8 @@ static struct opcode opcode_table[256] = {
 	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
 	I(SrcImmByte | Mov | Stack, em_push),
 	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
-	D2bvIP(DstDI | SrcDX | Mov | String, ins, check_perm_in), /* insb, insw/insd */
-	D2bvIP(SrcSI | DstDX | String, outs, check_perm_out), /* outsb, outsw/outsd */
+	I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
+	I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
 	/* 0x70 - 0x7F */
 	X16(D(SrcImmByte)),
 	/* 0x80 - 0x87 */
@@ -4027,16 +4027,6 @@ special_insn:
 			goto cannot_emulate;
 		ctxt->dst.val = (s32) ctxt->src.val;
 		break;
-	case 0x6c:		/* insb */
-	case 0x6d:		/* insw/insd */
-		ctxt->src.val = ctxt->regs[VCPU_REGS_RDX];
-		rc = em_in(ctxt);
-		break;
-	case 0x6e:		/* outsb */
-	case 0x6f:		/* outsw/outsd */
-		ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX];
-		rc = em_out(ctxt);
-		break;
 	case 0x70 ... 0x7f: /* jcc (short) */
 		if (test_cc(ctxt->b, ctxt->eflags))
 			jmp_rel(ctxt, ctxt->src.val);
-- 
cgit v1.1


From 00b27a3efb116062ca5a276ad5cb01ea1b80b5f6 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 23 Nov 2011 16:30:32 +0200
Subject: KVM: Move cpuid code to new file

The cpuid code has grown; put it into a separate file.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/Makefile |   2 +-
 arch/x86/kvm/cpuid.c  | 625 +++++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/cpuid.h  |  46 ++++
 arch/x86/kvm/lapic.c  |   1 +
 arch/x86/kvm/vmx.c    |   1 +
 arch/x86/kvm/x86.c    | 634 +-------------------------------------------------
 arch/x86/kvm/x86.h    |   5 +-
 7 files changed, 679 insertions(+), 635 deletions(-)
 create mode 100644 arch/x86/kvm/cpuid.c
 create mode 100644 arch/x86/kvm/cpuid.h

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f43..161b76a 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
-			   i8254.o timer.o
+			   i8254.o timer.o cpuid.o
 kvm-intel-y		+= vmx.o
 kvm-amd-y		+= svm.o
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
new file mode 100644
index 0000000..0a332ec
--- /dev/null
+++ b/arch/x86/kvm/cpuid.c
@@ -0,0 +1,625 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ * cpuid support routines
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ * Copyright IBM Corporation, 2008
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <asm/user.h>
+#include <asm/xsave.h>
+#include "cpuid.h"
+#include "lapic.h"
+#include "mmu.h"
+#include "trace.h"
+
+void kvm_update_cpuid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	if (!best)
+		return;
+
+	/* Update OSXSAVE bit */
+	if (cpu_has_xsave && best->function == 0x1) {
+		best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
+			best->ecx |= bit(X86_FEATURE_OSXSAVE);
+	}
+
+	if (apic) {
+		if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
+			apic->lapic_timer.timer_mode_mask = 3 << 17;
+		else
+			apic->lapic_timer.timer_mode_mask = 1 << 17;
+	}
+}
+
+static int is_efer_nx(void)
+{
+	unsigned long long efer = 0;
+
+	rdmsrl_safe(MSR_EFER, &efer);
+	return efer & EFER_NX;
+}
+
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_cpuid_entry2 *e, *entry;
+
+	entry = NULL;
+	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+		e = &vcpu->arch.cpuid_entries[i];
+		if (e->function == 0x80000001) {
+			entry = e;
+			break;
+		}
+	}
+	if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
+		entry->edx &= ~(1 << 20);
+		printk(KERN_INFO "kvm: guest NX capability removed\n");
+	}
+}
+
+/* when an old userspace process fills a new kernel module */
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+			     struct kvm_cpuid *cpuid,
+			     struct kvm_cpuid_entry __user *entries)
+{
+	int r, i;
+	struct kvm_cpuid_entry *cpuid_entries;
+
+	r = -E2BIG;
+	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+		goto out;
+	r = -ENOMEM;
+	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
+	if (!cpuid_entries)
+		goto out;
+	r = -EFAULT;
+	if (copy_from_user(cpuid_entries, entries,
+			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+		goto out_free;
+	for (i = 0; i < cpuid->nent; i++) {
+		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
+		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
+		vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
+		vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
+		vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
+		vcpu->arch.cpuid_entries[i].index = 0;
+		vcpu->arch.cpuid_entries[i].flags = 0;
+		vcpu->arch.cpuid_entries[i].padding[0] = 0;
+		vcpu->arch.cpuid_entries[i].padding[1] = 0;
+		vcpu->arch.cpuid_entries[i].padding[2] = 0;
+	}
+	vcpu->arch.cpuid_nent = cpuid->nent;
+	cpuid_fix_nx_cap(vcpu);
+	r = 0;
+	kvm_apic_set_version(vcpu);
+	kvm_x86_ops->cpuid_update(vcpu);
+	kvm_update_cpuid(vcpu);
+
+out_free:
+	vfree(cpuid_entries);
+out:
+	return r;
+}
+
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+			      struct kvm_cpuid2 *cpuid,
+			      struct kvm_cpuid_entry2 __user *entries)
+{
+	int r;
+
+	r = -E2BIG;
+	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+		goto out;
+	r = -EFAULT;
+	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
+			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+		goto out;
+	vcpu->arch.cpuid_nent = cpuid->nent;
+	kvm_apic_set_version(vcpu);
+	kvm_x86_ops->cpuid_update(vcpu);
+	kvm_update_cpuid(vcpu);
+	return 0;
+
+out:
+	return r;
+}
+
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+			      struct kvm_cpuid2 *cpuid,
+			      struct kvm_cpuid_entry2 __user *entries)
+{
+	int r;
+
+	r = -E2BIG;
+	if (cpuid->nent < vcpu->arch.cpuid_nent)
+		goto out;
+	r = -EFAULT;
+	if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
+			 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+		goto out;
+	return 0;
+
+out:
+	cpuid->nent = vcpu->arch.cpuid_nent;
+	return r;
+}
+
+static void cpuid_mask(u32 *word, int wordnum)
+{
+	*word &= boot_cpu_data.x86_capability[wordnum];
+}
+
+static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+			   u32 index)
+{
+	entry->function = function;
+	entry->index = index;
+	cpuid_count(entry->function, entry->index,
+		    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+	entry->flags = 0;
+}
+
+static bool supported_xcr0_bit(unsigned bit)
+{
+	u64 mask = ((u64)1 << bit);
+
+	return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
+}
+
+#define F(x) bit(X86_FEATURE_##x)
+
+static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+			 u32 index, int *nent, int maxnent)
+{
+	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
+#ifdef CONFIG_X86_64
+	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
+				? F(GBPAGES) : 0;
+	unsigned f_lm = F(LM);
+#else
+	unsigned f_gbpages = 0;
+	unsigned f_lm = 0;
+#endif
+	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
+
+	/* cpuid 1.edx */
+	const u32 kvm_supported_word0_x86_features =
+		F(FPU) | F(VME) | F(DE) | F(PSE) |
+		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
+		0 /* Reserved, DS, ACPI */ | F(MMX) |
+		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+		0 /* HTT, TM, Reserved, PBE */;
+	/* cpuid 0x80000001.edx */
+	const u32 kvm_supported_word1_x86_features =
+		F(FPU) | F(VME) | F(DE) | F(PSE) |
+		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+		F(PAT) | F(PSE36) | 0 /* Reserved */ |
+		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
+		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
+	/* cpuid 1.ecx */
+	const u32 kvm_supported_word4_x86_features =
+		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
+		0 /* DS-CPL, VMX, SMX, EST */ |
+		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
+		0 /* Reserved, DCA */ | F(XMM4_1) |
+		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
+		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+		F(F16C) | F(RDRAND);
+	/* cpuid 0x80000001.ecx */
+	const u32 kvm_supported_word6_x86_features =
+		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
+		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
+		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
+
+	/* cpuid 0xC0000001.edx */
+	const u32 kvm_supported_word5_x86_features =
+		F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
+		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
+		F(PMM) | F(PMM_EN);
+
+	/* cpuid 7.0.ebx */
+	const u32 kvm_supported_word9_x86_features =
+		F(SMEP) | F(FSGSBASE) | F(ERMS);
+
+	/* all calls to cpuid_count() should be made on the same cpu */
+	get_cpu();
+	do_cpuid_1_ent(entry, function, index);
+	++*nent;
+
+	switch (function) {
+	case 0:
+		entry->eax = min(entry->eax, (u32)0xd);
+		break;
+	case 1:
+		entry->edx &= kvm_supported_word0_x86_features;
+		cpuid_mask(&entry->edx, 0);
+		entry->ecx &= kvm_supported_word4_x86_features;
+		cpuid_mask(&entry->ecx, 4);
+		/* we support x2apic emulation even if host does not support
+		 * it since we emulate x2apic in software */
+		entry->ecx |= F(X2APIC);
+		break;
+	/* function 2 entries are STATEFUL. That is, repeated cpuid commands
+	 * may return different values. This forces us to get_cpu() before
+	 * issuing the first command, and also to emulate this annoying behavior
+	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
+	case 2: {
+		int t, times = entry->eax & 0xff;
+
+		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+		for (t = 1; t < times && *nent < maxnent; ++t) {
+			do_cpuid_1_ent(&entry[t], function, 0);
+			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+			++*nent;
+		}
+		break;
+	}
+	/* function 4 has additional index. */
+	case 4: {
+		int i, cache_type;
+
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		/* read more entries until cache_type is zero */
+		for (i = 1; *nent < maxnent; ++i) {
+			cache_type = entry[i - 1].eax & 0x1f;
+			if (!cache_type)
+				break;
+			do_cpuid_1_ent(&entry[i], function, i);
+			entry[i].flags |=
+			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+			++*nent;
+		}
+		break;
+	}
+	case 7: {
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		/* Mask ebx against host capbability word 9 */
+		if (index == 0) {
+			entry->ebx &= kvm_supported_word9_x86_features;
+			cpuid_mask(&entry->ebx, 9);
+		} else
+			entry->ebx = 0;
+		entry->eax = 0;
+		entry->ecx = 0;
+		entry->edx = 0;
+		break;
+	}
+	case 9:
+		break;
+	/* function 0xb has additional index. */
+	case 0xb: {
+		int i, level_type;
+
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		/* read more entries until level_type is zero */
+		for (i = 1; *nent < maxnent; ++i) {
+			level_type = entry[i - 1].ecx & 0xff00;
+			if (!level_type)
+				break;
+			do_cpuid_1_ent(&entry[i], function, i);
+			entry[i].flags |=
+			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+			++*nent;
+		}
+		break;
+	}
+	case 0xd: {
+		int idx, i;
+
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
+			do_cpuid_1_ent(&entry[i], function, idx);
+			if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
+				continue;
+			entry[i].flags |=
+			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+			++*nent;
+			++i;
+		}
+		break;
+	}
+	case KVM_CPUID_SIGNATURE: {
+		char signature[12] = "KVMKVMKVM\0\0";
+		u32 *sigptr = (u32 *)signature;
+		entry->eax = 0;
+		entry->ebx = sigptr[0];
+		entry->ecx = sigptr[1];
+		entry->edx = sigptr[2];
+		break;
+	}
+	case KVM_CPUID_FEATURES:
+		entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
+			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
+			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
+			     (1 << KVM_FEATURE_ASYNC_PF) |
+			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+
+		if (sched_info_on())
+			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
+
+		entry->ebx = 0;
+		entry->ecx = 0;
+		entry->edx = 0;
+		break;
+	case 0x80000000:
+		entry->eax = min(entry->eax, 0x8000001a);
+		break;
+	case 0x80000001:
+		entry->edx &= kvm_supported_word1_x86_features;
+		cpuid_mask(&entry->edx, 1);
+		entry->ecx &= kvm_supported_word6_x86_features;
+		cpuid_mask(&entry->ecx, 6);
+		break;
+	case 0x80000008: {
+		unsigned g_phys_as = (entry->eax >> 16) & 0xff;
+		unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
+		unsigned phys_as = entry->eax & 0xff;
+
+		if (!g_phys_as)
+			g_phys_as = phys_as;
+		entry->eax = g_phys_as | (virt_as << 8);
+		entry->ebx = entry->edx = 0;
+		break;
+	}
+	case 0x80000019:
+		entry->ecx = entry->edx = 0;
+		break;
+	case 0x8000001a:
+		break;
+	case 0x8000001d:
+		break;
+	/*Add support for Centaur's CPUID instruction*/
+	case 0xC0000000:
+		/*Just support up to 0xC0000004 now*/
+		entry->eax = min(entry->eax, 0xC0000004);
+		break;
+	case 0xC0000001:
+		entry->edx &= kvm_supported_word5_x86_features;
+		cpuid_mask(&entry->edx, 5);
+		break;
+	case 3: /* Processor serial number */
+	case 5: /* MONITOR/MWAIT */
+	case 6: /* Thermal management */
+	case 0xA: /* Architectural Performance Monitoring */
+	case 0x80000007: /* Advanced power management */
+	case 0xC0000002:
+	case 0xC0000003:
+	case 0xC0000004:
+	default:
+		entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+		break;
+	}
+
+	kvm_x86_ops->set_supported_cpuid(function, entry);
+
+	put_cpu();
+}
+
+#undef F
+
+int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+				      struct kvm_cpuid_entry2 __user *entries)
+{
+	struct kvm_cpuid_entry2 *cpuid_entries;
+	int limit, nent = 0, r = -E2BIG;
+	u32 func;
+
+	if (cpuid->nent < 1)
+		goto out;
+	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+		cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+	r = -ENOMEM;
+	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+	if (!cpuid_entries)
+		goto out;
+
+	do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
+	limit = cpuid_entries[0].eax;
+	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
+		do_cpuid_ent(&cpuid_entries[nent], func, 0,
+			     &nent, cpuid->nent);
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
+	limit = cpuid_entries[nent - 1].eax;
+	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
+		do_cpuid_ent(&cpuid_entries[nent], func, 0,
+			     &nent, cpuid->nent);
+
+
+
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	/* Add support for Centaur's CPUID instruction. */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
+		do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
+				&nent, cpuid->nent);
+
+		r = -E2BIG;
+		if (nent >= cpuid->nent)
+			goto out_free;
+
+		limit = cpuid_entries[nent - 1].eax;
+		for (func = 0xC0000001;
+			func <= limit && nent < cpuid->nent; ++func)
+			do_cpuid_ent(&cpuid_entries[nent], func, 0,
+					&nent, cpuid->nent);
+
+		r = -E2BIG;
+		if (nent >= cpuid->nent)
+			goto out_free;
+	}
+
+	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
+		     cpuid->nent);
+
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
+		     cpuid->nent);
+
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	r = -EFAULT;
+	if (copy_to_user(entries, cpuid_entries,
+			 nent * sizeof(struct kvm_cpuid_entry2)))
+		goto out_free;
+	cpuid->nent = nent;
+	r = 0;
+
+out_free:
+	vfree(cpuid_entries);
+out:
+	return r;
+}
+
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
+	int j, nent = vcpu->arch.cpuid_nent;
+
+	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+	/* when no next entry is found, the current entry[i] is reselected */
+	for (j = i + 1; ; j = (j + 1) % nent) {
+		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
+		if (ej->function == e->function) {
+			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+			return j;
+		}
+	}
+	return 0; /* silence gcc, even though control never reaches here */
+}
+
+/* find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful) */
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+	u32 function, u32 index)
+{
+	if (e->function != function)
+		return 0;
+	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+		return 0;
+	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+	    !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+		return 0;
+	return 1;
+}
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+					      u32 function, u32 index)
+{
+	int i;
+	struct kvm_cpuid_entry2 *best = NULL;
+
+	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+		struct kvm_cpuid_entry2 *e;
+
+		e = &vcpu->arch.cpuid_entries[i];
+		if (is_matching_cpuid_entry(e, function, index)) {
+			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+				move_to_next_stateful_cpuid_entry(vcpu, i);
+			best = e;
+			break;
+		}
+	}
+	return best;
+}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
+
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+	if (!best || best->eax < 0x80000008)
+		goto not_found;
+	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+	if (best)
+		return best->eax & 0xff;
+not_found:
+	return 36;
+}
+
+/*
+ * If no match is found, check whether we exceed the vCPU's limit
+ * and return the content of the highest valid _standard_ leaf instead.
+ * This is to satisfy the CPUID specification.
+ */
+static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
+                                                  u32 function, u32 index)
+{
+	struct kvm_cpuid_entry2 *maxlevel;
+
+	maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
+	if (!maxlevel || maxlevel->eax >= function)
+		return NULL;
+	if (function & 0x80000000) {
+		maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
+		if (!maxlevel)
+			return NULL;
+	}
+	return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
+}
+
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+	u32 function, index;
+	struct kvm_cpuid_entry2 *best;
+
+	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
+	best = kvm_find_cpuid_entry(vcpu, function, index);
+
+	if (!best)
+		best = check_cpuid_limit(vcpu, function, index);
+
+	if (best) {
+		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
+		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
+		kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
+		kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
+	}
+	kvm_x86_ops->skip_emulated_instruction(vcpu);
+	trace_kvm_cpuid(function,
+			kvm_register_read(vcpu, VCPU_REGS_RAX),
+			kvm_register_read(vcpu, VCPU_REGS_RBX),
+			kvm_register_read(vcpu, VCPU_REGS_RCX),
+			kvm_register_read(vcpu, VCPU_REGS_RDX));
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
new file mode 100644
index 0000000..5b97e17
--- /dev/null
+++ b/arch/x86/kvm/cpuid.h
@@ -0,0 +1,46 @@
+#ifndef ARCH_X86_KVM_CPUID_H
+#define ARCH_X86_KVM_CPUID_H
+
+#include "x86.h"
+
+void kvm_update_cpuid(struct kvm_vcpu *vcpu);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+					      u32 function, u32 index);
+int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+				      struct kvm_cpuid_entry2 __user *entries);
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+			     struct kvm_cpuid *cpuid,
+			     struct kvm_cpuid_entry __user *entries);
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+			      struct kvm_cpuid2 *cpuid,
+			      struct kvm_cpuid_entry2 __user *entries);
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+			      struct kvm_cpuid2 *cpuid,
+			      struct kvm_cpuid_entry2 __user *entries);
+
+
+static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
+}
+
+static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_SMEP));
+}
+
+static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
+}
+
+#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 54abb40..a7f3e65 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -38,6 +38,7 @@
 #include "irq.h"
 #include "trace.h"
 #include "x86.h"
+#include "cpuid.h"
 
 #ifndef CONFIG_X86_64
 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8f19d91..4ceced2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -18,6 +18,7 @@
 
 #include "irq.h"
 #include "mmu.h"
+#include "cpuid.h"
 
 #include <linux/kvm_host.h>
 #include <linux/module.h>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b6776c6..4e533d2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -26,6 +26,7 @@
 #include "tss.h"
 #include "kvm_cache_regs.h"
 #include "x86.h"
+#include "cpuid.h"
 
 #include <linux/clocksource.h>
 #include <linux/interrupt.h>
@@ -82,8 +83,6 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
-static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
-				    struct kvm_cpuid_entry2 __user *entries);
 static void process_nmi(struct kvm_vcpu *vcpu);
 
 struct kvm_x86_ops *kvm_x86_ops;
@@ -574,54 +573,6 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 }
 EXPORT_SYMBOL_GPL(kvm_set_xcr);
 
-static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *best;
-
-	best = kvm_find_cpuid_entry(vcpu, 1, 0);
-	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
-}
-
-static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *best;
-
-	best = kvm_find_cpuid_entry(vcpu, 7, 0);
-	return best && (best->ebx & bit(X86_FEATURE_SMEP));
-}
-
-static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *best;
-
-	best = kvm_find_cpuid_entry(vcpu, 7, 0);
-	return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
-}
-
-static void update_cpuid(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *best;
-	struct kvm_lapic *apic = vcpu->arch.apic;
-
-	best = kvm_find_cpuid_entry(vcpu, 1, 0);
-	if (!best)
-		return;
-
-	/* Update OSXSAVE bit */
-	if (cpu_has_xsave && best->function == 0x1) {
-		best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
-			best->ecx |= bit(X86_FEATURE_OSXSAVE);
-	}
-
-	if (apic) {
-		if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
-			apic->lapic_timer.timer_mode_mask = 3 << 17;
-		else
-			apic->lapic_timer.timer_mode_mask = 1 << 17;
-	}
-}
-
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
@@ -655,7 +606,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		kvm_mmu_reset_context(vcpu);
 
 	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
-		update_cpuid(vcpu);
+		kvm_update_cpuid(vcpu);
 
 	return 0;
 }
@@ -2265,466 +2216,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
 }
 
-static int is_efer_nx(void)
-{
-	unsigned long long efer = 0;
-
-	rdmsrl_safe(MSR_EFER, &efer);
-	return efer & EFER_NX;
-}
-
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
-{
-	int i;
-	struct kvm_cpuid_entry2 *e, *entry;
-
-	entry = NULL;
-	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
-		e = &vcpu->arch.cpuid_entries[i];
-		if (e->function == 0x80000001) {
-			entry = e;
-			break;
-		}
-	}
-	if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
-		entry->edx &= ~(1 << 20);
-		printk(KERN_INFO "kvm: guest NX capability removed\n");
-	}
-}
-
-/* when an old userspace process fills a new kernel module */
-static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
-				    struct kvm_cpuid *cpuid,
-				    struct kvm_cpuid_entry __user *entries)
-{
-	int r, i;
-	struct kvm_cpuid_entry *cpuid_entries;
-
-	r = -E2BIG;
-	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
-		goto out;
-	r = -ENOMEM;
-	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
-	if (!cpuid_entries)
-		goto out;
-	r = -EFAULT;
-	if (copy_from_user(cpuid_entries, entries,
-			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
-		goto out_free;
-	for (i = 0; i < cpuid->nent; i++) {
-		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
-		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
-		vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
-		vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
-		vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
-		vcpu->arch.cpuid_entries[i].index = 0;
-		vcpu->arch.cpuid_entries[i].flags = 0;
-		vcpu->arch.cpuid_entries[i].padding[0] = 0;
-		vcpu->arch.cpuid_entries[i].padding[1] = 0;
-		vcpu->arch.cpuid_entries[i].padding[2] = 0;
-	}
-	vcpu->arch.cpuid_nent = cpuid->nent;
-	cpuid_fix_nx_cap(vcpu);
-	r = 0;
-	kvm_apic_set_version(vcpu);
-	kvm_x86_ops->cpuid_update(vcpu);
-	update_cpuid(vcpu);
-
-out_free:
-	vfree(cpuid_entries);
-out:
-	return r;
-}
-
-static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
-				     struct kvm_cpuid2 *cpuid,
-				     struct kvm_cpuid_entry2 __user *entries)
-{
-	int r;
-
-	r = -E2BIG;
-	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
-		goto out;
-	r = -EFAULT;
-	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
-			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
-		goto out;
-	vcpu->arch.cpuid_nent = cpuid->nent;
-	kvm_apic_set_version(vcpu);
-	kvm_x86_ops->cpuid_update(vcpu);
-	update_cpuid(vcpu);
-	return 0;
-
-out:
-	return r;
-}
-
-static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
-				     struct kvm_cpuid2 *cpuid,
-				     struct kvm_cpuid_entry2 __user *entries)
-{
-	int r;
-
-	r = -E2BIG;
-	if (cpuid->nent < vcpu->arch.cpuid_nent)
-		goto out;
-	r = -EFAULT;
-	if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
-			 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
-		goto out;
-	return 0;
-
-out:
-	cpuid->nent = vcpu->arch.cpuid_nent;
-	return r;
-}
-
-static void cpuid_mask(u32 *word, int wordnum)
-{
-	*word &= boot_cpu_data.x86_capability[wordnum];
-}
-
-static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-			   u32 index)
-{
-	entry->function = function;
-	entry->index = index;
-	cpuid_count(entry->function, entry->index,
-		    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
-	entry->flags = 0;
-}
-
-static bool supported_xcr0_bit(unsigned bit)
-{
-	u64 mask = ((u64)1 << bit);
-
-	return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
-}
-
-#define F(x) bit(X86_FEATURE_##x)
-
-static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
-			 u32 index, int *nent, int maxnent)
-{
-	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
-#ifdef CONFIG_X86_64
-	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
-				? F(GBPAGES) : 0;
-	unsigned f_lm = F(LM);
-#else
-	unsigned f_gbpages = 0;
-	unsigned f_lm = 0;
-#endif
-	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
-
-	/* cpuid 1.edx */
-	const u32 kvm_supported_word0_x86_features =
-		F(FPU) | F(VME) | F(DE) | F(PSE) |
-		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
-		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
-		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
-		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
-		0 /* Reserved, DS, ACPI */ | F(MMX) |
-		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
-		0 /* HTT, TM, Reserved, PBE */;
-	/* cpuid 0x80000001.edx */
-	const u32 kvm_supported_word1_x86_features =
-		F(FPU) | F(VME) | F(DE) | F(PSE) |
-		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
-		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
-		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
-		F(PAT) | F(PSE36) | 0 /* Reserved */ |
-		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
-		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
-		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
-	/* cpuid 1.ecx */
-	const u32 kvm_supported_word4_x86_features =
-		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
-		0 /* DS-CPL, VMX, SMX, EST */ |
-		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
-		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
-		0 /* Reserved, DCA */ | F(XMM4_1) |
-		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
-		F(F16C) | F(RDRAND);
-	/* cpuid 0x80000001.ecx */
-	const u32 kvm_supported_word6_x86_features =
-		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
-		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
-		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
-		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
-
-	/* cpuid 0xC0000001.edx */
-	const u32 kvm_supported_word5_x86_features =
-		F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
-		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
-		F(PMM) | F(PMM_EN);
-
-	/* cpuid 7.0.ebx */
-	const u32 kvm_supported_word9_x86_features =
-		F(SMEP) | F(FSGSBASE) | F(ERMS);
-
-	/* all calls to cpuid_count() should be made on the same cpu */
-	get_cpu();
-	do_cpuid_1_ent(entry, function, index);
-	++*nent;
-
-	switch (function) {
-	case 0:
-		entry->eax = min(entry->eax, (u32)0xd);
-		break;
-	case 1:
-		entry->edx &= kvm_supported_word0_x86_features;
-		cpuid_mask(&entry->edx, 0);
-		entry->ecx &= kvm_supported_word4_x86_features;
-		cpuid_mask(&entry->ecx, 4);
-		/* we support x2apic emulation even if host does not support
-		 * it since we emulate x2apic in software */
-		entry->ecx |= F(X2APIC);
-		break;
-	/* function 2 entries are STATEFUL. That is, repeated cpuid commands
-	 * may return different values. This forces us to get_cpu() before
-	 * issuing the first command, and also to emulate this annoying behavior
-	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
-	case 2: {
-		int t, times = entry->eax & 0xff;
-
-		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
-		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-		for (t = 1; t < times && *nent < maxnent; ++t) {
-			do_cpuid_1_ent(&entry[t], function, 0);
-			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
-			++*nent;
-		}
-		break;
-	}
-	/* function 4 has additional index. */
-	case 4: {
-		int i, cache_type;
-
-		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		/* read more entries until cache_type is zero */
-		for (i = 1; *nent < maxnent; ++i) {
-			cache_type = entry[i - 1].eax & 0x1f;
-			if (!cache_type)
-				break;
-			do_cpuid_1_ent(&entry[i], function, i);
-			entry[i].flags |=
-			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-			++*nent;
-		}
-		break;
-	}
-	case 7: {
-		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		/* Mask ebx against host capbability word 9 */
-		if (index == 0) {
-			entry->ebx &= kvm_supported_word9_x86_features;
-			cpuid_mask(&entry->ebx, 9);
-		} else
-			entry->ebx = 0;
-		entry->eax = 0;
-		entry->ecx = 0;
-		entry->edx = 0;
-		break;
-	}
-	case 9:
-		break;
-	/* function 0xb has additional index. */
-	case 0xb: {
-		int i, level_type;
-
-		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		/* read more entries until level_type is zero */
-		for (i = 1; *nent < maxnent; ++i) {
-			level_type = entry[i - 1].ecx & 0xff00;
-			if (!level_type)
-				break;
-			do_cpuid_1_ent(&entry[i], function, i);
-			entry[i].flags |=
-			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-			++*nent;
-		}
-		break;
-	}
-	case 0xd: {
-		int idx, i;
-
-		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
-			do_cpuid_1_ent(&entry[i], function, idx);
-			if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
-				continue;
-			entry[i].flags |=
-			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-			++*nent;
-			++i;
-		}
-		break;
-	}
-	case KVM_CPUID_SIGNATURE: {
-		char signature[12] = "KVMKVMKVM\0\0";
-		u32 *sigptr = (u32 *)signature;
-		entry->eax = 0;
-		entry->ebx = sigptr[0];
-		entry->ecx = sigptr[1];
-		entry->edx = sigptr[2];
-		break;
-	}
-	case KVM_CPUID_FEATURES:
-		entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
-			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
-			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
-			     (1 << KVM_FEATURE_ASYNC_PF) |
-			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
-
-		if (sched_info_on())
-			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
-
-		entry->ebx = 0;
-		entry->ecx = 0;
-		entry->edx = 0;
-		break;
-	case 0x80000000:
-		entry->eax = min(entry->eax, 0x8000001a);
-		break;
-	case 0x80000001:
-		entry->edx &= kvm_supported_word1_x86_features;
-		cpuid_mask(&entry->edx, 1);
-		entry->ecx &= kvm_supported_word6_x86_features;
-		cpuid_mask(&entry->ecx, 6);
-		break;
-	case 0x80000008: {
-		unsigned g_phys_as = (entry->eax >> 16) & 0xff;
-		unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
-		unsigned phys_as = entry->eax & 0xff;
-
-		if (!g_phys_as)
-			g_phys_as = phys_as;
-		entry->eax = g_phys_as | (virt_as << 8);
-		entry->ebx = entry->edx = 0;
-		break;
-	}
-	case 0x80000019:
-		entry->ecx = entry->edx = 0;
-		break;
-	case 0x8000001a:
-		break;
-	case 0x8000001d:
-		break;
-	/*Add support for Centaur's CPUID instruction*/
-	case 0xC0000000:
-		/*Just support up to 0xC0000004 now*/
-		entry->eax = min(entry->eax, 0xC0000004);
-		break;
-	case 0xC0000001:
-		entry->edx &= kvm_supported_word5_x86_features;
-		cpuid_mask(&entry->edx, 5);
-		break;
-	case 3: /* Processor serial number */
-	case 5: /* MONITOR/MWAIT */
-	case 6: /* Thermal management */
-	case 0xA: /* Architectural Performance Monitoring */
-	case 0x80000007: /* Advanced power management */
-	case 0xC0000002:
-	case 0xC0000003:
-	case 0xC0000004:
-	default:
-		entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
-		break;
-	}
-
-	kvm_x86_ops->set_supported_cpuid(function, entry);
-
-	put_cpu();
-}
-
-#undef F
-
-static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
-				     struct kvm_cpuid_entry2 __user *entries)
-{
-	struct kvm_cpuid_entry2 *cpuid_entries;
-	int limit, nent = 0, r = -E2BIG;
-	u32 func;
-
-	if (cpuid->nent < 1)
-		goto out;
-	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
-		cpuid->nent = KVM_MAX_CPUID_ENTRIES;
-	r = -ENOMEM;
-	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
-	if (!cpuid_entries)
-		goto out;
-
-	do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
-	limit = cpuid_entries[0].eax;
-	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
-		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-			     &nent, cpuid->nent);
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
-	do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
-	limit = cpuid_entries[nent - 1].eax;
-	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
-		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-			     &nent, cpuid->nent);
-
-
-
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
-	/* Add support for Centaur's CPUID instruction. */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
-		do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
-				&nent, cpuid->nent);
-
-		r = -E2BIG;
-		if (nent >= cpuid->nent)
-			goto out_free;
-
-		limit = cpuid_entries[nent - 1].eax;
-		for (func = 0xC0000001;
-			func <= limit && nent < cpuid->nent; ++func)
-			do_cpuid_ent(&cpuid_entries[nent], func, 0,
-					&nent, cpuid->nent);
-
-		r = -E2BIG;
-		if (nent >= cpuid->nent)
-			goto out_free;
-	}
-
-	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
-		     cpuid->nent);
-
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
-	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
-		     cpuid->nent);
-
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
-	r = -EFAULT;
-	if (copy_to_user(entries, cpuid_entries,
-			 nent * sizeof(struct kvm_cpuid_entry2)))
-		goto out_free;
-	cpuid->nent = nent;
-	r = 0;
-
-out_free:
-	vfree(cpuid_entries);
-out:
-	return r;
-}
-
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
@@ -5438,125 +4929,6 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 	return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
 }
 
-static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
-{
-	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
-	int j, nent = vcpu->arch.cpuid_nent;
-
-	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
-	/* when no next entry is found, the current entry[i] is reselected */
-	for (j = i + 1; ; j = (j + 1) % nent) {
-		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
-		if (ej->function == e->function) {
-			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-			return j;
-		}
-	}
-	return 0; /* silence gcc, even though control never reaches here */
-}
-
-/* find an entry with matching function, matching index (if needed), and that
- * should be read next (if it's stateful) */
-static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
-	u32 function, u32 index)
-{
-	if (e->function != function)
-		return 0;
-	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
-		return 0;
-	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
-	    !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
-		return 0;
-	return 1;
-}
-
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
-					      u32 function, u32 index)
-{
-	int i;
-	struct kvm_cpuid_entry2 *best = NULL;
-
-	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
-		struct kvm_cpuid_entry2 *e;
-
-		e = &vcpu->arch.cpuid_entries[i];
-		if (is_matching_cpuid_entry(e, function, index)) {
-			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
-				move_to_next_stateful_cpuid_entry(vcpu, i);
-			best = e;
-			break;
-		}
-	}
-	return best;
-}
-EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
-
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *best;
-
-	best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
-	if (!best || best->eax < 0x80000008)
-		goto not_found;
-	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
-	if (best)
-		return best->eax & 0xff;
-not_found:
-	return 36;
-}
-
-/*
- * If no match is found, check whether we exceed the vCPU's limit
- * and return the content of the highest valid _standard_ leaf instead.
- * This is to satisfy the CPUID specification.
- */
-static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
-                                                  u32 function, u32 index)
-{
-	struct kvm_cpuid_entry2 *maxlevel;
-
-	maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
-	if (!maxlevel || maxlevel->eax >= function)
-		return NULL;
-	if (function & 0x80000000) {
-		maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
-		if (!maxlevel)
-			return NULL;
-	}
-	return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
-}
-
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
-{
-	u32 function, index;
-	struct kvm_cpuid_entry2 *best;
-
-	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
-	best = kvm_find_cpuid_entry(vcpu, function, index);
-
-	if (!best)
-		best = check_cpuid_limit(vcpu, function, index);
-
-	if (best) {
-		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
-		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
-		kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
-		kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
-	}
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
-	trace_kvm_cpuid(function,
-			kvm_register_read(vcpu, VCPU_REGS_RAX),
-			kvm_register_read(vcpu, VCPU_REGS_RBX),
-			kvm_register_read(vcpu, VCPU_REGS_RCX),
-			kvm_register_read(vcpu, VCPU_REGS_RDX));
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
-
 /*
  * Check if userspace requested an interrupt window, and that the
  * interrupt window is open.
@@ -6222,7 +5594,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
 	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
 	if (sregs->cr4 & X86_CR4_OSXSAVE)
-		update_cpuid(vcpu);
+		kvm_update_cpuid(vcpu);
 
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	if (!is_long_mode(vcpu) && is_pae(vcpu)) {
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d36fe23..cb80c29 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -33,9 +33,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
 	return (nr == BP_VECTOR) || (nr == OF_VECTOR);
 }
 
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
-                                             u32 function, u32 index);
-
 static inline bool is_protmode(struct kvm_vcpu *vcpu)
 {
 	return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
@@ -125,4 +122,6 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 	gva_t addr, void *val, unsigned int bytes,
 	struct x86_exception *exception);
 
+extern u64 host_xcr0;
+
 #endif
-- 
cgit v1.1


From fb215366b3c7320ac25dca766a0152df16534932 Mon Sep 17 00:00:00 2001
From: "Liu, Jinsong" <jinsong.liu@intel.com>
Date: Mon, 28 Nov 2011 03:55:19 -0800
Subject: KVM: expose latest Intel cpu new features (BMI1/BMI2/FMA/AVX2) to
 guest

Intel latest cpu add 6 new features, refer http://software.intel.com/file/36945
The new feature cpuid listed as below:

1. FMA		CPUID.EAX=01H:ECX.FMA[bit 12]
2. MOVBE	CPUID.EAX=01H:ECX.MOVBE[bit 22]
3. BMI1		CPUID.EAX=07H,ECX=0H:EBX.BMI1[bit 3]
4. AVX2		CPUID.EAX=07H,ECX=0H:EBX.AVX2[bit 5]
5. BMI2		CPUID.EAX=07H,ECX=0H:EBX.BMI2[bit 8]
6. LZCNT	CPUID.EAX=80000001H:ECX.LZCNT[bit 5]

This patch expose these features to guest.
Among them, FMA/MOVBE/LZCNT has already been defined, MOVBE/LZCNT has
already been exposed.

This patch defines BMI1/AVX2/BMI2, and exposes FMA/BMI1/AVX2/BMI2 to guest.

Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/cpufeature.h | 3 +++
 arch/x86/kvm/cpuid.c              | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index f3444f7..17c5d4b 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -197,7 +197,10 @@
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
 #define X86_FEATURE_FSGSBASE	(9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_BMI1	(9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_AVX2	(9*32+ 5) /* AVX2 instructions */
 #define X86_FEATURE_SMEP	(9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2	(9*32+ 8) /* 2nd group bit manipulation extensions */
 #define X86_FEATURE_ERMS	(9*32+ 9) /* Enhanced REP MOVSB/STOSB */
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0a332ec..47be763 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -222,7 +222,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
 		0 /* DS-CPL, VMX, SMX, EST */ |
 		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
-		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
+		F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
 		0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
 		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
@@ -242,7 +242,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* cpuid 7.0.ebx */
 	const u32 kvm_supported_word9_x86_features =
-		F(SMEP) | F(FSGSBASE) | F(ERMS);
+		F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS);
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
-- 
cgit v1.1


From 831bf664e9c1fc08fc6b3984d00d275cac82f5e9 Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Mon, 28 Nov 2011 11:20:29 +0200
Subject: KVM: Refactor and simplify kvm_dev_ioctl_get_supported_cpuid

This patch cleans and simplifies kvm_dev_ioctl_get_supported_cpuid by using a table
instead of duplicating code as Avi suggested.

This patch also fixes a bug where kvm_dev_ioctl_get_supported_cpuid would return
-E2BIG when amount of entries passed was just right.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/cpuid.c | 113 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 63 insertions(+), 50 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 47be763..52593e8 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -183,9 +183,10 @@ static bool supported_xcr0_bit(unsigned bit)
 
 #define F(x) bit(X86_FEATURE_##x)
 
-static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			 u32 index, int *nent, int maxnent)
 {
+	int r;
 	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
 #ifdef CONFIG_X86_64
 	unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
@@ -246,6 +247,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
+
+	r = -E2BIG;
+
+	if (*nent >= maxnent)
+		goto out;
+
 	do_cpuid_1_ent(entry, function, index);
 	++*nent;
 
@@ -271,7 +278,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
 		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
-		for (t = 1; t < times && *nent < maxnent; ++t) {
+		for (t = 1; t < times; ++t) {
+			if (*nent >= maxnent)
+				goto out;
+
 			do_cpuid_1_ent(&entry[t], function, 0);
 			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
 			++*nent;
@@ -284,7 +294,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		/* read more entries until cache_type is zero */
-		for (i = 1; *nent < maxnent; ++i) {
+		for (i = 1; ; ++i) {
+			if (*nent >= maxnent)
+				goto out;
+
 			cache_type = entry[i - 1].eax & 0x1f;
 			if (!cache_type)
 				break;
@@ -316,7 +329,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		/* read more entries until level_type is zero */
-		for (i = 1; *nent < maxnent; ++i) {
+		for (i = 1; ; ++i) {
+			if (*nent >= maxnent)
+				goto out;
+
 			level_type = entry[i - 1].ecx & 0xff00;
 			if (!level_type)
 				break;
@@ -331,7 +347,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		int idx, i;
 
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
+		for (idx = 1, i = 1; idx < 64; ++idx) {
+			if (*nent >= maxnent)
+				goto out;
+
 			do_cpuid_1_ent(&entry[i], function, idx);
 			if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
 				continue;
@@ -416,17 +435,41 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	kvm_x86_ops->set_supported_cpuid(function, entry);
 
+	r = 0;
+
+out:
 	put_cpu();
+
+	return r;
 }
 
 #undef F
 
+struct kvm_cpuid_param {
+	u32 func;
+	u32 idx;
+	bool has_leaf_count;
+	bool (*qualifier)(struct kvm_cpuid_param *param);
+};
+
+static bool is_centaur_cpu(struct kvm_cpuid_param *param)
+{
+	return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
+}
+
 int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 				      struct kvm_cpuid_entry2 __user *entries)
 {
 	struct kvm_cpuid_entry2 *cpuid_entries;
-	int limit, nent = 0, r = -E2BIG;
+	int limit, nent = 0, r = -E2BIG, i;
 	u32 func;
+	static struct kvm_cpuid_param param[] = {
+		{ .func = 0, .has_leaf_count = true },
+		{ .func = 0x80000000, .has_leaf_count = true },
+		{ .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
+		{ .func = KVM_CPUID_SIGNATURE },
+		{ .func = KVM_CPUID_FEATURES },
+	};
 
 	if (cpuid->nent < 1)
 		goto out;
@@ -437,61 +480,31 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 	if (!cpuid_entries)
 		goto out;
 
-	do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
-	limit = cpuid_entries[0].eax;
-	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
-		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-			     &nent, cpuid->nent);
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
-	do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
-	limit = cpuid_entries[nent - 1].eax;
-	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
-		do_cpuid_ent(&cpuid_entries[nent], func, 0,
-			     &nent, cpuid->nent);
-
-
+	r = 0;
+	for (i = 0; i < ARRAY_SIZE(param); i++) {
+		struct kvm_cpuid_param *ent = &param[i];
 
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
+		if (ent->qualifier && !ent->qualifier(ent))
+			continue;
 
-	/* Add support for Centaur's CPUID instruction. */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
-		do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
+		r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx,
 				&nent, cpuid->nent);
 
-		r = -E2BIG;
-		if (nent >= cpuid->nent)
+		if (r)
 			goto out_free;
 
+		if (!ent->has_leaf_count)
+			continue;
+
 		limit = cpuid_entries[nent - 1].eax;
-		for (func = 0xC0000001;
-			func <= limit && nent < cpuid->nent; ++func)
-			do_cpuid_ent(&cpuid_entries[nent], func, 0,
-					&nent, cpuid->nent);
+		for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
+			r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx,
+				     &nent, cpuid->nent);
 
-		r = -E2BIG;
-		if (nent >= cpuid->nent)
+		if (r)
 			goto out_free;
 	}
 
-	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
-		     cpuid->nent);
-
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
-	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
-		     cpuid->nent);
-
-	r = -E2BIG;
-	if (nent >= cpuid->nent)
-		goto out_free;
-
 	r = -EFAULT;
 	if (copy_to_user(entries, cpuid_entries,
 			 nent * sizeof(struct kvm_cpuid_entry2)))
-- 
cgit v1.1


From 0375f7fad904b59502341ccecfc8faea70b34c91 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 28 Nov 2011 20:41:00 +0800
Subject: KVM: MMU: audit: replace mmu audit tracepoint with jump-label

The tracepoint is only used to audit mmu code, it should not be exposed to
user, let us replace it with jump-label.

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 16 +++++++++++-----
 arch/x86/kvm/mmu_audit.c   | 28 +++++++++++++---------------
 arch/x86/kvm/mmutrace.h    | 19 -------------------
 arch/x86/kvm/paging_tmpl.h |  4 ++--
 4 files changed, 26 insertions(+), 41 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d737443..62f69db 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -68,6 +68,12 @@ char *audit_point_name[] = {
 	"post sync"
 };
 
+#ifdef CONFIG_KVM_MMU_AUDIT
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point);
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
+#endif
+
 #undef MMU_DEBUG
 
 #ifdef MMU_DEBUG
@@ -2852,12 +2858,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 		return;
 
 	vcpu_clear_mmio_info(vcpu, ~0ul);
-	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
 	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 		sp = page_header(root);
 		mmu_sync_children(vcpu, sp);
-		trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+		kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
 		return;
 	}
 	for (i = 0; i < 4; ++i) {
@@ -2869,7 +2875,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 			mmu_sync_children(vcpu, sp);
 		}
 	}
-	trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
 }
 
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -3667,7 +3673,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	++vcpu->kvm->stat.mmu_pte_write;
-	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
+	kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 
 	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
@@ -3700,7 +3706,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	}
 	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
+	kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 746ec25..5df6736 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -224,30 +224,29 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
 	mmu_spte_walk(vcpu, audit_spte);
 }
 
-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
+static bool mmu_audit;
+static struct jump_label_key mmu_audit_key;
+
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
 {
 	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
 
-	if (!__ratelimit(&ratelimit_state))
-		return;
+	if (static_branch((&mmu_audit_key))) {
+		if (!__ratelimit(&ratelimit_state))
+			return;
 
-	vcpu->kvm->arch.audit_point = point;
-	audit_all_active_sps(vcpu->kvm);
-	audit_vcpu_spte(vcpu);
+		vcpu->kvm->arch.audit_point = point;
+		audit_all_active_sps(vcpu->kvm);
+		audit_vcpu_spte(vcpu);
+	}
 }
 
-static bool mmu_audit;
-
 static void mmu_audit_enable(void)
 {
-	int ret;
-
 	if (mmu_audit)
 		return;
 
-	ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-	WARN_ON(ret);
-
+	jump_label_inc(&mmu_audit_key);
 	mmu_audit = true;
 }
 
@@ -256,8 +255,7 @@ static void mmu_audit_disable(void)
 	if (!mmu_audit)
 		return;
 
-	unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
-	tracepoint_synchronize_unregister();
+	jump_label_dec(&mmu_audit_key);
 	mmu_audit = false;
 }
 
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index eed67f3..89fb0e8 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -243,25 +243,6 @@ TRACE_EVENT(
 	TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
 		  __entry->access)
 );
-
-TRACE_EVENT(
-	kvm_mmu_audit,
-	TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
-	TP_ARGS(vcpu, audit_point),
-
-	TP_STRUCT__entry(
-		__field(struct kvm_vcpu *, vcpu)
-		__field(int, audit_point)
-	),
-
-	TP_fast_assign(
-		__entry->vcpu = vcpu;
-		__entry->audit_point = audit_point;
-	),
-
-	TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
-		  audit_point_name[__entry->audit_point])
-);
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 52e9d58..1561028 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -632,7 +632,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 
-	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
+	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
 	kvm_mmu_free_some_pages(vcpu);
 	if (!force_pt_level)
 		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
@@ -643,7 +643,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		 sptep, *sptep, emulate);
 
 	++vcpu->stat.pf_fixed;
-	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
+	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
 	return emulate;
-- 
cgit v1.1


From 9edb17d55f3ea4943f9654f2aad7a99b4c55840a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 28 Nov 2011 20:41:38 +0800
Subject: KVM: x86: remove the dead code of KVM_EXIT_HYPERCALL

KVM_EXIT_HYPERCALL is not used anymore, so remove the code

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4e533d2..4650531 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5389,10 +5389,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (r <= 0)
 		goto out;
 
-	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
-		kvm_register_write(vcpu, VCPU_REGS_RAX,
-				     kvm_run->hypercall.ret);
-
 	r = __vcpu_run(vcpu);
 
 out:
-- 
cgit v1.1


From e459e3228dc57f7160e564ce0f09edb5bee656d3 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 28 Nov 2011 20:42:16 +0800
Subject: KVM: MMU: move the relevant mmu code to mmu.c

Move the mmu code in kvm_arch_vcpu_init() to kvm_mmu_create()

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 ++++++
 arch/x86/kvm/mmu.c              |  6 +++++-
 arch/x86/kvm/x86.c              | 11 +----------
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1769f3d..020413a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -752,6 +752,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
 			      struct x86_exception *exception);
 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
@@ -773,6 +774,11 @@ void kvm_disable_tdp(void);
 int complete_pio(struct kvm_vcpu *vcpu);
 bool kvm_check_iopl(struct kvm_vcpu *vcpu);
 
+static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+	return gpa;
+}
+
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
 	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 62f69db..262a3af 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3839,7 +3839,11 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 int kvm_mmu_create(struct kvm_vcpu *vcpu)
 {
 	ASSERT(vcpu);
-	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+	vcpu->arch.mmu.translate_gpa = translate_gpa;
+	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
 
 	return alloc_mmu_pages(vcpu);
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4650531..d99976e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3430,12 +3430,7 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
 	kvm_x86_ops->get_segment(vcpu, var, seg);
 }
 
-static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
-{
-	return gpa;
-}
-
-static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 {
 	gpa_t t_gpa;
 	struct x86_exception exception;
@@ -5915,10 +5910,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm = vcpu->kvm;
 
 	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
-	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
-	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-	vcpu->arch.mmu.translate_gpa = translate_gpa;
-	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
-- 
cgit v1.1


From d750ea28865dbff6a73444358d189dd811c68c50 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Mon, 28 Nov 2011 20:43:18 +0800
Subject: KVM: MMU: remove oos_shadow parameter

The unsync code should be stable now, maybe it is the time to remove this
parameter to cleanup the code a little bit

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 262a3af..b1178d1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -93,9 +93,6 @@ static int dbg = 0;
 module_param(dbg, bool, 0644);
 #endif
 
-static int oos_shadow = 1;
-module_param(oos_shadow, bool, 0644);
-
 #ifndef MMU_DEBUG
 #define ASSERT(x) do { } while (0)
 #else
@@ -2196,8 +2193,6 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 			return 1;
 
 		if (!need_unsync && !s->unsync) {
-			if (!oos_shadow)
-				return 1;
 			need_unsync = true;
 		}
 	}
-- 
cgit v1.1


From e37fa7853c276403da2fea8792c579e8bfd75042 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Wed, 30 Nov 2011 17:43:24 +0800
Subject: KVM: MMU: audit: inline audit function

inline audit function and little cleanup

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       | 28 +++++++---------------------
 arch/x86/kvm/mmu_audit.c | 29 +++++++++++++++++++++--------
 2 files changed, 28 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b1178d1..7a8e99c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -59,21 +59,6 @@ enum {
 	AUDIT_POST_SYNC
 };
 
-char *audit_point_name[] = {
-	"pre page fault",
-	"post page fault",
-	"pre pte write",
-	"post pte write",
-	"pre sync",
-	"post sync"
-};
-
-#ifdef CONFIG_KVM_MMU_AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
-#endif
-
 #undef MMU_DEBUG
 
 #ifdef MMU_DEBUG
@@ -1539,6 +1524,13 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
 	return ret;
 }
 
+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
+static void mmu_audit_disable(void) { }
+#endif
+
 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			 struct list_head *invalid_list)
 {
@@ -4035,12 +4027,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 	mmu_free_memory_caches(vcpu);
 }
 
-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void mmu_audit_disable(void) { }
-#endif
-
 void kvm_mmu_module_exit(void)
 {
 	mmu_destroy_caches();
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 5df6736..fe15dcc 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,6 +19,15 @@
 
 #include <linux/ratelimit.h>
 
+char const *audit_point_name[] = {
+	"pre page fault",
+	"post page fault",
+	"pre pte write",
+	"post pte write",
+	"pre sync",
+	"post sync"
+};
+
 #define audit_printk(kvm, fmt, args...)		\
 	printk(KERN_ERR "audit: (%s) error: "	\
 		fmt, audit_point_name[kvm->arch.audit_point], ##args)
@@ -227,18 +236,22 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
 static bool mmu_audit;
 static struct jump_label_key mmu_audit_key;
 
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
+static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
 {
 	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
 
-	if (static_branch((&mmu_audit_key))) {
-		if (!__ratelimit(&ratelimit_state))
-			return;
+	if (!__ratelimit(&ratelimit_state))
+		return;
 
-		vcpu->kvm->arch.audit_point = point;
-		audit_all_active_sps(vcpu->kvm);
-		audit_vcpu_spte(vcpu);
-	}
+	vcpu->kvm->arch.audit_point = point;
+	audit_all_active_sps(vcpu->kvm);
+	audit_vcpu_spte(vcpu);
+}
+
+static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
+{
+	if (static_branch((&mmu_audit_key)))
+		__kvm_mmu_audit(vcpu, point);
 }
 
 static void mmu_audit_enable(void)
-- 
cgit v1.1


From 086c9855019935854311b477b33498a6ea357ef6 Mon Sep 17 00:00:00 2001
From: "Alex,Shi" <alex.shi@intel.com>
Date: Thu, 20 Oct 2011 15:34:01 +0800
Subject: KVM: use this_cpu_xxx replace percpu_xxx funcs

percpu_xxx funcs are duplicated with this_cpu_xxx funcs, so replace them
for further code clean up.

And in preempt safe scenario, __this_cpu_xxx funcs has a bit better
performance since __this_cpu_xxx has no redundant preempt_disable()

Signed-off-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d99976e..d55a94f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4664,15 +4664,15 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
 
 static int kvm_is_in_guest(void)
 {
-	return percpu_read(current_vcpu) != NULL;
+	return __this_cpu_read(current_vcpu) != NULL;
 }
 
 static int kvm_is_user_mode(void)
 {
 	int user_mode = 3;
 
-	if (percpu_read(current_vcpu))
-		user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
+	if (__this_cpu_read(current_vcpu))
+		user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
 
 	return user_mode != 0;
 }
@@ -4681,8 +4681,8 @@ static unsigned long kvm_get_guest_ip(void)
 {
 	unsigned long ip = 0;
 
-	if (percpu_read(current_vcpu))
-		ip = kvm_rip_read(percpu_read(current_vcpu));
+	if (__this_cpu_read(current_vcpu))
+		ip = kvm_rip_read(__this_cpu_read(current_vcpu));
 
 	return ip;
 }
@@ -4695,13 +4695,13 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
 
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
 {
-	percpu_write(current_vcpu, vcpu);
+	__this_cpu_write(current_vcpu, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
 
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
 {
-	percpu_write(current_vcpu, NULL);
+	__this_cpu_write(current_vcpu, NULL);
 }
 EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
 
-- 
cgit v1.1


From 3d56cbdf359c953f8bfcab68aa5cf766e4480799 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 2 Dec 2011 18:35:24 +0100
Subject: KVM: MMU: Drop unused return value of
 kvm_mmu_remove_some_alloc_mmu_pages

freed_pages is never evaluated, so remove it as well as the return code
kvm_mmu_remove_some_alloc_mmu_pages so far delivered to its only user.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7a8e99c..2a2a9b4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3890,14 +3890,14 @@ restart:
 	spin_unlock(&kvm->mmu_lock);
 }
 
-static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
-					       struct list_head *invalid_list)
+static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
+						struct list_head *invalid_list)
 {
 	struct kvm_mmu_page *page;
 
 	page = container_of(kvm->arch.active_mmu_pages.prev,
 			    struct kvm_mmu_page, link);
-	return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
+	kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
 }
 
 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
@@ -3912,15 +3912,15 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 	raw_spin_lock(&kvm_lock);
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		int idx, freed_pages;
+		int idx;
 		LIST_HEAD(invalid_list);
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
 		if (!kvm_freed && nr_to_scan > 0 &&
 		    kvm->arch.n_used_mmu_pages > 0) {
-			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
-							  &invalid_list);
+			kvm_mmu_remove_some_alloc_mmu_pages(kvm,
+							    &invalid_list);
 			kvm_freed = kvm;
 		}
 		nr_to_scan--;
-- 
cgit v1.1


From 234b639206a7d9d5ca362cff64ceddd4f27e4a46 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 2 Dec 2011 18:26:28 +0100
Subject: KVM: x86 emulator: Remove set-but-unused cr4 from check_cr_write

This was probably copy&pasted from the cr0 case, but it's unneeded here.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ac8e5ed..f641201 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3028,9 +3028,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
 		break;
 		}
 	case 4: {
-		u64 cr4;
-
-		cr4 = ctxt->ops->get_cr(ctxt, 4);
 		ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
 
 		if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
-- 
cgit v1.1


From cdfca7b346e6dbab1ba33260c28ccb8333485a5b Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Sun, 4 Dec 2011 19:36:28 +0200
Subject: KVM: Use kmemdup() instead of kmalloc/memcpy

Switch to kmemdup() in two places to shorten the code and avoid possible bugs.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d55a94f..03042d6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3031,10 +3031,10 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		memset(dirty_bitmap_head, 0, n);
 
 		r = -ENOMEM;
-		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+		slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL);
 		if (!slots)
 			goto out;
-		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
+
 		memslot = id_to_memslot(slots, log->slot);
 		memslot->nr_dirty_pages = 0;
 		memslot->dirty_bitmap = dirty_bitmap_head;
-- 
cgit v1.1


From ff5c2c0316ff0e3e2dba3ca14167d994453df093 Mon Sep 17 00:00:00 2001
From: Sasha Levin <levinsasha928@gmail.com>
Date: Sun, 4 Dec 2011 19:36:29 +0200
Subject: KVM: Use memdup_user instead of kmalloc/copy_from_user

Switch to using memdup_user when possible. This makes code more
smaller and compact, and prevents errors.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 82 +++++++++++++++++++++++-------------------------------
 1 file changed, 35 insertions(+), 47 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03042d6..0a646e2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1309,12 +1309,11 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
 	if (page_num >= blob_size)
 		goto out;
 	r = -ENOMEM;
-	page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!page)
+	page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
+	if (IS_ERR(page)) {
+		r = PTR_ERR(page);
 		goto out;
-	r = -EFAULT;
-	if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
-		goto out_free;
+	}
 	if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
 		goto out_free;
 	r = 0;
@@ -1988,15 +1987,12 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
 	if (msrs.nmsrs >= MAX_IO_MSRS)
 		goto out;
 
-	r = -ENOMEM;
 	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
-	entries = kmalloc(size, GFP_KERNEL);
-	if (!entries)
+	entries = memdup_user(user_msrs->entries, size);
+	if (IS_ERR(entries)) {
+		r = PTR_ERR(entries);
 		goto out;
-
-	r = -EFAULT;
-	if (copy_from_user(entries, user_msrs->entries, size))
-		goto out_free;
+	}
 
 	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
 	if (r < 0)
@@ -2533,13 +2529,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 		if (!vcpu->arch.apic)
 			goto out;
-		u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
-		r = -ENOMEM;
-		if (!u.lapic)
-			goto out;
-		r = -EFAULT;
-		if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
+		u.lapic = memdup_user(argp, sizeof(*u.lapic));
+		if (IS_ERR(u.lapic)) {
+			r = PTR_ERR(u.lapic);
 			goto out;
+		}
+
 		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
 		if (r)
 			goto out;
@@ -2718,14 +2713,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_XSAVE: {
-		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
-		r = -ENOMEM;
-		if (!u.xsave)
-			break;
-
-		r = -EFAULT;
-		if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
-			break;
+		u.xsave = memdup_user(argp, sizeof(*u.xsave));
+		if (IS_ERR(u.xsave)) {
+			r = PTR_ERR(u.xsave);
+			goto out;
+		}
 
 		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
 		break;
@@ -2746,15 +2738,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_XCRS: {
-		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
-		r = -ENOMEM;
-		if (!u.xcrs)
-			break;
-
-		r = -EFAULT;
-		if (copy_from_user(u.xcrs, argp,
-				   sizeof(struct kvm_xcrs)))
-			break;
+		u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
+		if (IS_ERR(u.xcrs)) {
+			r = PTR_ERR(u.xcrs);
+			goto out;
+		}
 
 		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
 		break;
@@ -3190,14 +3178,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 	case KVM_GET_IRQCHIP: {
 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-		struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
+		struct kvm_irqchip *chip;
 
-		r = -ENOMEM;
-		if (!chip)
+		chip = memdup_user(argp, sizeof(*chip));
+		if (IS_ERR(chip)) {
+			r = PTR_ERR(chip);
 			goto out;
-		r = -EFAULT;
-		if (copy_from_user(chip, argp, sizeof *chip))
-			goto get_irqchip_out;
+		}
+
 		r = -ENXIO;
 		if (!irqchip_in_kernel(kvm))
 			goto get_irqchip_out;
@@ -3216,14 +3204,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 	case KVM_SET_IRQCHIP: {
 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
-		struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
+		struct kvm_irqchip *chip;
 
-		r = -ENOMEM;
-		if (!chip)
+		chip = memdup_user(argp, sizeof(*chip));
+		if (IS_ERR(chip)) {
+			r = PTR_ERR(chip);
 			goto out;
-		r = -EFAULT;
-		if (copy_from_user(chip, argp, sizeof *chip))
-			goto set_irqchip_out;
+		}
+
 		r = -ENXIO;
 		if (!irqchip_in_kernel(kvm))
 			goto set_irqchip_out;
-- 
cgit v1.1


From 43771ebfc9d34ab1f74095d052d225a82ae0898c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 14 Dec 2011 12:27:47 +0200
Subject: KVM: Make KVM_INTEL depend on CPU_SUP_INTEL

PMU virtualization needs to talk to Intel-specific bits of perf; these are
only available when CPU_SUP_INTEL=y.

Fixes

  arch/x86/built-in.o: In function `atomic_switch_perf_msrs':
  vmx.c:(.text+0x6b1d4): undefined reference to `perf_guest_get_msrs'

Reported-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ff5790d..ca4d49e 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -52,6 +52,8 @@ config KVM
 config KVM_INTEL
 	tristate "KVM for Intel processors support"
 	depends on KVM
+	# for perf_guest_get_msrs():
+	depends on CPU_SUP_INTEL
 	---help---
 	  Provides support for KVM on Intel processors equipped with the VT
 	  extensions.
-- 
cgit v1.1


From bb5a798ad58996e4d666ead1016705854d5ca616 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 14 Dec 2011 17:58:18 +0100
Subject: KVM: x86: Do not rely on implicit inclusions

Works so far by change, but it is not guaranteed to stay like this.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/cpuid.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 52593e8..ca63032 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -14,6 +14,8 @@
 
 #include <linux/kvm_host.h>
 #include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
 #include <asm/user.h>
 #include <asm/xsave.h>
 #include "cpuid.h"
-- 
cgit v1.1


From a647795efbedeedf8a1dc6deded26defa23562bd Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 14 Dec 2011 19:25:33 +0100
Subject: KVM: x86: Consolidate PIT legacy test

Move the test for KVM_PIT_FLAGS_HPET_LEGACY into create_pit_timer
instead of replicating it on the caller site.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8254.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 405f262..d68f99d 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -344,7 +344,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 	struct kvm_timer *pt = &ps->pit_timer;
 	s64 interval;
 
-	if (!irqchip_in_kernel(kvm))
+	if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
 		return;
 
 	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -397,15 +397,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	case 1:
         /* FIXME: enhance mode 4 precision */
 	case 4:
-		if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
-			create_pit_timer(kvm, val, 0);
-		}
+		create_pit_timer(kvm, val, 0);
 		break;
 	case 2:
 	case 3:
-		if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
-			create_pit_timer(kvm, val, 1);
-		}
+		create_pit_timer(kvm, val, 1);
 		break;
 	default:
 		destroy_pit_timer(kvm->arch.vpit);
-- 
cgit v1.1


From d546cb406ea0d83e2d39ec14221957a24f88a622 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 15 Dec 2011 12:38:40 +0200
Subject: KVM: drop bsp_vcpu pointer from kvm struct

Drop bsp_vcpu pointer from kvm struct since its only use is incorrect
anyway.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8259.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index cac4746..b6a7353 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -262,9 +262,10 @@ int kvm_pic_read_irq(struct kvm *kvm)
 
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
-	int irq;
-	struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
+	int irq, i;
+	struct kvm_vcpu *vcpu;
 	u8 irr = s->irr, isr = s->imr;
+	bool found = false;
 
 	s->last_irr = 0;
 	s->irr = 0;
@@ -281,12 +282,19 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
 	s->special_fully_nested_mode = 0;
 	s->init4 = 0;
 
-	for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
-		if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
-			if (irr & (1 << irq) || isr & (1 << irq)) {
-				pic_clear_isr(s, irq);
-			}
-	}
+	kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
+		if (kvm_apic_accept_pic_intr(vcpu)) {
+			found = true;
+			break;
+		}
+
+
+	if (!found)
+		return;
+
+	for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+		if (irr & (1 << irq) || isr & (1 << irq))
+			pic_clear_isr(s, irq);
 }
 
 static void pic_ioport_write(void *opaque, u32 addr, u32 val)
-- 
cgit v1.1


From c15af35f54631b9e9b7ad1981016cc6e73cec794 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 6 Dec 2011 18:06:02 +0900
Subject: KVM: x86 emulator: Use opcode::execute for Group 1A instruction

Group 1A: 8F

Register em_pop() directly and remove em_grp1a().

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f641201..cd49774 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1675,11 +1675,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-static int em_grp1a(struct x86_emulate_ctxt *ctxt)
-{
-	return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes);
-}
-
 static int em_grp2(struct x86_emulate_ctxt *ctxt)
 {
 	switch (ctxt->modrm_reg) {
@@ -3203,7 +3198,7 @@ static struct opcode group1[] = {
 };
 
 static struct opcode group1A[] = {
-	D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
+	I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N,
 };
 
 static struct opcode group3[] = {
@@ -4031,9 +4026,6 @@ special_insn:
 	case 0x8d: /* lea r16/r32, m */
 		ctxt->dst.val = ctxt->src.addr.mem.ea;
 		break;
-	case 0x8f:		/* pop (sole member of Grp1a) */
-		rc = em_grp1a(ctxt);
-		break;
 	case 0x90 ... 0x97: /* nop / xchg reg, rax */
 		if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
 			break;
-- 
cgit v1.1


From c04ec8393f3815e0f60dde1d6b29040bf1875d52 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 6 Dec 2011 18:06:44 +0900
Subject: KVM: x86 emulator: Use opcode::execute for Group 4/5 instructions

Group 4: FE
Group 5: FF

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index cd49774..5b78785 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3213,16 +3213,19 @@ static struct opcode group3[] = {
 };
 
 static struct opcode group4[] = {
-	D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
+	I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
+	I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
 	N, N, N, N, N, N,
 };
 
 static struct opcode group5[] = {
-	D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
-	D(SrcMem | ModRM | Stack),
+	I(DstMem | SrcNone | ModRM | Lock, em_grp45),
+	I(DstMem | SrcNone | ModRM | Lock, em_grp45),
+	I(SrcMem | ModRM | Stack, em_grp45),
 	I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
-	D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
-	D(SrcMem | ModRM | Stack), N,
+	I(SrcMem | ModRM | Stack, em_grp45),
+	I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45),
+	I(SrcMem | ModRM | Stack, em_grp45), N,
 };
 
 static struct opcode group6[] = {
@@ -4082,12 +4085,6 @@ special_insn:
 	case 0xfd: /* std */
 		ctxt->eflags |= EFLG_DF;
 		break;
-	case 0xfe: /* Grp4 */
-		rc = em_grp45(ctxt);
-		break;
-	case 0xff: /* Grp5 */
-		rc = em_grp45(ctxt);
-		break;
 	default:
 		goto cannot_emulate;
 	}
-- 
cgit v1.1


From e0dac408d08c2a5e1bed2a6a9da7f3af3f7a9827 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Tue, 6 Dec 2011 18:07:27 +0900
Subject: KVM: x86 emulator: Use opcode::execute for Group 9 instruction

Group 9: 0F C7

Rename em_grp9() to em_cmpxchg8b() and register it.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5b78785..de7be77 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1784,7 +1784,7 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
 	return rc;
 }
 
-static int em_grp9(struct x86_emulate_ctxt *ctxt)
+static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
 {
 	u64 old = ctxt->dst.orig_val64;
 
@@ -3261,7 +3261,7 @@ static struct opcode group8[] = {
 };
 
 static struct group_dual group9 = { {
-	N, D(DstMem64 | ModRM | Lock | PageTable), N, N, N, N, N, N,
+	N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
 }, {
 	N, N, N, N, N, N, N, N,
 } };
@@ -4202,9 +4202,6 @@ twobyte_insn:
 		ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
 							(u64) ctxt->src.val;
 		break;
-	case 0xc7:		/* Grp9 (cmpxchg8b) */
-		rc = em_grp9(ctxt);
-		break;
 	default:
 		goto cannot_emulate;
 	}
-- 
cgit v1.1


From 893420822192f717af6fde927c9e78c9b82f8327 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 10 Nov 2011 14:57:21 +0200
Subject: KVM: Expose kvm_lapic_local_deliver()

Needed to deliver performance monitoring interrupts.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 2 +-
 arch/x86/kvm/lapic.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a7f3e65..cfdc6e0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1121,7 +1121,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
 {
 	u32 reg = apic_get_reg(apic, lvt_type);
 	int vector, mode, trig_mode;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 138e8cc..6f4ce25 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -34,6 +34,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
-- 
cgit v1.1


From f5132b01386b5a67f1ff673bb2b96a507a3f7e41 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 10 Nov 2011 14:57:22 +0200
Subject: KVM: Expose a version 2 architectural PMU to a guests

Use perf_events to emulate an architectural PMU, version 2.

Based on PMU version 1 emulation by Avi Kivity.

[avi: adjust for cpuid.c]
[jan: fix anonymous field initialization for older gcc]

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  48 ++++
 arch/x86/kvm/Kconfig            |   1 +
 arch/x86/kvm/Makefile           |   2 +-
 arch/x86/kvm/cpuid.c            |   2 +
 arch/x86/kvm/pmu.c              | 533 ++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              |  22 +-
 6 files changed, 598 insertions(+), 10 deletions(-)
 create mode 100644 arch/x86/kvm/pmu.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 020413a..fb60ffd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -16,10 +16,12 @@
 #include <linux/mmu_notifier.h>
 #include <linux/tracepoint.h>
 #include <linux/cpumask.h>
+#include <linux/irq_work.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
 #include <linux/kvm_types.h>
+#include <linux/perf_event.h>
 
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
@@ -291,6 +293,37 @@ struct kvm_mmu {
 	u64 pdptrs[4]; /* pae */
 };
 
+enum pmc_type {
+	KVM_PMC_GP = 0,
+	KVM_PMC_FIXED,
+};
+
+struct kvm_pmc {
+	enum pmc_type type;
+	u8 idx;
+	u64 counter;
+	u64 eventsel;
+	struct perf_event *perf_event;
+	struct kvm_vcpu *vcpu;
+};
+
+struct kvm_pmu {
+	unsigned nr_arch_gp_counters;
+	unsigned nr_arch_fixed_counters;
+	unsigned available_event_types;
+	u64 fixed_ctr_ctrl;
+	u64 global_ctrl;
+	u64 global_status;
+	u64 global_ovf_ctrl;
+	u64 counter_bitmask[2];
+	u64 global_ctrl_mask;
+	u8 version;
+	struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC];
+	struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED];
+	struct irq_work irq_work;
+	u64 reprogram_pmi;
+};
+
 struct kvm_vcpu_arch {
 	/*
 	 * rip and regs accesses must go through
@@ -424,6 +457,8 @@ struct kvm_vcpu_arch {
 	unsigned access;
 	gfn_t mmio_gfn;
 
+	struct kvm_pmu pmu;
+
 	/* used for guest single stepping over the given code position */
 	unsigned long singlestep_rip;
 
@@ -891,4 +926,17 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
 
+int kvm_is_in_guest(void);
+
+void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+void kvm_pmu_reset(struct kvm_vcpu *vcpu);
+void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
+bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
+void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
+void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ca4d49e..1a7fe86 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -35,6 +35,7 @@ config KVM
 	select KVM_MMIO
 	select TASKSTATS
 	select TASK_DELAY_ACCT
+	select PERF_EVENTS
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 161b76a..4f579e8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
-			   i8254.o timer.o cpuid.o
+			   i8254.o timer.o cpuid.o pmu.o
 kvm-intel-y		+= vmx.o
 kvm-amd-y		+= svm.o
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ca63032..e70be46 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -45,6 +45,8 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu)
 		else
 			apic->lapic_timer.timer_mode_mask = 1 << 17;
 	}
+
+	kvm_pmu_cpuid_update(vcpu);
 }
 
 static int is_efer_nx(void)
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
new file mode 100644
index 0000000..7aad544
--- /dev/null
+++ b/arch/x86/kvm/pmu.c
@@ -0,0 +1,533 @@
+/*
+ * Kernel-based Virtual Machine -- Performane Monitoring Unit support
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ *   Avi Kivity   <avi@redhat.com>
+ *   Gleb Natapov <gleb@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+
+static struct kvm_arch_event_perf_mapping {
+	u8 eventsel;
+	u8 unit_mask;
+	unsigned event_type;
+	bool inexact;
+} arch_events[] = {
+	/* Index must match CPUID 0x0A.EBX bit vector */
+	[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+	[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+	[2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES  },
+	[3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
+	[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
+	[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+	[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+};
+
+/* mapping between fixed pmc index and arch_events array */
+int fixed_pmc_events[] = {1, 0, 2};
+
+static bool pmc_is_gp(struct kvm_pmc *pmc)
+{
+	return pmc->type == KVM_PMC_GP;
+}
+
+static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
+{
+	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
+
+	return pmu->counter_bitmask[pmc->type];
+}
+
+static inline bool pmc_enabled(struct kvm_pmc *pmc)
+{
+	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
+	return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+}
+
+static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+					 u32 base)
+{
+	if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
+		return &pmu->gp_counters[msr - base];
+	return NULL;
+}
+
+static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+	int base = MSR_CORE_PERF_FIXED_CTR0;
+	if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
+		return &pmu->fixed_counters[msr - base];
+	return NULL;
+}
+
+static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
+{
+	return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx);
+}
+
+static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
+{
+	if (idx < X86_PMC_IDX_FIXED)
+		return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
+	else
+		return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED);
+}
+
+void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.apic)
+		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+}
+
+static void trigger_pmi(struct irq_work *irq_work)
+{
+	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu,
+			irq_work);
+	struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu,
+			arch.pmu);
+
+	kvm_deliver_pmi(vcpu);
+}
+
+static void kvm_perf_overflow(struct perf_event *perf_event,
+			      struct perf_sample_data *data,
+			      struct pt_regs *regs)
+{
+	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
+	__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+}
+
+static void kvm_perf_overflow_intr(struct perf_event *perf_event,
+		struct perf_sample_data *data, struct pt_regs *regs)
+{
+	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
+	if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
+		kvm_perf_overflow(perf_event, data, regs);
+		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+		/*
+		 * Inject PMI. If vcpu was in a guest mode during NMI PMI
+		 * can be ejected on a guest mode re-entry. Otherwise we can't
+		 * be sure that vcpu wasn't executing hlt instruction at the
+		 * time of vmexit and is not going to re-enter guest mode until,
+		 * woken up. So we should wake it, but this is impossible from
+		 * NMI context. Do it from irq work instead.
+		 */
+		if (!kvm_is_in_guest())
+			irq_work_queue(&pmc->vcpu->arch.pmu.irq_work);
+		else
+			kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
+	}
+}
+
+static u64 read_pmc(struct kvm_pmc *pmc)
+{
+	u64 counter, enabled, running;
+
+	counter = pmc->counter;
+
+	if (pmc->perf_event)
+		counter += perf_event_read_value(pmc->perf_event,
+						 &enabled, &running);
+
+	/* FIXME: Scaling needed? */
+
+	return counter & pmc_bitmask(pmc);
+}
+
+static void stop_counter(struct kvm_pmc *pmc)
+{
+	if (pmc->perf_event) {
+		pmc->counter = read_pmc(pmc);
+		perf_event_release_kernel(pmc->perf_event);
+		pmc->perf_event = NULL;
+	}
+}
+
+static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
+		unsigned config, bool exclude_user, bool exclude_kernel,
+		bool intr)
+{
+	struct perf_event *event;
+	struct perf_event_attr attr = {
+		.type = type,
+		.size = sizeof(attr),
+		.pinned = true,
+		.exclude_idle = true,
+		.exclude_host = 1,
+		.exclude_user = exclude_user,
+		.exclude_kernel = exclude_kernel,
+		.config = config,
+	};
+
+	attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
+
+	event = perf_event_create_kernel_counter(&attr, -1, current,
+						 intr ? kvm_perf_overflow_intr :
+						 kvm_perf_overflow, pmc);
+	if (IS_ERR(event)) {
+		printk_once("kvm: pmu event creation failed %ld\n",
+				PTR_ERR(event));
+		return;
+	}
+
+	pmc->perf_event = event;
+	clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi);
+}
+
+static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select,
+		u8 unit_mask)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(arch_events); i++)
+		if (arch_events[i].eventsel == event_select
+				&& arch_events[i].unit_mask == unit_mask
+				&& (pmu->available_event_types & (1 << i)))
+			break;
+
+	if (i == ARRAY_SIZE(arch_events))
+		return PERF_COUNT_HW_MAX;
+
+	return arch_events[i].event_type;
+}
+
+static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
+{
+	unsigned config, type = PERF_TYPE_RAW;
+	u8 event_select, unit_mask;
+
+	pmc->eventsel = eventsel;
+
+	stop_counter(pmc);
+
+	if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc))
+		return;
+
+	event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+	unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+
+	if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE |
+				ARCH_PERFMON_EVENTSEL_INV |
+				ARCH_PERFMON_EVENTSEL_CMASK))) {
+		config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
+				unit_mask);
+		if (config != PERF_COUNT_HW_MAX)
+			type = PERF_TYPE_HARDWARE;
+	}
+
+	if (type == PERF_TYPE_RAW)
+		config = eventsel & X86_RAW_EVENT_MASK;
+
+	reprogram_counter(pmc, type, config,
+			!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+			!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+			eventsel & ARCH_PERFMON_EVENTSEL_INT);
+}
+
+static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
+{
+	unsigned en = en_pmi & 0x3;
+	bool pmi = en_pmi & 0x8;
+
+	stop_counter(pmc);
+
+	if (!en || !pmc_enabled(pmc))
+		return;
+
+	reprogram_counter(pmc, PERF_TYPE_HARDWARE,
+			arch_events[fixed_pmc_events[idx]].event_type,
+			!(en & 0x2), /* exclude user */
+			!(en & 0x1), /* exclude kernel */
+			pmi);
+}
+
+static inline u8 fixed_en_pmi(u64 ctrl, int idx)
+{
+	return (ctrl >> (idx * 4)) & 0xf;
+}
+
+static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
+{
+	int i;
+
+	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+		u8 en_pmi = fixed_en_pmi(data, i);
+		struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i);
+
+		if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi)
+			continue;
+
+		reprogram_fixed_counter(pmc, en_pmi, i);
+	}
+
+	pmu->fixed_ctr_ctrl = data;
+}
+
+static void reprogram_idx(struct kvm_pmu *pmu, int idx)
+{
+	struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx);
+
+	if (!pmc)
+		return;
+
+	if (pmc_is_gp(pmc))
+		reprogram_gp_counter(pmc, pmc->eventsel);
+	else {
+		int fidx = idx - X86_PMC_IDX_FIXED;
+		reprogram_fixed_counter(pmc,
+				fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
+	}
+}
+
+static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
+{
+	int bit;
+	u64 diff = pmu->global_ctrl ^ data;
+
+	pmu->global_ctrl = data;
+
+	for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+		reprogram_idx(pmu, bit);
+}
+
+bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	int ret;
+
+	switch (msr) {
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		ret = pmu->version > 1;
+		break;
+	default:
+		ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)
+			|| get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0)
+			|| get_fixed_pmc(pmu, msr);
+		break;
+	}
+	return ret;
+}
+
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_pmc *pmc;
+
+	switch (index) {
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+		*data = pmu->fixed_ctr_ctrl;
+		return 0;
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+		*data = pmu->global_status;
+		return 0;
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+		*data = pmu->global_ctrl;
+		return 0;
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		*data = pmu->global_ovf_ctrl;
+		return 0;
+	default:
+		if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
+				(pmc = get_fixed_pmc(pmu, index))) {
+			*data = read_pmc(pmc);
+			return 0;
+		} else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
+			*data = pmc->eventsel;
+			return 0;
+		}
+	}
+	return 1;
+}
+
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_pmc *pmc;
+
+	switch (index) {
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+		if (pmu->fixed_ctr_ctrl == data)
+			return 0;
+		if (!(data & 0xfffffffffffff444)) {
+			reprogram_fixed_counters(pmu, data);
+			return 0;
+		}
+		break;
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+		break; /* RO MSR */
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+		if (pmu->global_ctrl == data)
+			return 0;
+		if (!(data & pmu->global_ctrl_mask)) {
+			global_ctrl_changed(pmu, data);
+			return 0;
+		}
+		break;
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
+			pmu->global_status &= ~data;
+			pmu->global_ovf_ctrl = data;
+			return 0;
+		}
+		break;
+	default:
+		if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
+				(pmc = get_fixed_pmc(pmu, index))) {
+			data = (s64)(s32)data;
+			pmc->counter += data - read_pmc(pmc);
+			return 0;
+		} else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
+			if (data == pmc->eventsel)
+				return 0;
+			if (!(data & 0xffffffff00200000ull)) {
+				reprogram_gp_counter(pmc, data);
+				return 0;
+			}
+		}
+	}
+	return 1;
+}
+
+int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	bool fast_mode = pmc & (1u << 31);
+	bool fixed = pmc & (1u << 30);
+	struct kvm_pmc *counters;
+	u64 ctr;
+
+	pmc &= (3u << 30) - 1;
+	if (!fixed && pmc >= pmu->nr_arch_gp_counters)
+		return 1;
+	if (fixed && pmc >= pmu->nr_arch_fixed_counters)
+		return 1;
+	counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
+	ctr = read_pmc(&counters[pmc]);
+	if (fast_mode)
+		ctr = (u32)ctr;
+	*data = ctr;
+
+	return 0;
+}
+
+void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	struct kvm_cpuid_entry2 *entry;
+	unsigned bitmap_len;
+
+	pmu->nr_arch_gp_counters = 0;
+	pmu->nr_arch_fixed_counters = 0;
+	pmu->counter_bitmask[KVM_PMC_GP] = 0;
+	pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+	pmu->version = 0;
+
+	entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
+	if (!entry)
+		return;
+
+	pmu->version = entry->eax & 0xff;
+	if (!pmu->version)
+		return;
+
+	pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
+			X86_PMC_MAX_GENERIC);
+	pmu->counter_bitmask[KVM_PMC_GP] =
+		((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
+	bitmap_len = (entry->eax >> 24) & 0xff;
+	pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1);
+
+	if (pmu->version == 1) {
+		pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1;
+		return;
+	}
+
+	pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f),
+			X86_PMC_MAX_FIXED);
+	pmu->counter_bitmask[KVM_PMC_FIXED] =
+		((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;
+	pmu->global_ctrl_mask = ~(((1 << pmu->nr_arch_gp_counters) - 1)
+			| (((1ull << pmu->nr_arch_fixed_counters) - 1)
+				<< X86_PMC_IDX_FIXED));
+}
+
+void kvm_pmu_init(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+
+	memset(pmu, 0, sizeof(*pmu));
+	for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
+		pmu->gp_counters[i].type = KVM_PMC_GP;
+		pmu->gp_counters[i].vcpu = vcpu;
+		pmu->gp_counters[i].idx = i;
+	}
+	for (i = 0; i < X86_PMC_MAX_FIXED; i++) {
+		pmu->fixed_counters[i].type = KVM_PMC_FIXED;
+		pmu->fixed_counters[i].vcpu = vcpu;
+		pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED;
+	}
+	init_irq_work(&pmu->irq_work, trigger_pmi);
+	kvm_pmu_cpuid_update(vcpu);
+}
+
+void kvm_pmu_reset(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	int i;
+
+	irq_work_sync(&pmu->irq_work);
+	for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
+		struct kvm_pmc *pmc = &pmu->gp_counters[i];
+		stop_counter(pmc);
+		pmc->counter = pmc->eventsel = 0;
+	}
+
+	for (i = 0; i < X86_PMC_MAX_FIXED; i++)
+		stop_counter(&pmu->fixed_counters[i]);
+
+	pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
+		pmu->global_ovf_ctrl = 0;
+}
+
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
+{
+	kvm_pmu_reset(vcpu);
+}
+
+void kvm_handle_pmu_event(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	u64 bitmask;
+	int bit;
+
+	bitmask = pmu->reprogram_pmi;
+
+	for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
+		struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit);
+
+		if (unlikely(!pmc || !pmc->perf_event)) {
+			clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
+			continue;
+		}
+
+		reprogram_idx(pmu, bit);
+	}
+}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0a646e2..08ae951 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1602,8 +1602,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	 * which we perfectly emulate ;-). Any other value should be at least
 	 * reported, some guests depend on them.
 	 */
-	case MSR_P6_EVNTSEL0:
-	case MSR_P6_EVNTSEL1:
 	case MSR_K7_EVNTSEL0:
 	case MSR_K7_EVNTSEL1:
 	case MSR_K7_EVNTSEL2:
@@ -1615,8 +1613,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	/* at least RHEL 4 unconditionally writes to the perfctr registers,
 	 * so we ignore writes to make it happy.
 	 */
-	case MSR_P6_PERFCTR0:
-	case MSR_P6_PERFCTR1:
 	case MSR_K7_PERFCTR0:
 	case MSR_K7_PERFCTR1:
 	case MSR_K7_PERFCTR2:
@@ -1653,6 +1649,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	default:
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 			return xen_hvm_config(vcpu, data);
+		if (kvm_pmu_msr(vcpu, msr))
+			return kvm_pmu_set_msr(vcpu, msr, data);
 		if (!ignore_msrs) {
 			pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
 				msr, data);
@@ -1815,10 +1813,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_K8_SYSCFG:
 	case MSR_K7_HWCR:
 	case MSR_VM_HSAVE_PA:
-	case MSR_P6_PERFCTR0:
-	case MSR_P6_PERFCTR1:
-	case MSR_P6_EVNTSEL0:
-	case MSR_P6_EVNTSEL1:
 	case MSR_K7_EVNTSEL0:
 	case MSR_K7_PERFCTR0:
 	case MSR_K8_INT_PENDING_MSG:
@@ -1929,6 +1923,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		data = 0xbe702111;
 		break;
 	default:
+		if (kvm_pmu_msr(vcpu, msr))
+			return kvm_pmu_get_msr(vcpu, msr, pdata);
 		if (!ignore_msrs) {
 			pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
 			return 1;
@@ -4650,7 +4646,7 @@ static void kvm_timer_init(void)
 
 static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
 
-static int kvm_is_in_guest(void)
+int kvm_is_in_guest(void)
 {
 	return __this_cpu_read(current_vcpu) != NULL;
 }
@@ -5114,6 +5110,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			process_nmi(vcpu);
 		req_immediate_exit =
 			kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
+		if (kvm_check_request(KVM_REQ_PMU, vcpu))
+			kvm_handle_pmu_event(vcpu);
+		if (kvm_check_request(KVM_REQ_PMI, vcpu))
+			kvm_deliver_pmi(vcpu);
 	}
 
 	r = kvm_mmu_reload(vcpu);
@@ -5850,6 +5850,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 	kvm_async_pf_hash_reset(vcpu);
 	vcpu->arch.apf.halted = false;
 
+	kvm_pmu_reset(vcpu);
+
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
@@ -5934,6 +5936,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		goto fail_free_mce_banks;
 
 	kvm_async_pf_hash_reset(vcpu);
+	kvm_pmu_init(vcpu);
 
 	return 0;
 fail_free_mce_banks:
@@ -5952,6 +5955,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 	int idx;
 
+	kvm_pmu_destroy(vcpu);
 	kfree(vcpu->arch.mce_banks);
 	kvm_free_lapic(vcpu);
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
-- 
cgit v1.1


From 022cd0e84020eec8b589bc119699c935c7b29584 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 10 Nov 2011 14:57:23 +0200
Subject: KVM: Add generic RDPMC support

Add a helper function that emulates the RDPMC instruction operation.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fb60ffd..52d6640 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -760,6 +760,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+bool kvm_rdpmc(struct kvm_vcpu *vcpu);
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 08ae951..27d18b7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -760,6 +760,21 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 }
 EXPORT_SYMBOL_GPL(kvm_get_dr);
 
+bool kvm_rdpmc(struct kvm_vcpu *vcpu)
+{
+	u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+	u64 data;
+	int err;
+
+	err = kvm_pmu_read_pmc(vcpu, ecx, &data);
+	if (err)
+		return err;
+	kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
+	kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
+	return err;
+}
+EXPORT_SYMBOL_GPL(kvm_rdpmc);
+
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
-- 
cgit v1.1


From 332b56e4841ef62db4dbf1b4b92195575e1c7338 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 10 Nov 2011 14:57:24 +0200
Subject: KVM: SVM: Intercept RDPMC

Intercept RDPMC and forward it to the PMU emulation code.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e32243e..5fa553b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1014,6 +1014,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	set_intercept(svm, INTERCEPT_NMI);
 	set_intercept(svm, INTERCEPT_SMI);
 	set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+	set_intercept(svm, INTERCEPT_RDPMC);
 	set_intercept(svm, INTERCEPT_CPUID);
 	set_intercept(svm, INTERCEPT_INVD);
 	set_intercept(svm, INTERCEPT_HLT);
@@ -2770,6 +2771,19 @@ static int emulate_on_interception(struct vcpu_svm *svm)
 	return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
 }
 
+static int rdpmc_interception(struct vcpu_svm *svm)
+{
+	int err;
+
+	if (!static_cpu_has(X86_FEATURE_NRIPS))
+		return emulate_on_interception(svm);
+
+	err = kvm_rdpmc(&svm->vcpu);
+	kvm_complete_insn_gp(&svm->vcpu, err);
+
+	return 1;
+}
+
 bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
 {
 	unsigned long cr0 = svm->vcpu.arch.cr0;
@@ -3190,6 +3204,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_SMI]				= nop_on_interception,
 	[SVM_EXIT_INIT]				= nop_on_interception,
 	[SVM_EXIT_VINTR]			= interrupt_window_interception,
+	[SVM_EXIT_RDPMC]			= rdpmc_interception,
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_IRET]                         = iret_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
-- 
cgit v1.1


From fee84b079d5ddee2247b5c1f53162c330c622902 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 10 Nov 2011 14:57:25 +0200
Subject: KVM: VMX: Intercept RDPMC

Intercept RDPMC and forward it to the PMU emulation code.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4ceced2..906a7e8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1956,6 +1956,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #endif
 		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
 		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
+		CPU_BASED_RDPMC_EXITING |
 		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 	/*
 	 * We can allow some features even when not supported by the
@@ -2410,7 +2411,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	      CPU_BASED_USE_TSC_OFFSETING |
 	      CPU_BASED_MWAIT_EXITING |
 	      CPU_BASED_MONITOR_EXITING |
-	      CPU_BASED_INVLPG_EXITING;
+	      CPU_BASED_INVLPG_EXITING |
+	      CPU_BASED_RDPMC_EXITING;
 
 	if (yield_on_hlt)
 		min |= CPU_BASED_HLT_EXITING;
@@ -4613,6 +4615,16 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_rdpmc(struct kvm_vcpu *vcpu)
+{
+	int err;
+
+	err = kvm_rdpmc(vcpu);
+	kvm_complete_insn_gp(vcpu, err);
+
+	return 1;
+}
+
 static int handle_wbinvd(struct kvm_vcpu *vcpu)
 {
 	skip_emulated_instruction(vcpu);
@@ -5563,6 +5575,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_HLT]                     = handle_halt,
 	[EXIT_REASON_INVD]		      = handle_invd,
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
+	[EXIT_REASON_RDPMC]                   = handle_rdpmc,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
 	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
 	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
-- 
cgit v1.1


From a6c06ed1a60aff77b27ba558c315c3fed4e35565 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 10 Nov 2011 14:57:28 +0200
Subject: KVM: Expose the architectural performance monitoring CPUID leaf

Provide a CPUID leaf that describes the emulated PMU.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/cpuid.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index e70be46..89b02bf 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -327,6 +327,35 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	}
 	case 9:
 		break;
+	case 0xa: { /* Architectural Performance Monitoring */
+		struct x86_pmu_capability cap;
+		union cpuid10_eax eax;
+		union cpuid10_edx edx;
+
+		perf_get_x86_pmu_capability(&cap);
+
+		/*
+		 * Only support guest architectural pmu on a host
+		 * with architectural pmu.
+		 */
+		if (!cap.version)
+			memset(&cap, 0, sizeof(cap));
+
+		eax.split.version_id = min(cap.version, 2);
+		eax.split.num_counters = cap.num_counters_gp;
+		eax.split.bit_width = cap.bit_width_gp;
+		eax.split.mask_length = cap.events_mask_len;
+
+		edx.split.num_counters_fixed = cap.num_counters_fixed;
+		edx.split.bit_width_fixed = cap.bit_width_fixed;
+		edx.split.reserved = 0;
+
+		entry->eax = eax.full;
+		entry->ebx = cap.events_mask;
+		entry->ecx = 0;
+		entry->edx = edx.full;
+		break;
+	}
 	/* function 0xb has additional index. */
 	case 0xb: {
 		int i, level_type;
@@ -427,7 +456,6 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	case 3: /* Processor serial number */
 	case 5: /* MONITOR/MWAIT */
 	case 6: /* Thermal management */
-	case 0xA: /* Architectural Performance Monitoring */
 	case 0x80000007: /* Advanced power management */
 	case 0xC0000002:
 	case 0xC0000003:
-- 
cgit v1.1


From 80bdec64c05b645708b0dd97919783ad077fcdc8 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 10 Nov 2011 14:57:29 +0200
Subject: KVM: x86 emulator: fix RDPMC privilege check

RDPMC is only privileged if CR4.PCE=0.  check_rdpmc() already implements this,
so all we need to do is drop the Priv flag.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index de7be77..d270f1a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3411,7 +3411,7 @@ static struct opcode twobyte_table[256] = {
 	II(ImplicitOps | Priv, em_wrmsr, wrmsr),
 	IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
 	II(ImplicitOps | Priv, em_rdmsr, rdmsr),
-	DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
+	DIP(ImplicitOps, rdpmc, check_rdpmc),
 	I(ImplicitOps | VendorSpecific, em_sysenter),
 	I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
 	N, N,
-- 
cgit v1.1


From 222d21aa070a4885ce3c7125a1b7ce07429ea4a1 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 10 Nov 2011 14:57:30 +0200
Subject: KVM: x86 emulator: implement RDPMC (0F 33)

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/emulate.c             | 13 ++++++++++++-
 arch/x86/kvm/x86.c                 |  7 +++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 9a4acf4..ab4092e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -181,6 +181,7 @@ struct x86_emulate_ops {
 	int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
 	int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
 	int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+	int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
 	void (*halt)(struct x86_emulate_ctxt *ctxt);
 	void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
 	int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d270f1a..05a562b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2645,6 +2645,17 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+	u64 pmc;
+
+	if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc))
+		return emulate_gp(ctxt, 0);
+	ctxt->regs[VCPU_REGS_RAX] = (u32)pmc;
+	ctxt->regs[VCPU_REGS_RDX] = pmc >> 32;
+	return X86EMUL_CONTINUE;
+}
+
 static int em_mov(struct x86_emulate_ctxt *ctxt)
 {
 	ctxt->dst.val = ctxt->src.val;
@@ -3411,7 +3422,7 @@ static struct opcode twobyte_table[256] = {
 	II(ImplicitOps | Priv, em_wrmsr, wrmsr),
 	IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
 	II(ImplicitOps | Priv, em_rdmsr, rdmsr),
-	DIP(ImplicitOps, rdpmc, check_rdpmc),
+	IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
 	I(ImplicitOps | VendorSpecific, em_sysenter),
 	I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
 	N, N,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 27d18b7..1171def 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4146,6 +4146,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
 	return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
 }
 
+static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
+			     u32 pmc, u64 *pdata)
+{
+	return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
+}
+
 static void emulator_halt(struct x86_emulate_ctxt *ctxt)
 {
 	emul_to_vcpu(ctxt)->arch.halt_request = 1;
@@ -4198,6 +4204,7 @@ static struct x86_emulate_ops emulate_ops = {
 	.set_dr              = emulator_set_dr,
 	.set_msr             = emulator_set_msr,
 	.get_msr             = emulator_get_msr,
+	.read_pmc            = emulator_read_pmc,
 	.halt                = emulator_halt,
 	.wbinvd              = emulator_wbinvd,
 	.fix_hypercall       = emulator_fix_hypercall,
-- 
cgit v1.1


From 7c9c3a1e5fc8728e948b8fa3cbcfcfb86db3afda Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 29 Dec 2011 14:43:16 +0000
Subject: x86/intel config: Fix the APB_TIMER selection

Seems Kconfig SELECT isn't selecting things hierarchically when
selected.

config APB_TIMER
       def_bool y if X86_INTEL_MID
       prompt "Intel MID APB Timer Support" if X86_INTEL_MID
       select DW_APB_TIMER
       depends on X86_INTEL_MID && SFI

when we select APB_TIMER doesn't select DW_APB_TIMER so do it by
hand.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Link: http://lkml.kernel.org/n/tip-kpnaimplltk6d1lolusqj3ae@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 07620bc91..78fbb34 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -409,12 +409,14 @@ config X86_MRST
 	depends on PCI
 	depends on PCI_GOANY
 	depends on X86_IO_APIC
+	select X86_INTEL_MID
+	select SFI
+	select DW_APB_TIMER
 	select APB_TIMER
 	select I2C
 	select SPI
 	select INTEL_SCU_IPC
 	select X86_PLATFORM_DEVICES
-	select X86_INTEL_MID
 	---help---
 	  Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
 	  Internet Device(MID) platform. Moorestown consists of two chips:
@@ -428,12 +430,14 @@ config X86_MDFLD
 	depends on PCI
 	depends on PCI_GOANY
 	depends on X86_IO_APIC
+	select X86_INTEL_MID
+	select SFI
+	select DW_APB_TIMER
 	select APB_TIMER
 	select I2C
 	select SPI
 	select INTEL_SCU_IPC
 	select X86_PLATFORM_DEVICES
-	select X86_INTEL_MID
 	---help---
 	  Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
 	  Internet Device(MID) platform. 
-- 
cgit v1.1


From 2c9ede55ecec58099b72e4bb8eab719f32f72c31 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 23 Jul 2011 20:24:48 -0400
Subject: switch device_get_devnode() and ->devnode() to umode_t *

both callers of device_get_devnode() are only interested in lower 16bits
and nobody tries to return anything wider than 16bit anyway.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/kernel/cpuid.c | 2 +-
 arch/x86/kernel/msr.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 212a6a4..a524353 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
 	.notifier_call = cpuid_class_cpu_callback,
 };
 
-static char *cpuid_devnode(struct device *dev, mode_t *mode)
+static char *cpuid_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
 }
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 12fcbe2..9635676 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -236,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
 	.notifier_call = msr_class_cpu_callback,
 };
 
-static char *msr_devnode(struct device *dev, mode_t *mode)
+static char *msr_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
 }
-- 
cgit v1.1


From f4ae40a6a50a98ac23d4b285f739455e926a473e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Jul 2011 04:33:43 -0400
Subject: switch debugfs to umode_t

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/xen/debugfs.c | 2 +-
 arch/x86/xen/debugfs.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 7c0fedd..ef1db19 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -109,7 +109,7 @@ static const struct file_operations u32_array_fops = {
 	.llseek = no_llseek,
 };
 
-struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
+struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
 					    struct dentry *parent,
 					    u32 *array, unsigned elements)
 {
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
index e281320..78d2549 100644
--- a/arch/x86/xen/debugfs.h
+++ b/arch/x86/xen/debugfs.h
@@ -3,7 +3,7 @@
 
 struct dentry * __init xen_init_debugfs(void);
 
-struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
+struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
 					    struct dentry *parent,
 					    u32 *array, unsigned elements);
 
-- 
cgit v1.1


From 28c3c05d337f6fdf84faf69374e6325b80cbf9ad Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Fri, 30 Dec 2011 14:37:05 -0500
Subject: PCI: add set_nouse_crs for use by a pci=nocrs blacklist

Some machines don't boot unless passed pci=nocrs.
(See https://bugzilla.redhat.com/show_bug.cgi?id=770308 for details of
  one report. Waiting on dmidecode output for others).

Currently there is a DMI whitelist, even though the default is on.

v2: drop the 1536 blacklist entry, superceded by the PNP/MMCONFIG changes from
    Bjorn

Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 404f21a..f5ccf29 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -24,6 +24,12 @@ static int __init set_use_crs(const struct dmi_system_id *id)
 	return 0;
 }
 
+static int __init set_nouse_crs(const struct dmi_system_id *id)
+{
+	pci_use_crs = false;
+	return 0;
+}
+
 static const struct dmi_system_id pci_use_crs_table[] __initconst = {
 	/* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */
 	{
@@ -54,6 +60,7 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
 			DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
 		},
 	},
+
 	{}
 };
 
-- 
cgit v1.1


From e702781fa846dd726b73e673f91ffbd3b0e8d114 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Wed, 4 Jan 2012 11:33:12 -0500
Subject: PCI: Add Dell Studio 1557 to pci=nocrs blacklist

The Dell Studio 1557 also doesn't suspend correctly when CRS is enabled.
Details at https://bugzilla.redhat.com/show_bug.cgi?id=769657

Reported-by: Gregory S. Hoerner <ghoerner@transcendingthought.com>
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index f5ccf29..0d9329f 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -61,6 +61,18 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
 		},
 	},
 
+	/* Now for the blacklist.. */
+
+	/* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */
+	{
+		.callback = set_nouse_crs,
+		.ident = "Dell Studio 1557",
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Studio 1557"),
+			DMI_MATCH(DMI_BIOS_VERSION, "A09"),
+		},
+	},
 	{}
 };
 
-- 
cgit v1.1


From 8b6a5af92c03b363df050da906480085b6cd6e00 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Wed, 4 Jan 2012 11:30:52 -0500
Subject: PCI: Add Thinkpad SL510 to pci=nocrs blacklist

Enabling CRS by default breaks suspend on the Thinkpad SL510.
Details in https://bugzilla.redhat.com/show_bug.cgi?id=769657

Reported-by: Stefan Kirrmann <stefan.kirrmann@gmail.com>
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 0d9329f..e662cee 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -73,6 +73,16 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
 			DMI_MATCH(DMI_BIOS_VERSION, "A09"),
 		},
 	},
+	/* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */
+	{
+		.callback = set_nouse_crs,
+		.ident = "Thinkpad SL510",
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_BOARD_NAME, "2847DFG"),
+			DMI_MATCH(DMI_BIOS_VERSION, "6JET85WW (1.43 )"),
+		},
+	},
 	{}
 };
 
-- 
cgit v1.1


From ae5cd86455381282ece162966183d3f208c6fad7 Mon Sep 17 00:00:00 2001
From: Gary Hade <garyhade@us.ibm.com>
Date: Mon, 14 Nov 2011 15:42:16 -0800
Subject: x86/PCI: Ignore CPU non-addressable _CRS reserved memory resources

This assures that a _CRS reserved host bridge window or window region is
not used if it is not addressable by the CPU.  The new code either trims
the window to exclude the non-addressable portion or totally ignores the
window if the entire window is non-addressable.

The current code has been shown to be problematic with 32-bit non-PAE
kernels on systems where _CRS reserves resources above 4GB.

Signed-off-by: Gary Hade <garyhade@us.ibm.com>
Reviewed-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Thomas Renninger <trenn@novell.com>
Cc: linux-kernel@vger.kernel.org
Cc: stable@kernel.org
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index e662cee..425500b 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -178,7 +178,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 	struct acpi_resource_address64 addr;
 	acpi_status status;
 	unsigned long flags;
-	u64 start, end;
+	u64 start, orig_end, end;
 
 	status = resource_to_addr(acpi_res, &addr);
 	if (!ACPI_SUCCESS(status))
@@ -194,7 +194,21 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 		return AE_OK;
 
 	start = addr.minimum + addr.translation_offset;
-	end = addr.maximum + addr.translation_offset;
+	orig_end = end = addr.maximum + addr.translation_offset;
+
+	/* Exclude non-addressable range or non-addressable portion of range */
+	end = min(end, (u64)iomem_resource.end);
+	if (end <= start) {
+		dev_info(&info->bridge->dev,
+			"host bridge window [%#llx-%#llx] "
+			"(ignored, not CPU addressable)\n", start, orig_end);
+		return AE_OK;
+	} else if (orig_end != end) {
+		dev_info(&info->bridge->dev,
+			"host bridge window [%#llx-%#llx] "
+			"([%#llx-%#llx] ignored, not CPU addressable)\n", 
+			start, orig_end, end + 1, orig_end);
+	}
 
 	res = &info->res[info->res_num];
 	res->name = info->name;
-- 
cgit v1.1


From 96c5590058d7fded14f43af2ab521436cecf3125 Mon Sep 17 00:00:00 2001
From: Myron Stowe <mstowe@redhat.com>
Date: Fri, 28 Oct 2011 15:48:38 -0600
Subject: PCI: Pull PCI 'latency timer' setup up into the core

The 'latency timer' of PCI devices, both Type 0 and Type 1,
is setup in architecture-specific code [see: 'pcibios_set_master()'].
There are two approaches being taken by all the architectures - check
if the 'latency timer' is currently set between 16 and 255 and if not
bring it within bounds, or, do nothing (and then there is the
gratuitously different PA-RISC implementation).

There is nothing architecture-specific about PCI's 'latency timer' so
this patch pulls its setup functionality up into the PCI core by
creating a generic 'pcibios_set_master()' function using the '__weak'
attribute which can be used by all architectures as a default which,
if necessary, can then be over-ridden by architecture-specific code.

No functional change.

Signed-off-by: Myron Stowe <myron.stowe@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/pci_x86.h | 2 --
 arch/x86/pci/i386.c            | 6 ------
 2 files changed, 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index e381978..b3a5317 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -44,8 +44,6 @@ enum pci_bf_sort_state {
 
 /* pci-i386.c */
 
-extern unsigned int pcibios_max_latency;
-
 void pcibios_resource_survey(void);
 void pcibios_set_cache_line_size(void);
 
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 794b092..dd5806b 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -254,12 +254,6 @@ void __init pcibios_resource_survey(void)
  */
 fs_initcall(pcibios_assign_resources);
 
-/*
- *  If we set up a device for bus mastering, we need to check the latency
- *  timer as certain crappy BIOSes forget to set it properly.
- */
-unsigned int pcibios_max_latency = 255;
-
 void pcibios_set_master(struct pci_dev *dev)
 {
 	u8 lat;
-- 
cgit v1.1


From b9a276ad262815d88f4dd232d578864949aab3b9 Mon Sep 17 00:00:00 2001
From: Myron Stowe <mstowe@redhat.com>
Date: Fri, 28 Oct 2011 15:49:13 -0600
Subject: PCI: x86: use generic pcibios_set_master()

This patch removes x86's architecture-specific 'pcibios_set_master()'
routine and lets the default PCI core based implementation handle PCI
device 'latency timer' setup.

No functional change.

Signed-off-by: Myron Stowe <myron.stowe@redhat.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/i386.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index dd5806b..91821a1 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -254,20 +254,6 @@ void __init pcibios_resource_survey(void)
  */
 fs_initcall(pcibios_assign_resources);
 
-void pcibios_set_master(struct pci_dev *dev)
-{
-	u8 lat;
-	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
-	if (lat < 16)
-		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
-	else if (lat > pcibios_max_latency)
-		lat = pcibios_max_latency;
-	else
-		return;
-	dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat);
-	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
-}
-
 static const struct vm_operations_struct pci_mmap_ops = {
 	.access = generic_access_phys,
 };
-- 
cgit v1.1


From ca3671a83389eea1458929d22c66a69e955bfb07 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Mon, 5 Dec 2011 18:12:28 +0100
Subject: x86/PCI: amd: Kill misleading message about enablement of IO access
 to PCI ECS]

Commit 24d9b70b8c679264756a6980e668b96b3f964826 (x86: Use PCI method
for enabling AMD extended config space before MSR method) added a
message when IO access to PCI ECS was enabled via access to the NB_CFG
PCI register.  This can lead to a bogus message like

[    0.365177] Extended Config Space enabled on 0 nodes

which is misleading because IO ECS access is subsequently enabled for
AMD CPUs (that support this) by modifying the corresponding NB_CFG
MSR.

Furthermore it's not "Extended Config Space" that is enabled by this
register setting. It's the IO access that is enabled for extended
configruation space.

IMHO the ambiguous message needs to be cancelled.

Cc: Jan Beulich <jbeulich@novell.com>
Cc: Robert Richter <robert.richter@amd.com>
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/amd_bus.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 026e493..7b7a897 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -403,7 +403,6 @@ static void __init pci_enable_pci_io_ecs(void)
 			++n;
 		}
 	}
-	pr_info("Extended Config Space enabled on %u nodes\n", n);
 #endif
 }
 
-- 
cgit v1.1


From 6361d72b04d1f77736142bc3911a32b814370729 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 28 Oct 2011 16:28:03 -0600
Subject: x86/PCI: read Broadcom CNB20LE host bridge info before PCI scan

We currently read the CNB20LE aperture information in a PCI quirk,
which happens after we've already created the root bus.  This patch
changes it to read the apertures earlier so we can create the root
bus with the correct resources.

I believe the CNB20LE lives at "pci 0000:00:00" based on
https://lkml.org/lkml/2010/8/13/220

CC: Ira W. Snyder <iws@ovro.caltech.edu>
CC: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/broadcom_bus.c | 62 ++++++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index ab8269b..f3a7c56 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -15,10 +15,11 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <asm/pci_x86.h>
+#include <asm/pci-direct.h>
 
 #include "bus_numa.h"
 
-static void __devinit cnb20le_res(struct pci_dev *dev)
+static void __init cnb20le_res(u8 bus, u8 slot, u8 func)
 {
 	struct pci_root_info *info;
 	struct resource res;
@@ -26,21 +27,12 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
 	u8 fbus, lbus;
 	int i;
 
-#ifdef CONFIG_ACPI
-	/*
-	 * We should get host bridge information from ACPI unless the BIOS
-	 * doesn't support it.
-	 */
-	if (acpi_os_get_root_pointer())
-		return;
-#endif
-
 	info = &pci_root_info[pci_root_num];
 	pci_root_num++;
 
 	/* read the PCI bus numbers */
-	pci_read_config_byte(dev, 0x44, &fbus);
-	pci_read_config_byte(dev, 0x45, &lbus);
+	fbus = read_pci_config_byte(bus, slot, func, 0x44);
+	lbus = read_pci_config_byte(bus, slot, func, 0x45);
 	info->bus_min = fbus;
 	info->bus_max = lbus;
 
@@ -59,8 +51,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
 	}
 
 	/* read the non-prefetchable memory window */
-	pci_read_config_word(dev, 0xc0, &word1);
-	pci_read_config_word(dev, 0xc2, &word2);
+	word1 = read_pci_config_16(bus, slot, func, 0xc0);
+	word2 = read_pci_config_16(bus, slot, func, 0xc2);
 	if (word1 != word2) {
 		res.start = (word1 << 16) | 0x0000;
 		res.end   = (word2 << 16) | 0xffff;
@@ -69,8 +61,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
 	}
 
 	/* read the prefetchable memory window */
-	pci_read_config_word(dev, 0xc4, &word1);
-	pci_read_config_word(dev, 0xc6, &word2);
+	word1 = read_pci_config_16(bus, slot, func, 0xc4);
+	word2 = read_pci_config_16(bus, slot, func, 0xc6);
 	if (word1 != word2) {
 		res.start = (word1 << 16) | 0x0000;
 		res.end   = (word2 << 16) | 0xffff;
@@ -79,8 +71,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
 	}
 
 	/* read the IO port window */
-	pci_read_config_word(dev, 0xd0, &word1);
-	pci_read_config_word(dev, 0xd2, &word2);
+	word1 = read_pci_config_16(bus, slot, func, 0xd0);
+	word2 = read_pci_config_16(bus, slot, func, 0xd2);
 	if (word1 != word2) {
 		res.start = word1;
 		res.end   = word2;
@@ -92,13 +84,37 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
 	res.start = fbus;
 	res.end   = lbus;
 	res.flags = IORESOURCE_BUS;
-	dev_info(&dev->dev, "CNB20LE PCI Host Bridge (domain %04x %pR)\n",
-			    pci_domain_nr(dev->bus), &res);
+	printk(KERN_INFO "CNB20LE PCI Host Bridge (domain 0000 %pR)\n", &res);
 
 	for (i = 0; i < info->res_num; i++)
-		dev_info(&dev->dev, "host bridge window %pR\n", &info->res[i]);
+		printk(KERN_INFO "host bridge window %pR\n", &info->res[i]);
 }
 
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE,
-			cnb20le_res);
+static int __init broadcom_postcore_init(void)
+{
+	u8 bus = 0, slot = 0;
+	u32 id;
+	u16 vendor, device;
+
+#ifdef CONFIG_ACPI
+	/*
+	 * We should get host bridge information from ACPI unless the BIOS
+	 * doesn't support it.
+	 */
+	if (acpi_os_get_root_pointer())
+		return 0;
+#endif
+
+	id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID);
+	vendor = id & 0xffff;
+	device = (id >> 16) & 0xffff;
+
+	if (vendor == PCI_VENDOR_ID_SERVERWORKS &&
+	    device == PCI_DEVICE_ID_SERVERWORKS_LE) {
+		cnb20le_res(bus, slot, 0);
+		cnb20le_res(bus, slot, 1);
+	}
+	return 0;
+}
 
+postcore_initcall(broadcom_postcore_init);
-- 
cgit v1.1


From 46fbade05ca0784ca3c959bd7bf2aae7d81306c2 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 28 Oct 2011 16:28:08 -0600
Subject: x86/PCI: use pci_scan_bus() instead of pci_scan_bus_parented()

This doesn't change any functionality, but it makes a subsequent patch
slightly simpler.

pci_scan_bus(NULL, ...) and pci_scan_bus_parented() are identical except
that pci_scan_bus() also calls pci_bus_add_devices():

  pci_scan_bus_parented
    pci_create_bus
    pci_scan_child_bus

  pci_scan_bus
    pci_create_bus
    pci_scan_child_bus
    pci_bus_add_devices

All callers of pcibios_scan_root() call pci_bus_add_devices() explicitly,
and we don't pass a parent device, so we might as well use pci_scan_bus().

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/common.c   | 2 +-
 arch/x86/pci/legacy.c   | 3 ---
 arch/x86/pci/numaq_32.c | 2 --
 3 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 7962ccb..07c55ce 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -456,7 +456,7 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
 	sd->node = get_mp_bus_to_node(busnum);
 
 	printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
-	bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
+	bus = pci_scan_bus(busnum, &pci_root_ops, sd);
 	if (!bus)
 		kfree(sd);
 
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 2c2aeab..a1df191 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -31,9 +31,6 @@ int __init pci_legacy_init(void)
 
 	printk("PCI: Probing PCI hardware\n");
 	pci_root_bus = pcibios_scan_root(0);
-	if (pci_root_bus)
-		pci_bus_add_devices(pci_root_bus);
-
 	return 0;
 }
 
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 51abf02..83e125b 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -153,8 +153,6 @@ int __init pci_numaq_init(void)
 	raw_pci_ops = &pci_direct_conf1_mq;
 
 	pci_root_bus = pcibios_scan_root(0);
-	if (pci_root_bus)
-		pci_bus_add_devices(pci_root_bus);
 	if (num_online_nodes() > 1)
 		for_each_online_node(quad) {
 			if (quad == 0)
-- 
cgit v1.1


From 2cd6975a4ff92a75e46240109d01c1daf4682e5d Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 28 Oct 2011 16:28:14 -0600
Subject: x86/PCI: convert to pci_create_root_bus() and pci_scan_root_bus()

x86 has two kinds of PCI root bus scanning:

(1) ACPI-based, using _CRS resources.  This used pci_create_bus(), not
    pci_scan_bus(), because ACPI hotplug needed to split the
    pci_bus_add_devices() into a separate host bridge .start() method.

    This patch parses the _CRS resources earlier, so we can build a list of
    resources and pass it to pci_create_root_bus().

    Note that as before, we parse the _CRS even if we aren't going to use
    it so we can print it for debugging purposes.

(2) All other, which used either default resources (ioport_resource and
    iomem_resource) or information read from the hardware via amd_bus.c or
    similar.  This used pci_scan_bus().

    This patch converts x86_pci_root_bus_res_quirks() (previously called
    from pcibios_fixup_bus()) to x86_pci_root_bus_resources(), which builds
    a list of resources before we call pci_scan_root_bus().

    We also use x86_pci_root_bus_resources() if we have ACPI but are
    ignoring _CRS.

CC: Yinghai Lu <yinghai.lu@oracle.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/topology.h |  2 +-
 arch/x86/pci/acpi.c             | 28 ++++++++++++++--------------
 arch/x86/pci/bus_numa.c         | 31 ++++++++++++++++++-------------
 arch/x86/pci/common.c           | 19 ++++++++++++-------
 4 files changed, 45 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index c006924..5f83b13 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -174,7 +174,7 @@ static inline void arch_fix_phys_package_id(int num, u32 slot)
 }
 
 struct pci_bus;
-void x86_pci_root_bus_res_quirks(struct pci_bus *b);
+void x86_pci_root_bus_resources(int bus, struct list_head *resources);
 
 #ifdef CONFIG_SMP
 #define mc_capable()	((boot_cpu_data.x86_max_cores > 1) && \
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 425500b..a312e76 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -12,7 +12,7 @@ struct pci_root_info {
 	char *name;
 	unsigned int res_num;
 	struct resource *res;
-	struct pci_bus *bus;
+	struct list_head *resources;
 	int busnum;
 };
 
@@ -304,23 +304,20 @@ static void add_resources(struct pci_root_info *info)
 				 "ignoring host bridge window %pR (conflicts with %s %pR)\n",
 				 res, conflict->name, conflict);
 		else
-			pci_bus_add_resource(info->bus, res, 0);
+			pci_add_resource(info->resources, res);
 	}
 }
 
 static void
 get_current_resources(struct acpi_device *device, int busnum,
-			int domain, struct pci_bus *bus)
+		      int domain, struct list_head *resources)
 {
 	struct pci_root_info info;
 	size_t size;
 
-	if (pci_use_crs)
-		pci_bus_remove_resources(bus);
-
 	info.bridge = device;
-	info.bus = bus;
 	info.res_num = 0;
+	info.resources = resources;
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
 				&info);
 	if (!info.res_num)
@@ -329,7 +326,7 @@ get_current_resources(struct acpi_device *device, int busnum,
 	size = sizeof(*info.res) * info.res_num;
 	info.res = kmalloc(size, GFP_KERNEL);
 	if (!info.res)
-		goto res_alloc_fail;
+		return;
 
 	info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum);
 	if (!info.name)
@@ -344,8 +341,6 @@ get_current_resources(struct acpi_device *device, int busnum,
 
 name_alloc_fail:
 	kfree(info.res);
-res_alloc_fail:
-	return;
 }
 
 struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
@@ -353,6 +348,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 	struct acpi_device *device = root->device;
 	int domain = root->segment;
 	int busnum = root->secondary.start;
+	LIST_HEAD(resources);
 	struct pci_bus *bus;
 	struct pci_sysdata *sd;
 	int node;
@@ -407,11 +403,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 		memcpy(bus->sysdata, sd, sizeof(*sd));
 		kfree(sd);
 	} else {
-		bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd);
-		if (bus) {
-			get_current_resources(device, busnum, domain, bus);
+		get_current_resources(device, busnum, domain, &resources);
+		if (list_empty(&resources))
+			x86_pci_root_bus_resources(busnum, &resources);
+		bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd,
+					  &resources);
+		if (bus)
 			bus->subordinate = pci_scan_child_bus(bus);
-		}
+		else
+			pci_free_resource_list(&resources);
 	}
 
 	/* After the PCI-E bus has been walked and all devices discovered,
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 64a1228..fd3f655 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -7,45 +7,50 @@
 int pci_root_num;
 struct pci_root_info pci_root_info[PCI_ROOT_NR];
 
-void x86_pci_root_bus_res_quirks(struct pci_bus *b)
+void x86_pci_root_bus_resources(int bus, struct list_head *resources)
 {
 	int i;
 	int j;
 	struct pci_root_info *info;
 
-	/* don't go for it if _CRS is used already */
-	if (b->resource[0] != &ioport_resource ||
-	    b->resource[1] != &iomem_resource)
-		return;
-
 	if (!pci_root_num)
-		return;
+		goto default_resources;
 
 	for (i = 0; i < pci_root_num; i++) {
-		if (pci_root_info[i].bus_min == b->number)
+		if (pci_root_info[i].bus_min == bus)
 			break;
 	}
 
 	if (i == pci_root_num)
-		return;
+		goto default_resources;
 
-	printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
-			b->number);
+	printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n",
+	       bus);
 
-	pci_bus_remove_resources(b);
 	info = &pci_root_info[i];
 	for (j = 0; j < info->res_num; j++) {
 		struct resource *res;
 		struct resource *root;
 
 		res = &info->res[j];
-		pci_bus_add_resource(b, res, 0);
+		pci_add_resource(resources, res);
 		if (res->flags & IORESOURCE_IO)
 			root = &ioport_resource;
 		else
 			root = &iomem_resource;
 		insert_resource(root, res);
 	}
+	return;
+
+default_resources:
+	/*
+	 * We don't have any host bridge aperture information from the
+	 * "native host bridge drivers," e.g., amd_bus or broadcom_bus,
+	 * so fall back to the defaults historically used by pci_create_bus().
+	 */
+	printk(KERN_DEBUG "PCI: root bus %02x: using default resources\n", bus);
+	pci_add_resource(resources, &ioport_resource);
+	pci_add_resource(resources, &iomem_resource);
 }
 
 void __devinit update_res(struct pci_root_info *info, resource_size_t start,
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 07c55ce..323481e 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -164,9 +164,6 @@ void __devinit pcibios_fixup_bus(struct pci_bus *b)
 {
 	struct pci_dev *dev;
 
-	/* root bus? */
-	if (!b->parent)
-		x86_pci_root_bus_res_quirks(b);
 	pci_read_bridge_bases(b);
 	list_for_each_entry(dev, &b->devices, bus_list)
 		pcibios_fixup_device_resources(dev);
@@ -433,6 +430,7 @@ void __init dmi_check_pciprobe(void)
 
 struct pci_bus * __devinit pcibios_scan_root(int busnum)
 {
+	LIST_HEAD(resources);
 	struct pci_bus *bus = NULL;
 	struct pci_sysdata *sd;
 
@@ -456,9 +454,12 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
 	sd->node = get_mp_bus_to_node(busnum);
 
 	printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
-	bus = pci_scan_bus(busnum, &pci_root_ops, sd);
-	if (!bus)
+	x86_pci_root_bus_resources(busnum, &resources);
+	bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources);
+	if (!bus) {
+		pci_free_resource_list(&resources);
 		kfree(sd);
+	}
 
 	return bus;
 }
@@ -639,6 +640,7 @@ int pci_ext_cfg_avail(struct pci_dev *dev)
 
 struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
 {
+	LIST_HEAD(resources);
 	struct pci_bus *bus = NULL;
 	struct pci_sysdata *sd;
 
@@ -653,9 +655,12 @@ struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops,
 		return NULL;
 	}
 	sd->node = node;
-	bus = pci_scan_bus(busno, ops, sd);
-	if (!bus)
+	x86_pci_root_bus_resources(busno, &resources);
+	bus = pci_scan_root_bus(NULL, busno, ops, sd, &resources);
+	if (!bus) {
+		pci_free_resource_list(&resources);
 		kfree(sd);
+	}
 
 	return bus;
 }
-- 
cgit v1.1


From 24d25dbfa63c376323096660bfa9ad45a08870ce Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 5 Jan 2012 14:27:19 -0700
Subject: x86/PCI: amd: factor out MMCONFIG discovery

This factors out the AMD native MMCONFIG discovery so we can use it
outside amd_bus.c.

amd_bus.c reads AMD MSRs so it can remove the MMCONFIG area from the
PCI resources.  We may also need the MMCONFIG information to work
around BIOS defects in the ACPI MCFG table.

Cc: Borislav Petkov <borislav.petkov@amd.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: stable@kernel.org       # 2.6.34+
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/amd_nb.h |  2 ++
 arch/x86/kernel/amd_nb.c      | 31 +++++++++++++++++++++++++++++++
 arch/x86/pci/amd_bus.c        | 42 +++++++++++-------------------------------
 3 files changed, 44 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 8e41071..49ad773 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_AMD_NB_H
 #define _ASM_X86_AMD_NB_H
 
+#include <linux/ioport.h>
 #include <linux/pci.h>
 
 struct amd_nb_bus_dev_range {
@@ -13,6 +14,7 @@ extern const struct pci_device_id amd_nb_misc_ids[];
 extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
 
 extern bool early_is_amd_nb(u32 value);
+extern struct resource *amd_get_mmconfig_range(struct resource *res);
 extern int amd_cache_northbridges(void);
 extern void amd_flush_garts(void);
 extern int amd_numa_init(void);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4c39baa..bae1efe 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -119,6 +119,37 @@ bool __init early_is_amd_nb(u32 device)
 	return false;
 }
 
+struct resource *amd_get_mmconfig_range(struct resource *res)
+{
+	u32 address;
+	u64 base, msr;
+	unsigned segn_busn_bits;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+		return NULL;
+
+	/* assume all cpus from fam10h have mmconfig */
+        if (boot_cpu_data.x86 < 0x10)
+		return NULL;
+
+	address = MSR_FAM10H_MMIO_CONF_BASE;
+	rdmsrl(address, msr);
+
+	/* mmconfig is not enabled */
+	if (!(msr & FAM10H_MMIO_CONF_ENABLE))
+		return NULL;
+
+	base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
+
+	segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
+			 FAM10H_MMIO_CONF_BUSRANGE_MASK;
+
+	res->flags = IORESOURCE_MEM;
+	res->start = base;
+	res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
+	return res;
+}
+
 int amd_get_subcaches(int cpu)
 {
 	struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 7b7a897..0567df3 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -30,34 +30,6 @@ static struct pci_hostbridge_probe pci_probes[] __initdata = {
 	{ 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 },
 };
 
-static u64 __initdata fam10h_mmconf_start;
-static u64 __initdata fam10h_mmconf_end;
-static void __init get_pci_mmcfg_amd_fam10h_range(void)
-{
-	u32 address;
-	u64 base, msr;
-	unsigned segn_busn_bits;
-
-	/* assume all cpus from fam10h have mmconf */
-        if (boot_cpu_data.x86 < 0x10)
-		return;
-
-	address = MSR_FAM10H_MMIO_CONF_BASE;
-	rdmsrl(address, msr);
-
-	/* mmconfig is not enable */
-	if (!(msr & FAM10H_MMIO_CONF_ENABLE))
-		return;
-
-	base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
-
-	segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
-			 FAM10H_MMIO_CONF_BUSRANGE_MASK;
-
-	fam10h_mmconf_start = base;
-	fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
-}
-
 #define RANGE_NUM 16
 
 /**
@@ -85,6 +57,9 @@ static int __init early_fill_mp_bus_info(void)
 	u64 val;
 	u32 address;
 	bool found;
+	struct resource fam10h_mmconf_res, *fam10h_mmconf;
+	u64 fam10h_mmconf_start;
+	u64 fam10h_mmconf_end;
 
 	if (!early_pci_allowed())
 		return -1;
@@ -211,12 +186,17 @@ static int __init early_fill_mp_bus_info(void)
 		subtract_range(range, RANGE_NUM, 0, end);
 
 	/* get mmconfig */
-	get_pci_mmcfg_amd_fam10h_range();
+	fam10h_mmconf = amd_get_mmconfig_range(&fam10h_mmconf_res);
 	/* need to take out mmconf range */
-	if (fam10h_mmconf_end) {
-		printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end);
+	if (fam10h_mmconf) {
+		printk(KERN_DEBUG "Fam 10h mmconf %pR\n", fam10h_mmconf);
+		fam10h_mmconf_start = fam10h_mmconf->start;
+		fam10h_mmconf_end = fam10h_mmconf->end;
 		subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
 				 fam10h_mmconf_end + 1);
+	} else {
+		fam10h_mmconf_start = 0;
+		fam10h_mmconf_end = 0;
 	}
 
 	/* mmio resource */
-- 
cgit v1.1


From 76ccc297018d25d55b789bbd508861ef1e2cdb0c Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 16 Dec 2011 17:38:18 -0500
Subject: x86/PCI: Expand the x86_msi_ops to have a restore MSIs.

The MSI restore function will become a function pointer in an
x86_msi_ops struct. It defaults to the implementation in the
io_apic.c and msi.c. We piggyback on the indirection mechanism
introduced by "x86: Introduce x86_msi_ops".

Cc: x86@kernel.org
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: linux-pci@vger.kernel.org
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/pci.h      | 9 +++++++++
 arch/x86/include/asm/x86_init.h | 1 +
 arch/x86/kernel/x86_init.c      | 1 +
 3 files changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d498943..df75d07 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -112,19 +112,28 @@ static inline void x86_teardown_msi_irq(unsigned int irq)
 {
 	x86_msi.teardown_msi_irq(irq);
 }
+static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+	x86_msi.restore_msi_irqs(dev, irq);
+}
 #define arch_setup_msi_irqs x86_setup_msi_irqs
 #define arch_teardown_msi_irqs x86_teardown_msi_irqs
 #define arch_teardown_msi_irq x86_teardown_msi_irq
+#define arch_restore_msi_irqs x86_restore_msi_irqs
 /* implemented in arch/x86/kernel/apic/io_apic. */
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void native_teardown_msi_irq(unsigned int irq);
+void native_restore_msi_irqs(struct pci_dev *dev, int irq);
 /* default to the implementation in drivers/lib/msi.c */
 #define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
+#define HAVE_DEFAULT_MSI_RESTORE_IRQS
 void default_teardown_msi_irqs(struct pci_dev *dev);
+void default_restore_msi_irqs(struct pci_dev *dev, int irq);
 #else
 #define native_setup_msi_irqs		NULL
 #define native_teardown_msi_irq		NULL
 #define default_teardown_msi_irqs	NULL
+#define default_restore_msi_irqs	NULL
 #endif
 
 #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 1971e65..cd52084 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -177,6 +177,7 @@ struct x86_msi_ops {
 	int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
 	void (*teardown_msi_irq)(unsigned int irq);
 	void (*teardown_msi_irqs)(struct pci_dev *dev);
+	void (*restore_msi_irqs)(struct pci_dev *dev, int irq);
 };
 
 extern struct x86_init_ops x86_init;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c1d6cd5..83b05ad 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -114,4 +114,5 @@ struct x86_msi_ops x86_msi = {
 	.setup_msi_irqs = native_setup_msi_irqs,
 	.teardown_msi_irq = native_teardown_msi_irq,
 	.teardown_msi_irqs = default_teardown_msi_irqs,
+	.restore_msi_irqs = default_restore_msi_irqs,
 };
-- 
cgit v1.1


From e58d429209105e698e9d0357481d62b37fe9a7dd Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Fri, 6 Jan 2012 11:17:51 -0500
Subject: x86, reboot: Fix typo in nmi reboot path

It was brought to my attention that my x86 change to use NMI in
the reboot path broke Intel Nehalem and Westmere boxes when
using kexec.

I realized I had mistyped the if statement in commit
3603a2512f9e69dc87914ba922eb4a0812b21cd6 and stuck the ')' in
the wrong spot.  Putting it in the right spot fixes kexec again.

Doh.

Reported-by: Yinghai Lu <yinghai@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Link: http://lkml.kernel.org/r/1325866671-9797-1-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 113acda..66c74f4 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -176,7 +176,7 @@ static void native_nmi_stop_other_cpus(int wait)
 	 */
 	if (num_online_cpus() > 1) {
 		/* did someone beat us here? */
-		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id() != -1))
+		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
 			return;
 
 		if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
-- 
cgit v1.1


From 72142fd4109105c6bd21658966ca5e93c1684081 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sat, 7 Jan 2012 14:10:18 -0800
Subject: x86: Move <asm/asm-offsets.h> from trace_syscalls.c to asm/syscall.h

This reverts commit d5e553d6e0a4bdea43adae7373e3fa144b9a1aaa, which
caused large numbers of build warnings on PowerPC.

This moves the #include <asm/asm-offsets.h> to <asm/syscall.h>, which
makes some kind of sense since NR_syscalls is syscalls related.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Link: http://lkml.kernel.org/r/20111214181545.6e13bc954cb7ddce9086e861@canb.auug.org.au
---
 arch/x86/include/asm/syscall.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index c4a348f..d962e56 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -15,6 +15,7 @@
 
 #include <linux/sched.h>
 #include <linux/err.h>
+#include <asm/asm-offsets.h>	/* For NR_syscalls */
 
 extern const unsigned long sys_call_table[];
 
-- 
cgit v1.1


From dc6821e0cfe74802aefd2067b40fcdc03fc4599e Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Sat, 7 Jan 2012 21:27:38 -0500
Subject: xen/mmu: Fix compile errors introduced by x86/memblock mismerge.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The git commit d4bbf7e7759afc172e2bfbc5c416324590049cdd
"Merge branch 'master' into x86/memblock" mismerged the 32-bit
section causing:

arch/x86/xen/mmu.c: In function ‘xen_setup_kernel_pagetable’:
arch/x86/xen/mmu.c:1855: error: expected ‘;’ before ‘)’ token
arch/x86/xen/mmu.c:1855: error: expected statement before ‘)’ token

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index f4bf8aa..58a0e46 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1852,7 +1852,7 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
 	xen_write_cr3(__pa(initial_page_table));
 
 	memblock_reserve(__pa(xen_start_info->pt_base),
-			 xen_start_info->nr_pt_frames * PAGE_SIZE));
+			 xen_start_info->nr_pt_frames * PAGE_SIZE);
 
 	return initial_page_table;
 }
-- 
cgit v1.1


From e39f560239984c3098237ad94c9449b1494163f8 Mon Sep 17 00:00:00 2001
From: David Daney <ddaney.cavm@gmail.com>
Date: Tue, 10 Jan 2012 15:10:21 -0800
Subject: fs: binfmt_elf: create Kconfig variable for PIE randomization

Randomization of PIE load address is hard coded in binfmt_elf.c for X86
and ARM.  Create a new Kconfig variable
(CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE) for this and use it instead.  Thus
architecture specific policy is pushed out of the generic binfmt_elf.c and
into the architecture Kconfig files.

X86 and ARM Kconfigs are modified to select the new variable so there is
no change in behavior.  A follow on patch will select it for MIPS too.

Signed-off-by: David Daney <david.daney@cavium.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d2a69d..d6ddc0b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -62,6 +62,7 @@ config X86
 	select ANON_INODES
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
+	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_TEXT_POKE_SMP
 	select HAVE_GENERIC_HARDIRQS
-- 
cgit v1.1


From 5e6292c0f28f03dfdb8ea3d685f0b838a23bfba4 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Tue, 10 Jan 2012 15:11:17 -0800
Subject: signal: add block_sigmask() for adding sigmask to current->blocked

Abstract the code sequence for adding a signal handler's sa_mask to
current->blocked because the sequence is identical for all architectures.
Furthermore, in the past some architectures actually got this code wrong,
so introduce a wrapper that all architectures can use.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/signal.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 54ddaeb2..46a01bdc 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -682,7 +682,6 @@ static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		struct pt_regs *regs)
 {
-	sigset_t blocked;
 	int ret;
 
 	/* Are we from a system call? */
@@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	 */
 	regs->flags &= ~X86_EFLAGS_TF;
 
-	sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
-	if (!(ka->sa.sa_flags & SA_NODEFER))
-		sigaddset(&blocked, sig);
-	set_current_blocked(&blocked);
+	block_sigmask(ka, sig);
 
 	tracehook_signal_handler(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLESTEP));
-- 
cgit v1.1


From b6c96c0214138186f495e3ee73737c6fc5e4efa2 Mon Sep 17 00:00:00 2001
From: Stratos Psomadakis <psomas@cslab.ece.ntua.gr>
Date: Thu, 12 Jan 2012 15:44:47 +1030
Subject: lguest: Make sure interrupt is allocated ok by lguest_setup_irq

Make sure the interrupt is allocated correctly by lguest_setup_irq (check the
return value of irq_alloc_desc_at for -ENOMEM)

Signed-off-by: Stratos Psomadakis <psomas@cslab.ece.ntua.gr>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (cleanups and commentry)
---
 arch/x86/lguest/boot.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index cf4603b..642d880 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -856,18 +856,23 @@ static void __init lguest_init_IRQ(void)
 }
 
 /*
- * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so
- * rather than set them in lguest_init_IRQ we are called here every time an
- * lguest device needs an interrupt.
- *
- * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should
- * pass that up!
+ * Interrupt descriptors are allocated as-needed, but low-numbered ones are
+ * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
+ * tells us the irq is already used: other errors (ie. ENOMEM) we take
+ * seriously.
  */
-void lguest_setup_irq(unsigned int irq)
+int lguest_setup_irq(unsigned int irq)
 {
-	irq_alloc_desc_at(irq, 0);
+	int err;
+
+	/* Returns -ve error or vector number. */
+	err = irq_alloc_desc_at(irq, 0);
+	if (err < 0 && err != -EEXIST)
+		return err;
+
 	irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
 				      handle_level_irq, "level");
+	return 0;
 }
 
 /*
-- 
cgit v1.1


From 5cf9a4e69c1ff0ccdd1d2b7404f95c0531355274 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 12 Jan 2012 08:01:40 -0700
Subject: x86/PCI: build amd_bus.o only when CONFIG_AMD_NB=y

We only need amd_bus.o for AMD systems with PCI.  arch/x86/pci/Makefile
already depends on CONFIG_PCI=y, so this patch just adds the dependency
on CONFIG_AMD_NB.

Cc: Yinghai Lu <yinghai@kernel.org>
Cc: stable@kernel.org	# 2.6.34+ (needs adjustment for k8 -> amd rename)
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/pci/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 75b06f3..e76e18c 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -18,8 +18,9 @@ obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
 obj-$(CONFIG_X86_INTEL_MID)	+= mrst.o
 
 obj-y				+= common.o early.o
-obj-y				+= amd_bus.o bus_numa.o
+obj-y				+= bus_numa.o
 
+obj-$(CONFIG_AMD_NB)		+= amd_bus.o
 obj-$(CONFIG_PCI_CNB20LE_QUIRK)	+= broadcom_bus.o
 
 ifeq ($(CONFIG_PCI_DEBUG),y)
-- 
cgit v1.1


From bccd17294a26b67a8a19aaa120e3eeaa7da49281 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <cbouatmailru@gmail.com>
Date: Wed, 11 Jan 2012 05:11:46 +0400
Subject: x86: Get rid of 'dubious one-bit signed bitfield' sprase warning

This very noisy sparse warning appears on almost every file in the
kernel:

  CHECK   init/main.c
  arch/x86/include/asm/thread_info.h:43:55: error: dubious one-bit signed bitfield
  arch/x86/include/asm/thread_info.h:44:46: error: dubious one-bit signed bitfield

This patch changes sig_on_uaccess_error and uaccess_err flags to unsigned
type and thus fixes the warning.

Signed-off-by: Anton Vorontsov <cbouatmailru@gmail.com>
Acked-by: Andy Lutomirski <luto@mit.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/thread_info.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 7404715..bc817cd 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -40,8 +40,8 @@ struct thread_info {
 						*/
 	__u8			supervisor_stack[0];
 #endif
-	int			sig_on_uaccess_error:1;
-	int			uaccess_err:1;	/* uaccess failed */
+	unsigned int		sig_on_uaccess_error:1;
+	unsigned int		uaccess_err:1;	/* uaccess failed */
 };
 
 #define INIT_THREAD_INFO(tsk)			\
-- 
cgit v1.1


From 476bc0015bf09dad39d36a8b19f76f0c181d1ec9 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Jan 2012 09:32:18 +1030
Subject: module_param: make bool parameters really bool (arch)

module_param(bool) used to counter-intuitively take an int.  In
fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy
trick.

It's time to remove the int/unsigned int option.  For this version
it'll simply give a warning, but it'll break next kernel version.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/kernel/apm_32.c       | 16 ++++++++--------
 arch/x86/kvm/mmu.c             |  2 +-
 arch/x86/kvm/vmx.c             | 18 +++++++++---------
 arch/x86/kvm/x86.c             |  4 ++--
 arch/x86/mm/mmio-mod.c         |  4 ++--
 arch/x86/platform/geode/alix.c |  2 +-
 arch/x86/platform/iris/iris.c  |  2 +-
 7 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index a46bd38..f76623c 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -383,21 +383,21 @@ static int ignore_sys_suspend;
 static int ignore_normal_resume;
 static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
 
-static int debug __read_mostly;
-static int smp __read_mostly;
+static bool debug __read_mostly;
+static bool smp __read_mostly;
 static int apm_disabled = -1;
 #ifdef CONFIG_SMP
-static int power_off;
+static bool power_off;
 #else
-static int power_off = 1;
+static bool power_off = 1;
 #endif
-static int realmode_power_off;
+static bool realmode_power_off;
 #ifdef CONFIG_APM_ALLOW_INTS
-static int allow_ints = 1;
+static bool allow_ints = 1;
 #else
-static int allow_ints;
+static bool allow_ints;
 #endif
-static int broken_psr;
+static bool broken_psr;
 
 static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
 static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2a2a9b4..224b02c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -74,7 +74,7 @@ enum {
 #endif
 
 #ifdef MMU_DEBUG
-static int dbg = 0;
+static bool dbg = 0;
 module_param(dbg, bool, 0644);
 #endif
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 906a7e8..d29216c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -51,29 +51,29 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static int __read_mostly enable_vpid = 1;
+static bool __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
-static int __read_mostly flexpriority_enabled = 1;
+static bool __read_mostly flexpriority_enabled = 1;
 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
 
-static int __read_mostly enable_ept = 1;
+static bool __read_mostly enable_ept = 1;
 module_param_named(ept, enable_ept, bool, S_IRUGO);
 
-static int __read_mostly enable_unrestricted_guest = 1;
+static bool __read_mostly enable_unrestricted_guest = 1;
 module_param_named(unrestricted_guest,
 			enable_unrestricted_guest, bool, S_IRUGO);
 
-static int __read_mostly emulate_invalid_guest_state = 0;
+static bool __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
-static int __read_mostly vmm_exclusive = 1;
+static bool __read_mostly vmm_exclusive = 1;
 module_param(vmm_exclusive, bool, S_IRUGO);
 
-static int __read_mostly yield_on_hlt = 1;
+static bool __read_mostly yield_on_hlt = 1;
 module_param(yield_on_hlt, bool, S_IRUGO);
 
-static int __read_mostly fasteoi = 1;
+static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
 /*
@@ -81,7 +81,7 @@ module_param(fasteoi, bool, S_IRUGO);
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
  * use VMX instructions.
  */
-static int __read_mostly nested = 0;
+static bool __read_mostly nested = 0;
 module_param(nested, bool, S_IRUGO);
 
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1171def..14d6cad 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -88,8 +88,8 @@ static void process_nmi(struct kvm_vcpu *vcpu);
 struct kvm_x86_ops *kvm_x86_ops;
 EXPORT_SYMBOL_GPL(kvm_x86_ops);
 
-int ignore_msrs = 0;
-module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+static bool ignore_msrs = 0;
+module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
 
 bool kvm_has_tsc_control;
 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index de54b9b..dc0b727 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -75,8 +75,8 @@ static LIST_HEAD(trace_list);		/* struct remap_trace */
 
 /* module parameters */
 static unsigned long	filter_offset;
-static int		nommiotrace;
-static int		trace_pc;
+static bool		nommiotrace;
+static bool		trace_pc;
 
 module_param(filter_offset, ulong, 0);
 module_param(nommiotrace, bool, 0);
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index ca19736..dc5f1d3 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -27,7 +27,7 @@
 
 #include <asm/geode.h>
 
-static int force = 0;
+static bool force = 0;
 module_param(force, bool, 0444);
 /* FIXME: Award bios is not automatically detected as Alix platform */
 MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform");
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index 1ba7f5e..5917eb5 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -42,7 +42,7 @@ MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
 MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
 MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
 
-static int force;
+static bool force;
 
 module_param(force, bool, 0);
 MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
-- 
cgit v1.1


From 43570fd2f47ba518145e9289f54cde3dba4c8b25 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 12 Jan 2012 17:17:27 -0800
Subject: mm,slub,x86: decouple size of struct page from CONFIG_CMPXCHG_LOCAL

While implementing cmpxchg_double() on s390 I realized that we don't set
CONFIG_CMPXCHG_LOCAL despite the fact that we have support for it.

However setting that option will increase the size of struct page by
eight bytes on 64 bit, which we certainly do not want.  Also, it doesn't
make sense that a present cpu feature should increase the size of struct
page.

Besides that it looks like the dependency to CMPXCHG_LOCAL is wrong and
that it should depend on CMPXCHG_DOUBLE instead.

This patch:

If an architecture supports CMPXCHG_LOCAL this shouldn't result
automatically in larger struct pages if the SLUB allocator is used.
Instead introduce a new config option "HAVE_ALIGNED_STRUCT_PAGE" which
can be selected if a double word aligned struct page is required.  Also
update x86 Kconfig so that it should work as before.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a150f4c..5201a2c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -60,6 +60,7 @@ config X86
 	select PERF_EVENTS
 	select HAVE_PERF_EVENTS_NMI
 	select ANON_INODES
+	select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
-- 
cgit v1.1


From 4156153c4daddf12dd386016f96a947a01e93bf4 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 12 Jan 2012 17:17:30 -0800
Subject: mm,x86,um: move CMPXCHG_LOCAL config option

Move CMPXCHG_LOCAL and rename it to HAVE_CMPXCHG_LOCAL so architectures
can simply select the option if it is supported.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig     | 1 +
 arch/x86/Kconfig.cpu | 3 ---
 arch/x86/um/Kconfig  | 4 ----
 3 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5201a2c..59717fd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -61,6 +61,7 @@ config X86
 	select HAVE_PERF_EVENTS_NMI
 	select ANON_INODES
 	select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
+	select HAVE_CMPXCHG_LOCAL if !M386
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index e3ca7e0..99d2ab8 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -309,9 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_CMPXCHG
 	def_bool X86_64 || (X86_32 && !M386)
 
-config CMPXCHG_LOCAL
-	def_bool X86_64 || (X86_32 && !M386)
-
 config CMPXCHG_DOUBLE
 	def_bool y
 
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 1d97bd8..a62bfc6 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -6,10 +6,6 @@ menu "UML-specific options"
 
 menu "Host processor type and features"
 
-config CMPXCHG_LOCAL
-	bool
-	default n
-
 config CMPXCHG_DOUBLE
 	bool
 	default n
-- 
cgit v1.1


From 2565409fc0303f3ab8d66b8326702a687962a29b Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 12 Jan 2012 17:17:33 -0800
Subject: mm,x86,um: move CMPXCHG_DOUBLE config option

Move CMPXCHG_DOUBLE and rename it to HAVE_CMPXCHG_DOUBLE so architectures
can simply select the option if it is supported.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig     | 1 +
 arch/x86/Kconfig.cpu | 3 ---
 arch/x86/um/Kconfig  | 4 ----
 3 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 59717fd..6c14ecd 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -62,6 +62,7 @@ config X86
 	select ANON_INODES
 	select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
 	select HAVE_CMPXCHG_LOCAL if !M386
+	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 99d2ab8..3c57033 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -309,9 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_CMPXCHG
 	def_bool X86_64 || (X86_32 && !M386)
 
-config CMPXCHG_DOUBLE
-	def_bool y
-
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index a62bfc6..b2b54d2 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -6,10 +6,6 @@ menu "UML-specific options"
 
 menu "Host processor type and features"
 
-config CMPXCHG_DOUBLE
-	bool
-	default n
-
 source "arch/x86/Kconfig.cpu"
 
 endmenu
-- 
cgit v1.1


From 9512938b885304f72c847379611d6018064af840 Mon Sep 17 00:00:00 2001
From: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Date: Thu, 12 Jan 2012 17:20:09 -0800
Subject: cpumask: update setup_node_to_cpumask_map() comments

node_to_cpumask() has been replaced by cpumask_of_node(), and wholly
removed since commit 29c337a0 ("cpumask: remove obsolete node_to_cpumask
now everyone uses cpumask_of_node").

So update the comments for setup_node_to_cpumask_map().

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/numa.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 020cd2e..19d3fa0 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -110,7 +110,7 @@ void __cpuinit numa_clear_node(int cpu)
  * Allocate node_to_cpumask_map based on number of available nodes
  * Requires node_possible_map to be valid.
  *
- * Note: node_to_cpumask() is not valid until after this is done.
+ * Note: cpumask_of_node() is not valid until after this is done.
  * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
  */
 void __init setup_node_to_cpumask_map(void)
-- 
cgit v1.1


From a3301b751b19f0efbafddc4034f8e7ce6bf3007b Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Sat, 14 Jan 2012 08:11:31 +0530
Subject: x86/mce: Fix CPU hotplug and suspend regression related to MCE

Commit 8a25a2fd126c ("cpu: convert 'cpu' and 'machinecheck' sysdev_class
to a regular subsystem") changed how things are dealt with in the MCE
subsystem.  Some of the things that got broken due to this are CPU
hotplug and suspend/hibernate.

MCE uses per_cpu allocations of struct device.  So, when a CPU goes
offline and comes back online, in order to ensure that we start from a
clean slate with respect to the MCE subsystem, zero out the entire
per_cpu device structure to 0 before using it.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index f22a9f7..29ba329 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -2011,7 +2011,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	memset(&dev->kobj, 0, sizeof(struct kobject));
+	memset(dev, 0, sizeof(struct device));
 	dev->id  = cpu;
 	dev->bus = &mce_subsys;
 
-- 
cgit v1.1


From e032d80774315869aa2285b217fdbbfed86c0b49 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Mon, 16 Jan 2012 14:40:28 -0800
Subject: mce: fix warning messages about static struct mce_device

When suspending, there was a large list of warnings going something like:

	Device 'machinecheck1' does not have a release() function, it is broken and must be fixed

This patch turns the static mce_devices into dynamically allocated, and
properly frees them when they are removed from the system.  It solves
the warning messages on my laptop here.

Reported-by: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Djalal Harouni <tixxdz@opendz.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@amd64.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/mce.h           |  2 +-
 arch/x86/kernel/cpu/mcheck/mce.c     | 18 ++++++++++++++----
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 18 +++++++++++-------
 3 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f35ce43..6aefb14 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -151,7 +151,7 @@ static inline void enable_p5_mce(void) {}
 
 void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
-DECLARE_PER_CPU(struct device, mce_device);
+extern struct device *mce_device[CONFIG_NR_CPUS];
 
 /*
  * Maximum banks number.
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 29ba329..5a11ae2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1859,7 +1859,7 @@ static struct bus_type mce_subsys = {
 	.dev_name	= "machinecheck",
 };
 
-DEFINE_PER_CPU(struct device, mce_device);
+struct device *mce_device[CONFIG_NR_CPUS];
 
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -2001,19 +2001,27 @@ static struct device_attribute *mce_device_attrs[] = {
 
 static cpumask_var_t mce_device_initialized;
 
+static void mce_device_release(struct device *dev)
+{
+	kfree(dev);
+}
+
 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
 static __cpuinit int mce_device_create(unsigned int cpu)
 {
-	struct device *dev = &per_cpu(mce_device, cpu);
+	struct device *dev;
 	int err;
 	int i, j;
 
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	memset(dev, 0, sizeof(struct device));
+	dev = kzalloc(sizeof *dev, GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
 	dev->id  = cpu;
 	dev->bus = &mce_subsys;
+	dev->release = &mce_device_release;
 
 	err = device_register(dev);
 	if (err)
@@ -2030,6 +2038,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)
 			goto error2;
 	}
 	cpumask_set_cpu(cpu, mce_device_initialized);
+	mce_device[cpu] = dev;
 
 	return 0;
 error2:
@@ -2046,7 +2055,7 @@ error:
 
 static __cpuinit void mce_device_remove(unsigned int cpu)
 {
-	struct device *dev = &per_cpu(mce_device, cpu);
+	struct device *dev = mce_device[cpu];
 	int i;
 
 	if (!cpumask_test_cpu(cpu, mce_device_initialized))
@@ -2060,6 +2069,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu)
 
 	device_unregister(dev);
 	cpumask_clear_cpu(cpu, mce_device_initialized);
+	mce_device[cpu] = NULL;
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ba0b94a..786e76a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -523,6 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 {
 	int i, err = 0;
 	struct threshold_bank *b = NULL;
+	struct device *dev = mce_device[cpu];
 	char name[32];
 
 	sprintf(name, "threshold_bank%i", bank);
@@ -543,8 +544,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (!b)
 			goto out;
 
-		err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj,
-					b->kobj, name);
+		err = sysfs_create_link(&dev->kobj, b->kobj, name);
 		if (err)
 			goto out;
 
@@ -565,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		goto out;
 	}
 
-	b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj);
+	b->kobj = kobject_create_and_add(name, &dev->kobj);
 	if (!b->kobj)
 		goto out_free;
 
@@ -585,8 +585,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (i == cpu)
 			continue;
 
-		err = sysfs_create_link(&per_cpu(mce_device, i).kobj,
-					b->kobj, name);
+		dev = mce_device[i];
+		if (dev)
+			err = sysfs_create_link(&dev->kobj,b->kobj, name);
 		if (err)
 			goto out;
 
@@ -649,6 +650,7 @@ static void deallocate_threshold_block(unsigned int cpu,
 static void threshold_remove_bank(unsigned int cpu, int bank)
 {
 	struct threshold_bank *b;
+	struct device *dev;
 	char name[32];
 	int i = 0;
 
@@ -663,7 +665,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #ifdef CONFIG_SMP
 	/* sibling symlink */
 	if (shared_bank[bank] && b->blocks->cpu != cpu) {
-		sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name);
+		sysfs_remove_link(&mce_device[cpu]->kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
 
 		return;
@@ -675,7 +677,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 		if (i == cpu)
 			continue;
 
-		sysfs_remove_link(&per_cpu(mce_device, i).kobj, name);
+		dev = mce_device[i];
+		if (dev)
+			sysfs_remove_link(&dev->kobj, name);
 		per_cpu(threshold_banks, i)[bank] = NULL;
 	}
 
-- 
cgit v1.1


From b54ac6d2a25084667da781c7ca2cebef52a2bcdd Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Thu, 8 Dec 2011 11:25:49 +0800
Subject: ACPI, Record ACPI NVS regions

Some firmware will access memory in ACPI NVS region via APEI.  That
is, instructions in APEI ERST/EINJ table will read/write ACPI NVS
region.  The original resource conflict checking in APEI code will
check memory/ioport accessed by APEI via general resource management
mechanism.  But ACPI NVS region is marked as busy already, so that the
false resource conflict will prevent APEI ERST/EINJ to work.

To fix this, this patch record ACPI NVS regions, so that we can avoid
request resources for memory region inside it.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/e820.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 303a0e4..51c3b18 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -714,7 +714,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
 }
 #endif
 
-#ifdef CONFIG_HIBERNATION
+#ifdef CONFIG_ACPI
 /**
  * Mark ACPI NVS memory region, so that we can save/restore it during
  * hibernation and the subsequent resume.
@@ -727,7 +727,7 @@ static int __init e820_mark_nvs_memory(void)
 		struct e820entry *ei = &e820.map[i];
 
 		if (ei->type == E820_NVS)
-			suspend_nvs_register(ei->addr, ei->size);
+			acpi_nvs_register(ei->addr, ei->size);
 	}
 
 	return 0;
-- 
cgit v1.1


From cd298f60a2451a16e0f077404bf69b62ec868733 Mon Sep 17 00:00:00 2001
From: Kurt Garloff <kurt@garloff.de>
Date: Tue, 17 Jan 2012 04:20:31 -0500
Subject: ACPI, x86: Use SRAT table rev to use 8bit or 32bit PXM fields
 (x86/x86-64)

In SRAT v1, we had 8bit proximity domain (PXM) fields; SRAT v2 provides
32bits for these. The new fields were reserved before.
According to the ACPI spec, the OS must disregrard reserved fields.

x86/x86-64 was rather inconsistent prior to this patch; it used 8 bits
for the pxm field in cpu_affinity, but 32 bits in mem_affinity.
This patch makes it consistent: Either use 8 bits consistently (SRAT
rev 1 or lower) or 32 bits (SRAT rev 2 or higher).

cc: x86@kernel.org
Signed-off-by: Kurt Garloff <kurt@garloff.de>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/mm/srat.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 81dbfde..7efd0c6 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -104,6 +104,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
 		return;
 	pxm = pa->proximity_domain_lo;
+	if (acpi_srat_revision >= 2)
+		pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8;
 	node = setup_node(pxm);
 	if (node < 0) {
 		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
@@ -155,6 +157,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	start = ma->base_address;
 	end = start + ma->length;
 	pxm = ma->proximity_domain;
+	if (acpi_srat_revision <= 1)
+		pxm &= 0xff;
 	node = setup_node(pxm);
 	if (node < 0) {
 		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
-- 
cgit v1.1


From d7e7528bcd456f5c36ad4a202ccfb43c5aa98bc4 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: Audit: push audit success and retcode into arch ptrace.h

The audit system previously expected arches calling to audit_syscall_exit to
supply as arguments if the syscall was a success and what the return code was.
Audit also provides a helper AUDITSC_RESULT which was supposed to simplify things
by converting from negative retcodes to an audit internal magic value stating
success or failure.  This helper was wrong and could indicate that a valid
pointer returned to userspace was a failed syscall.  The fix is to fix the
layering foolishness.  We now pass audit_syscall_exit a struct pt_reg and it
in turns calls back into arch code to collect the return value and to
determine if the syscall was a success or failure.  We also define a generic
is_syscall_success() macro which determines success/failure based on if the
value is < -MAX_ERRNO.  This works for arches like x86 which do not use a
separate mechanism to indicate syscall failure.

We make both the is_syscall_success() and regs_return_value() static inlines
instead of macros.  The reason is because the audit function must take a void*
for the regs.  (uml calls theirs struct uml_pt_regs instead of just struct
pt_regs so audit_syscall_exit can't take a struct pt_regs).  Since the audit
function takes a void* we need to use static inlines to cast it back to the
arch correct structure to dereference it.

The other major change is that on some arches, like ia64, MIPS and ppc, we
change regs_return_value() to give us the negative value on syscall failure.
THE only other user of this macro, kretprobe_example.c, won't notice and it
makes the value signed consistently for the audit functions across all archs.

In arch/sh/kernel/ptrace_64.c I see that we were using regs[9] in the old
audit code as the return value.  But the ptrace_64.h code defined the macro
regs_return_value() as regs[3].  I have no idea which one is correct, but this
patch now uses the regs_return_value() function, so it now uses regs[3].

For powerpc we previously used regs->result but now use the
regs_return_value() function which uses regs->gprs[3].  regs->gprs[3] is
always positive so the regs_return_value(), much like ia64 makes it negative
before calling the audit code when appropriate.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: H. Peter Anvin <hpa@zytor.com> [for x86 portion]
Acked-by: Tony Luck <tony.luck@intel.com> [for ia64]
Acked-by: Richard Weinberger <richard@nod.at> [for uml]
Acked-by: David S. Miller <davem@davemloft.net> [for sparc]
Acked-by: Ralf Baechle <ralf@linux-mips.org> [for mips]
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> [for ppc]
---
 arch/x86/ia32/ia32entry.S          | 10 +++++-----
 arch/x86/kernel/entry_32.S         |  8 ++++----
 arch/x86/kernel/entry_64.S         | 10 +++++-----
 arch/x86/kernel/ptrace.c           |  3 +--
 arch/x86/kernel/vm86_32.c          |  4 ++--
 arch/x86/um/shared/sysdep/ptrace.h |  5 +++++
 6 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 3e27456..64ced0b 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -14,6 +14,7 @@
 #include <asm/segment.h>
 #include <asm/irqflags.h>
 #include <linux/linkage.h>
+#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -208,12 +209,11 @@ sysexit_from_sys_call:
 	TRACE_IRQS_ON
 	sti
 	movl %eax,%esi		/* second arg, syscall return value */
-	cmpl $0,%eax		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
-	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
-	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall return value */
+	call __audit_syscall_exit
+	movq RAX-ARGOFFSET(%rsp),%rax	/* reload syscall return value */
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	cli
 	TRACE_IRQS_OFF
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 22d0e21..a22facf 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -42,6 +42,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/err.h>
 #include <asm/thread_info.h>
 #include <asm/irqflags.h>
 #include <asm/errno.h>
@@ -466,11 +467,10 @@ sysexit_audit:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)
 	movl %eax,%edx		/* second arg, syscall return value */
-	cmpl $0,%eax		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%eax		/* zero-extend that */
-	inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
+	call __audit_syscall_exit
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a20e1cb..e51393d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,7 @@
 #include <asm/paravirt.h>
 #include <asm/ftrace.h>
 #include <asm/percpu.h>
+#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -563,17 +564,16 @@ auditsys:
 	jmp system_call_fastpath
 
 	/*
-	 * Return fast path for syscall audit.  Call audit_syscall_exit()
+	 * Return fast path for syscall audit.  Call __audit_syscall_exit()
 	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
 	 * masked off.
 	 */
 sysret_audit:
 	movq RAX-ARGOFFSET(%rsp),%rsi	/* second arg, syscall return value */
-	cmpq $0,%rsi		/* is it < 0? */
-	setl %al		/* 1 if so, 0 if not */
+	cmpq $-MAX_ERRNO,%rsi	/* is it < -MAX_ERRNO? */
+	setbe %al		/* 1 if so, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
-	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-	call audit_syscall_exit
+	call __audit_syscall_exit
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	jmp sysret_check
 #endif	/* CONFIG_AUDITSYSCALL */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 89a04c7..8b02187 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1414,8 +1414,7 @@ void syscall_trace_leave(struct pt_regs *regs)
 {
 	bool step;
 
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
+	audit_syscall_exit(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_exit(regs, regs->ax);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 863f875..af17e1c 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -335,9 +335,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	if (info->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 
-	/*call audit_syscall_exit since we do not exit via the normal paths */
+	/*call __audit_syscall_exit since we do not exit via the normal paths */
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(0), 0);
+		__audit_syscall_exit(1, 0);
 
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 711b162..5ef9344 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -3,3 +3,8 @@
 #else
 #include "ptrace_64.h"
 #endif
+
+static inline long regs_return_value(struct uml_pt_regs *regs)
+{
+	return UPT_SYSCALL_RET(regs);
+}
-- 
cgit v1.1


From f031cd25568a390dc2c9c3a4015054183753449a Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: audit: ia32entry.S sign extend error codes when calling 64 bit code

In the ia32entry syscall exit audit fastpath we have assembly code which calls
__audit_syscall_exit directly.  This code was, however, zeroes the upper 32
bits of the return code.  It then proceeded to call code which expects longs
to be 64bits long.  In order to handle code which expects longs to be 64bit we
sign extend the return code if that code is an error.  Thus the
__audit_syscall_exit function can correctly handle using the values in
snprintf("%ld").  This fixes the regression introduced in 5cbf1565f29eb57a86a.

Old record:
type=SYSCALL msg=audit(1306197182.256:281): arch=40000003 syscall=192 success=no exit=4294967283
New record:
type=SYSCALL msg=audit(1306197182.256:281): arch=40000003 syscall=192 success=no exit=-13

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/ia32/ia32entry.S | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 64ced0b..025f0f0 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -210,7 +210,9 @@ sysexit_from_sys_call:
 	sti
 	movl %eax,%esi		/* second arg, syscall return value */
 	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
-	setbe %al		/* 1 if so, 0 if not */
+	jbe 1f
+	movslq %eax, %rsi	/* if error sign extend to 64 bits */
+1:	setbe %al		/* 1 if error, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
 	call __audit_syscall_exit
 	movq RAX-ARGOFFSET(%rsp),%rax	/* reload syscall return value */
-- 
cgit v1.1


From b05d8447e7821695bc2fa3359431f7a664232743 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 3 Jan 2012 14:23:06 -0500
Subject: audit: inline audit_syscall_entry to reduce burden on archs

Every arch calls:

if (unlikely(current->audit_context))
	audit_syscall_entry()

which requires knowledge about audit (the existance of audit_context) in
the arch code.  Just do it all in static inline in audit.h so that arch's
can remain blissfully ignorant.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 arch/x86/ia32/ia32entry.S  |  2 +-
 arch/x86/kernel/entry_32.S |  2 +-
 arch/x86/kernel/entry_64.S |  4 ++--
 arch/x86/kernel/ptrace.c   | 22 ++++++++++------------
 4 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 025f0f0..cecfd9a 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -192,7 +192,7 @@ sysexit_from_sys_call:
 	movl %ebx,%edx			/* 3rd arg: 1st syscall arg */
 	movl %eax,%esi			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%edi	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall number */
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index a22facf..1ccd742 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -456,7 +456,7 @@ sysenter_audit:
 	movl %ebx,%ecx			/* 3rd arg: 1st syscall arg */
 	movl %eax,%edx			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_I386,%eax	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	pushl_cfi %ebx
 	movl PT_EAX(%esp),%eax		/* reload syscall number */
 	jmp sysenter_do_call
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e51393d..1ca66b6 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -549,7 +549,7 @@ badsys:
 #ifdef CONFIG_AUDITSYSCALL
 	/*
 	 * Fast path for syscall audit without full syscall trace.
-	 * We just call audit_syscall_entry() directly, and then
+	 * We just call __audit_syscall_entry() directly, and then
 	 * jump back to the normal fast path.
 	 */
 auditsys:
@@ -559,7 +559,7 @@ auditsys:
 	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
 	movq %rax,%rsi			/* 2nd arg: syscall number */
 	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
-	call audit_syscall_entry
+	call __audit_syscall_entry
 	LOAD_ARGS 0		/* reload call-clobbered registers */
 	jmp system_call_fastpath
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 8b02187..5026738 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->orig_ax);
 
-	if (unlikely(current->audit_context)) {
-		if (IS_IA32)
-			audit_syscall_entry(AUDIT_ARCH_I386,
-					    regs->orig_ax,
-					    regs->bx, regs->cx,
-					    regs->dx, regs->si);
+	if (IS_IA32)
+		audit_syscall_entry(AUDIT_ARCH_I386,
+				    regs->orig_ax,
+				    regs->bx, regs->cx,
+				    regs->dx, regs->si);
 #ifdef CONFIG_X86_64
-		else
-			audit_syscall_entry(AUDIT_ARCH_X86_64,
-					    regs->orig_ax,
-					    regs->di, regs->si,
-					    regs->dx, regs->r10);
+	else
+		audit_syscall_entry(AUDIT_ARCH_X86_64,
+				    regs->orig_ax,
+				    regs->di, regs->si,
+				    regs->dx, regs->r10);
 #endif
-	}
 
 	return ret ?: regs->orig_ax;
 }
-- 
cgit v1.1


From 6015ff103133c7e50a753c198c69bcabc3a5e3b0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 18 Jan 2012 01:51:22 +0000
Subject: x86-32: Fix build failure with AUDIT=y, AUDITSYSCALL=n

JONGMAN HEO reports:

  With current linus git (commit a25a2b84), I got following build error,

  arch/x86/kernel/vm86_32.c: In function 'do_sys_vm86':
  arch/x86/kernel/vm86_32.c:340: error: implicit declaration of function '__audit_syscall_exit'
  make[3]: *** [arch/x86/kernel/vm86_32.o] Error 1

OK, I can reproduce it (32bit allmodconfig with AUDIT=y, AUDITSYSCALL=n)

It's due to commit d7e7528bcd45: "Audit: push audit success and retcode
into arch ptrace.h".

Reported-by: JONGMAN HEO <jongman.heo@samsung.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/vm86_32.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index af17e1c..b466cab 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -336,8 +336,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 		mark_screen_rdonly(tsk->mm);
 
 	/*call __audit_syscall_exit since we do not exit via the normal paths */
+#ifdef CONFIG_AUDITSYSCALL
 	if (unlikely(current->audit_context))
 		__audit_syscall_exit(1, 0);
+#endif
 
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
-- 
cgit v1.1


From 90a4c0f51e8e44111a926be6f4c87af3938a79c3 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 18 Jan 2012 19:26:11 -0800
Subject: uml: fix compile for x86-64

Randy Dunlap reports that we get

  arch/x86/um/shared/sysdep/ptrace.h:7:20: error: redefinition of 'regs_return_value'
  arch/x86/um/shared/sysdep/ptrace.h:7:20: note: previous definition of 'regs_return_value' was here

when compiling UML for x86-64.

Stephen Rothwell root-caused it and says:

 "Caused by commit d7e7528bcd45 ("Audit: push audit success and retcode
  into arch ptrace.h") (another patch that was never in linux-next :-().

  This file now needs protection against double inclusion."

so let's do as the man says.

Reported-by: Randy Dunlap <rdunlap@xenotime.net>
Analyzed-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/um/shared/sysdep/ptrace.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 5ef9344..2bbe1ec2 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -1,3 +1,6 @@
+#ifndef __SYSDEP_X86_PTRACE_H
+#define __SYSDEP_X86_PTRACE_H
+
 #ifdef __i386__
 #include "ptrace_32.h"
 #else
@@ -8,3 +11,5 @@ static inline long regs_return_value(struct uml_pt_regs *regs)
 {
 	return UPT_SYSCALL_RET(regs);
 }
+
+#endif /* __SYSDEP_X86_PTRACE_H */
-- 
cgit v1.1