From 81559f9ad3d88c033e4ec3b6468012dbfda3b31d Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 18 Oct 2011 13:33:02 +0300 Subject: crypto: twofish-x86_64-3way - add lrw support Patch adds LRW support for twofish-x86_64-3way by using lrw_crypt(). Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmarks results (twofish-3way/twofish-asm speed ratios): Intel Celeron T1600 (fam:6, model:15, step:13): size lrw-enc lrw-dec 16B 0.99x 1.00x 64B 1.17x 1.17x 256B 1.26x 1.27x 1024B 1.30x 1.31x 8192B 1.31x 1.32x AMD Phenom II 1055T (fam:16, model:10): size lrw-enc lrw-dec 16B 1.06x 1.01x 64B 1.08x 1.14x 256B 1.19x 1.20x 1024B 1.21x 1.22x 8192B 1.23x 1.24x Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_glue_3way.c | 135 ++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 5ede9c4..fa9151d 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -32,6 +32,11 @@ #include #include #include +#include + +#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE) +#define HAS_LRW +#endif /* regular block cipher functions from twofish_x86_64 module */ asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, @@ -432,6 +437,124 @@ static struct crypto_alg blk_ctr_alg = { }, }; +#ifdef HAS_LRW + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = TF_BLOCK_SIZE; + struct twofish_ctx *ctx = priv; + int i; + + if (nbytes == 3 * bsize) { + twofish_enc_blk_3way(ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + twofish_enc_blk(ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = TF_BLOCK_SIZE; + struct twofish_ctx *ctx = priv; + int i; + + if (nbytes == 3 * bsize) { + twofish_dec_blk_3way(ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + twofish_dec_blk(ctx, srcdst, srcdst); +} + +struct twofish_lrw_ctx { + struct lrw_table_ctx lrw_table; + struct twofish_ctx twofish_ctx; +}; + +static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + int err; + + err = __twofish_setkey(&ctx->twofish_ctx, key, keylen - TF_BLOCK_SIZE, + &tfm->crt_flags); + if (err) + return err; + + return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[3]; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &ctx->twofish_ctx, + .crypt_fn = encrypt_callback, + }; + + return lrw_crypt(desc, dst, src, nbytes, &req); +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[3]; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &ctx->twofish_ctx, + .crypt_fn = decrypt_callback, + }; + + return lrw_crypt(desc, dst, src, nbytes, &req); +} + +static void lrw_exit_tfm(struct crypto_tfm *tfm) +{ + struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + + lrw_free_table(&ctx->lrw_table); +} + +static struct crypto_alg blk_lrw_alg = { + .cra_name = "lrw(twofish)", + .cra_driver_name = "lrw-twofish-3way", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_lrw_alg.cra_list), + .cra_exit = lrw_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE, + .max_keysize = TF_MAX_KEY_SIZE + TF_BLOCK_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = lrw_twofish_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}; + +#endif + int __init init(void) { int err; @@ -445,9 +568,18 @@ int __init init(void) err = crypto_register_alg(&blk_ctr_alg); if (err) goto ctr_err; +#ifdef HAS_LRW + err = crypto_register_alg(&blk_lrw_alg); + if (err) + goto blk_lrw_err; +#endif return 0; +#ifdef HAS_LRW +blk_lrw_err: + crypto_unregister_alg(&blk_ctr_alg); +#endif ctr_err: crypto_unregister_alg(&blk_cbc_alg); cbc_err: @@ -458,6 +590,9 @@ ecb_err: void __exit fini(void) { +#ifdef HAS_LRW + crypto_unregister_alg(&blk_lrw_alg); +#endif crypto_unregister_alg(&blk_ctr_alg); crypto_unregister_alg(&blk_cbc_alg); crypto_unregister_alg(&blk_ecb_alg); -- cgit v1.1 From bae6d3038b7faff187f4207448a40b9912cf787d Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 18 Oct 2011 13:33:43 +0300 Subject: crypto: twofish-x86_64-3way - add xts support Patch adds XTS support for twofish-x86_64-3way by using xts_crypt(). Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmarks results (twofish-3way/twofish-asm speed ratios): Intel Celeron T1600 (fam:6, model:15, step:13): size xts-enc xts-dec 16B 0.98x 1.00x 64B 1.14x 1.15x 256B 1.23x 1.25x 1024B 1.26x 1.29x 8192B 1.28x 1.30x AMD Phenom II 1055T (fam:16, model:10): size xts-enc xts-dec 16B 1.03x 1.03x 64B 1.13x 1.16x 256B 1.20x 1.20x 1024B 1.22x 1.22x 8192B 1.22x 1.21x Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_glue_3way.c | 119 +++++++++++++++++++++++++++++++++++- 1 file changed, 117 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index fa9151d..954f59e 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -33,11 +33,16 @@ #include #include #include +#include #if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE) #define HAS_LRW #endif +#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE) +#define HAS_XTS +#endif + /* regular block cipher functions from twofish_x86_64 module */ asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, const u8 *src); @@ -437,7 +442,7 @@ static struct crypto_alg blk_ctr_alg = { }, }; -#ifdef HAS_LRW +#if defined(HAS_LRW) || defined(HAS_XTS) static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) { @@ -469,6 +474,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) twofish_dec_blk(ctx, srcdst, srcdst); } +#endif + +#ifdef HAS_LRW + struct twofish_lrw_ctx { struct lrw_table_ctx lrw_table; struct twofish_ctx twofish_ctx; @@ -555,6 +564,99 @@ static struct crypto_alg blk_lrw_alg = { #endif +#ifdef HAS_XTS + +struct twofish_xts_ctx { + struct twofish_ctx tweak_ctx; + struct twofish_ctx crypt_ctx; +}; + +static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + int err; + + /* key consists of keys of equal size concatenated, therefore + * the length must be even + */ + if (keylen % 2) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + /* first half of xts-key is for crypt */ + err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, + flags); +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[3]; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), + .crypt_ctx = &ctx->crypt_ctx, + .crypt_fn = encrypt_callback, + }; + + return xts_crypt(desc, dst, src, nbytes, &req); +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[3]; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk), + .crypt_ctx = &ctx->crypt_ctx, + .crypt_fn = decrypt_callback, + }; + + return xts_crypt(desc, dst, src, nbytes, &req); +} + +static struct crypto_alg blk_xts_alg = { + .cra_name = "xts(twofish)", + .cra_driver_name = "xts-twofish-3way", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_xts_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE * 2, + .max_keysize = TF_MAX_KEY_SIZE * 2, + .ivsize = TF_BLOCK_SIZE, + .setkey = xts_twofish_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}; + +#endif + int __init init(void) { int err; @@ -573,13 +675,23 @@ int __init init(void) if (err) goto blk_lrw_err; #endif +#ifdef HAS_XTS + err = crypto_register_alg(&blk_xts_alg); + if (err) + goto blk_xts_err; +#endif return 0; +#ifdef HAS_XTS + crypto_unregister_alg(&blk_xts_alg); +blk_xts_err: +#endif #ifdef HAS_LRW + crypto_unregister_alg(&blk_lrw_alg); blk_lrw_err: - crypto_unregister_alg(&blk_ctr_alg); #endif + crypto_unregister_alg(&blk_ctr_alg); ctr_err: crypto_unregister_alg(&blk_cbc_alg); cbc_err: @@ -590,6 +702,9 @@ ecb_err: void __exit fini(void) { +#ifdef HAS_XTS + crypto_unregister_alg(&blk_xts_alg); +#endif #ifdef HAS_LRW crypto_unregister_alg(&blk_lrw_alg); #endif -- cgit v1.1 From ff14c1d01576fb839a925a42596582f6c68a1a1a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 1 Nov 2011 15:58:16 +0200 Subject: x86, mm: Use MAX_DMA_PFN for ZONE_DMA on 32-bit Use MAX_DMA_PFN which represents the 16 MB ISA DMA limit on 32-bit x86 just like we do on 64-bit. Acked-by: Tejun Heo Acked-by: Yinghai Lu Acked-by: David Rientjes Signed-off-by: Pekka Enberg Link: http://lkml.kernel.org/r/1320155902-10424-1-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 29f7c6d9..434c97d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -679,8 +679,7 @@ static void __init zone_sizes_init(void) unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); #ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = - virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM -- cgit v1.1 From 4c0b2e5f8940fec7cbeafcf641fecd5e746329c5 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 1 Nov 2011 15:58:17 +0200 Subject: x86, mm: Move zone init from paging_init() on 64-bit This patch introduces a zone_sizes_init() helper function on 64-bit to make it more similar to 32-bit init. Acked-by: Tejun Heo Acked-by: Yinghai Lu Acked-by: David Rientjes Signed-off-by: Pekka Enberg Link: http://lkml.kernel.org/r/1320155902-10424-2-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bbaaa00..3ddda59 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -612,7 +612,7 @@ void __init initmem_init(void) } #endif -void __init paging_init(void) +static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; @@ -623,6 +623,11 @@ void __init paging_init(void) max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; max_zone_pfns[ZONE_NORMAL] = max_pfn; + free_area_init_nodes(max_zone_pfns); +} + +void __init paging_init(void) +{ sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); @@ -634,7 +639,7 @@ void __init paging_init(void) */ node_clear_state(0, N_NORMAL_MEMORY); - free_area_init_nodes(max_zone_pfns); + zone_sizes_init(); } /* -- cgit v1.1 From e4794640ca408acda18eb31b126f58a58803b9c9 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 1 Nov 2011 15:58:18 +0200 Subject: x86, mm: Use max_pfn instead of highend_pfn The 'highend_pfn' variable is always set to 'max_pfn' so just use the latter directly. Acked-by: Tejun Heo Acked-by: Yinghai Lu Signed-off-by: Pekka Enberg Link: http://lkml.kernel.org/r/1320155902-10424-3-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 434c97d..5ac0118 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -683,7 +683,7 @@ static void __init zone_sizes_init(void) #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif free_area_init_nodes(max_zone_pfns); -- cgit v1.1 From 80b3cac97bc14fdf839d967602e599cbf82ea336 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 1 Nov 2011 15:58:19 +0200 Subject: x86, mm: Wrap ZONE_DMA32 with CONFIG_ZONE_DMA32 In preparation for unifying 32-bit and 64-bit zone_sizes_init() make sure ZONE_DMA32 is wrapped in CONFIG_ZONE_DMA32. Acked-by: Tejun Heo Acked-by: Yinghai Lu Acked-by: David Rientjes Acked-by: Arun Sharma Signed-off-by: Pekka Enberg Link: http://lkml.kernel.org/r/1320155902-10424-4-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3ddda59..a9214e6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -620,7 +620,9 @@ static void __init zone_sizes_init(void) #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; #endif +#ifdef CONFIG_ZONE_DMA32 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; +#endif max_zone_pfns[ZONE_NORMAL] = max_pfn; free_area_init_nodes(max_zone_pfns); -- cgit v1.1 From ece838b6257412647197c072fe59dfc6615df144 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 1 Nov 2011 15:58:20 +0200 Subject: x86, mm: Use max_low_pfn for ZONE_NORMAL on 64-bit 64-bit has no highmem so max_low_pfn is always the same as 'max_pfn'. Acked-by: Tejun Heo Acked-by: Yinghai Lu Acked-by: David Rientjes Signed-off-by: Pekka Enberg Link: http://lkml.kernel.org/r/1320155902-10424-5-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a9214e6..f6b1f08 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -623,7 +623,7 @@ static void __init zone_sizes_init(void) #ifdef CONFIG_ZONE_DMA32 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; #endif - max_zone_pfns[ZONE_NORMAL] = max_pfn; + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; free_area_init_nodes(max_zone_pfns); } -- cgit v1.1 From 248b52b97da7a712d2263a51d8d84c959f38ef75 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 1 Nov 2011 15:58:21 +0200 Subject: x86, mm: Prepare zone_sizes_init() for unification Make 32-bit and 64-bit zone_sizes_init() identical in preparation for unification. Acked-by: Tejun Heo Acked-by: Yinghai Lu Signed-off-by: Pekka Enberg Link: http://lkml.kernel.org/r/1320155902-10424-6-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ++++ arch/x86/mm/init_64.c | 3 +++ 2 files changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5ac0118..27455b9 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -677,10 +677,14 @@ void __init initmem_init(void) static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; #endif +#ifdef CONFIG_ZONE_DMA32 + max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; +#endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM max_zone_pfns[ZONE_HIGHMEM] = max_pfn; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f6b1f08..06c4360 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -624,6 +624,9 @@ static void __init zone_sizes_init(void) max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; +#endif free_area_init_nodes(max_zone_pfns); } -- cgit v1.1 From 176239153049a023d060ce95b05f7ef31667e362 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 1 Nov 2011 15:58:22 +0200 Subject: x86, mm: Unify zone_sizes_init() Now that zone_sizes_init() is identical on 32-bit and 64-bit, move the code to arch/x86/mm/init.c and use it for both architectures. Acked-by: Tejun Heo Acked-by: Yinghai Lu Signed-off-by: Pekka Enberg Link: http://lkml.kernel.org/r/1320155902-10424-7-git-send-email-penberg@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/init.h | 2 ++ arch/x86/mm/init.c | 23 +++++++++++++++++++++++ arch/x86/mm/init_32.c | 19 ------------------- arch/x86/mm/init_64.c | 19 ------------------- 4 files changed, 25 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h index 8dbe353..adcc0ae 100644 --- a/arch/x86/include/asm/init.h +++ b/arch/x86/include/asm/init.h @@ -5,6 +5,8 @@ extern void __init early_ioremap_page_table_range_init(void); #endif +extern void __init zone_sizes_init(void); + extern unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long end, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 87488b9..2426b60 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -3,6 +3,7 @@ #include #include #include +#include /* for max_low_pfn */ #include #include @@ -15,6 +16,7 @@ #include #include #include +#include /* for MAX_DMA_PFN */ unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; @@ -392,3 +394,24 @@ void free_initrd_mem(unsigned long start, unsigned long end) free_init_pages("initrd memory", start, PAGE_ALIGN(end)); } #endif + +void __init zone_sizes_init(void) +{ + unsigned long max_zone_pfns[MAX_NR_ZONES]; + + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + +#ifdef CONFIG_ZONE_DMA + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; +#endif +#ifdef CONFIG_ZONE_DMA32 + max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; +#endif + max_zone_pfns[ZONE_NORMAL] = max_low_pfn; +#ifdef CONFIG_HIGHMEM + max_zone_pfns[ZONE_HIGHMEM] = max_pfn; +#endif + + free_area_init_nodes(max_zone_pfns); +} + diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 27455b9..3bebaed 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -674,25 +674,6 @@ void __init initmem_init(void) } #endif /* !CONFIG_NEED_MULTIPLE_NODES */ -static void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); -#ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; -#endif -#ifdef CONFIG_ZONE_DMA32 - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; -#endif - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; -#ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_HIGHMEM] = max_pfn; -#endif - - free_area_init_nodes(max_zone_pfns); -} - void __init setup_bootmem_allocator(void) { printk(KERN_INFO " mapped low ram: 0 - %08lx\n", diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 06c4360..6fcce7d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -612,25 +612,6 @@ void __init initmem_init(void) } #endif -static void __init zone_sizes_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); -#ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; -#endif -#ifdef CONFIG_ZONE_DMA32 - max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; -#endif - max_zone_pfns[ZONE_NORMAL] = max_low_pfn; -#ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_HIGHMEM] = max_pfn; -#endif - - free_area_init_nodes(max_zone_pfns); -} - void __init paging_init(void) { sparse_memory_present_with_active_regions(MAX_NUMNODES); -- cgit v1.1 From bcb71abe7d4c5a0d0368c67da0a7def4fc73497a Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 21 Oct 2011 15:56:24 -0400 Subject: iommu: Add option to group multi-function devices The option iommu=group_mf indicates the that the iommu driver should expose all functions of a multi-function PCI device as the same iommu_device_group. This is useful for disallowing individual functions being exposed as independent devices to userspace as there are often hidden dependencies. Virtual functions are not affected by this option. Signed-off-by: Alex Williamson Signed-off-by: Joerg Roedel --- arch/x86/include/asm/iommu.h | 1 + arch/x86/kernel/pci-dma.c | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index 345c99c..dffc38e 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -5,6 +5,7 @@ extern struct dma_map_ops nommu_dma_ops; extern int force_iommu, no_iommu; extern int iommu_detected; extern int iommu_pass_through; +extern int iommu_group_mf; /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 80dc793..1c4d769 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -45,6 +45,15 @@ int iommu_detected __read_mostly = 0; */ int iommu_pass_through __read_mostly; +/* + * Group multi-function PCI devices into a single device-group for the + * iommu_device_group interface. This tells the iommu driver to pretend + * it cannot distinguish between functions of a device, exposing only one + * group for the device. Useful for disallowing use of individual PCI + * functions from userspace drivers. + */ +int iommu_group_mf __read_mostly; + extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; /* Dummy device used for NULL arguments (normally ISA). */ @@ -169,6 +178,8 @@ static __init int iommu_setup(char *p) #endif if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; + if (!strncmp(p, "group_mf", 8)) + iommu_group_mf = 1; gart_parse_options(p); -- cgit v1.1 From b82e324b3c46a554595c12b45465d1943a57326c Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 10 Nov 2011 13:18:09 +0000 Subject: serial, mfd: don't hardcode the console Add support to specify which HSU port to use as an early console. This can be selected by passing "earlyprintk=hsu" on the kernel command line. By default port 0 is still used. Signed-off-by: Mika Westerberg Signed-off-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/mrst.h | 2 +- arch/x86/kernel/early_printk.c | 2 +- arch/x86/platform/mrst/early_printk_mrst.c | 16 ++++++++++++---- 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 719f00b..4707760 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -51,7 +51,7 @@ extern struct console early_mrst_console; extern void mrst_early_console_init(void); extern struct console early_hsu_console; -extern void hsu_early_console_init(void); +extern void hsu_early_console_init(const char *); extern void intel_scu_devices_create(void); extern void intel_scu_devices_destroy(void); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index cd28a35..9d42a52 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -247,7 +247,7 @@ static int __init setup_early_printk(char *buf) } if (!strncmp(buf, "hsu", 3)) { - hsu_early_console_init(); + hsu_early_console_init(buf + 3); early_console_register(&early_hsu_console, keep); } #endif diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c index 25bfdbb..3c6e328 100644 --- a/arch/x86/platform/mrst/early_printk_mrst.c +++ b/arch/x86/platform/mrst/early_printk_mrst.c @@ -245,16 +245,24 @@ struct console early_mrst_console = { * Following is the early console based on Medfield HSU (High * Speed UART) device. */ -#define HSU_PORT2_PADDR 0xffa28180 +#define HSU_PORT_BASE 0xffa28080 static void __iomem *phsu; -void hsu_early_console_init(void) +void hsu_early_console_init(const char *s) { + unsigned long paddr, port = 0; u8 lcr; - phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, - HSU_PORT2_PADDR); + /* + * Select the early HSU console port if specified by user in the + * kernel command line. + */ + if (*s && !kstrtoul(s, 10, &port)) + port = clamp_val(port, 0, 2); + + paddr = HSU_PORT_BASE + port * 0x80; + phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr); /* Disable FIFO */ writeb(0x0, phsu + UART_FCR); -- cgit v1.1 From b7641d2c83aa10031bf45afd82619bfaaedcbc6f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 11 Nov 2011 15:43:02 -0800 Subject: x86-64, syscall: Adjust comment spacing and remove typo Adjust spacing for comment so that it matches the multiline comment style used in the rest of the kernel, and remove word duplication. It is not really clear what version of gcc this refers to, but the extra & doesn't cause any harm, so there is no reason to remove it. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/syscall_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index de87d60..0edfafa 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -21,9 +21,9 @@ extern void sys_ni_syscall(void); const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* - *Smells like a like a compiler bug -- it doesn't work - *when the & below is removed. - */ + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ [0 ... __NR_syscall_max] = &sys_ni_syscall, #include }; -- cgit v1.1 From e79a7fccfb2ab10f8753ac634a1c8473e870ae6c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 11 Nov 2011 15:48:42 -0800 Subject: x86-64, ia32: Move compat_ni_syscall into C and its own file Move compat_ni_syscall out of ia32entry.S and into its own .c file. Although this is a trivial function, it is not performance-critical, and this will simplify further cleanups. Signed-off-by: H. Peter Anvin --- arch/x86/ia32/Makefile | 1 + arch/x86/ia32/ia32entry.S | 3 --- arch/x86/ia32/nosyscall.c | 7 +++++++ 3 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 arch/x86/ia32/nosyscall.c (limited to 'arch/x86') diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index 52d0ccf..eea9a1c 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -3,6 +3,7 @@ # obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o +obj-$(CONFIG_IA32_EMULATION) += nosyscall.o sysv-$(CONFIG_SYSVIPC) := ipc32.o obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a6253ec..59538a77 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -453,9 +453,6 @@ ia32_badsys: movq $-ENOSYS,%rax jmp ia32_sysret -quiet_ni_syscall: - movq $-ENOSYS,%rax - ret CFI_ENDPROC .macro PTREGSCALL label, func, arg diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c new file mode 100644 index 0000000..51ecd5b --- /dev/null +++ b/arch/x86/ia32/nosyscall.c @@ -0,0 +1,7 @@ +#include +#include + +long compat_ni_syscall(void) +{ + return -ENOSYS; +} -- cgit v1.1 From d181764ccf6207e02abb95fb3052639b947f4833 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 11 Nov 2011 15:55:49 -0800 Subject: x86: Machine-readable syscall tables and scripts to process them Create a simple set of syscall tables and scripts to turn them into both header files (unistd_*.h) and macros for generating the system call tables. Signed-off-by: H. Peter Anvin --- arch/x86/syscalls/Makefile | 43 +++++ arch/x86/syscalls/syscall_32.tbl | 357 +++++++++++++++++++++++++++++++++++++++ arch/x86/syscalls/syscall_64.tbl | 320 +++++++++++++++++++++++++++++++++++ arch/x86/syscalls/syscallhdr.sh | 36 ++++ arch/x86/syscalls/syscalltbl.sh | 15 ++ 5 files changed, 771 insertions(+) create mode 100644 arch/x86/syscalls/Makefile create mode 100644 arch/x86/syscalls/syscall_32.tbl create mode 100644 arch/x86/syscalls/syscall_64.tbl create mode 100644 arch/x86/syscalls/syscallhdr.sh create mode 100644 arch/x86/syscalls/syscalltbl.sh (limited to 'arch/x86') diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile new file mode 100644 index 0000000..564b247 --- /dev/null +++ b/arch/x86/syscalls/Makefile @@ -0,0 +1,43 @@ +out := $(obj)/../include/generated/asm + +# Create output directory if not already present +_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') + +syscall32 := $(srctree)/$(src)/syscall_32.tbl +syscall64 := $(srctree)/$(src)/syscall_64.tbl + +syshdr := $(srctree)/$(src)/syscallhdr.sh +systbl := $(srctree)/$(src)/syscalltbl.sh + +quiet_cmd_syshdr = SYSHDR $@ + cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' $< $@ \ + $(syshdr_abi_$(basetarget)) $(syshdr_pfx_$(basetarget)) +quiet_cmd_systbl = SYSTBL $@ + cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@ + +syshdr_abi_unistd_32 := i386 +$(out)/unistd_32.h: $(syscall32) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_32_ia32 := i386 +syshdr_pfx_unistd_32_ia32 := ia32_ +$(out)/unistd_32_ia32.h: $(syscall32) $(syshdr) + $(call if_changed,syshdr) + +syshdr_abi_unistd_64 := 64 +$(out)/unistd_64.h: $(syscall64) $(syshdr) + $(call if_changed,syshdr) + +$(out)/syscalls_32.h: $(syscall32) $(systbl) + $(call if_changed,systbl) +$(out)/syscalls_64.h: $(syscall64) $(systbl) + $(call if_changed,systbl) + +syshdr-y += unistd_32.h unistd_64.h +syshdr-y += syscalls_32.h +syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h +syshdr-$(CONFIG_X86_64) += syscalls_64.h + +targets += $(syshdr-y) + +all: $(addprefix $(out)/,$(targets)) diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl new file mode 100644 index 0000000..ce98e28 --- /dev/null +++ b/arch/x86/syscalls/syscall_32.tbl @@ -0,0 +1,357 @@ +# +# 32-bit system call numbers and entry vectors +# +# The format is: +# +# +# The abi is always "i386" for this file. +# +0 i386 restart_syscall sys_restart_syscall +1 i386 exit sys_exit +2 i386 fork ptregs_fork stub32_fork +3 i386 read sys_read +4 i386 write sys_write +5 i386 open sys_open compat_sys_open +6 i386 close sys_close +7 i386 waitpid sys_waitpid sys32_waitpid +8 i386 creat sys_creat +9 i386 link sys_link +10 i386 unlink sys_unlink +11 i386 execve ptregs_execve stub32_execve +12 i386 chdir sys_chdir +13 i386 time sys_time compat_sys_time +14 i386 mknod sys_mknod +15 i386 chmod sys_chmod +16 i386 lchown sys_lchown16 +17 i386 break +18 i386 oldstat sys_stat +19 i386 lseek sys_lseek sys32_lseek +20 i386 getpid sys_getpid +21 i386 mount sys_mount compat_sys_mount +22 i386 umount sys_oldumount +23 i386 setuid sys_setuid16 +24 i386 getuid sys_getuid16 +25 i386 stime sys_stime compat_sys_stime +26 i386 ptrace sys_ptrace compat_sys_ptrace +27 i386 alarm sys_alarm +28 i386 oldfstat sys_fstat +29 i386 pause sys_pause +30 i386 utime sys_utime compat_sys_utime +31 i386 stty +32 i386 gtty +33 i386 access sys_access +34 i386 nice sys_nice +35 i386 ftime +36 i386 sync sys_sync +37 i386 kill sys_kill sys32_kill +38 i386 rename sys_rename +39 i386 mkdir sys_mkdir +40 i386 rmdir sys_rmdir +41 i386 dup sys_dup +42 i386 pipe sys_pipe +43 i386 times sys_times compat_sys_times +44 i386 prof +45 i386 brk sys_brk +46 i386 setgid sys_setgid16 +47 i386 getgid sys_getgid16 +48 i386 signal sys_signal +49 i386 geteuid sys_geteuid16 +50 i386 getegid sys_getegid16 +51 i386 acct sys_acct +52 i386 umount2 sys_umount +53 i386 lock +54 i386 ioctl sys_ioctl compat_sys_ioctl +55 i386 fcntl sys_fcntl compat_sys_fcntl64 +56 i386 mpx +57 i386 setpgid sys_setpgid +58 i386 ulimit +59 i386 oldolduname sys_olduname +60 i386 umask sys_umask +61 i386 chroot sys_chroot +62 i386 ustat sys_ustat compat_sys_ustat +63 i386 dup2 sys_dup2 +64 i386 getppid sys_getppid +65 i386 getpgrp sys_getpgrp +66 i386 setsid sys_setsid +67 i386 sigaction sys_sigaction sys32_sigaction +68 i386 sgetmask sys_sgetmask +69 i386 ssetmask sys_ssetmask +70 i386 setreuid sys_setreuid16 +71 i386 setregid sys_setregid16 +72 i386 sigsuspend sys_sigsuspend sys32_sigsuspend +73 i386 sigpending sys_sigpending compat_sys_sigpending +74 i386 sethostname sys_sethostname +75 i386 setrlimit sys_setrlimit compat_sys_setrlimit +76 i386 getrlimit sys_old_getrlimit compat_sys_old_getrlimit +77 i386 getrusage sys_getrusage compat_sys_getrusage +78 i386 gettimeofday sys_gettimeofday compat_sys_gettimeofday +79 i386 settimeofday sys_settimeofday compat_sys_settimeofday +80 i386 getgroups sys_getgroups16 +81 i386 setgroups sys_setgroups16 +82 i386 select sys_old_select compat_sys_old_select +83 i386 symlink sys_symlink +84 i386 oldlstat sys_lstat +85 i386 readlink sys_readlink +86 i386 uselib sys_uselib +87 i386 swapon sys_swapon +88 i386 reboot sys_reboot +89 i386 readdir sys_old_readdir compat_sys_old_readdir +90 i386 mmap sys_old_mmap sys32_mmap +91 i386 munmap sys_munmap +92 i386 truncate sys_truncate +93 i386 ftruncate sys_ftruncate +94 i386 fchmod sys_fchmod +95 i386 fchown sys_fchown16 +96 i386 getpriority sys_getpriority +97 i386 setpriority sys_setpriority +98 i386 profil +99 i386 statfs sys_statfs compat_sys_statfs +100 i386 fstatfs sys_fstatfs compat_sys_fstatfs +101 i386 ioperm sys_ioperm +102 i386 socketcall sys_socketcall compat_sys_socketcall +103 i386 syslog sys_syslog +104 i386 setitimer sys_setitimer compat_sys_setitimer +105 i386 getitimer sys_getitimer compat_sys_getitimer +106 i386 stat sys_newstat compat_sys_newstat +107 i386 lstat sys_newlstat compat_sys_newlstat +108 i386 fstat sys_newfstat compat_sys_newfstat +109 i386 olduname sys_uname +110 i386 iopl ptregs_iopl stub32_iopl +111 i386 vhangup sys_vhangup +112 i386 idle +113 i386 vm86old ptregs_vm86old sys32_vm86_warning +114 i386 wait4 sys_wait4 compat_sys_wait4 +115 i386 swapoff sys_swapoff +116 i386 sysinfo sys_sysinfo compat_sys_sysinfo +117 i386 ipc sys_ipc sys32_ipc +118 i386 fsync sys_fsync +119 i386 sigreturn ptregs_sigreturn stub32_sigreturn +120 i386 clone ptregs_clone stub32_clone +121 i386 setdomainname sys_setdomainname +122 i386 uname sys_newuname +123 i386 modify_ldt sys_modify_ldt +124 i386 adjtimex sys_adjtimex compat_sys_adjtimex +125 i386 mprotect sys_mprotect sys32_mprotect +126 i386 sigprocmask sys_sigprocmask compat_sys_sigprocmask +127 i386 create_module +128 i386 init_module sys_init_module +129 i386 delete_module sys_delete_module +130 i386 get_kernel_syms +131 i386 quotactl sys_quotactl sys32_quotactl +132 i386 getpgid sys_getpgid +133 i386 fchdir sys_fchdir +134 i386 bdflush sys_bdflush +135 i386 sysfs sys_sysfs +136 i386 personality sys_personality +137 i386 afs_syscall +138 i386 setfsuid sys_setfsuid16 +139 i386 setfsgid sys_setfsgid16 +140 i386 _llseek sys_llseek +141 i386 getdents sys_getdents compat_sys_getdents +142 i386 _newselect sys_select compat_sys_select +143 i386 flock sys_flock +144 i386 msync sys_msync +145 i386 readv sys_readv compat_sys_readv +146 i386 writev sys_writev compat_sys_writev +147 i386 getsid sys_getsid +148 i386 fdatasync sys_fdatasync +149 i386 _sysctl sys_sysctl compat_sys_sysctl +150 i386 mlock sys_mlock +151 i386 munlock sys_munlock +152 i386 mlockall sys_mlockall +153 i386 munlockall sys_munlockall +154 i386 sched_setparam sys_sched_setparam +155 i386 sched_getparam sys_sched_getparam +156 i386 sched_setscheduler sys_sched_setscheduler +157 i386 sched_getscheduler sys_sched_getscheduler +158 i386 sched_yield sys_sched_yield +159 i386 sched_get_priority_max sys_sched_get_priority_max +160 i386 sched_get_priority_min sys_sched_get_priority_min +161 i386 sched_rr_get_interval sys_sched_rr_get_interval sys32_sched_rr_get_interval +162 i386 nanosleep sys_nanosleep compat_sys_nanosleep +163 i386 mremap sys_mremap +164 i386 setresuid sys_setresuid16 +165 i386 getresuid sys_getresuid16 +166 i386 vm86 ptregs_vm86 sys32_vm86_warning +167 i386 query_module +168 i386 poll sys_poll +169 i386 nfsservctl +170 i386 setresgid sys_setresgid16 +171 i386 getresgid sys_getresgid16 +172 i386 prctl sys_prctl +173 i386 rt_sigreturn ptregs_rt_sigreturn stub32_rt_sigreturn +174 i386 rt_sigaction sys_rt_sigaction sys32_rt_sigaction +175 i386 rt_sigprocmask sys_rt_sigprocmask sys32_rt_sigprocmask +176 i386 rt_sigpending sys_rt_sigpending sys32_rt_sigpending +177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait +178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo sys32_rt_sigqueueinfo +179 i386 rt_sigsuspend sys_rt_sigsuspend +180 i386 pread64 sys_pread64 sys32_pread +181 i386 pwrite64 sys_pwrite64 sys32_pwrite +182 i386 chown sys_chown16 +183 i386 getcwd sys_getcwd +184 i386 capget sys_capget +185 i386 capset sys_capset +186 i386 sigaltstack ptregs_sigaltstack stub32_sigaltstack +187 i386 sendfile sys_sendfile sys32_sendfile +188 i386 getpmsg +189 i386 putpmsg +190 i386 vfork ptregs_vfork stub32_vfork +191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit +192 i386 mmap2 sys_mmap_pgoff +193 i386 truncate64 sys_truncate64 sys32_truncate64 +194 i386 ftruncate64 sys_ftruncate64 sys32_ftruncate64 +195 i386 stat64 sys_stat64 sys32_stat64 +196 i386 lstat64 sys_lstat64 sys32_lstat64 +197 i386 fstat64 sys_fstat64 sys32_fstat64 +198 i386 lchown32 sys_lchown +199 i386 getuid32 sys_getuid +200 i386 getgid32 sys_getgid +201 i386 geteuid32 sys_geteuid +202 i386 getegid32 sys_getegid +203 i386 setreuid32 sys_setreuid +204 i386 setregid32 sys_setregid +205 i386 getgroups32 sys_getgroups +206 i386 setgroups32 sys_setgroups +207 i386 fchown32 sys_fchown +208 i386 setresuid32 sys_setresuid +209 i386 getresuid32 sys_getresuid +210 i386 setresgid32 sys_setresgid +211 i386 getresgid32 sys_getresgid +212 i386 chown32 sys_chown +213 i386 setuid32 sys_setuid +214 i386 setgid32 sys_setgid +215 i386 setfsuid32 sys_setfsuid +216 i386 setfsgid32 sys_setfsgid +217 i386 pivot_root sys_pivot_root +218 i386 mincore sys_mincore +219 i386 madvise sys_madvise +220 i386 getdents64 sys_getdents64 compat_sys_getdents64 +221 i386 fcntl64 sys_fcntl64 compat_sys_fcntl64 +# 222 is unused +# 223 is unused +224 i386 gettid sys_gettid +225 i386 readahead sys_readahead sys32_readahead +226 i386 setxattr sys_setxattr +227 i386 lsetxattr sys_lsetxattr +228 i386 fsetxattr sys_fsetxattr +229 i386 getxattr sys_getxattr +230 i386 lgetxattr sys_lgetxattr +231 i386 fgetxattr sys_fgetxattr +232 i386 listxattr sys_listxattr +233 i386 llistxattr sys_llistxattr +234 i386 flistxattr sys_flistxattr +235 i386 removexattr sys_removexattr +236 i386 lremovexattr sys_lremovexattr +237 i386 fremovexattr sys_fremovexattr +238 i386 tkill sys_tkill +239 i386 sendfile64 sys_sendfile64 +240 i386 futex sys_futex compat_sys_futex +241 i386 sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity +242 i386 sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity +243 i386 set_thread_area sys_set_thread_area +244 i386 get_thread_area sys_get_thread_area +245 i386 io_setup sys_io_setup compat_sys_io_setup +246 i386 io_destroy sys_io_destroy +247 i386 io_getevents sys_io_getevents compat_sys_io_getevents +248 i386 io_submit sys_io_submit compat_sys_io_submit +249 i386 io_cancel sys_io_cancel +250 i386 fadvise64 sys_fadvise64 sys32_fadvise64 +# 251 is available for reuse (was briefly sys_set_zone_reclaim) +252 i386 exit_group sys_exit_group +253 i386 lookup_dcookie sys_lookup_dcookie sys32_lookup_dcookie +254 i386 epoll_create sys_epoll_create +255 i386 epoll_ctl sys_epoll_ctl +256 i386 epoll_wait sys_epoll_wait +257 i386 remap_file_pages sys_remap_file_pages +258 i386 set_tid_address sys_set_tid_address +259 i386 timer_create sys_timer_create compat_sys_timer_create +260 i386 timer_settime sys_timer_settime compat_sys_timer_settime +261 i386 timer_gettime sys_timer_gettime compat_sys_timer_gettime +262 i386 timer_getoverrun sys_timer_getoverrun +263 i386 timer_delete sys_timer_delete +264 i386 clock_settime sys_clock_settime compat_sys_clock_settime +265 i386 clock_gettime sys_clock_gettime compat_sys_clock_gettime +266 i386 clock_getres sys_clock_getres compat_sys_clock_getres +267 i386 clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep +268 i386 statfs64 sys_statfs64 compat_sys_statfs64 +269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 +270 i386 tgkill sys_tgkill +271 i386 utimes sys_utimes compat_sys_utimes +272 i386 fadvise64_64 sys_fadvise64_64 sys32_fadvise64_64 +273 i386 vserver +274 i386 mbind sys_mbind +275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy +276 i386 set_mempolicy sys_set_mempolicy +277 i386 mq_open sys_mq_open compat_sys_mq_open +278 i386 mq_unlink sys_mq_unlink +279 i386 mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend +280 i386 mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive +281 i386 mq_notify sys_mq_notify compat_sys_mq_notify +282 i386 mq_getsetaddr sys_mq_getsetattr compat_sys_mq_getsetattr +283 i386 kexec_load sys_kexec_load compat_sys_kexec_load +284 i386 waitid sys_waitid compat_sys_waitid +# 285 sys_setaltroot +286 i386 add_key sys_add_key +287 i386 request_key sys_request_key +288 i386 keyctl sys_keyctl +289 i386 ioprio_set sys_ioprio_set +290 i386 ioprio_get sys_ioprio_get +291 i386 inotify_init sys_inotify_init +292 i386 inotify_add_watch sys_inotify_add_watch +293 i386 inotify_rm_watch sys_inotify_rm_watch +294 i386 migrate_pages sys_migrate_pages +295 i386 openat sys_openat compat_sys_openat +296 i386 mkdirat sys_mkdirat +297 i386 mknodat sys_mknodat +298 i386 fchownat sys_fchownat +299 i386 futimesat sys_futimesat compat_sys_futimesat +300 i386 fstatat64 sys_fstatat64 sys32_fstatat +301 i386 unlinkat sys_unlinkat +302 i386 renameat sys_renameat +303 i386 linkat sys_linkat +304 i386 symlinkat sys_symlinkat +305 i386 readlinkat sys_readlinkat +306 i386 fchmodat sys_fchmodat +307 i386 faccessat sys_faccessat +308 i386 pselect6 sys_pselect6 compat_sys_pselect6 +309 i386 ppoll sys_ppoll compat_sys_ppoll +310 i386 unshare sys_unshare +311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list +312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list +313 i386 splice sys_splice +314 i386 sync_file_range sys_sync_file_range sys32_sync_file_range +315 i386 tee sys_tee +316 i386 vmsplice sys_vmsplice compat_sys_vmsplice +317 i386 move_pages sys_move_pages compat_sys_move_pages +318 i386 getcpu sys_getcpu +319 i386 epoll_pwait sys_epoll_pwait +320 i386 utimensat sys_utimensat compat_sys_utimensat +321 i386 signalfd sys_signalfd compat_sys_signalfd +322 i386 timerfd_create sys_timerfd_create +323 i386 eventfd sys_eventfd +324 i386 fallocate sys_fallocate sys32_fallocate +325 i386 timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime +326 i386 timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime +327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4 +328 i386 eventfd2 sys_eventfd2 +329 i386 epoll_create1 sys_epoll_create1 +330 i386 dup3 sys_dup3 +331 i386 pipe2 sys_pipe2 +332 i386 inotify_init1 sys_inotify_init1 +333 i386 preadv sys_preadv compat_sys_preadv +334 i386 pwritev sys_pwritev compat_sys_pwritev +335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +336 i386 perf_event_open sys_perf_event_open +337 i386 recvmmsg sys_recvmmsg compat_sys_recvmmsg +338 i386 fanotify_init sys_fanotify_init +339 i386 fanotify_mark sys_fanotify_mark sys32_fanotify_mark +340 i386 prlimit64 sys_prlimit64 +341 i386 name_to_handle_at sys_name_to_handle_at +342 i386 open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at +343 i386 clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime +344 i386 syncfs sys_syncfs +345 i386 sendmmsg sys_sendmmsg compat_sys_sendmmsg +346 i386 setns sys_setns +347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv +348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl new file mode 100644 index 0000000..b440a8f --- /dev/null +++ b/arch/x86/syscalls/syscall_64.tbl @@ -0,0 +1,320 @@ +# +# 64-bit system call numbers and entry vectors +# +# The format is: +# +# +# The abi is always "64" for this file (for now.) +# +0 64 read sys_read +1 64 write sys_write +2 64 open sys_open +3 64 close sys_close +4 64 stat sys_newstat +5 64 fstat sys_newfstat +6 64 lstat sys_newlstat +7 64 poll sys_poll +8 64 lseek sys_lseek +9 64 mmap sys_mmap +10 64 mprotect sys_mprotect +11 64 munmap sys_munmap +12 64 brk sys_brk +13 64 rt_sigaction sys_rt_sigaction +14 64 rt_sigprocmask sys_rt_sigprocmask +15 64 rt_sigreturn stub_rt_sigreturn +16 64 ioctl sys_ioctl +17 64 pread64 sys_pread64 +18 64 pwrite64 sys_pwrite64 +19 64 readv sys_readv +20 64 writev sys_writev +21 64 access sys_access +22 64 pipe sys_pipe +23 64 select sys_select +24 64 sched_yield sys_sched_yield +25 64 mremap sys_mremap +26 64 msync sys_msync +27 64 mincore sys_mincore +28 64 madvise sys_madvise +29 64 shmget sys_shmget +30 64 shmat sys_shmat +31 64 shmctl sys_shmctl +32 64 dup sys_dup +33 64 dup2 sys_dup2 +34 64 pause sys_pause +35 64 nanosleep sys_nanosleep +36 64 getitimer sys_getitimer +37 64 alarm sys_alarm +38 64 setitimer sys_setitimer +39 64 getpid sys_getpid +40 64 sendfile sys_sendfile64 +41 64 socket sys_socket +42 64 connect sys_connect +43 64 accept sys_accept +44 64 sendto sys_sendto +45 64 recvfrom sys_recvfrom +46 64 sendmsg sys_sendmsg +47 64 recvmsg sys_recvmsg +48 64 shutdown sys_shutdown +49 64 bind sys_bind +50 64 listen sys_listen +51 64 getsockname sys_getsockname +52 64 getpeername sys_getpeername +53 64 socketpair sys_socketpair +54 64 setsockopt sys_setsockopt +55 64 getsockopt sys_getsockopt +56 64 clone stub_clone +57 64 fork stub_fork +58 64 vfork stub_vfork +59 64 execve stub_execve +60 64 exit sys_exit +61 64 wait4 sys_wait4 +62 64 kill sys_kill +63 64 uname sys_newuname +64 64 semget sys_semget +65 64 semop sys_semop +66 64 semctl sys_semctl +67 64 shmdt sys_shmdt +68 64 msgget sys_msgget +69 64 msgsnd sys_msgsnd +70 64 msgrcv sys_msgrcv +71 64 msgctl sys_msgctl +72 64 fcntl sys_fcntl +73 64 flock sys_flock +74 64 fsync sys_fsync +75 64 fdatasync sys_fdatasync +76 64 truncate sys_truncate +77 64 ftruncate sys_ftruncate +78 64 getdents sys_getdents +79 64 getcwd sys_getcwd +80 64 chdir sys_chdir +81 64 fchdir sys_fchdir +82 64 rename sys_rename +83 64 mkdir sys_mkdir +84 64 rmdir sys_rmdir +85 64 creat sys_creat +86 64 link sys_link +87 64 unlink sys_unlink +88 64 symlink sys_symlink +89 64 readlink sys_readlink +90 64 chmod sys_chmod +91 64 fchmod sys_fchmod +92 64 chown sys_chown +93 64 fchown sys_fchown +94 64 lchown sys_lchown +95 64 umask sys_umask +96 64 gettimeofday sys_gettimeofday +97 64 getrlimit sys_getrlimit +98 64 getrusage sys_getrusage +99 64 sysinfo sys_sysinfo +100 64 times sys_times +101 64 ptrace sys_ptrace +102 64 getuid sys_getuid +103 64 syslog sys_syslog +104 64 getgid sys_getgid +105 64 setuid sys_setuid +106 64 setgid sys_setgid +107 64 geteuid sys_geteuid +108 64 getegid sys_getegid +109 64 setpgid sys_setpgid +110 64 getppid sys_getppid +111 64 getpgrp sys_getpgrp +112 64 setsid sys_setsid +113 64 setreuid sys_setreuid +114 64 setregid sys_setregid +115 64 getgroups sys_getgroups +116 64 setgroups sys_setgroups +117 64 setresuid sys_setresuid +118 64 getresuid sys_getresuid +119 64 setresgid sys_setresgid +120 64 getresgid sys_getresgid +121 64 getpgid sys_getpgid +122 64 setfsuid sys_setfsuid +123 64 setfsgid sys_setfsgid +124 64 getsid sys_getsid +125 64 capget sys_capget +126 64 capset sys_capset +127 64 rt_sigpending sys_rt_sigpending +128 64 rt_sigtimedwait sys_rt_sigtimedwait +129 64 rt_sigqueueinfo sys_rt_sigqueueinfo +130 64 rt_sigsuspend sys_rt_sigsuspend +131 64 sigaltstack stub_sigaltstack +132 64 utime sys_utime +133 64 mknod sys_mknod +134 64 uselib +135 64 personality sys_personality +136 64 ustat sys_ustat +137 64 statfs sys_statfs +138 64 fstatfs sys_fstatfs +139 64 sysfs sys_sysfs +140 64 getpriority sys_getpriority +141 64 setpriority sys_setpriority +142 64 sched_setparam sys_sched_setparam +143 64 sched_getparam sys_sched_getparam +144 64 sched_setscheduler sys_sched_setscheduler +145 64 sched_getscheduler sys_sched_getscheduler +146 64 sched_get_priority_max sys_sched_get_priority_max +147 64 sched_get_priority_min sys_sched_get_priority_min +148 64 sched_rr_get_interval sys_sched_rr_get_interval +149 64 mlock sys_mlock +150 64 munlock sys_munlock +151 64 mlockall sys_mlockall +152 64 munlockall sys_munlockall +153 64 vhangup sys_vhangup +154 64 modify_ldt sys_modify_ldt +155 64 pivot_root sys_pivot_root +156 64 _sysctl sys_sysctl +157 64 prctl sys_prctl +158 64 arch_prctl sys_arch_prctl +159 64 adjtimex sys_adjtimex +160 64 setrlimit sys_setrlimit +161 64 chroot sys_chroot +162 64 sync sys_sync +163 64 acct sys_acct +164 64 settimeofday sys_settimeofday +165 64 mount sys_mount +166 64 umount2 sys_umount +167 64 swapon sys_swapon +168 64 swapoff sys_swapoff +169 64 reboot sys_reboot +170 64 sethostname sys_sethostname +171 64 setdomainname sys_setdomainname +172 64 iopl stub_iopl +173 64 ioperm sys_ioperm +174 64 create_module +175 64 init_module sys_init_module +176 64 delete_module sys_delete_module +177 64 get_kernel_syms +178 64 query_module +179 64 quotactl sys_quotactl +180 64 nfsservctl +181 64 getpmsg +182 64 putpmsg +183 64 afs_syscall +184 64 tuxcall +185 64 security +186 64 gettid sys_gettid +187 64 readahead sys_readahead +188 64 setxattr sys_setxattr +189 64 lsetxattr sys_lsetxattr +190 64 fsetxattr sys_fsetxattr +191 64 getxattr sys_getxattr +192 64 lgetxattr sys_lgetxattr +193 64 fgetxattr sys_fgetxattr +194 64 listxattr sys_listxattr +195 64 llistxattr sys_llistxattr +196 64 flistxattr sys_flistxattr +197 64 removexattr sys_removexattr +198 64 lremovexattr sys_lremovexattr +199 64 fremovexattr sys_fremovexattr +200 64 tkill sys_tkill +201 64 time sys_time +202 64 futex sys_futex +203 64 sched_setaffinity sys_sched_setaffinity +204 64 sched_getaffinity sys_sched_getaffinity +205 64 set_thread_area +206 64 io_setup sys_io_setup +207 64 io_destroy sys_io_destroy +208 64 io_getevents sys_io_getevents +209 64 io_submit sys_io_submit +210 64 io_cancel sys_io_cancel +211 64 get_thread_area +212 64 lookup_dcookie sys_lookup_dcookie +213 64 epoll_create sys_epoll_create +214 64 epoll_ctl_old +215 64 epoll_wait_old +216 64 remap_file_pages sys_remap_file_pages +217 64 getdents64 sys_getdents64 +218 64 set_tid_address sys_set_tid_address +219 64 restart_syscall sys_restart_syscall +220 64 semtimedop sys_semtimedop +221 64 fadvise64 sys_fadvise64 +222 64 timer_create sys_timer_create +223 64 timer_settime sys_timer_settime +224 64 timer_gettime sys_timer_gettime +225 64 timer_getoverrun sys_timer_getoverrun +226 64 timer_delete sys_timer_delete +227 64 clock_settime sys_clock_settime +228 64 clock_gettime sys_clock_gettime +229 64 clock_getres sys_clock_getres +230 64 clock_nanosleep sys_clock_nanosleep +231 64 exit_group sys_exit_group +232 64 epoll_wait sys_epoll_wait +233 64 epoll_ctl sys_epoll_ctl +234 64 tgkill sys_tgkill +235 64 utimes sys_utimes +236 64 vserver +237 64 mbind sys_mbind +238 64 set_mempolicy sys_set_mempolicy +239 64 get_mempolicy sys_get_mempolicy +240 64 mq_open sys_mq_open +241 64 mq_unlink sys_mq_unlink +242 64 mq_timedsend sys_mq_timedsend +243 64 mq_timedreceive sys_mq_timedreceive +244 64 mq_notify sys_mq_notify +245 64 mq_getsetattr sys_mq_getsetattr +246 64 kexec_load sys_kexec_load +247 64 waitid sys_waitid +248 64 add_key sys_add_key +249 64 request_key sys_request_key +250 64 keyctl sys_keyctl +251 64 ioprio_set sys_ioprio_set +252 64 ioprio_get sys_ioprio_get +253 64 inotify_init sys_inotify_init +254 64 inotify_add_watch sys_inotify_add_watch +255 64 inotify_rm_watch sys_inotify_rm_watch +256 64 migrate_pages sys_migrate_pages +257 64 openat sys_openat +258 64 mkdirat sys_mkdirat +259 64 mknodat sys_mknodat +260 64 fchownat sys_fchownat +261 64 futimesat sys_futimesat +262 64 newfstatat sys_newfstatat +263 64 unlinkat sys_unlinkat +264 64 renameat sys_renameat +265 64 linkat sys_linkat +266 64 symlinkat sys_symlinkat +267 64 readlinkat sys_readlinkat +268 64 fchmodat sys_fchmodat +269 64 faccessat sys_faccessat +270 64 pselect6 sys_pselect6 +271 64 ppoll sys_ppoll +272 64 unshare sys_unshare +273 64 set_robust_list sys_set_robust_list +274 64 get_robust_list sys_get_robust_list +275 64 splice sys_splice +276 64 tee sys_tee +277 64 sync_file_range sys_sync_file_range +278 64 vmsplice sys_vmsplice +279 64 move_pages sys_move_pages +280 64 utimensat sys_utimensat +281 64 epoll_pwait sys_epoll_pwait +282 64 signalfd sys_signalfd +283 64 timerfd_create sys_timerfd_create +284 64 eventfd sys_eventfd +285 64 fallocate sys_fallocate +286 64 timerfd_settime sys_timerfd_settime +287 64 timerfd_gettime sys_timerfd_gettime +288 64 accept4 sys_accept4 +289 64 signalfd4 sys_signalfd4 +290 64 eventfd2 sys_eventfd2 +291 64 epoll_create1 sys_epoll_create1 +292 64 dup3 sys_dup3 +293 64 pipe2 sys_pipe2 +294 64 inotify_init1 sys_inotify_init1 +295 64 preadv sys_preadv +296 64 pwritev sys_pwritev +297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo +298 64 perf_event_open sys_perf_event_open +299 64 recvmmsg sys_recvmmsg +300 64 fanotify_init sys_fanotify_init +301 64 fanotify_mark sys_fanotify_mark +302 64 prlimit64 sys_prlimit64 +303 64 name_to_handle_at sys_name_to_handle_at +304 64 open_by_handle_at sys_open_by_handle_at +305 64 clock_adjtime sys_clock_adjtime +306 64 syncfs sys_syncfs +307 64 sendmmsg sys_sendmmsg +308 64 setns sys_setns +309 64 getcpu sys_getcpu +310 64 process_vm_readv sys_process_vm_readv +311 64 process_vm_writev sys_process_vm_writev diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh new file mode 100644 index 0000000..0d473ff --- /dev/null +++ b/arch/x86/syscalls/syscallhdr.sh @@ -0,0 +1,36 @@ +#!/bin/sh + +in="$1" +out="$2" +my_abis=`echo "$3" | tr ',' ' '` +prefix="$4" +offset="$5" + +fileguard=_ASM_X86_`basename "$out" | sed \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ + -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` + +in_list () { + local x + for x in $1; do + if [ x"$x" = x"$2" ]; then + return 0 + fi + done + return 1 +} + +grep '^[0-9]' "$in" | sort -n | ( + echo "#ifndef ${fileguard}" + echo "#define ${fileguard} 1" + echo "" + + while read nr abi name entry ; do + if in_list "$my_abis" "$abi"; then + echo "#define __NR_${prefix}${name}" $((nr+offset)) + fi + done + + echo "" + echo "#endif /* ${fileguard} */" +) > "$out" diff --git a/arch/x86/syscalls/syscalltbl.sh b/arch/x86/syscalls/syscalltbl.sh new file mode 100644 index 0000000..0e7f8ec --- /dev/null +++ b/arch/x86/syscalls/syscalltbl.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +in="$1" +out="$2" + +grep '^[0-9]' "$in" | sort -n | ( + while read nr abi name entry compat; do + abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` + if [ -n "$compat" ]; then + echo "__SYSCALL_${abi}($nr, $entry, $compat)" + elif [ -n "$entry" ]; then + echo "__SYSCALL_${abi}($nr, $entry, $entry)" + fi + done +) > "$out" -- cgit v1.1 From 303395ac3bf3e2cb488435537d416bc840438fcb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 11 Nov 2011 16:07:41 -0800 Subject: x86: Generate system call tables and unistd_*.h from tables Generate system call tables and unistd_*.h automatically from the tables in arch/x86/syscalls. All other information, like NR_syscalls, is auto-generated, some of which is in asm-offsets_*.c. This allows us to keep all the system call information in one place, and allows for kernel space and user space to see different information; this is currently used for the ia32 system call numbers when building the 64-bit kernel, but will be used by the x32 ABI in the near future. This also removes some gratuitious differences between i386, x86-64 and ia32; in particular, now all system call tables are generated with the same mechanism. Cc: H. J. Lu Cc: Sam Ravnborg Cc: Michal Marek Signed-off-by: H. Peter Anvin --- arch/x86/Makefile | 6 + arch/x86/ia32/Makefile | 2 +- arch/x86/ia32/ia32entry.S | 356 ------------------ arch/x86/ia32/syscall_ia32.c | 25 ++ arch/x86/include/asm/Kbuild | 5 +- arch/x86/include/asm/ia32_unistd.h | 13 +- arch/x86/include/asm/unistd.h | 54 ++- arch/x86/include/asm/unistd_32.h | 401 -------------------- arch/x86/include/asm/unistd_64.h | 732 ------------------------------------- arch/x86/kernel/Makefile | 3 +- arch/x86/kernel/asm-offsets_32.c | 8 + arch/x86/kernel/asm-offsets_64.c | 19 +- arch/x86/kernel/entry_32.S | 37 +- arch/x86/kernel/syscall_32.c | 25 ++ arch/x86/kernel/syscall_64.c | 14 +- arch/x86/kernel/syscall_table_32.S | 350 ------------------ 16 files changed, 154 insertions(+), 1896 deletions(-) create mode 100644 arch/x86/ia32/syscall_ia32.c delete mode 100644 arch/x86/include/asm/unistd_32.h delete mode 100644 arch/x86/include/asm/unistd_64.h create mode 100644 arch/x86/kernel/syscall_32.c delete mode 100644 arch/x86/kernel/syscall_table_32.S (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index b02e509..209ba12 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -118,6 +118,12 @@ KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) ### +# Syscall table generation + +archheaders: + $(Q)$(MAKE) $(build)=arch/x86/syscalls all + +### # Kernel objects head-y := arch/x86/kernel/head_$(BITS).o diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index eea9a1c..455646e 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -3,7 +3,7 @@ # obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o -obj-$(CONFIG_IA32_EMULATION) += nosyscall.o +obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o sysv-$(CONFIG_SYSVIPC) := ipc32.o obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 59538a77..72f853a 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -27,8 +27,6 @@ .section .entry.text, "ax" -#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) - .macro IA32_ARG_FIXUP noebp=0 movl %edi,%r8d .if \noebp @@ -496,357 +494,3 @@ ENTRY(ia32_ptregs_common) jmp ia32_sysret /* misbalances the return cache */ CFI_ENDPROC END(ia32_ptregs_common) - - .section .rodata,"a" - .align 8 -ia32_sys_call_table: - .quad sys_restart_syscall - .quad sys_exit - .quad stub32_fork - .quad sys_read - .quad sys_write - .quad compat_sys_open /* 5 */ - .quad sys_close - .quad sys32_waitpid - .quad sys_creat - .quad sys_link - .quad sys_unlink /* 10 */ - .quad stub32_execve - .quad sys_chdir - .quad compat_sys_time - .quad sys_mknod - .quad sys_chmod /* 15 */ - .quad sys_lchown16 - .quad quiet_ni_syscall /* old break syscall holder */ - .quad sys_stat - .quad sys32_lseek - .quad sys_getpid /* 20 */ - .quad compat_sys_mount /* mount */ - .quad sys_oldumount /* old_umount */ - .quad sys_setuid16 - .quad sys_getuid16 - .quad compat_sys_stime /* stime */ /* 25 */ - .quad compat_sys_ptrace /* ptrace */ - .quad sys_alarm - .quad sys_fstat /* (old)fstat */ - .quad sys_pause - .quad compat_sys_utime /* 30 */ - .quad quiet_ni_syscall /* old stty syscall holder */ - .quad quiet_ni_syscall /* old gtty syscall holder */ - .quad sys_access - .quad sys_nice - .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */ - .quad sys_sync - .quad sys32_kill - .quad sys_rename - .quad sys_mkdir - .quad sys_rmdir /* 40 */ - .quad sys_dup - .quad sys_pipe - .quad compat_sys_times - .quad quiet_ni_syscall /* old prof syscall holder */ - .quad sys_brk /* 45 */ - .quad sys_setgid16 - .quad sys_getgid16 - .quad sys_signal - .quad sys_geteuid16 - .quad sys_getegid16 /* 50 */ - .quad sys_acct - .quad sys_umount /* new_umount */ - .quad quiet_ni_syscall /* old lock syscall holder */ - .quad compat_sys_ioctl - .quad compat_sys_fcntl64 /* 55 */ - .quad quiet_ni_syscall /* old mpx syscall holder */ - .quad sys_setpgid - .quad quiet_ni_syscall /* old ulimit syscall holder */ - .quad sys_olduname - .quad sys_umask /* 60 */ - .quad sys_chroot - .quad compat_sys_ustat - .quad sys_dup2 - .quad sys_getppid - .quad sys_getpgrp /* 65 */ - .quad sys_setsid - .quad sys32_sigaction - .quad sys_sgetmask - .quad sys_ssetmask - .quad sys_setreuid16 /* 70 */ - .quad sys_setregid16 - .quad sys32_sigsuspend - .quad compat_sys_sigpending - .quad sys_sethostname - .quad compat_sys_setrlimit /* 75 */ - .quad compat_sys_old_getrlimit /* old_getrlimit */ - .quad compat_sys_getrusage - .quad compat_sys_gettimeofday - .quad compat_sys_settimeofday - .quad sys_getgroups16 /* 80 */ - .quad sys_setgroups16 - .quad compat_sys_old_select - .quad sys_symlink - .quad sys_lstat - .quad sys_readlink /* 85 */ - .quad sys_uselib - .quad sys_swapon - .quad sys_reboot - .quad compat_sys_old_readdir - .quad sys32_mmap /* 90 */ - .quad sys_munmap - .quad sys_truncate - .quad sys_ftruncate - .quad sys_fchmod - .quad sys_fchown16 /* 95 */ - .quad sys_getpriority - .quad sys_setpriority - .quad quiet_ni_syscall /* old profil syscall holder */ - .quad compat_sys_statfs - .quad compat_sys_fstatfs /* 100 */ - .quad sys_ioperm - .quad compat_sys_socketcall - .quad sys_syslog - .quad compat_sys_setitimer - .quad compat_sys_getitimer /* 105 */ - .quad compat_sys_newstat - .quad compat_sys_newlstat - .quad compat_sys_newfstat - .quad sys_uname - .quad stub32_iopl /* 110 */ - .quad sys_vhangup - .quad quiet_ni_syscall /* old "idle" system call */ - .quad sys32_vm86_warning /* vm86old */ - .quad compat_sys_wait4 - .quad sys_swapoff /* 115 */ - .quad compat_sys_sysinfo - .quad sys32_ipc - .quad sys_fsync - .quad stub32_sigreturn - .quad stub32_clone /* 120 */ - .quad sys_setdomainname - .quad sys_newuname - .quad sys_modify_ldt - .quad compat_sys_adjtimex - .quad sys32_mprotect /* 125 */ - .quad compat_sys_sigprocmask - .quad quiet_ni_syscall /* create_module */ - .quad sys_init_module - .quad sys_delete_module - .quad quiet_ni_syscall /* 130 get_kernel_syms */ - .quad sys32_quotactl - .quad sys_getpgid - .quad sys_fchdir - .quad quiet_ni_syscall /* bdflush */ - .quad sys_sysfs /* 135 */ - .quad sys_personality - .quad quiet_ni_syscall /* for afs_syscall */ - .quad sys_setfsuid16 - .quad sys_setfsgid16 - .quad sys_llseek /* 140 */ - .quad compat_sys_getdents - .quad compat_sys_select - .quad sys_flock - .quad sys_msync - .quad compat_sys_readv /* 145 */ - .quad compat_sys_writev - .quad sys_getsid - .quad sys_fdatasync - .quad compat_sys_sysctl /* sysctl */ - .quad sys_mlock /* 150 */ - .quad sys_munlock - .quad sys_mlockall - .quad sys_munlockall - .quad sys_sched_setparam - .quad sys_sched_getparam /* 155 */ - .quad sys_sched_setscheduler - .quad sys_sched_getscheduler - .quad sys_sched_yield - .quad sys_sched_get_priority_max - .quad sys_sched_get_priority_min /* 160 */ - .quad sys32_sched_rr_get_interval - .quad compat_sys_nanosleep - .quad sys_mremap - .quad sys_setresuid16 - .quad sys_getresuid16 /* 165 */ - .quad sys32_vm86_warning /* vm86 */ - .quad quiet_ni_syscall /* query_module */ - .quad sys_poll - .quad quiet_ni_syscall /* old nfsservctl */ - .quad sys_setresgid16 /* 170 */ - .quad sys_getresgid16 - .quad sys_prctl - .quad stub32_rt_sigreturn - .quad sys32_rt_sigaction - .quad sys32_rt_sigprocmask /* 175 */ - .quad sys32_rt_sigpending - .quad compat_sys_rt_sigtimedwait - .quad sys32_rt_sigqueueinfo - .quad sys_rt_sigsuspend - .quad sys32_pread /* 180 */ - .quad sys32_pwrite - .quad sys_chown16 - .quad sys_getcwd - .quad sys_capget - .quad sys_capset - .quad stub32_sigaltstack - .quad sys32_sendfile - .quad quiet_ni_syscall /* streams1 */ - .quad quiet_ni_syscall /* streams2 */ - .quad stub32_vfork /* 190 */ - .quad compat_sys_getrlimit - .quad sys_mmap_pgoff - .quad sys32_truncate64 - .quad sys32_ftruncate64 - .quad sys32_stat64 /* 195 */ - .quad sys32_lstat64 - .quad sys32_fstat64 - .quad sys_lchown - .quad sys_getuid - .quad sys_getgid /* 200 */ - .quad sys_geteuid - .quad sys_getegid - .quad sys_setreuid - .quad sys_setregid - .quad sys_getgroups /* 205 */ - .quad sys_setgroups - .quad sys_fchown - .quad sys_setresuid - .quad sys_getresuid - .quad sys_setresgid /* 210 */ - .quad sys_getresgid - .quad sys_chown - .quad sys_setuid - .quad sys_setgid - .quad sys_setfsuid /* 215 */ - .quad sys_setfsgid - .quad sys_pivot_root - .quad sys_mincore - .quad sys_madvise - .quad compat_sys_getdents64 /* 220 getdents64 */ - .quad compat_sys_fcntl64 - .quad quiet_ni_syscall /* tux */ - .quad quiet_ni_syscall /* security */ - .quad sys_gettid - .quad sys32_readahead /* 225 */ - .quad sys_setxattr - .quad sys_lsetxattr - .quad sys_fsetxattr - .quad sys_getxattr - .quad sys_lgetxattr /* 230 */ - .quad sys_fgetxattr - .quad sys_listxattr - .quad sys_llistxattr - .quad sys_flistxattr - .quad sys_removexattr /* 235 */ - .quad sys_lremovexattr - .quad sys_fremovexattr - .quad sys_tkill - .quad sys_sendfile64 - .quad compat_sys_futex /* 240 */ - .quad compat_sys_sched_setaffinity - .quad compat_sys_sched_getaffinity - .quad sys_set_thread_area - .quad sys_get_thread_area - .quad compat_sys_io_setup /* 245 */ - .quad sys_io_destroy - .quad compat_sys_io_getevents - .quad compat_sys_io_submit - .quad sys_io_cancel - .quad sys32_fadvise64 /* 250 */ - .quad quiet_ni_syscall /* free_huge_pages */ - .quad sys_exit_group - .quad sys32_lookup_dcookie - .quad sys_epoll_create - .quad sys_epoll_ctl /* 255 */ - .quad sys_epoll_wait - .quad sys_remap_file_pages - .quad sys_set_tid_address - .quad compat_sys_timer_create - .quad compat_sys_timer_settime /* 260 */ - .quad compat_sys_timer_gettime - .quad sys_timer_getoverrun - .quad sys_timer_delete - .quad compat_sys_clock_settime - .quad compat_sys_clock_gettime /* 265 */ - .quad compat_sys_clock_getres - .quad compat_sys_clock_nanosleep - .quad compat_sys_statfs64 - .quad compat_sys_fstatfs64 - .quad sys_tgkill /* 270 */ - .quad compat_sys_utimes - .quad sys32_fadvise64_64 - .quad quiet_ni_syscall /* sys_vserver */ - .quad sys_mbind - .quad compat_sys_get_mempolicy /* 275 */ - .quad sys_set_mempolicy - .quad compat_sys_mq_open - .quad sys_mq_unlink - .quad compat_sys_mq_timedsend - .quad compat_sys_mq_timedreceive /* 280 */ - .quad compat_sys_mq_notify - .quad compat_sys_mq_getsetattr - .quad compat_sys_kexec_load /* reserved for kexec */ - .quad compat_sys_waitid - .quad quiet_ni_syscall /* 285: sys_altroot */ - .quad sys_add_key - .quad sys_request_key - .quad sys_keyctl - .quad sys_ioprio_set - .quad sys_ioprio_get /* 290 */ - .quad sys_inotify_init - .quad sys_inotify_add_watch - .quad sys_inotify_rm_watch - .quad sys_migrate_pages - .quad compat_sys_openat /* 295 */ - .quad sys_mkdirat - .quad sys_mknodat - .quad sys_fchownat - .quad compat_sys_futimesat - .quad sys32_fstatat /* 300 */ - .quad sys_unlinkat - .quad sys_renameat - .quad sys_linkat - .quad sys_symlinkat - .quad sys_readlinkat /* 305 */ - .quad sys_fchmodat - .quad sys_faccessat - .quad compat_sys_pselect6 - .quad compat_sys_ppoll - .quad sys_unshare /* 310 */ - .quad compat_sys_set_robust_list - .quad compat_sys_get_robust_list - .quad sys_splice - .quad sys32_sync_file_range - .quad sys_tee /* 315 */ - .quad compat_sys_vmsplice - .quad compat_sys_move_pages - .quad sys_getcpu - .quad sys_epoll_pwait - .quad compat_sys_utimensat /* 320 */ - .quad compat_sys_signalfd - .quad sys_timerfd_create - .quad sys_eventfd - .quad sys32_fallocate - .quad compat_sys_timerfd_settime /* 325 */ - .quad compat_sys_timerfd_gettime - .quad compat_sys_signalfd4 - .quad sys_eventfd2 - .quad sys_epoll_create1 - .quad sys_dup3 /* 330 */ - .quad sys_pipe2 - .quad sys_inotify_init1 - .quad compat_sys_preadv - .quad compat_sys_pwritev - .quad compat_sys_rt_tgsigqueueinfo /* 335 */ - .quad sys_perf_event_open - .quad compat_sys_recvmmsg - .quad sys_fanotify_init - .quad sys32_fanotify_mark - .quad sys_prlimit64 /* 340 */ - .quad sys_name_to_handle_at - .quad compat_sys_open_by_handle_at - .quad compat_sys_clock_adjtime - .quad sys_syncfs - .quad compat_sys_sendmmsg /* 345 */ - .quad sys_setns - .quad compat_sys_process_vm_readv - .quad compat_sys_process_vm_writev -ia32_syscall_end: diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c new file mode 100644 index 0000000..d04d3db --- /dev/null +++ b/arch/x86/ia32/syscall_ia32.c @@ -0,0 +1,25 @@ +/* System call table for ia32 emulation. */ + +#include +#include +#include +#include + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ; +#include +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = compat, + +typedef void (*sys_call_ptr_t)(void); + +extern void compat_ni_syscall(void); + +const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { + /* + * Smells like a like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall, +#include +}; diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 6fa90a8..b57e6a4 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -19,7 +19,8 @@ header-y += processor-flags.h header-y += ptrace-abi.h header-y += sigcontext32.h header-y += ucontext.h -header-y += unistd_32.h -header-y += unistd_64.h header-y += vm86.h header-y += vsyscall.h + +genhdr-y += unistd_32.h +genhdr-y += unistd_64.h diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h index 976f6ec..b0d5716 100644 --- a/arch/x86/include/asm/ia32_unistd.h +++ b/arch/x86/include/asm/ia32_unistd.h @@ -2,17 +2,10 @@ #define _ASM_X86_IA32_UNISTD_H /* - * This file contains the system call numbers of the ia32 port, + * This file contains the system call numbers of the ia32 compat ABI, * this is for the kernel only. - * Only add syscalls here where some part of the kernel needs to know - * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK */ - -#define __NR_ia32_restart_syscall 0 -#define __NR_ia32_exit 1 -#define __NR_ia32_read 3 -#define __NR_ia32_write 4 -#define __NR_ia32_sigreturn 119 -#define __NR_ia32_rt_sigreturn 173 +#define __SYSCALL_ia32_NR(x) (x) +#include #endif /* _ASM_X86_IA32_UNISTD_H */ diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 2a58ed3..b4a3db7 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -1,13 +1,59 @@ +#ifndef _ASM_X86_UNISTD_H +#define _ASM_X86_UNISTD_H 1 + #ifdef __KERNEL__ # ifdef CONFIG_X86_32 -# include "unistd_32.h" + +# include +# define __ARCH_WANT_IPC_PARSE_VERSION +# define __ARCH_WANT_STAT64 +# define __ARCH_WANT_SYS_OLD_MMAP +# define __ARCH_WANT_SYS_OLD_SELECT + # else -# include "unistd_64.h" + +# include +# define __ARCH_WANT_COMPAT_SYS_TIME + # endif + +# define __ARCH_WANT_OLD_READDIR +# define __ARCH_WANT_OLD_STAT +# define __ARCH_WANT_SYS_ALARM +# define __ARCH_WANT_SYS_FADVISE64 +# define __ARCH_WANT_SYS_GETHOSTNAME +# define __ARCH_WANT_SYS_GETPGRP +# define __ARCH_WANT_SYS_LLSEEK +# define __ARCH_WANT_SYS_NICE +# define __ARCH_WANT_SYS_OLDUMOUNT +# define __ARCH_WANT_SYS_OLD_GETRLIMIT +# define __ARCH_WANT_SYS_OLD_UNAME +# define __ARCH_WANT_SYS_PAUSE +# define __ARCH_WANT_SYS_RT_SIGACTION +# define __ARCH_WANT_SYS_RT_SIGSUSPEND +# define __ARCH_WANT_SYS_SGETMASK +# define __ARCH_WANT_SYS_SIGNAL +# define __ARCH_WANT_SYS_SIGPENDING +# define __ARCH_WANT_SYS_SIGPROCMASK +# define __ARCH_WANT_SYS_SOCKETCALL +# define __ARCH_WANT_SYS_TIME +# define __ARCH_WANT_SYS_UTIME +# define __ARCH_WANT_SYS_WAITPID + +/* + * "Conditional" syscalls + * + * What we want is __attribute__((weak,alias("sys_ni_syscall"))), + * but it doesn't work on all toolchains, so we just do it by hand + */ +# define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall") + #else # ifdef __i386__ -# include "unistd_32.h" +# include # else -# include "unistd_64.h" +# include # endif #endif + +#endif /* _ASM_X86_UNISTD_H */ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h deleted file mode 100644 index 599c77d..0000000 --- a/arch/x86/include/asm/unistd_32.h +++ /dev/null @@ -1,401 +0,0 @@ -#ifndef _ASM_X86_UNISTD_32_H -#define _ASM_X86_UNISTD_32_H - -/* - * This file contains the system call numbers. - */ - -#define __NR_restart_syscall 0 -#define __NR_exit 1 -#define __NR_fork 2 -#define __NR_read 3 -#define __NR_write 4 -#define __NR_open 5 -#define __NR_close 6 -#define __NR_waitpid 7 -#define __NR_creat 8 -#define __NR_link 9 -#define __NR_unlink 10 -#define __NR_execve 11 -#define __NR_chdir 12 -#define __NR_time 13 -#define __NR_mknod 14 -#define __NR_chmod 15 -#define __NR_lchown 16 -#define __NR_break 17 -#define __NR_oldstat 18 -#define __NR_lseek 19 -#define __NR_getpid 20 -#define __NR_mount 21 -#define __NR_umount 22 -#define __NR_setuid 23 -#define __NR_getuid 24 -#define __NR_stime 25 -#define __NR_ptrace 26 -#define __NR_alarm 27 -#define __NR_oldfstat 28 -#define __NR_pause 29 -#define __NR_utime 30 -#define __NR_stty 31 -#define __NR_gtty 32 -#define __NR_access 33 -#define __NR_nice 34 -#define __NR_ftime 35 -#define __NR_sync 36 -#define __NR_kill 37 -#define __NR_rename 38 -#define __NR_mkdir 39 -#define __NR_rmdir 40 -#define __NR_dup 41 -#define __NR_pipe 42 -#define __NR_times 43 -#define __NR_prof 44 -#define __NR_brk 45 -#define __NR_setgid 46 -#define __NR_getgid 47 -#define __NR_signal 48 -#define __NR_geteuid 49 -#define __NR_getegid 50 -#define __NR_acct 51 -#define __NR_umount2 52 -#define __NR_lock 53 -#define __NR_ioctl 54 -#define __NR_fcntl 55 -#define __NR_mpx 56 -#define __NR_setpgid 57 -#define __NR_ulimit 58 -#define __NR_oldolduname 59 -#define __NR_umask 60 -#define __NR_chroot 61 -#define __NR_ustat 62 -#define __NR_dup2 63 -#define __NR_getppid 64 -#define __NR_getpgrp 65 -#define __NR_setsid 66 -#define __NR_sigaction 67 -#define __NR_sgetmask 68 -#define __NR_ssetmask 69 -#define __NR_setreuid 70 -#define __NR_setregid 71 -#define __NR_sigsuspend 72 -#define __NR_sigpending 73 -#define __NR_sethostname 74 -#define __NR_setrlimit 75 -#define __NR_getrlimit 76 /* Back compatible 2Gig limited rlimit */ -#define __NR_getrusage 77 -#define __NR_gettimeofday 78 -#define __NR_settimeofday 79 -#define __NR_getgroups 80 -#define __NR_setgroups 81 -#define __NR_select 82 -#define __NR_symlink 83 -#define __NR_oldlstat 84 -#define __NR_readlink 85 -#define __NR_uselib 86 -#define __NR_swapon 87 -#define __NR_reboot 88 -#define __NR_readdir 89 -#define __NR_mmap 90 -#define __NR_munmap 91 -#define __NR_truncate 92 -#define __NR_ftruncate 93 -#define __NR_fchmod 94 -#define __NR_fchown 95 -#define __NR_getpriority 96 -#define __NR_setpriority 97 -#define __NR_profil 98 -#define __NR_statfs 99 -#define __NR_fstatfs 100 -#define __NR_ioperm 101 -#define __NR_socketcall 102 -#define __NR_syslog 103 -#define __NR_setitimer 104 -#define __NR_getitimer 105 -#define __NR_stat 106 -#define __NR_lstat 107 -#define __NR_fstat 108 -#define __NR_olduname 109 -#define __NR_iopl 110 -#define __NR_vhangup 111 -#define __NR_idle 112 -#define __NR_vm86old 113 -#define __NR_wait4 114 -#define __NR_swapoff 115 -#define __NR_sysinfo 116 -#define __NR_ipc 117 -#define __NR_fsync 118 -#define __NR_sigreturn 119 -#define __NR_clone 120 -#define __NR_setdomainname 121 -#define __NR_uname 122 -#define __NR_modify_ldt 123 -#define __NR_adjtimex 124 -#define __NR_mprotect 125 -#define __NR_sigprocmask 126 -#define __NR_create_module 127 -#define __NR_init_module 128 -#define __NR_delete_module 129 -#define __NR_get_kernel_syms 130 -#define __NR_quotactl 131 -#define __NR_getpgid 132 -#define __NR_fchdir 133 -#define __NR_bdflush 134 -#define __NR_sysfs 135 -#define __NR_personality 136 -#define __NR_afs_syscall 137 /* Syscall for Andrew File System */ -#define __NR_setfsuid 138 -#define __NR_setfsgid 139 -#define __NR__llseek 140 -#define __NR_getdents 141 -#define __NR__newselect 142 -#define __NR_flock 143 -#define __NR_msync 144 -#define __NR_readv 145 -#define __NR_writev 146 -#define __NR_getsid 147 -#define __NR_fdatasync 148 -#define __NR__sysctl 149 -#define __NR_mlock 150 -#define __NR_munlock 151 -#define __NR_mlockall 152 -#define __NR_munlockall 153 -#define __NR_sched_setparam 154 -#define __NR_sched_getparam 155 -#define __NR_sched_setscheduler 156 -#define __NR_sched_getscheduler 157 -#define __NR_sched_yield 158 -#define __NR_sched_get_priority_max 159 -#define __NR_sched_get_priority_min 160 -#define __NR_sched_rr_get_interval 161 -#define __NR_nanosleep 162 -#define __NR_mremap 163 -#define __NR_setresuid 164 -#define __NR_getresuid 165 -#define __NR_vm86 166 -#define __NR_query_module 167 -#define __NR_poll 168 -#define __NR_nfsservctl 169 -#define __NR_setresgid 170 -#define __NR_getresgid 171 -#define __NR_prctl 172 -#define __NR_rt_sigreturn 173 -#define __NR_rt_sigaction 174 -#define __NR_rt_sigprocmask 175 -#define __NR_rt_sigpending 176 -#define __NR_rt_sigtimedwait 177 -#define __NR_rt_sigqueueinfo 178 -#define __NR_rt_sigsuspend 179 -#define __NR_pread64 180 -#define __NR_pwrite64 181 -#define __NR_chown 182 -#define __NR_getcwd 183 -#define __NR_capget 184 -#define __NR_capset 185 -#define __NR_sigaltstack 186 -#define __NR_sendfile 187 -#define __NR_getpmsg 188 /* some people actually want streams */ -#define __NR_putpmsg 189 /* some people actually want streams */ -#define __NR_vfork 190 -#define __NR_ugetrlimit 191 /* SuS compliant getrlimit */ -#define __NR_mmap2 192 -#define __NR_truncate64 193 -#define __NR_ftruncate64 194 -#define __NR_stat64 195 -#define __NR_lstat64 196 -#define __NR_fstat64 197 -#define __NR_lchown32 198 -#define __NR_getuid32 199 -#define __NR_getgid32 200 -#define __NR_geteuid32 201 -#define __NR_getegid32 202 -#define __NR_setreuid32 203 -#define __NR_setregid32 204 -#define __NR_getgroups32 205 -#define __NR_setgroups32 206 -#define __NR_fchown32 207 -#define __NR_setresuid32 208 -#define __NR_getresuid32 209 -#define __NR_setresgid32 210 -#define __NR_getresgid32 211 -#define __NR_chown32 212 -#define __NR_setuid32 213 -#define __NR_setgid32 214 -#define __NR_setfsuid32 215 -#define __NR_setfsgid32 216 -#define __NR_pivot_root 217 -#define __NR_mincore 218 -#define __NR_madvise 219 -#define __NR_madvise1 219 /* delete when C lib stub is removed */ -#define __NR_getdents64 220 -#define __NR_fcntl64 221 -/* 223 is unused */ -#define __NR_gettid 224 -#define __NR_readahead 225 -#define __NR_setxattr 226 -#define __NR_lsetxattr 227 -#define __NR_fsetxattr 228 -#define __NR_getxattr 229 -#define __NR_lgetxattr 230 -#define __NR_fgetxattr 231 -#define __NR_listxattr 232 -#define __NR_llistxattr 233 -#define __NR_flistxattr 234 -#define __NR_removexattr 235 -#define __NR_lremovexattr 236 -#define __NR_fremovexattr 237 -#define __NR_tkill 238 -#define __NR_sendfile64 239 -#define __NR_futex 240 -#define __NR_sched_setaffinity 241 -#define __NR_sched_getaffinity 242 -#define __NR_set_thread_area 243 -#define __NR_get_thread_area 244 -#define __NR_io_setup 245 -#define __NR_io_destroy 246 -#define __NR_io_getevents 247 -#define __NR_io_submit 248 -#define __NR_io_cancel 249 -#define __NR_fadvise64 250 -/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */ -#define __NR_exit_group 252 -#define __NR_lookup_dcookie 253 -#define __NR_epoll_create 254 -#define __NR_epoll_ctl 255 -#define __NR_epoll_wait 256 -#define __NR_remap_file_pages 257 -#define __NR_set_tid_address 258 -#define __NR_timer_create 259 -#define __NR_timer_settime (__NR_timer_create+1) -#define __NR_timer_gettime (__NR_timer_create+2) -#define __NR_timer_getoverrun (__NR_timer_create+3) -#define __NR_timer_delete (__NR_timer_create+4) -#define __NR_clock_settime (__NR_timer_create+5) -#define __NR_clock_gettime (__NR_timer_create+6) -#define __NR_clock_getres (__NR_timer_create+7) -#define __NR_clock_nanosleep (__NR_timer_create+8) -#define __NR_statfs64 268 -#define __NR_fstatfs64 269 -#define __NR_tgkill 270 -#define __NR_utimes 271 -#define __NR_fadvise64_64 272 -#define __NR_vserver 273 -#define __NR_mbind 274 -#define __NR_get_mempolicy 275 -#define __NR_set_mempolicy 276 -#define __NR_mq_open 277 -#define __NR_mq_unlink (__NR_mq_open+1) -#define __NR_mq_timedsend (__NR_mq_open+2) -#define __NR_mq_timedreceive (__NR_mq_open+3) -#define __NR_mq_notify (__NR_mq_open+4) -#define __NR_mq_getsetattr (__NR_mq_open+5) -#define __NR_kexec_load 283 -#define __NR_waitid 284 -/* #define __NR_sys_setaltroot 285 */ -#define __NR_add_key 286 -#define __NR_request_key 287 -#define __NR_keyctl 288 -#define __NR_ioprio_set 289 -#define __NR_ioprio_get 290 -#define __NR_inotify_init 291 -#define __NR_inotify_add_watch 292 -#define __NR_inotify_rm_watch 293 -#define __NR_migrate_pages 294 -#define __NR_openat 295 -#define __NR_mkdirat 296 -#define __NR_mknodat 297 -#define __NR_fchownat 298 -#define __NR_futimesat 299 -#define __NR_fstatat64 300 -#define __NR_unlinkat 301 -#define __NR_renameat 302 -#define __NR_linkat 303 -#define __NR_symlinkat 304 -#define __NR_readlinkat 305 -#define __NR_fchmodat 306 -#define __NR_faccessat 307 -#define __NR_pselect6 308 -#define __NR_ppoll 309 -#define __NR_unshare 310 -#define __NR_set_robust_list 311 -#define __NR_get_robust_list 312 -#define __NR_splice 313 -#define __NR_sync_file_range 314 -#define __NR_tee 315 -#define __NR_vmsplice 316 -#define __NR_move_pages 317 -#define __NR_getcpu 318 -#define __NR_epoll_pwait 319 -#define __NR_utimensat 320 -#define __NR_signalfd 321 -#define __NR_timerfd_create 322 -#define __NR_eventfd 323 -#define __NR_fallocate 324 -#define __NR_timerfd_settime 325 -#define __NR_timerfd_gettime 326 -#define __NR_signalfd4 327 -#define __NR_eventfd2 328 -#define __NR_epoll_create1 329 -#define __NR_dup3 330 -#define __NR_pipe2 331 -#define __NR_inotify_init1 332 -#define __NR_preadv 333 -#define __NR_pwritev 334 -#define __NR_rt_tgsigqueueinfo 335 -#define __NR_perf_event_open 336 -#define __NR_recvmmsg 337 -#define __NR_fanotify_init 338 -#define __NR_fanotify_mark 339 -#define __NR_prlimit64 340 -#define __NR_name_to_handle_at 341 -#define __NR_open_by_handle_at 342 -#define __NR_clock_adjtime 343 -#define __NR_syncfs 344 -#define __NR_sendmmsg 345 -#define __NR_setns 346 -#define __NR_process_vm_readv 347 -#define __NR_process_vm_writev 348 - -#ifdef __KERNEL__ - -#define NR_syscalls 349 - -#define __ARCH_WANT_IPC_PARSE_VERSION -#define __ARCH_WANT_OLD_READDIR -#define __ARCH_WANT_OLD_STAT -#define __ARCH_WANT_STAT64 -#define __ARCH_WANT_SYS_ALARM -#define __ARCH_WANT_SYS_GETHOSTNAME -#define __ARCH_WANT_SYS_IPC -#define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK -#define __ARCH_WANT_SYS_SIGNAL -#define __ARCH_WANT_SYS_TIME -#define __ARCH_WANT_SYS_UTIME -#define __ARCH_WANT_SYS_WAITPID -#define __ARCH_WANT_SYS_SOCKETCALL -#define __ARCH_WANT_SYS_FADVISE64 -#define __ARCH_WANT_SYS_GETPGRP -#define __ARCH_WANT_SYS_LLSEEK -#define __ARCH_WANT_SYS_NICE -#define __ARCH_WANT_SYS_OLD_GETRLIMIT -#define __ARCH_WANT_SYS_OLD_UNAME -#define __ARCH_WANT_SYS_OLD_MMAP -#define __ARCH_WANT_SYS_OLD_SELECT -#define __ARCH_WANT_SYS_OLDUMOUNT -#define __ARCH_WANT_SYS_SIGPENDING -#define __ARCH_WANT_SYS_SIGPROCMASK -#define __ARCH_WANT_SYS_RT_SIGACTION -#define __ARCH_WANT_SYS_RT_SIGSUSPEND - -/* - * "Conditional" syscalls - * - * What we want is __attribute__((weak,alias("sys_ni_syscall"))), - * but it doesn't work on all toolchains, so we just do it by hand - */ -#ifndef cond_syscall -#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall") -#endif - -#endif /* __KERNEL__ */ -#endif /* _ASM_X86_UNISTD_32_H */ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h deleted file mode 100644 index 0431f19..0000000 --- a/arch/x86/include/asm/unistd_64.h +++ /dev/null @@ -1,732 +0,0 @@ -#ifndef _ASM_X86_UNISTD_64_H -#define _ASM_X86_UNISTD_64_H - -#ifndef __SYSCALL -#define __SYSCALL(a, b) -#endif - -/* - * This file contains the system call numbers. - * - * Note: holes are not allowed. - */ - -/* at least 8 syscall per cacheline */ -#define __NR_read 0 -__SYSCALL(__NR_read, sys_read) -#define __NR_write 1 -__SYSCALL(__NR_write, sys_write) -#define __NR_open 2 -__SYSCALL(__NR_open, sys_open) -#define __NR_close 3 -__SYSCALL(__NR_close, sys_close) -#define __NR_stat 4 -__SYSCALL(__NR_stat, sys_newstat) -#define __NR_fstat 5 -__SYSCALL(__NR_fstat, sys_newfstat) -#define __NR_lstat 6 -__SYSCALL(__NR_lstat, sys_newlstat) -#define __NR_poll 7 -__SYSCALL(__NR_poll, sys_poll) - -#define __NR_lseek 8 -__SYSCALL(__NR_lseek, sys_lseek) -#define __NR_mmap 9 -__SYSCALL(__NR_mmap, sys_mmap) -#define __NR_mprotect 10 -__SYSCALL(__NR_mprotect, sys_mprotect) -#define __NR_munmap 11 -__SYSCALL(__NR_munmap, sys_munmap) -#define __NR_brk 12 -__SYSCALL(__NR_brk, sys_brk) -#define __NR_rt_sigaction 13 -__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction) -#define __NR_rt_sigprocmask 14 -__SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask) -#define __NR_rt_sigreturn 15 -__SYSCALL(__NR_rt_sigreturn, stub_rt_sigreturn) - -#define __NR_ioctl 16 -__SYSCALL(__NR_ioctl, sys_ioctl) -#define __NR_pread64 17 -__SYSCALL(__NR_pread64, sys_pread64) -#define __NR_pwrite64 18 -__SYSCALL(__NR_pwrite64, sys_pwrite64) -#define __NR_readv 19 -__SYSCALL(__NR_readv, sys_readv) -#define __NR_writev 20 -__SYSCALL(__NR_writev, sys_writev) -#define __NR_access 21 -__SYSCALL(__NR_access, sys_access) -#define __NR_pipe 22 -__SYSCALL(__NR_pipe, sys_pipe) -#define __NR_select 23 -__SYSCALL(__NR_select, sys_select) - -#define __NR_sched_yield 24 -__SYSCALL(__NR_sched_yield, sys_sched_yield) -#define __NR_mremap 25 -__SYSCALL(__NR_mremap, sys_mremap) -#define __NR_msync 26 -__SYSCALL(__NR_msync, sys_msync) -#define __NR_mincore 27 -__SYSCALL(__NR_mincore, sys_mincore) -#define __NR_madvise 28 -__SYSCALL(__NR_madvise, sys_madvise) -#define __NR_shmget 29 -__SYSCALL(__NR_shmget, sys_shmget) -#define __NR_shmat 30 -__SYSCALL(__NR_shmat, sys_shmat) -#define __NR_shmctl 31 -__SYSCALL(__NR_shmctl, sys_shmctl) - -#define __NR_dup 32 -__SYSCALL(__NR_dup, sys_dup) -#define __NR_dup2 33 -__SYSCALL(__NR_dup2, sys_dup2) -#define __NR_pause 34 -__SYSCALL(__NR_pause, sys_pause) -#define __NR_nanosleep 35 -__SYSCALL(__NR_nanosleep, sys_nanosleep) -#define __NR_getitimer 36 -__SYSCALL(__NR_getitimer, sys_getitimer) -#define __NR_alarm 37 -__SYSCALL(__NR_alarm, sys_alarm) -#define __NR_setitimer 38 -__SYSCALL(__NR_setitimer, sys_setitimer) -#define __NR_getpid 39 -__SYSCALL(__NR_getpid, sys_getpid) - -#define __NR_sendfile 40 -__SYSCALL(__NR_sendfile, sys_sendfile64) -#define __NR_socket 41 -__SYSCALL(__NR_socket, sys_socket) -#define __NR_connect 42 -__SYSCALL(__NR_connect, sys_connect) -#define __NR_accept 43 -__SYSCALL(__NR_accept, sys_accept) -#define __NR_sendto 44 -__SYSCALL(__NR_sendto, sys_sendto) -#define __NR_recvfrom 45 -__SYSCALL(__NR_recvfrom, sys_recvfrom) -#define __NR_sendmsg 46 -__SYSCALL(__NR_sendmsg, sys_sendmsg) -#define __NR_recvmsg 47 -__SYSCALL(__NR_recvmsg, sys_recvmsg) - -#define __NR_shutdown 48 -__SYSCALL(__NR_shutdown, sys_shutdown) -#define __NR_bind 49 -__SYSCALL(__NR_bind, sys_bind) -#define __NR_listen 50 -__SYSCALL(__NR_listen, sys_listen) -#define __NR_getsockname 51 -__SYSCALL(__NR_getsockname, sys_getsockname) -#define __NR_getpeername 52 -__SYSCALL(__NR_getpeername, sys_getpeername) -#define __NR_socketpair 53 -__SYSCALL(__NR_socketpair, sys_socketpair) -#define __NR_setsockopt 54 -__SYSCALL(__NR_setsockopt, sys_setsockopt) -#define __NR_getsockopt 55 -__SYSCALL(__NR_getsockopt, sys_getsockopt) - -#define __NR_clone 56 -__SYSCALL(__NR_clone, stub_clone) -#define __NR_fork 57 -__SYSCALL(__NR_fork, stub_fork) -#define __NR_vfork 58 -__SYSCALL(__NR_vfork, stub_vfork) -#define __NR_execve 59 -__SYSCALL(__NR_execve, stub_execve) -#define __NR_exit 60 -__SYSCALL(__NR_exit, sys_exit) -#define __NR_wait4 61 -__SYSCALL(__NR_wait4, sys_wait4) -#define __NR_kill 62 -__SYSCALL(__NR_kill, sys_kill) -#define __NR_uname 63 -__SYSCALL(__NR_uname, sys_newuname) - -#define __NR_semget 64 -__SYSCALL(__NR_semget, sys_semget) -#define __NR_semop 65 -__SYSCALL(__NR_semop, sys_semop) -#define __NR_semctl 66 -__SYSCALL(__NR_semctl, sys_semctl) -#define __NR_shmdt 67 -__SYSCALL(__NR_shmdt, sys_shmdt) -#define __NR_msgget 68 -__SYSCALL(__NR_msgget, sys_msgget) -#define __NR_msgsnd 69 -__SYSCALL(__NR_msgsnd, sys_msgsnd) -#define __NR_msgrcv 70 -__SYSCALL(__NR_msgrcv, sys_msgrcv) -#define __NR_msgctl 71 -__SYSCALL(__NR_msgctl, sys_msgctl) - -#define __NR_fcntl 72 -__SYSCALL(__NR_fcntl, sys_fcntl) -#define __NR_flock 73 -__SYSCALL(__NR_flock, sys_flock) -#define __NR_fsync 74 -__SYSCALL(__NR_fsync, sys_fsync) -#define __NR_fdatasync 75 -__SYSCALL(__NR_fdatasync, sys_fdatasync) -#define __NR_truncate 76 -__SYSCALL(__NR_truncate, sys_truncate) -#define __NR_ftruncate 77 -__SYSCALL(__NR_ftruncate, sys_ftruncate) -#define __NR_getdents 78 -__SYSCALL(__NR_getdents, sys_getdents) -#define __NR_getcwd 79 -__SYSCALL(__NR_getcwd, sys_getcwd) - -#define __NR_chdir 80 -__SYSCALL(__NR_chdir, sys_chdir) -#define __NR_fchdir 81 -__SYSCALL(__NR_fchdir, sys_fchdir) -#define __NR_rename 82 -__SYSCALL(__NR_rename, sys_rename) -#define __NR_mkdir 83 -__SYSCALL(__NR_mkdir, sys_mkdir) -#define __NR_rmdir 84 -__SYSCALL(__NR_rmdir, sys_rmdir) -#define __NR_creat 85 -__SYSCALL(__NR_creat, sys_creat) -#define __NR_link 86 -__SYSCALL(__NR_link, sys_link) -#define __NR_unlink 87 -__SYSCALL(__NR_unlink, sys_unlink) - -#define __NR_symlink 88 -__SYSCALL(__NR_symlink, sys_symlink) -#define __NR_readlink 89 -__SYSCALL(__NR_readlink, sys_readlink) -#define __NR_chmod 90 -__SYSCALL(__NR_chmod, sys_chmod) -#define __NR_fchmod 91 -__SYSCALL(__NR_fchmod, sys_fchmod) -#define __NR_chown 92 -__SYSCALL(__NR_chown, sys_chown) -#define __NR_fchown 93 -__SYSCALL(__NR_fchown, sys_fchown) -#define __NR_lchown 94 -__SYSCALL(__NR_lchown, sys_lchown) -#define __NR_umask 95 -__SYSCALL(__NR_umask, sys_umask) - -#define __NR_gettimeofday 96 -__SYSCALL(__NR_gettimeofday, sys_gettimeofday) -#define __NR_getrlimit 97 -__SYSCALL(__NR_getrlimit, sys_getrlimit) -#define __NR_getrusage 98 -__SYSCALL(__NR_getrusage, sys_getrusage) -#define __NR_sysinfo 99 -__SYSCALL(__NR_sysinfo, sys_sysinfo) -#define __NR_times 100 -__SYSCALL(__NR_times, sys_times) -#define __NR_ptrace 101 -__SYSCALL(__NR_ptrace, sys_ptrace) -#define __NR_getuid 102 -__SYSCALL(__NR_getuid, sys_getuid) -#define __NR_syslog 103 -__SYSCALL(__NR_syslog, sys_syslog) - -/* at the very end the stuff that never runs during the benchmarks */ -#define __NR_getgid 104 -__SYSCALL(__NR_getgid, sys_getgid) -#define __NR_setuid 105 -__SYSCALL(__NR_setuid, sys_setuid) -#define __NR_setgid 106 -__SYSCALL(__NR_setgid, sys_setgid) -#define __NR_geteuid 107 -__SYSCALL(__NR_geteuid, sys_geteuid) -#define __NR_getegid 108 -__SYSCALL(__NR_getegid, sys_getegid) -#define __NR_setpgid 109 -__SYSCALL(__NR_setpgid, sys_setpgid) -#define __NR_getppid 110 -__SYSCALL(__NR_getppid, sys_getppid) -#define __NR_getpgrp 111 -__SYSCALL(__NR_getpgrp, sys_getpgrp) - -#define __NR_setsid 112 -__SYSCALL(__NR_setsid, sys_setsid) -#define __NR_setreuid 113 -__SYSCALL(__NR_setreuid, sys_setreuid) -#define __NR_setregid 114 -__SYSCALL(__NR_setregid, sys_setregid) -#define __NR_getgroups 115 -__SYSCALL(__NR_getgroups, sys_getgroups) -#define __NR_setgroups 116 -__SYSCALL(__NR_setgroups, sys_setgroups) -#define __NR_setresuid 117 -__SYSCALL(__NR_setresuid, sys_setresuid) -#define __NR_getresuid 118 -__SYSCALL(__NR_getresuid, sys_getresuid) -#define __NR_setresgid 119 -__SYSCALL(__NR_setresgid, sys_setresgid) - -#define __NR_getresgid 120 -__SYSCALL(__NR_getresgid, sys_getresgid) -#define __NR_getpgid 121 -__SYSCALL(__NR_getpgid, sys_getpgid) -#define __NR_setfsuid 122 -__SYSCALL(__NR_setfsuid, sys_setfsuid) -#define __NR_setfsgid 123 -__SYSCALL(__NR_setfsgid, sys_setfsgid) -#define __NR_getsid 124 -__SYSCALL(__NR_getsid, sys_getsid) -#define __NR_capget 125 -__SYSCALL(__NR_capget, sys_capget) -#define __NR_capset 126 -__SYSCALL(__NR_capset, sys_capset) - -#define __NR_rt_sigpending 127 -__SYSCALL(__NR_rt_sigpending, sys_rt_sigpending) -#define __NR_rt_sigtimedwait 128 -__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait) -#define __NR_rt_sigqueueinfo 129 -__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo) -#define __NR_rt_sigsuspend 130 -__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend) -#define __NR_sigaltstack 131 -__SYSCALL(__NR_sigaltstack, stub_sigaltstack) -#define __NR_utime 132 -__SYSCALL(__NR_utime, sys_utime) -#define __NR_mknod 133 -__SYSCALL(__NR_mknod, sys_mknod) - -/* Only needed for a.out */ -#define __NR_uselib 134 -__SYSCALL(__NR_uselib, sys_ni_syscall) -#define __NR_personality 135 -__SYSCALL(__NR_personality, sys_personality) - -#define __NR_ustat 136 -__SYSCALL(__NR_ustat, sys_ustat) -#define __NR_statfs 137 -__SYSCALL(__NR_statfs, sys_statfs) -#define __NR_fstatfs 138 -__SYSCALL(__NR_fstatfs, sys_fstatfs) -#define __NR_sysfs 139 -__SYSCALL(__NR_sysfs, sys_sysfs) - -#define __NR_getpriority 140 -__SYSCALL(__NR_getpriority, sys_getpriority) -#define __NR_setpriority 141 -__SYSCALL(__NR_setpriority, sys_setpriority) -#define __NR_sched_setparam 142 -__SYSCALL(__NR_sched_setparam, sys_sched_setparam) -#define __NR_sched_getparam 143 -__SYSCALL(__NR_sched_getparam, sys_sched_getparam) -#define __NR_sched_setscheduler 144 -__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler) -#define __NR_sched_getscheduler 145 -__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler) -#define __NR_sched_get_priority_max 146 -__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max) -#define __NR_sched_get_priority_min 147 -__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min) -#define __NR_sched_rr_get_interval 148 -__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval) - -#define __NR_mlock 149 -__SYSCALL(__NR_mlock, sys_mlock) -#define __NR_munlock 150 -__SYSCALL(__NR_munlock, sys_munlock) -#define __NR_mlockall 151 -__SYSCALL(__NR_mlockall, sys_mlockall) -#define __NR_munlockall 152 -__SYSCALL(__NR_munlockall, sys_munlockall) - -#define __NR_vhangup 153 -__SYSCALL(__NR_vhangup, sys_vhangup) - -#define __NR_modify_ldt 154 -__SYSCALL(__NR_modify_ldt, sys_modify_ldt) - -#define __NR_pivot_root 155 -__SYSCALL(__NR_pivot_root, sys_pivot_root) - -#define __NR__sysctl 156 -__SYSCALL(__NR__sysctl, sys_sysctl) - -#define __NR_prctl 157 -__SYSCALL(__NR_prctl, sys_prctl) -#define __NR_arch_prctl 158 -__SYSCALL(__NR_arch_prctl, sys_arch_prctl) - -#define __NR_adjtimex 159 -__SYSCALL(__NR_adjtimex, sys_adjtimex) - -#define __NR_setrlimit 160 -__SYSCALL(__NR_setrlimit, sys_setrlimit) - -#define __NR_chroot 161 -__SYSCALL(__NR_chroot, sys_chroot) - -#define __NR_sync 162 -__SYSCALL(__NR_sync, sys_sync) - -#define __NR_acct 163 -__SYSCALL(__NR_acct, sys_acct) - -#define __NR_settimeofday 164 -__SYSCALL(__NR_settimeofday, sys_settimeofday) - -#define __NR_mount 165 -__SYSCALL(__NR_mount, sys_mount) -#define __NR_umount2 166 -__SYSCALL(__NR_umount2, sys_umount) - -#define __NR_swapon 167 -__SYSCALL(__NR_swapon, sys_swapon) -#define __NR_swapoff 168 -__SYSCALL(__NR_swapoff, sys_swapoff) - -#define __NR_reboot 169 -__SYSCALL(__NR_reboot, sys_reboot) - -#define __NR_sethostname 170 -__SYSCALL(__NR_sethostname, sys_sethostname) -#define __NR_setdomainname 171 -__SYSCALL(__NR_setdomainname, sys_setdomainname) - -#define __NR_iopl 172 -__SYSCALL(__NR_iopl, stub_iopl) -#define __NR_ioperm 173 -__SYSCALL(__NR_ioperm, sys_ioperm) - -#define __NR_create_module 174 -__SYSCALL(__NR_create_module, sys_ni_syscall) -#define __NR_init_module 175 -__SYSCALL(__NR_init_module, sys_init_module) -#define __NR_delete_module 176 -__SYSCALL(__NR_delete_module, sys_delete_module) -#define __NR_get_kernel_syms 177 -__SYSCALL(__NR_get_kernel_syms, sys_ni_syscall) -#define __NR_query_module 178 -__SYSCALL(__NR_query_module, sys_ni_syscall) - -#define __NR_quotactl 179 -__SYSCALL(__NR_quotactl, sys_quotactl) - -#define __NR_nfsservctl 180 -__SYSCALL(__NR_nfsservctl, sys_ni_syscall) - -/* reserved for LiS/STREAMS */ -#define __NR_getpmsg 181 -__SYSCALL(__NR_getpmsg, sys_ni_syscall) -#define __NR_putpmsg 182 -__SYSCALL(__NR_putpmsg, sys_ni_syscall) - -/* reserved for AFS */ -#define __NR_afs_syscall 183 -__SYSCALL(__NR_afs_syscall, sys_ni_syscall) - -/* reserved for tux */ -#define __NR_tuxcall 184 -__SYSCALL(__NR_tuxcall, sys_ni_syscall) - -#define __NR_security 185 -__SYSCALL(__NR_security, sys_ni_syscall) - -#define __NR_gettid 186 -__SYSCALL(__NR_gettid, sys_gettid) - -#define __NR_readahead 187 -__SYSCALL(__NR_readahead, sys_readahead) -#define __NR_setxattr 188 -__SYSCALL(__NR_setxattr, sys_setxattr) -#define __NR_lsetxattr 189 -__SYSCALL(__NR_lsetxattr, sys_lsetxattr) -#define __NR_fsetxattr 190 -__SYSCALL(__NR_fsetxattr, sys_fsetxattr) -#define __NR_getxattr 191 -__SYSCALL(__NR_getxattr, sys_getxattr) -#define __NR_lgetxattr 192 -__SYSCALL(__NR_lgetxattr, sys_lgetxattr) -#define __NR_fgetxattr 193 -__SYSCALL(__NR_fgetxattr, sys_fgetxattr) -#define __NR_listxattr 194 -__SYSCALL(__NR_listxattr, sys_listxattr) -#define __NR_llistxattr 195 -__SYSCALL(__NR_llistxattr, sys_llistxattr) -#define __NR_flistxattr 196 -__SYSCALL(__NR_flistxattr, sys_flistxattr) -#define __NR_removexattr 197 -__SYSCALL(__NR_removexattr, sys_removexattr) -#define __NR_lremovexattr 198 -__SYSCALL(__NR_lremovexattr, sys_lremovexattr) -#define __NR_fremovexattr 199 -__SYSCALL(__NR_fremovexattr, sys_fremovexattr) -#define __NR_tkill 200 -__SYSCALL(__NR_tkill, sys_tkill) -#define __NR_time 201 -__SYSCALL(__NR_time, sys_time) -#define __NR_futex 202 -__SYSCALL(__NR_futex, sys_futex) -#define __NR_sched_setaffinity 203 -__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity) -#define __NR_sched_getaffinity 204 -__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity) -#define __NR_set_thread_area 205 -__SYSCALL(__NR_set_thread_area, sys_ni_syscall) /* use arch_prctl */ -#define __NR_io_setup 206 -__SYSCALL(__NR_io_setup, sys_io_setup) -#define __NR_io_destroy 207 -__SYSCALL(__NR_io_destroy, sys_io_destroy) -#define __NR_io_getevents 208 -__SYSCALL(__NR_io_getevents, sys_io_getevents) -#define __NR_io_submit 209 -__SYSCALL(__NR_io_submit, sys_io_submit) -#define __NR_io_cancel 210 -__SYSCALL(__NR_io_cancel, sys_io_cancel) -#define __NR_get_thread_area 211 -__SYSCALL(__NR_get_thread_area, sys_ni_syscall) /* use arch_prctl */ -#define __NR_lookup_dcookie 212 -__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie) -#define __NR_epoll_create 213 -__SYSCALL(__NR_epoll_create, sys_epoll_create) -#define __NR_epoll_ctl_old 214 -__SYSCALL(__NR_epoll_ctl_old, sys_ni_syscall) -#define __NR_epoll_wait_old 215 -__SYSCALL(__NR_epoll_wait_old, sys_ni_syscall) -#define __NR_remap_file_pages 216 -__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages) -#define __NR_getdents64 217 -__SYSCALL(__NR_getdents64, sys_getdents64) -#define __NR_set_tid_address 218 -__SYSCALL(__NR_set_tid_address, sys_set_tid_address) -#define __NR_restart_syscall 219 -__SYSCALL(__NR_restart_syscall, sys_restart_syscall) -#define __NR_semtimedop 220 -__SYSCALL(__NR_semtimedop, sys_semtimedop) -#define __NR_fadvise64 221 -__SYSCALL(__NR_fadvise64, sys_fadvise64) -#define __NR_timer_create 222 -__SYSCALL(__NR_timer_create, sys_timer_create) -#define __NR_timer_settime 223 -__SYSCALL(__NR_timer_settime, sys_timer_settime) -#define __NR_timer_gettime 224 -__SYSCALL(__NR_timer_gettime, sys_timer_gettime) -#define __NR_timer_getoverrun 225 -__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun) -#define __NR_timer_delete 226 -__SYSCALL(__NR_timer_delete, sys_timer_delete) -#define __NR_clock_settime 227 -__SYSCALL(__NR_clock_settime, sys_clock_settime) -#define __NR_clock_gettime 228 -__SYSCALL(__NR_clock_gettime, sys_clock_gettime) -#define __NR_clock_getres 229 -__SYSCALL(__NR_clock_getres, sys_clock_getres) -#define __NR_clock_nanosleep 230 -__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep) -#define __NR_exit_group 231 -__SYSCALL(__NR_exit_group, sys_exit_group) -#define __NR_epoll_wait 232 -__SYSCALL(__NR_epoll_wait, sys_epoll_wait) -#define __NR_epoll_ctl 233 -__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl) -#define __NR_tgkill 234 -__SYSCALL(__NR_tgkill, sys_tgkill) -#define __NR_utimes 235 -__SYSCALL(__NR_utimes, sys_utimes) -#define __NR_vserver 236 -__SYSCALL(__NR_vserver, sys_ni_syscall) -#define __NR_mbind 237 -__SYSCALL(__NR_mbind, sys_mbind) -#define __NR_set_mempolicy 238 -__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy) -#define __NR_get_mempolicy 239 -__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy) -#define __NR_mq_open 240 -__SYSCALL(__NR_mq_open, sys_mq_open) -#define __NR_mq_unlink 241 -__SYSCALL(__NR_mq_unlink, sys_mq_unlink) -#define __NR_mq_timedsend 242 -__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend) -#define __NR_mq_timedreceive 243 -__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive) -#define __NR_mq_notify 244 -__SYSCALL(__NR_mq_notify, sys_mq_notify) -#define __NR_mq_getsetattr 245 -__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr) -#define __NR_kexec_load 246 -__SYSCALL(__NR_kexec_load, sys_kexec_load) -#define __NR_waitid 247 -__SYSCALL(__NR_waitid, sys_waitid) -#define __NR_add_key 248 -__SYSCALL(__NR_add_key, sys_add_key) -#define __NR_request_key 249 -__SYSCALL(__NR_request_key, sys_request_key) -#define __NR_keyctl 250 -__SYSCALL(__NR_keyctl, sys_keyctl) -#define __NR_ioprio_set 251 -__SYSCALL(__NR_ioprio_set, sys_ioprio_set) -#define __NR_ioprio_get 252 -__SYSCALL(__NR_ioprio_get, sys_ioprio_get) -#define __NR_inotify_init 253 -__SYSCALL(__NR_inotify_init, sys_inotify_init) -#define __NR_inotify_add_watch 254 -__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch) -#define __NR_inotify_rm_watch 255 -__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch) -#define __NR_migrate_pages 256 -__SYSCALL(__NR_migrate_pages, sys_migrate_pages) -#define __NR_openat 257 -__SYSCALL(__NR_openat, sys_openat) -#define __NR_mkdirat 258 -__SYSCALL(__NR_mkdirat, sys_mkdirat) -#define __NR_mknodat 259 -__SYSCALL(__NR_mknodat, sys_mknodat) -#define __NR_fchownat 260 -__SYSCALL(__NR_fchownat, sys_fchownat) -#define __NR_futimesat 261 -__SYSCALL(__NR_futimesat, sys_futimesat) -#define __NR_newfstatat 262 -__SYSCALL(__NR_newfstatat, sys_newfstatat) -#define __NR_unlinkat 263 -__SYSCALL(__NR_unlinkat, sys_unlinkat) -#define __NR_renameat 264 -__SYSCALL(__NR_renameat, sys_renameat) -#define __NR_linkat 265 -__SYSCALL(__NR_linkat, sys_linkat) -#define __NR_symlinkat 266 -__SYSCALL(__NR_symlinkat, sys_symlinkat) -#define __NR_readlinkat 267 -__SYSCALL(__NR_readlinkat, sys_readlinkat) -#define __NR_fchmodat 268 -__SYSCALL(__NR_fchmodat, sys_fchmodat) -#define __NR_faccessat 269 -__SYSCALL(__NR_faccessat, sys_faccessat) -#define __NR_pselect6 270 -__SYSCALL(__NR_pselect6, sys_pselect6) -#define __NR_ppoll 271 -__SYSCALL(__NR_ppoll, sys_ppoll) -#define __NR_unshare 272 -__SYSCALL(__NR_unshare, sys_unshare) -#define __NR_set_robust_list 273 -__SYSCALL(__NR_set_robust_list, sys_set_robust_list) -#define __NR_get_robust_list 274 -__SYSCALL(__NR_get_robust_list, sys_get_robust_list) -#define __NR_splice 275 -__SYSCALL(__NR_splice, sys_splice) -#define __NR_tee 276 -__SYSCALL(__NR_tee, sys_tee) -#define __NR_sync_file_range 277 -__SYSCALL(__NR_sync_file_range, sys_sync_file_range) -#define __NR_vmsplice 278 -__SYSCALL(__NR_vmsplice, sys_vmsplice) -#define __NR_move_pages 279 -__SYSCALL(__NR_move_pages, sys_move_pages) -#define __NR_utimensat 280 -__SYSCALL(__NR_utimensat, sys_utimensat) -#define __NR_epoll_pwait 281 -__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait) -#define __NR_signalfd 282 -__SYSCALL(__NR_signalfd, sys_signalfd) -#define __NR_timerfd_create 283 -__SYSCALL(__NR_timerfd_create, sys_timerfd_create) -#define __NR_eventfd 284 -__SYSCALL(__NR_eventfd, sys_eventfd) -#define __NR_fallocate 285 -__SYSCALL(__NR_fallocate, sys_fallocate) -#define __NR_timerfd_settime 286 -__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime) -#define __NR_timerfd_gettime 287 -__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime) -#define __NR_accept4 288 -__SYSCALL(__NR_accept4, sys_accept4) -#define __NR_signalfd4 289 -__SYSCALL(__NR_signalfd4, sys_signalfd4) -#define __NR_eventfd2 290 -__SYSCALL(__NR_eventfd2, sys_eventfd2) -#define __NR_epoll_create1 291 -__SYSCALL(__NR_epoll_create1, sys_epoll_create1) -#define __NR_dup3 292 -__SYSCALL(__NR_dup3, sys_dup3) -#define __NR_pipe2 293 -__SYSCALL(__NR_pipe2, sys_pipe2) -#define __NR_inotify_init1 294 -__SYSCALL(__NR_inotify_init1, sys_inotify_init1) -#define __NR_preadv 295 -__SYSCALL(__NR_preadv, sys_preadv) -#define __NR_pwritev 296 -__SYSCALL(__NR_pwritev, sys_pwritev) -#define __NR_rt_tgsigqueueinfo 297 -__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) -#define __NR_perf_event_open 298 -__SYSCALL(__NR_perf_event_open, sys_perf_event_open) -#define __NR_recvmmsg 299 -__SYSCALL(__NR_recvmmsg, sys_recvmmsg) -#define __NR_fanotify_init 300 -__SYSCALL(__NR_fanotify_init, sys_fanotify_init) -#define __NR_fanotify_mark 301 -__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) -#define __NR_prlimit64 302 -__SYSCALL(__NR_prlimit64, sys_prlimit64) -#define __NR_name_to_handle_at 303 -__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) -#define __NR_open_by_handle_at 304 -__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) -#define __NR_clock_adjtime 305 -__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) -#define __NR_syncfs 306 -__SYSCALL(__NR_syncfs, sys_syncfs) -#define __NR_sendmmsg 307 -__SYSCALL(__NR_sendmmsg, sys_sendmmsg) -#define __NR_setns 308 -__SYSCALL(__NR_setns, sys_setns) -#define __NR_getcpu 309 -__SYSCALL(__NR_getcpu, sys_getcpu) -#define __NR_process_vm_readv 310 -__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv) -#define __NR_process_vm_writev 311 -__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev) - -#ifndef __NO_STUBS -#define __ARCH_WANT_OLD_READDIR -#define __ARCH_WANT_OLD_STAT -#define __ARCH_WANT_SYS_ALARM -#define __ARCH_WANT_SYS_GETHOSTNAME -#define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK -#define __ARCH_WANT_SYS_SIGNAL -#define __ARCH_WANT_SYS_UTIME -#define __ARCH_WANT_SYS_WAITPID -#define __ARCH_WANT_SYS_SOCKETCALL -#define __ARCH_WANT_SYS_FADVISE64 -#define __ARCH_WANT_SYS_GETPGRP -#define __ARCH_WANT_SYS_LLSEEK -#define __ARCH_WANT_SYS_NICE -#define __ARCH_WANT_SYS_OLD_GETRLIMIT -#define __ARCH_WANT_SYS_OLD_UNAME -#define __ARCH_WANT_SYS_OLDUMOUNT -#define __ARCH_WANT_SYS_SIGPENDING -#define __ARCH_WANT_SYS_SIGPROCMASK -#define __ARCH_WANT_SYS_RT_SIGACTION -#define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_TIME -#define __ARCH_WANT_COMPAT_SYS_TIME -#endif /* __NO_STUBS */ - -#ifdef __KERNEL__ - -#ifndef COMPILE_OFFSETS -#include -#define NR_syscalls (__NR_syscall_max + 1) -#endif - -/* - * "Conditional" syscalls - * - * What we want is __attribute__((weak,alias("sys_ni_syscall"))), - * but it doesn't work on all toolchains, so we just do it by hand - */ -#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall") -#endif /* __KERNEL__ */ - -#endif /* _ASM_X86_UNISTD_64_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8baca3c..8c473d9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,7 +25,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o +obj-y += syscall_$(BITS).o +obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 395a10e..85d98ab 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -3,6 +3,11 @@ #include #include "../../../drivers/lguest/lg.h" +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls[] = { +#include +}; + /* workaround for a warning with -Wmissing-prototypes */ void foo(void); @@ -76,4 +81,7 @@ void foo(void) OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); #endif + BLANK(); + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + DEFINE(NR_syscalls, sizeof(syscalls)); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72a119..834e897 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,11 +1,12 @@ #include -#define __NO_STUBS 1 -#undef __SYSCALL -#undef _ASM_X86_UNISTD_64_H -#define __SYSCALL(nr, sym) [nr] = 1, -static char syscalls[] = { -#include +#define __SYSCALL_64(nr, sym, compat) [nr] = 1, +static char syscalls_64[] = { +#include +}; +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls_ia32[] = { +#include }; int main(void) @@ -72,7 +73,11 @@ int main(void) OFFSET(TSS_ist, tss_struct, x86_tss.ist); BLANK(); - DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); + DEFINE(NR_syscalls, sizeof(syscalls_64)); + + DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1); + DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); return 0; } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f3f6f53..1ffcda2 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -81,8 +81,6 @@ * enough to patch inline, increasing performance. */ -#define nr_syscalls ((syscall_table_size)/4) - #ifdef CONFIG_PREEMPT #define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else @@ -423,7 +421,7 @@ sysenter_past_esp: testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz sysenter_audit sysenter_do_call: - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) @@ -504,7 +502,7 @@ ENTRY(system_call) # system call tracing in operation / emulation testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jae syscall_badsys syscall_call: call *sys_call_table(,%eax,4) @@ -650,7 +648,7 @@ syscall_trace_entry: movl %esp, %eax call syscall_trace_enter /* What it returned is what we'll actually use. */ - cmpl $(nr_syscalls), %eax + cmpl $(NR_syscalls), %eax jnae syscall_call jmp syscall_exit END(syscall_trace_entry) @@ -690,29 +688,28 @@ END(syscall_badsys) * System calls that need a pt_regs pointer. */ #define PTREGSCALL0(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL1(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%edx; \ movl (PT_EBX+4)(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL2(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ leal 4(%esp),%ecx; \ movl (PT_ECX+4)(%esp),%edx; \ movl (PT_EBX+4)(%esp),%eax; \ - jmp sys_##name; + jmp sys_##name; \ +ENDPROC(ptregs_##name) #define PTREGSCALL3(name) \ - ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \ CFI_STARTPROC; \ leal 4(%esp),%eax; \ pushl_cfi %eax; \ @@ -737,8 +734,7 @@ PTREGSCALL2(vm86) PTREGSCALL1(vm86old) /* Clone is an oddball. The 4th arg is in %edi */ - ALIGN; -ptregs_clone: +ENTRY(ptregs_clone) CFI_STARTPROC leal 4(%esp),%eax pushl_cfi %eax @@ -1209,11 +1205,6 @@ return_to_handler: jmp *%ecx #endif -.section .rodata,"a" -#include "syscall_table_32.S" - -syscall_table_size=(.-sys_call_table) - /* * Some functions should be protected against kprobes */ diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c new file mode 100644 index 0000000..b37a573 --- /dev/null +++ b/arch/x86/kernel/syscall_32.c @@ -0,0 +1,25 @@ +/* System call table for i386. */ + +#include +#include +#include +#include + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#include +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, + +typedef asmlinkage void (*sys_call_ptr_t)(void); + +extern asmlinkage void sys_ni_syscall(void); + +const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { + /* + * Smells like a like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include +}; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 0edfafa..7ac7943 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -5,15 +5,11 @@ #include #include -#define __NO_STUBS +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include +#undef __SYSCALL_64 -#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_UNISTD_64_H -#include - -#undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = sym, -#undef _ASM_X86_UNISTD_64_H +#define __SYSCALL_64(nr, sym, compat) [nr] = sym, typedef void (*sys_call_ptr_t)(void); @@ -25,5 +21,5 @@ const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { * when the & below is removed. */ [0 ... __NR_syscall_max] = &sys_ni_syscall, -#include +#include }; diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S deleted file mode 100644 index 9a0e312..0000000 --- a/arch/x86/kernel/syscall_table_32.S +++ /dev/null @@ -1,350 +0,0 @@ -ENTRY(sys_call_table) - .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ - .long sys_exit - .long ptregs_fork - .long sys_read - .long sys_write - .long sys_open /* 5 */ - .long sys_close - .long sys_waitpid - .long sys_creat - .long sys_link - .long sys_unlink /* 10 */ - .long ptregs_execve - .long sys_chdir - .long sys_time - .long sys_mknod - .long sys_chmod /* 15 */ - .long sys_lchown16 - .long sys_ni_syscall /* old break syscall holder */ - .long sys_stat - .long sys_lseek - .long sys_getpid /* 20 */ - .long sys_mount - .long sys_oldumount - .long sys_setuid16 - .long sys_getuid16 - .long sys_stime /* 25 */ - .long sys_ptrace - .long sys_alarm - .long sys_fstat - .long sys_pause - .long sys_utime /* 30 */ - .long sys_ni_syscall /* old stty syscall holder */ - .long sys_ni_syscall /* old gtty syscall holder */ - .long sys_access - .long sys_nice - .long sys_ni_syscall /* 35 - old ftime syscall holder */ - .long sys_sync - .long sys_kill - .long sys_rename - .long sys_mkdir - .long sys_rmdir /* 40 */ - .long sys_dup - .long sys_pipe - .long sys_times - .long sys_ni_syscall /* old prof syscall holder */ - .long sys_brk /* 45 */ - .long sys_setgid16 - .long sys_getgid16 - .long sys_signal - .long sys_geteuid16 - .long sys_getegid16 /* 50 */ - .long sys_acct - .long sys_umount /* recycled never used phys() */ - .long sys_ni_syscall /* old lock syscall holder */ - .long sys_ioctl - .long sys_fcntl /* 55 */ - .long sys_ni_syscall /* old mpx syscall holder */ - .long sys_setpgid - .long sys_ni_syscall /* old ulimit syscall holder */ - .long sys_olduname - .long sys_umask /* 60 */ - .long sys_chroot - .long sys_ustat - .long sys_dup2 - .long sys_getppid - .long sys_getpgrp /* 65 */ - .long sys_setsid - .long sys_sigaction - .long sys_sgetmask - .long sys_ssetmask - .long sys_setreuid16 /* 70 */ - .long sys_setregid16 - .long sys_sigsuspend - .long sys_sigpending - .long sys_sethostname - .long sys_setrlimit /* 75 */ - .long sys_old_getrlimit - .long sys_getrusage - .long sys_gettimeofday - .long sys_settimeofday - .long sys_getgroups16 /* 80 */ - .long sys_setgroups16 - .long sys_old_select - .long sys_symlink - .long sys_lstat - .long sys_readlink /* 85 */ - .long sys_uselib - .long sys_swapon - .long sys_reboot - .long sys_old_readdir - .long sys_old_mmap /* 90 */ - .long sys_munmap - .long sys_truncate - .long sys_ftruncate - .long sys_fchmod - .long sys_fchown16 /* 95 */ - .long sys_getpriority - .long sys_setpriority - .long sys_ni_syscall /* old profil syscall holder */ - .long sys_statfs - .long sys_fstatfs /* 100 */ - .long sys_ioperm - .long sys_socketcall - .long sys_syslog - .long sys_setitimer - .long sys_getitimer /* 105 */ - .long sys_newstat - .long sys_newlstat - .long sys_newfstat - .long sys_uname - .long ptregs_iopl /* 110 */ - .long sys_vhangup - .long sys_ni_syscall /* old "idle" system call */ - .long ptregs_vm86old - .long sys_wait4 - .long sys_swapoff /* 115 */ - .long sys_sysinfo - .long sys_ipc - .long sys_fsync - .long ptregs_sigreturn - .long ptregs_clone /* 120 */ - .long sys_setdomainname - .long sys_newuname - .long sys_modify_ldt - .long sys_adjtimex - .long sys_mprotect /* 125 */ - .long sys_sigprocmask - .long sys_ni_syscall /* old "create_module" */ - .long sys_init_module - .long sys_delete_module - .long sys_ni_syscall /* 130: old "get_kernel_syms" */ - .long sys_quotactl - .long sys_getpgid - .long sys_fchdir - .long sys_bdflush - .long sys_sysfs /* 135 */ - .long sys_personality - .long sys_ni_syscall /* reserved for afs_syscall */ - .long sys_setfsuid16 - .long sys_setfsgid16 - .long sys_llseek /* 140 */ - .long sys_getdents - .long sys_select - .long sys_flock - .long sys_msync - .long sys_readv /* 145 */ - .long sys_writev - .long sys_getsid - .long sys_fdatasync - .long sys_sysctl - .long sys_mlock /* 150 */ - .long sys_munlock - .long sys_mlockall - .long sys_munlockall - .long sys_sched_setparam - .long sys_sched_getparam /* 155 */ - .long sys_sched_setscheduler - .long sys_sched_getscheduler - .long sys_sched_yield - .long sys_sched_get_priority_max - .long sys_sched_get_priority_min /* 160 */ - .long sys_sched_rr_get_interval - .long sys_nanosleep - .long sys_mremap - .long sys_setresuid16 - .long sys_getresuid16 /* 165 */ - .long ptregs_vm86 - .long sys_ni_syscall /* Old sys_query_module */ - .long sys_poll - .long sys_ni_syscall /* Old nfsservctl */ - .long sys_setresgid16 /* 170 */ - .long sys_getresgid16 - .long sys_prctl - .long ptregs_rt_sigreturn - .long sys_rt_sigaction - .long sys_rt_sigprocmask /* 175 */ - .long sys_rt_sigpending - .long sys_rt_sigtimedwait - .long sys_rt_sigqueueinfo - .long sys_rt_sigsuspend - .long sys_pread64 /* 180 */ - .long sys_pwrite64 - .long sys_chown16 - .long sys_getcwd - .long sys_capget - .long sys_capset /* 185 */ - .long ptregs_sigaltstack - .long sys_sendfile - .long sys_ni_syscall /* reserved for streams1 */ - .long sys_ni_syscall /* reserved for streams2 */ - .long ptregs_vfork /* 190 */ - .long sys_getrlimit - .long sys_mmap_pgoff - .long sys_truncate64 - .long sys_ftruncate64 - .long sys_stat64 /* 195 */ - .long sys_lstat64 - .long sys_fstat64 - .long sys_lchown - .long sys_getuid - .long sys_getgid /* 200 */ - .long sys_geteuid - .long sys_getegid - .long sys_setreuid - .long sys_setregid - .long sys_getgroups /* 205 */ - .long sys_setgroups - .long sys_fchown - .long sys_setresuid - .long sys_getresuid - .long sys_setresgid /* 210 */ - .long sys_getresgid - .long sys_chown - .long sys_setuid - .long sys_setgid - .long sys_setfsuid /* 215 */ - .long sys_setfsgid - .long sys_pivot_root - .long sys_mincore - .long sys_madvise - .long sys_getdents64 /* 220 */ - .long sys_fcntl64 - .long sys_ni_syscall /* reserved for TUX */ - .long sys_ni_syscall - .long sys_gettid - .long sys_readahead /* 225 */ - .long sys_setxattr - .long sys_lsetxattr - .long sys_fsetxattr - .long sys_getxattr - .long sys_lgetxattr /* 230 */ - .long sys_fgetxattr - .long sys_listxattr - .long sys_llistxattr - .long sys_flistxattr - .long sys_removexattr /* 235 */ - .long sys_lremovexattr - .long sys_fremovexattr - .long sys_tkill - .long sys_sendfile64 - .long sys_futex /* 240 */ - .long sys_sched_setaffinity - .long sys_sched_getaffinity - .long sys_set_thread_area - .long sys_get_thread_area - .long sys_io_setup /* 245 */ - .long sys_io_destroy - .long sys_io_getevents - .long sys_io_submit - .long sys_io_cancel - .long sys_fadvise64 /* 250 */ - .long sys_ni_syscall - .long sys_exit_group - .long sys_lookup_dcookie - .long sys_epoll_create - .long sys_epoll_ctl /* 255 */ - .long sys_epoll_wait - .long sys_remap_file_pages - .long sys_set_tid_address - .long sys_timer_create - .long sys_timer_settime /* 260 */ - .long sys_timer_gettime - .long sys_timer_getoverrun - .long sys_timer_delete - .long sys_clock_settime - .long sys_clock_gettime /* 265 */ - .long sys_clock_getres - .long sys_clock_nanosleep - .long sys_statfs64 - .long sys_fstatfs64 - .long sys_tgkill /* 270 */ - .long sys_utimes - .long sys_fadvise64_64 - .long sys_ni_syscall /* sys_vserver */ - .long sys_mbind - .long sys_get_mempolicy - .long sys_set_mempolicy - .long sys_mq_open - .long sys_mq_unlink - .long sys_mq_timedsend - .long sys_mq_timedreceive /* 280 */ - .long sys_mq_notify - .long sys_mq_getsetattr - .long sys_kexec_load - .long sys_waitid - .long sys_ni_syscall /* 285 */ /* available */ - .long sys_add_key - .long sys_request_key - .long sys_keyctl - .long sys_ioprio_set - .long sys_ioprio_get /* 290 */ - .long sys_inotify_init - .long sys_inotify_add_watch - .long sys_inotify_rm_watch - .long sys_migrate_pages - .long sys_openat /* 295 */ - .long sys_mkdirat - .long sys_mknodat - .long sys_fchownat - .long sys_futimesat - .long sys_fstatat64 /* 300 */ - .long sys_unlinkat - .long sys_renameat - .long sys_linkat - .long sys_symlinkat - .long sys_readlinkat /* 305 */ - .long sys_fchmodat - .long sys_faccessat - .long sys_pselect6 - .long sys_ppoll - .long sys_unshare /* 310 */ - .long sys_set_robust_list - .long sys_get_robust_list - .long sys_splice - .long sys_sync_file_range - .long sys_tee /* 315 */ - .long sys_vmsplice - .long sys_move_pages - .long sys_getcpu - .long sys_epoll_pwait - .long sys_utimensat /* 320 */ - .long sys_signalfd - .long sys_timerfd_create - .long sys_eventfd - .long sys_fallocate - .long sys_timerfd_settime /* 325 */ - .long sys_timerfd_gettime - .long sys_signalfd4 - .long sys_eventfd2 - .long sys_epoll_create1 - .long sys_dup3 /* 330 */ - .long sys_pipe2 - .long sys_inotify_init1 - .long sys_preadv - .long sys_pwritev - .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_event_open - .long sys_recvmmsg - .long sys_fanotify_init - .long sys_fanotify_mark - .long sys_prlimit64 /* 340 */ - .long sys_name_to_handle_at - .long sys_open_by_handle_at - .long sys_clock_adjtime - .long sys_syncfs - .long sys_sendmmsg /* 345 */ - .long sys_setns - .long sys_process_vm_readv - .long sys_process_vm_writev -- cgit v1.1 From f14525f9e033f344996905744f41680ea2b877ce Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 Nov 2011 16:03:27 -0800 Subject: x86: Simplify syscallhdr.sh Simplify syscallhdr.sh by letting grep sort out the ABIs that we want, rather than relying on manual list matching. This is safe since the ABI strings already have to consist only of characters which are valid in C macro names. Suggested-by: Matt Helsley Link: http://lkml.kernel.org/r/20111118221558.GA6408@count0.beaverton.ibm.com Signed-off-by: H. Peter Anvin --- arch/x86/syscalls/syscallhdr.sh | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh index 0d473ff..b3c5930 100644 --- a/arch/x86/syscalls/syscallhdr.sh +++ b/arch/x86/syscalls/syscallhdr.sh @@ -2,33 +2,20 @@ in="$1" out="$2" -my_abis=`echo "$3" | tr ',' ' '` +my_abis=`echo "($3)" | tr ',' '|'` prefix="$4" offset="$5" fileguard=_ASM_X86_`basename "$out" | sed \ -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` - -in_list () { - local x - for x in $1; do - if [ x"$x" = x"$2" ]; then - return 0 - fi - done - return 1 -} - -grep '^[0-9]' "$in" | sort -n | ( +grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( echo "#ifndef ${fileguard}" echo "#define ${fileguard} 1" echo "" while read nr abi name entry ; do - if in_list "$my_abis" "$abi"; then - echo "#define __NR_${prefix}${name}" $((nr+offset)) - fi + echo "#define __NR_${prefix}${name}" $((nr+offset)) done echo "" -- cgit v1.1 From 61f1e7e20874e8f11dab69b6a4bf7616badd4fe8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 Nov 2011 16:25:07 -0800 Subject: x86, syscall: Re-fix typo in comment Fix the same typo as was fixed in: b7641d2c x86-64, syscall: Adjust comment spacing and remove typo ... for the new versions of this file (32-bit and IA32 compat). Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1321569446-20433-4-git-send-email-hpa@linux.intel.com --- arch/x86/ia32/syscall_ia32.c | 2 +- arch/x86/kernel/syscall_32.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c index d04d3db..4754ba0 100644 --- a/arch/x86/ia32/syscall_ia32.c +++ b/arch/x86/ia32/syscall_ia32.c @@ -17,7 +17,7 @@ extern void compat_ni_syscall(void); const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { /* - * Smells like a like a compiler bug -- it doesn't work + * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall, diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c index b37a573..147fcd4 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/kernel/syscall_32.c @@ -17,7 +17,7 @@ extern asmlinkage void sys_ni_syscall(void); const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* - * Smells like a like a compiler bug -- it doesn't work + * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ [0 ... __NR_syscall_max] = &sys_ni_syscall, -- cgit v1.1 From 3f86886c72fb68088162c7e08cc7f85282f1860c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 Nov 2011 17:01:19 -0800 Subject: x86, syscall: Allow syscall offset to be symbolic Allow the specified syscall offset to be symbolic, e.g. a macro. For offset system calls, this if nothing else makes the generated code easier to read. Suggested-by: H. J. Lu Link: http://lkml.kernel.org/r/1321569446-20433-7-git-send-email-hpa@linux.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/syscalls/syscallhdr.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/syscalls/syscallhdr.sh index b3c5930..31fd5f1 100644 --- a/arch/x86/syscalls/syscallhdr.sh +++ b/arch/x86/syscalls/syscallhdr.sh @@ -15,7 +15,11 @@ grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( echo "" while read nr abi name entry ; do - echo "#define __NR_${prefix}${name}" $((nr+offset)) + if [ -z "$offset" ]; then + echo "#define __NR_${prefix}${name} $nr" + else + echo "#define __NR_${prefix}${name} ($offset + $nr)" + fi done echo "" -- cgit v1.1 From 937c30d7f560210b0163035edd42b2aef78fed9e Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Wed, 9 Nov 2011 16:26:25 +0200 Subject: crypto: serpent - add 8-way parallel x86_64/SSE2 assembler implementation Patch adds x86_64/SSE2 assembler implementation of serpent cipher. Assembler functions crypt data in eigth block chunks (two 4 block chunk SSE2 operations in parallel to improve performance on out-of-order CPUs). Glue code is based on one from AES-NI implementation, so requests from irq context are redirected to cryptd. v2: - add missing include of linux/module.h (appearently crypto.h used to include module.h, which changed for 3.2 by commit 7c926402a7e8c9b279968fd94efec8700ba3859e) Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios): AMD Phenom II 1055T (fam:16, model:10): size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 1.03x 1.01x 1.03x 1.05x 1.00x 0.99x 64B 1.00x 1.01x 1.02x 1.04x 1.02x 1.01x 256B 2.34x 2.41x 0.99x 2.43x 2.39x 2.40x 1024B 2.51x 2.57x 1.00x 2.59x 2.56x 2.56x 8192B 2.50x 2.54x 1.00x 2.55x 2.57x 2.57x Intel Celeron T1600 (fam:6, model:15, step:13): size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 0.97x 0.97x 1.01x 1.01x 1.01x 1.02x 64B 1.00x 1.00x 1.00x 1.02x 1.01x 1.01x 256B 3.41x 3.35x 1.00x 3.39x 3.42x 3.44x 1024B 3.75x 3.72x 0.99x 3.74x 3.75x 3.75x 8192B 3.70x 3.68x 0.99x 3.68x 3.69x 3.69x Full output: http://koti.mbnet.fi/axh/kernel/crypto/phenom-ii-1055t/serpent-generic.txt http://koti.mbnet.fi/axh/kernel/crypto/phenom-ii-1055t/serpent-sse2.txt http://koti.mbnet.fi/axh/kernel/crypto/celeron-t1600/serpent-generic.txt http://koti.mbnet.fi/axh/kernel/crypto/celeron-t1600/serpent-sse2.txt Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/serpent-sse2-x86_64-asm_64.S | 761 +++++++++++++++++++++++++++ arch/x86/crypto/serpent_sse2_glue.c | 719 +++++++++++++++++++++++++ arch/x86/include/asm/serpent.h | 32 ++ 4 files changed, 1514 insertions(+) create mode 100644 arch/x86/crypto/serpent-sse2-x86_64-asm_64.S create mode 100644 arch/x86/crypto/serpent_sse2_glue.c create mode 100644 arch/x86/include/asm/serpent.h (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 3537d4b..12ebdbd 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o +obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o @@ -26,6 +27,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o +serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S new file mode 100644 index 0000000..7f24a15 --- /dev/null +++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S @@ -0,0 +1,761 @@ +/* + * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2) + * + * Copyright (C) 2011 Jussi Kivilinna + * + * Based on crypto/serpent.c by + * Copyright (C) 2002 Dag Arne Osvik + * 2003 Herbert Valerio Riedel + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "serpent-sse2-x86_64-asm_64.S" +.text + +#define CTX %rdi + +/********************************************************************** + 8-way SSE2 serpent + **********************************************************************/ +#define RA1 %xmm0 +#define RB1 %xmm1 +#define RC1 %xmm2 +#define RD1 %xmm3 +#define RE1 %xmm4 + +#define RA2 %xmm5 +#define RB2 %xmm6 +#define RC2 %xmm7 +#define RD2 %xmm8 +#define RE2 %xmm9 + +#define RNOT %xmm10 + +#define RK0 %xmm11 +#define RK1 %xmm12 +#define RK2 %xmm13 +#define RK3 %xmm14 + +#define S0_1(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + por x0, x3; \ + pxor x4, x0; \ + pxor x2, x4; \ + pxor RNOT, x4; \ + pxor x1, x3; \ + pand x0, x1; \ + pxor x4, x1; \ + pxor x0, x2; +#define S0_2(x0, x1, x2, x3, x4) \ + pxor x3, x0; \ + por x0, x4; \ + pxor x2, x0; \ + pand x1, x2; \ + pxor x2, x3; \ + pxor RNOT, x1; \ + pxor x4, x2; \ + pxor x2, x1; + +#define S1_1(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + pxor x0, x1; \ + pxor x3, x0; \ + pxor RNOT, x3; \ + pand x1, x4; \ + por x1, x0; \ + pxor x2, x3; \ + pxor x3, x0; \ + pxor x3, x1; +#define S1_2(x0, x1, x2, x3, x4) \ + pxor x4, x3; \ + por x4, x1; \ + pxor x2, x4; \ + pand x0, x2; \ + pxor x1, x2; \ + por x0, x1; \ + pxor RNOT, x0; \ + pxor x2, x0; \ + pxor x1, x4; + +#define S2_1(x0, x1, x2, x3, x4) \ + pxor RNOT, x3; \ + pxor x0, x1; \ + movdqa x0, x4; \ + pand x2, x0; \ + pxor x3, x0; \ + por x4, x3; \ + pxor x1, x2; \ + pxor x1, x3; \ + pand x0, x1; +#define S2_2(x0, x1, x2, x3, x4) \ + pxor x2, x0; \ + pand x3, x2; \ + por x1, x3; \ + pxor RNOT, x0; \ + pxor x0, x3; \ + pxor x0, x4; \ + pxor x2, x0; \ + por x2, x1; + +#define S3_1(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + pxor x3, x1; \ + por x0, x3; \ + pand x0, x4; \ + pxor x2, x0; \ + pxor x1, x2; \ + pand x3, x1; \ + pxor x3, x2; \ + por x4, x0; \ + pxor x3, x4; +#define S3_2(x0, x1, x2, x3, x4) \ + pxor x0, x1; \ + pand x3, x0; \ + pand x4, x3; \ + pxor x2, x3; \ + por x1, x4; \ + pand x1, x2; \ + pxor x3, x4; \ + pxor x3, x0; \ + pxor x2, x3; + +#define S4_1(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + pand x0, x3; \ + pxor x4, x0; \ + pxor x2, x3; \ + por x4, x2; \ + pxor x1, x0; \ + pxor x3, x4; \ + por x0, x2; \ + pxor x1, x2; +#define S4_2(x0, x1, x2, x3, x4) \ + pand x0, x1; \ + pxor x4, x1; \ + pand x2, x4; \ + pxor x3, x2; \ + pxor x0, x4; \ + por x1, x3; \ + pxor RNOT, x1; \ + pxor x0, x3; + +#define S5_1(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + por x0, x1; \ + pxor x1, x2; \ + pxor RNOT, x3; \ + pxor x0, x4; \ + pxor x2, x0; \ + pand x4, x1; \ + por x3, x4; \ + pxor x0, x4; +#define S5_2(x0, x1, x2, x3, x4) \ + pand x3, x0; \ + pxor x3, x1; \ + pxor x2, x3; \ + pxor x1, x0; \ + pand x4, x2; \ + pxor x2, x1; \ + pand x0, x2; \ + pxor x2, x3; + +#define S6_1(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + pxor x0, x3; \ + pxor x2, x1; \ + pxor x0, x2; \ + pand x3, x0; \ + por x3, x1; \ + pxor RNOT, x4; \ + pxor x1, x0; \ + pxor x2, x1; +#define S6_2(x0, x1, x2, x3, x4) \ + pxor x4, x3; \ + pxor x0, x4; \ + pand x0, x2; \ + pxor x1, x4; \ + pxor x3, x2; \ + pand x1, x3; \ + pxor x0, x3; \ + pxor x2, x1; + +#define S7_1(x0, x1, x2, x3, x4) \ + pxor RNOT, x1; \ + movdqa x1, x4; \ + pxor RNOT, x0; \ + pand x2, x1; \ + pxor x3, x1; \ + por x4, x3; \ + pxor x2, x4; \ + pxor x3, x2; \ + pxor x0, x3; \ + por x1, x0; +#define S7_2(x0, x1, x2, x3, x4) \ + pand x0, x2; \ + pxor x4, x0; \ + pxor x3, x4; \ + pand x0, x3; \ + pxor x1, x4; \ + pxor x4, x2; \ + pxor x1, x3; \ + por x0, x4; \ + pxor x1, x4; + +#define SI0_1(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + pxor x0, x1; \ + por x1, x3; \ + pxor x1, x4; \ + pxor RNOT, x0; \ + pxor x3, x2; \ + pxor x0, x3; \ + pand x1, x0; \ + pxor x2, x0; +#define SI0_2(x0, x1, x2, x3, x4) \ + pand x3, x2; \ + pxor x4, x3; \ + pxor x3, x2; \ + pxor x3, x1; \ + pand x0, x3; \ + pxor x0, x1; \ + pxor x2, x0; \ + pxor x3, x4; + +#define SI1_1(x0, x1, x2, x3, x4) \ + pxor x3, x1; \ + movdqa x0, x4; \ + pxor x2, x0; \ + pxor RNOT, x2; \ + por x1, x4; \ + pxor x3, x4; \ + pand x1, x3; \ + pxor x2, x1; \ + pand x4, x2; +#define SI1_2(x0, x1, x2, x3, x4) \ + pxor x1, x4; \ + por x3, x1; \ + pxor x0, x3; \ + pxor x0, x2; \ + por x4, x0; \ + pxor x4, x2; \ + pxor x0, x1; \ + pxor x1, x4; + +#define SI2_1(x0, x1, x2, x3, x4) \ + pxor x1, x2; \ + movdqa x3, x4; \ + pxor RNOT, x3; \ + por x2, x3; \ + pxor x4, x2; \ + pxor x0, x4; \ + pxor x1, x3; \ + por x2, x1; \ + pxor x0, x2; +#define SI2_2(x0, x1, x2, x3, x4) \ + pxor x4, x1; \ + por x3, x4; \ + pxor x3, x2; \ + pxor x2, x4; \ + pand x1, x2; \ + pxor x3, x2; \ + pxor x4, x3; \ + pxor x0, x4; + +#define SI3_1(x0, x1, x2, x3, x4) \ + pxor x1, x2; \ + movdqa x1, x4; \ + pand x2, x1; \ + pxor x0, x1; \ + por x4, x0; \ + pxor x3, x4; \ + pxor x3, x0; \ + por x1, x3; \ + pxor x2, x1; +#define SI3_2(x0, x1, x2, x3, x4) \ + pxor x3, x1; \ + pxor x2, x0; \ + pxor x3, x2; \ + pand x1, x3; \ + pxor x0, x1; \ + pand x2, x0; \ + pxor x3, x4; \ + pxor x0, x3; \ + pxor x1, x0; + +#define SI4_1(x0, x1, x2, x3, x4) \ + pxor x3, x2; \ + movdqa x0, x4; \ + pand x1, x0; \ + pxor x2, x0; \ + por x3, x2; \ + pxor RNOT, x4; \ + pxor x0, x1; \ + pxor x2, x0; \ + pand x4, x2; +#define SI4_2(x0, x1, x2, x3, x4) \ + pxor x0, x2; \ + por x4, x0; \ + pxor x3, x0; \ + pand x2, x3; \ + pxor x3, x4; \ + pxor x1, x3; \ + pand x0, x1; \ + pxor x1, x4; \ + pxor x3, x0; + +#define SI5_1(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + por x2, x1; \ + pxor x4, x2; \ + pxor x3, x1; \ + pand x4, x3; \ + pxor x3, x2; \ + por x0, x3; \ + pxor RNOT, x0; \ + pxor x2, x3; \ + por x0, x2; +#define SI5_2(x0, x1, x2, x3, x4) \ + pxor x1, x4; \ + pxor x4, x2; \ + pand x0, x4; \ + pxor x1, x0; \ + pxor x3, x1; \ + pand x2, x0; \ + pxor x3, x2; \ + pxor x2, x0; \ + pxor x4, x2; \ + pxor x3, x4; + +#define SI6_1(x0, x1, x2, x3, x4) \ + pxor x2, x0; \ + movdqa x0, x4; \ + pand x3, x0; \ + pxor x3, x2; \ + pxor x2, x0; \ + pxor x1, x3; \ + por x4, x2; \ + pxor x3, x2; \ + pand x0, x3; +#define SI6_2(x0, x1, x2, x3, x4) \ + pxor RNOT, x0; \ + pxor x1, x3; \ + pand x2, x1; \ + pxor x0, x4; \ + pxor x4, x3; \ + pxor x2, x4; \ + pxor x1, x0; \ + pxor x0, x2; + +#define SI7_1(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + pand x0, x3; \ + pxor x2, x0; \ + por x4, x2; \ + pxor x1, x4; \ + pxor RNOT, x0; \ + por x3, x1; \ + pxor x0, x4; \ + pand x2, x0; \ + pxor x1, x0; +#define SI7_2(x0, x1, x2, x3, x4) \ + pand x2, x1; \ + pxor x2, x3; \ + pxor x3, x4; \ + pand x3, x2; \ + por x0, x3; \ + pxor x4, x1; \ + pxor x4, x3; \ + pand x0, x4; \ + pxor x2, x4; + +#define get_key(i, j, t) \ + movd (4*(i)+(j))*4(CTX), t; \ + pshufd $0, t, t; + +#define K2(x0, x1, x2, x3, x4, i) \ + get_key(i, 0, RK0); \ + get_key(i, 1, RK1); \ + get_key(i, 2, RK2); \ + get_key(i, 3, RK3); \ + pxor RK0, x0 ## 1; \ + pxor RK1, x1 ## 1; \ + pxor RK2, x2 ## 1; \ + pxor RK3, x3 ## 1; \ + pxor RK0, x0 ## 2; \ + pxor RK1, x1 ## 2; \ + pxor RK2, x2 ## 2; \ + pxor RK3, x3 ## 2; + +#define LK2(x0, x1, x2, x3, x4, i) \ + movdqa x0 ## 1, x4 ## 1; \ + pslld $13, x0 ## 1; \ + psrld $(32 - 13), x4 ## 1; \ + por x4 ## 1, x0 ## 1; \ + pxor x0 ## 1, x1 ## 1; \ + movdqa x2 ## 1, x4 ## 1; \ + pslld $3, x2 ## 1; \ + psrld $(32 - 3), x4 ## 1; \ + por x4 ## 1, x2 ## 1; \ + pxor x2 ## 1, x1 ## 1; \ + movdqa x0 ## 2, x4 ## 2; \ + pslld $13, x0 ## 2; \ + psrld $(32 - 13), x4 ## 2; \ + por x4 ## 2, x0 ## 2; \ + pxor x0 ## 2, x1 ## 2; \ + movdqa x2 ## 2, x4 ## 2; \ + pslld $3, x2 ## 2; \ + psrld $(32 - 3), x4 ## 2; \ + por x4 ## 2, x2 ## 2; \ + pxor x2 ## 2, x1 ## 2; \ + movdqa x1 ## 1, x4 ## 1; \ + pslld $1, x1 ## 1; \ + psrld $(32 - 1), x4 ## 1; \ + por x4 ## 1, x1 ## 1; \ + movdqa x0 ## 1, x4 ## 1; \ + pslld $3, x4 ## 1; \ + pxor x2 ## 1, x3 ## 1; \ + pxor x4 ## 1, x3 ## 1; \ + movdqa x3 ## 1, x4 ## 1; \ + get_key(i, 1, RK1); \ + movdqa x1 ## 2, x4 ## 2; \ + pslld $1, x1 ## 2; \ + psrld $(32 - 1), x4 ## 2; \ + por x4 ## 2, x1 ## 2; \ + movdqa x0 ## 2, x4 ## 2; \ + pslld $3, x4 ## 2; \ + pxor x2 ## 2, x3 ## 2; \ + pxor x4 ## 2, x3 ## 2; \ + movdqa x3 ## 2, x4 ## 2; \ + get_key(i, 3, RK3); \ + pslld $7, x3 ## 1; \ + psrld $(32 - 7), x4 ## 1; \ + por x4 ## 1, x3 ## 1; \ + movdqa x1 ## 1, x4 ## 1; \ + pslld $7, x4 ## 1; \ + pxor x1 ## 1, x0 ## 1; \ + pxor x3 ## 1, x0 ## 1; \ + pxor x3 ## 1, x2 ## 1; \ + pxor x4 ## 1, x2 ## 1; \ + get_key(i, 0, RK0); \ + pslld $7, x3 ## 2; \ + psrld $(32 - 7), x4 ## 2; \ + por x4 ## 2, x3 ## 2; \ + movdqa x1 ## 2, x4 ## 2; \ + pslld $7, x4 ## 2; \ + pxor x1 ## 2, x0 ## 2; \ + pxor x3 ## 2, x0 ## 2; \ + pxor x3 ## 2, x2 ## 2; \ + pxor x4 ## 2, x2 ## 2; \ + get_key(i, 2, RK2); \ + pxor RK1, x1 ## 1; \ + pxor RK3, x3 ## 1; \ + movdqa x0 ## 1, x4 ## 1; \ + pslld $5, x0 ## 1; \ + psrld $(32 - 5), x4 ## 1; \ + por x4 ## 1, x0 ## 1; \ + movdqa x2 ## 1, x4 ## 1; \ + pslld $22, x2 ## 1; \ + psrld $(32 - 22), x4 ## 1; \ + por x4 ## 1, x2 ## 1; \ + pxor RK0, x0 ## 1; \ + pxor RK2, x2 ## 1; \ + pxor RK1, x1 ## 2; \ + pxor RK3, x3 ## 2; \ + movdqa x0 ## 2, x4 ## 2; \ + pslld $5, x0 ## 2; \ + psrld $(32 - 5), x4 ## 2; \ + por x4 ## 2, x0 ## 2; \ + movdqa x2 ## 2, x4 ## 2; \ + pslld $22, x2 ## 2; \ + psrld $(32 - 22), x4 ## 2; \ + por x4 ## 2, x2 ## 2; \ + pxor RK0, x0 ## 2; \ + pxor RK2, x2 ## 2; + +#define KL2(x0, x1, x2, x3, x4, i) \ + pxor RK0, x0 ## 1; \ + pxor RK2, x2 ## 1; \ + movdqa x0 ## 1, x4 ## 1; \ + psrld $5, x0 ## 1; \ + pslld $(32 - 5), x4 ## 1; \ + por x4 ## 1, x0 ## 1; \ + pxor RK3, x3 ## 1; \ + pxor RK1, x1 ## 1; \ + movdqa x2 ## 1, x4 ## 1; \ + psrld $22, x2 ## 1; \ + pslld $(32 - 22), x4 ## 1; \ + por x4 ## 1, x2 ## 1; \ + pxor x3 ## 1, x2 ## 1; \ + pxor RK0, x0 ## 2; \ + pxor RK2, x2 ## 2; \ + movdqa x0 ## 2, x4 ## 2; \ + psrld $5, x0 ## 2; \ + pslld $(32 - 5), x4 ## 2; \ + por x4 ## 2, x0 ## 2; \ + pxor RK3, x3 ## 2; \ + pxor RK1, x1 ## 2; \ + movdqa x2 ## 2, x4 ## 2; \ + psrld $22, x2 ## 2; \ + pslld $(32 - 22), x4 ## 2; \ + por x4 ## 2, x2 ## 2; \ + pxor x3 ## 2, x2 ## 2; \ + pxor x3 ## 1, x0 ## 1; \ + movdqa x1 ## 1, x4 ## 1; \ + pslld $7, x4 ## 1; \ + pxor x1 ## 1, x0 ## 1; \ + pxor x4 ## 1, x2 ## 1; \ + movdqa x1 ## 1, x4 ## 1; \ + psrld $1, x1 ## 1; \ + pslld $(32 - 1), x4 ## 1; \ + por x4 ## 1, x1 ## 1; \ + pxor x3 ## 2, x0 ## 2; \ + movdqa x1 ## 2, x4 ## 2; \ + pslld $7, x4 ## 2; \ + pxor x1 ## 2, x0 ## 2; \ + pxor x4 ## 2, x2 ## 2; \ + movdqa x1 ## 2, x4 ## 2; \ + psrld $1, x1 ## 2; \ + pslld $(32 - 1), x4 ## 2; \ + por x4 ## 2, x1 ## 2; \ + movdqa x3 ## 1, x4 ## 1; \ + psrld $7, x3 ## 1; \ + pslld $(32 - 7), x4 ## 1; \ + por x4 ## 1, x3 ## 1; \ + pxor x0 ## 1, x1 ## 1; \ + movdqa x0 ## 1, x4 ## 1; \ + pslld $3, x4 ## 1; \ + pxor x4 ## 1, x3 ## 1; \ + movdqa x0 ## 1, x4 ## 1; \ + movdqa x3 ## 2, x4 ## 2; \ + psrld $7, x3 ## 2; \ + pslld $(32 - 7), x4 ## 2; \ + por x4 ## 2, x3 ## 2; \ + pxor x0 ## 2, x1 ## 2; \ + movdqa x0 ## 2, x4 ## 2; \ + pslld $3, x4 ## 2; \ + pxor x4 ## 2, x3 ## 2; \ + movdqa x0 ## 2, x4 ## 2; \ + psrld $13, x0 ## 1; \ + pslld $(32 - 13), x4 ## 1; \ + por x4 ## 1, x0 ## 1; \ + pxor x2 ## 1, x1 ## 1; \ + pxor x2 ## 1, x3 ## 1; \ + movdqa x2 ## 1, x4 ## 1; \ + psrld $3, x2 ## 1; \ + pslld $(32 - 3), x4 ## 1; \ + por x4 ## 1, x2 ## 1; \ + psrld $13, x0 ## 2; \ + pslld $(32 - 13), x4 ## 2; \ + por x4 ## 2, x0 ## 2; \ + pxor x2 ## 2, x1 ## 2; \ + pxor x2 ## 2, x3 ## 2; \ + movdqa x2 ## 2, x4 ## 2; \ + psrld $3, x2 ## 2; \ + pslld $(32 - 3), x4 ## 2; \ + por x4 ## 2, x2 ## 2; + +#define S(SBOX, x0, x1, x2, x3, x4) \ + SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); + +#define SP(SBOX, x0, x1, x2, x3, x4, i) \ + get_key(i, 0, RK0); \ + SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + get_key(i, 2, RK2); \ + SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + get_key(i, 3, RK3); \ + SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ + get_key(i, 1, RK1); \ + SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ + +#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ + movdqa x2, t3; \ + movdqa x0, t1; \ + unpcklps x3, t3; \ + movdqa x0, t2; \ + unpcklps x1, t1; \ + unpckhps x1, t2; \ + movdqa t3, x1; \ + unpckhps x3, x2; \ + movdqa t1, x0; \ + movhlps t1, x1; \ + movdqa t2, t1; \ + movlhps t3, x0; \ + movlhps x2, t1; \ + movhlps t2, x2; \ + movdqa x2, x3; \ + movdqa t1, x2; + +#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ + movdqu (0*4*4)(in), x0; \ + movdqu (1*4*4)(in), x1; \ + movdqu (2*4*4)(in), x2; \ + movdqu (3*4*4)(in), x3; \ + \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) + +#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + movdqu x0, (0*4*4)(out); \ + movdqu x1, (1*4*4)(out); \ + movdqu x2, (2*4*4)(out); \ + movdqu x3, (3*4*4)(out); + +#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + movdqu (0*4*4)(out), t0; \ + pxor t0, x0; \ + movdqu x0, (0*4*4)(out); \ + movdqu (1*4*4)(out), t0; \ + pxor t0, x1; \ + movdqu x1, (1*4*4)(out); \ + movdqu (2*4*4)(out), t0; \ + pxor t0, x2; \ + movdqu x2, (2*4*4)(out); \ + movdqu (3*4*4)(out), t0; \ + pxor t0, x3; \ + movdqu x3, (3*4*4)(out); + +.align 8 +.global __serpent_enc_blk_8way +.type __serpent_enc_blk_8way,@function; + +__serpent_enc_blk_8way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool, if true: xor output + */ + + pcmpeqd RNOT, RNOT; + + leaq (4*4*4)(%rdx), %rax; + read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + K2(RA, RB, RC, RD, RE, 0); + S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); + S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); + S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); + S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); + S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); + S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); + S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); + S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); + S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); + S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); + S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); + S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); + S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); + S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); + S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); + S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); + S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); + S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); + S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); + S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); + S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); + S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); + S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); + S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); + S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); + S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); + S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); + S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); + S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); + S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); + S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); + S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); + + leaq (4*4*4)(%rsi), %rax; + + testb %cl, %cl; + jnz __enc_xor8; + + write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + ret; + +__enc_xor8: + xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + ret; + +.align 8 +.global serpent_dec_blk_8way +.type serpent_dec_blk_8way,@function; + +serpent_dec_blk_8way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + pcmpeqd RNOT, RNOT; + + leaq (4*4*4)(%rdx), %rax; + read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); + read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + + K2(RA, RB, RC, RD, RE, 32); + SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); + SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); + SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); + SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); + SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); + SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); + SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); + SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); + SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); + SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); + SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); + SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); + SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); + SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); + SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); + SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); + SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); + SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); + SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); + SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); + SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); + SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); + SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); + SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); + SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); + SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); + SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); + SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); + SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); + SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); + SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); + S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); + + leaq (4*4*4)(%rsi), %rax; + write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); + write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); + + ret; diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c new file mode 100644 index 0000000..947cf57 --- /dev/null +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -0,0 +1,719 @@ +/* + * Glue Code for SSE2 assembler versions of Serpent Cipher + * + * Copyright (c) 2011 Jussi Kivilinna + * + * Glue code based on aesni-intel_glue.c by: + * Copyright (C) 2008, Intel Corp. + * Author: Huang Ying + * + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: + * Copyright (c) 2006 Herbert Xu + * CTR part based on code (crypto/ctr.c) by: + * (C) Copyright IBM Corp. 2007 - Joy Latten + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct async_serpent_ctx { + struct cryptd_ablkcipher *cryptd_tfm; +}; + +static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes) +{ + if (fpu_enabled) + return true; + + /* SSE2 is only used when chunk to be processed is large enough, so + * do not enable FPU until it is necessary. + */ + if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS) + return false; + + kernel_fpu_begin(); + return true; +} + +static inline void serpent_fpu_end(bool fpu_enabled) +{ + if (fpu_enabled) + kernel_fpu_end(); +} + +static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, + bool enc) +{ + bool fpu_enabled = false; + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = SERPENT_BLOCK_SIZE; + unsigned int nbytes; + int err; + + err = blkcipher_walk_virt(desc, walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); + + /* Process multi-block batch */ + if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { + do { + if (enc) + serpent_enc_blk_xway(ctx, wdst, wsrc); + else + serpent_dec_blk_xway(ctx, wdst, wsrc); + + wsrc += bsize * SERPENT_PARALLEL_BLOCKS; + wdst += bsize * SERPENT_PARALLEL_BLOCKS; + nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; + } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (enc) + __serpent_encrypt(ctx, wdst, wsrc); + else + __serpent_decrypt(ctx, wdst, wsrc); + + wsrc += bsize; + wdst += bsize; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + serpent_fpu_end(fpu_enabled); + return err; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, true); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, false); +} + +static struct crypto_alg blk_ecb_alg = { + .cra_name = "__ecb-serpent-sse2", + .cra_driver_name = "__driver-ecb-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}; + +static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = SERPENT_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 *iv = (u128 *)walk->iv; + + do { + u128_xor(dst, src, iv); + __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); + return nbytes; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_encrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = SERPENT_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; + u128 last_iv; + int i; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + /* Process multi-block batch */ + if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { + do { + nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1); + src -= SERPENT_PARALLEL_BLOCKS - 1; + dst -= SERPENT_PARALLEL_BLOCKS - 1; + + for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++) + ivs[i] = src[i]; + + serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); + + for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++) + u128_xor(dst + (i + 1), dst + (i + 1), ivs + i); + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + for (;;) { + __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src); + + nbytes -= bsize; + if (nbytes < bsize) + break; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } + +done: + u128_xor(dst, dst, (u128 *)walk->iv); + *(u128 *)walk->iv = last_iv; + + return nbytes; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk.nbytes)) { + fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); + nbytes = __cbc_decrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + serpent_fpu_end(fpu_enabled); + return err; +} + +static struct crypto_alg blk_cbc_alg = { + .cra_name = "__cbc-serpent-sse2", + .cra_driver_name = "__driver-cbc-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = serpent_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}; + +static inline void u128_to_be128(be128 *dst, const u128 *src) +{ + dst->a = cpu_to_be64(src->a); + dst->b = cpu_to_be64(src->b); +} + +static inline void be128_to_u128(u128 *dst, const be128 *src) +{ + dst->a = be64_to_cpu(src->a); + dst->b = be64_to_cpu(src->b); +} + +static inline void u128_inc(u128 *i) +{ + i->b++; + if (!i->b) + i->a++; +} + +static void ctr_crypt_final(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + u8 *ctrblk = walk->iv; + u8 keystream[SERPENT_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + + __serpent_encrypt(ctx, keystream, ctrblk); + crypto_xor(keystream, src, nbytes); + memcpy(dst, keystream, nbytes); + + crypto_inc(ctrblk, SERPENT_BLOCK_SIZE); +} + +static unsigned int __ctr_crypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = SERPENT_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ctrblk; + be128 ctrblocks[SERPENT_PARALLEL_BLOCKS]; + int i; + + be128_to_u128(&ctrblk, (be128 *)walk->iv); + + /* Process multi-block batch */ + if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) { + do { + /* create ctrblks for parallel encrypt */ + for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { + if (dst != src) + dst[i] = src[i]; + + u128_to_be128(&ctrblocks[i], &ctrblk); + u128_inc(&ctrblk); + } + + serpent_enc_blk_xway_xor(ctx, (u8 *)dst, + (u8 *)ctrblocks); + + src += SERPENT_PARALLEL_BLOCKS; + dst += SERPENT_PARALLEL_BLOCKS; + nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; + } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (dst != src) + *dst = *src; + + u128_to_be128(&ctrblocks[0], &ctrblk); + u128_inc(&ctrblk); + + __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); + u128_xor(dst, dst, (u128 *)ctrblocks); + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + u128_to_be128((be128 *)walk->iv, &ctrblk); + return nbytes; +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) { + fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes); + nbytes = __ctr_crypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + serpent_fpu_end(fpu_enabled); + + if (walk.nbytes) { + ctr_crypt_final(desc, &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} + +static struct crypto_alg blk_ctr_alg = { + .cra_name = "__ctr-serpent-sse2", + .cra_driver_name = "__driver-ctr-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = serpent_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}; + +static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; + int err; + + crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ablkcipher_setkey(child, key, key_len); + crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) + & CRYPTO_TFM_RES_MASK); + return err; +} + +static int __ablk_encrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct blkcipher_desc desc; + + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); + desc.info = req->info; + desc.flags = 0; + + return crypto_blkcipher_crt(desc.tfm)->encrypt( + &desc, req->dst, req->src, req->nbytes); +} + +static int ablk_encrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); + + if (!irq_fpu_usable()) { + struct ablkcipher_request *cryptd_req = + ablkcipher_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + + return crypto_ablkcipher_encrypt(cryptd_req); + } else { + return __ablk_encrypt(req); + } +} + +static int ablk_decrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm); + + if (!irq_fpu_usable()) { + struct ablkcipher_request *cryptd_req = + ablkcipher_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + + return crypto_ablkcipher_decrypt(cryptd_req); + } else { + struct blkcipher_desc desc; + + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); + desc.info = req->info; + desc.flags = 0; + + return crypto_blkcipher_crt(desc.tfm)->decrypt( + &desc, req->dst, req->src, req->nbytes); + } +} + +static void ablk_exit(struct crypto_tfm *tfm) +{ + struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_free_ablkcipher(ctx->cryptd_tfm); +} + +static void ablk_init_common(struct crypto_tfm *tfm, + struct cryptd_ablkcipher *cryptd_tfm) +{ + struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->cryptd_tfm = cryptd_tfm; + tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + + crypto_ablkcipher_reqsize(&cryptd_tfm->base); +} + +static int ablk_ecb_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-serpent-sse2", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_ecb_alg = { + .cra_name = "ecb(serpent)", + .cra_driver_name = "ecb-serpent-sse2", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list), + .cra_init = ablk_ecb_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; + +static int ablk_cbc_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_cbc_alg = { + .cra_name = "cbc(serpent)", + .cra_driver_name = "cbc-serpent-sse2", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list), + .cra_init = ablk_cbc_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = __ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; + +static int ablk_ctr_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_ctr_alg = { + .cra_name = "ctr(serpent)", + .cra_driver_name = "ctr-serpent-sse2", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list), + .cra_init = ablk_ctr_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_encrypt, + .geniv = "chainiv", + }, + }, +}; + +static int __init serpent_sse2_init(void) +{ + int err; + + if (!cpu_has_xmm2) { + printk(KERN_INFO "SSE2 instructions are not detected.\n"); + return -ENODEV; + } + + err = crypto_register_alg(&blk_ecb_alg); + if (err) + goto blk_ecb_err; + err = crypto_register_alg(&blk_cbc_alg); + if (err) + goto blk_cbc_err; + err = crypto_register_alg(&blk_ctr_alg); + if (err) + goto blk_ctr_err; + err = crypto_register_alg(&ablk_ecb_alg); + if (err) + goto ablk_ecb_err; + err = crypto_register_alg(&ablk_cbc_alg); + if (err) + goto ablk_cbc_err; + err = crypto_register_alg(&ablk_ctr_alg); + if (err) + goto ablk_ctr_err; + return err; + +ablk_ctr_err: + crypto_unregister_alg(&ablk_cbc_alg); +ablk_cbc_err: + crypto_unregister_alg(&ablk_ecb_alg); +ablk_ecb_err: + crypto_unregister_alg(&blk_ctr_alg); +blk_ctr_err: + crypto_unregister_alg(&blk_cbc_alg); +blk_cbc_err: + crypto_unregister_alg(&blk_ecb_alg); +blk_ecb_err: + return err; +} + +static void __exit serpent_sse2_exit(void) +{ + crypto_unregister_alg(&ablk_ctr_alg); + crypto_unregister_alg(&ablk_cbc_alg); + crypto_unregister_alg(&ablk_ecb_alg); + crypto_unregister_alg(&blk_ctr_alg); + crypto_unregister_alg(&blk_cbc_alg); + crypto_unregister_alg(&blk_ecb_alg); +} + +module_init(serpent_sse2_init); +module_exit(serpent_sse2_exit); + +MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("serpent"); diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h new file mode 100644 index 0000000..b7fd3b5 --- /dev/null +++ b/arch/x86/include/asm/serpent.h @@ -0,0 +1,32 @@ +#ifndef ASM_X86_SERPENT_H +#define ASM_X86_SERPENT_H + +#include +#include + +#define SERPENT_PARALLEL_BLOCKS 8 + +asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_8way(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_8way(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + serpent_dec_blk_8way(ctx, dst, src); +} + +#endif -- cgit v1.1 From 251496dbfc1be38bc43b49651f3d33c02faccc47 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Wed, 9 Nov 2011 16:26:31 +0200 Subject: crypto: serpent - add 4-way parallel i586/SSE2 assembler implementation Patch adds i586/SSE2 assembler implementation of serpent cipher. Assembler functions crypt data in four block chunks. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios): Intel Atom N270: size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16 0.95x 1.12x 1.02x 1.07x 0.97x 0.98x 64 1.73x 1.82x 1.08x 1.82x 1.72x 1.73x 256 2.08x 2.00x 1.04x 2.07x 1.99x 2.01x 1024 2.28x 2.18x 1.05x 2.23x 2.17x 2.20x 8192 2.28x 2.13x 1.05x 2.23x 2.18x 2.20x Full output: http://koti.mbnet.fi/axh/kernel/crypto/atom-n270/serpent-generic.txt http://koti.mbnet.fi/axh/kernel/crypto/atom-n270/serpent-sse2.txt Userspace test results: Encryption/decryption of sse2-i586 vs generic on Intel Atom N270: encrypt: 2.35x decrypt: 2.54x Encryption/decryption of sse2-i586 vs generic on AMD Phenom II: encrypt: 1.82x decrypt: 2.51x Encryption/decryption of sse2-i586 vs generic on Intel Xeon E7330: encrypt: 2.99x decrypt: 3.48x Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/serpent-sse2-i586-asm_32.S | 638 +++++++++++++++++++++++++++++ arch/x86/include/asm/serpent.h | 31 ++ 3 files changed, 671 insertions(+) create mode 100644 arch/x86/crypto/serpent-sse2-i586-asm_32.S (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 12ebdbd..2b0b963 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o +obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o @@ -21,6 +22,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o aes-i586-y := aes-i586-asm_32.o aes_glue.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o +serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S new file mode 100644 index 0000000..4e37677 --- /dev/null +++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S @@ -0,0 +1,638 @@ +/* + * Serpent Cipher 4-way parallel algorithm (i586/SSE2) + * + * Copyright (C) 2011 Jussi Kivilinna + * + * Based on crypto/serpent.c by + * Copyright (C) 2002 Dag Arne Osvik + * 2003 Herbert Valerio Riedel + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "serpent-sse2-i586-asm_32.S" +.text + +#define arg_ctx 4 +#define arg_dst 8 +#define arg_src 12 +#define arg_xor 16 + +/********************************************************************** + 4-way SSE2 serpent + **********************************************************************/ +#define CTX %edx + +#define RA %xmm0 +#define RB %xmm1 +#define RC %xmm2 +#define RD %xmm3 +#define RE %xmm4 + +#define RT0 %xmm5 +#define RT1 %xmm6 + +#define RNOT %xmm7 + +#define get_key(i, j, t) \ + movd (4*(i)+(j))*4(CTX), t; \ + pshufd $0, t, t; + +#define K(x0, x1, x2, x3, x4, i) \ + get_key(i, 0, x4); \ + get_key(i, 1, RT0); \ + get_key(i, 2, RT1); \ + pxor x4, x0; \ + pxor RT0, x1; \ + pxor RT1, x2; \ + get_key(i, 3, x4); \ + pxor x4, x3; + +#define LK(x0, x1, x2, x3, x4, i) \ + movdqa x0, x4; \ + pslld $13, x0; \ + psrld $(32 - 13), x4; \ + por x4, x0; \ + pxor x0, x1; \ + movdqa x2, x4; \ + pslld $3, x2; \ + psrld $(32 - 3), x4; \ + por x4, x2; \ + pxor x2, x1; \ + movdqa x1, x4; \ + pslld $1, x1; \ + psrld $(32 - 1), x4; \ + por x4, x1; \ + movdqa x0, x4; \ + pslld $3, x4; \ + pxor x2, x3; \ + pxor x4, x3; \ + movdqa x3, x4; \ + pslld $7, x3; \ + psrld $(32 - 7), x4; \ + por x4, x3; \ + movdqa x1, x4; \ + pslld $7, x4; \ + pxor x1, x0; \ + pxor x3, x0; \ + pxor x3, x2; \ + pxor x4, x2; \ + movdqa x0, x4; \ + get_key(i, 1, RT0); \ + pxor RT0, x1; \ + get_key(i, 3, RT0); \ + pxor RT0, x3; \ + pslld $5, x0; \ + psrld $(32 - 5), x4; \ + por x4, x0; \ + movdqa x2, x4; \ + pslld $22, x2; \ + psrld $(32 - 22), x4; \ + por x4, x2; \ + get_key(i, 0, RT0); \ + pxor RT0, x0; \ + get_key(i, 2, RT0); \ + pxor RT0, x2; + +#define KL(x0, x1, x2, x3, x4, i) \ + K(x0, x1, x2, x3, x4, i); \ + movdqa x0, x4; \ + psrld $5, x0; \ + pslld $(32 - 5), x4; \ + por x4, x0; \ + movdqa x2, x4; \ + psrld $22, x2; \ + pslld $(32 - 22), x4; \ + por x4, x2; \ + pxor x3, x2; \ + pxor x3, x0; \ + movdqa x1, x4; \ + pslld $7, x4; \ + pxor x1, x0; \ + pxor x4, x2; \ + movdqa x1, x4; \ + psrld $1, x1; \ + pslld $(32 - 1), x4; \ + por x4, x1; \ + movdqa x3, x4; \ + psrld $7, x3; \ + pslld $(32 - 7), x4; \ + por x4, x3; \ + pxor x0, x1; \ + movdqa x0, x4; \ + pslld $3, x4; \ + pxor x4, x3; \ + movdqa x0, x4; \ + psrld $13, x0; \ + pslld $(32 - 13), x4; \ + por x4, x0; \ + pxor x2, x1; \ + pxor x2, x3; \ + movdqa x2, x4; \ + psrld $3, x2; \ + pslld $(32 - 3), x4; \ + por x4, x2; + +#define S0(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + por x0, x3; \ + pxor x4, x0; \ + pxor x2, x4; \ + pxor RNOT, x4; \ + pxor x1, x3; \ + pand x0, x1; \ + pxor x4, x1; \ + pxor x0, x2; \ + pxor x3, x0; \ + por x0, x4; \ + pxor x2, x0; \ + pand x1, x2; \ + pxor x2, x3; \ + pxor RNOT, x1; \ + pxor x4, x2; \ + pxor x2, x1; + +#define S1(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + pxor x0, x1; \ + pxor x3, x0; \ + pxor RNOT, x3; \ + pand x1, x4; \ + por x1, x0; \ + pxor x2, x3; \ + pxor x3, x0; \ + pxor x3, x1; \ + pxor x4, x3; \ + por x4, x1; \ + pxor x2, x4; \ + pand x0, x2; \ + pxor x1, x2; \ + por x0, x1; \ + pxor RNOT, x0; \ + pxor x2, x0; \ + pxor x1, x4; + +#define S2(x0, x1, x2, x3, x4) \ + pxor RNOT, x3; \ + pxor x0, x1; \ + movdqa x0, x4; \ + pand x2, x0; \ + pxor x3, x0; \ + por x4, x3; \ + pxor x1, x2; \ + pxor x1, x3; \ + pand x0, x1; \ + pxor x2, x0; \ + pand x3, x2; \ + por x1, x3; \ + pxor RNOT, x0; \ + pxor x0, x3; \ + pxor x0, x4; \ + pxor x2, x0; \ + por x2, x1; + +#define S3(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + pxor x3, x1; \ + por x0, x3; \ + pand x0, x4; \ + pxor x2, x0; \ + pxor x1, x2; \ + pand x3, x1; \ + pxor x3, x2; \ + por x4, x0; \ + pxor x3, x4; \ + pxor x0, x1; \ + pand x3, x0; \ + pand x4, x3; \ + pxor x2, x3; \ + por x1, x4; \ + pand x1, x2; \ + pxor x3, x4; \ + pxor x3, x0; \ + pxor x2, x3; + +#define S4(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + pand x0, x3; \ + pxor x4, x0; \ + pxor x2, x3; \ + por x4, x2; \ + pxor x1, x0; \ + pxor x3, x4; \ + por x0, x2; \ + pxor x1, x2; \ + pand x0, x1; \ + pxor x4, x1; \ + pand x2, x4; \ + pxor x3, x2; \ + pxor x0, x4; \ + por x1, x3; \ + pxor RNOT, x1; \ + pxor x0, x3; + +#define S5(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + por x0, x1; \ + pxor x1, x2; \ + pxor RNOT, x3; \ + pxor x0, x4; \ + pxor x2, x0; \ + pand x4, x1; \ + por x3, x4; \ + pxor x0, x4; \ + pand x3, x0; \ + pxor x3, x1; \ + pxor x2, x3; \ + pxor x1, x0; \ + pand x4, x2; \ + pxor x2, x1; \ + pand x0, x2; \ + pxor x2, x3; + +#define S6(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + pxor x0, x3; \ + pxor x2, x1; \ + pxor x0, x2; \ + pand x3, x0; \ + por x3, x1; \ + pxor RNOT, x4; \ + pxor x1, x0; \ + pxor x2, x1; \ + pxor x4, x3; \ + pxor x0, x4; \ + pand x0, x2; \ + pxor x1, x4; \ + pxor x3, x2; \ + pand x1, x3; \ + pxor x0, x3; \ + pxor x2, x1; + +#define S7(x0, x1, x2, x3, x4) \ + pxor RNOT, x1; \ + movdqa x1, x4; \ + pxor RNOT, x0; \ + pand x2, x1; \ + pxor x3, x1; \ + por x4, x3; \ + pxor x2, x4; \ + pxor x3, x2; \ + pxor x0, x3; \ + por x1, x0; \ + pand x0, x2; \ + pxor x4, x0; \ + pxor x3, x4; \ + pand x0, x3; \ + pxor x1, x4; \ + pxor x4, x2; \ + pxor x1, x3; \ + por x0, x4; \ + pxor x1, x4; + +#define SI0(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + pxor x0, x1; \ + por x1, x3; \ + pxor x1, x4; \ + pxor RNOT, x0; \ + pxor x3, x2; \ + pxor x0, x3; \ + pand x1, x0; \ + pxor x2, x0; \ + pand x3, x2; \ + pxor x4, x3; \ + pxor x3, x2; \ + pxor x3, x1; \ + pand x0, x3; \ + pxor x0, x1; \ + pxor x2, x0; \ + pxor x3, x4; + +#define SI1(x0, x1, x2, x3, x4) \ + pxor x3, x1; \ + movdqa x0, x4; \ + pxor x2, x0; \ + pxor RNOT, x2; \ + por x1, x4; \ + pxor x3, x4; \ + pand x1, x3; \ + pxor x2, x1; \ + pand x4, x2; \ + pxor x1, x4; \ + por x3, x1; \ + pxor x0, x3; \ + pxor x0, x2; \ + por x4, x0; \ + pxor x4, x2; \ + pxor x0, x1; \ + pxor x1, x4; + +#define SI2(x0, x1, x2, x3, x4) \ + pxor x1, x2; \ + movdqa x3, x4; \ + pxor RNOT, x3; \ + por x2, x3; \ + pxor x4, x2; \ + pxor x0, x4; \ + pxor x1, x3; \ + por x2, x1; \ + pxor x0, x2; \ + pxor x4, x1; \ + por x3, x4; \ + pxor x3, x2; \ + pxor x2, x4; \ + pand x1, x2; \ + pxor x3, x2; \ + pxor x4, x3; \ + pxor x0, x4; + +#define SI3(x0, x1, x2, x3, x4) \ + pxor x1, x2; \ + movdqa x1, x4; \ + pand x2, x1; \ + pxor x0, x1; \ + por x4, x0; \ + pxor x3, x4; \ + pxor x3, x0; \ + por x1, x3; \ + pxor x2, x1; \ + pxor x3, x1; \ + pxor x2, x0; \ + pxor x3, x2; \ + pand x1, x3; \ + pxor x0, x1; \ + pand x2, x0; \ + pxor x3, x4; \ + pxor x0, x3; \ + pxor x1, x0; + +#define SI4(x0, x1, x2, x3, x4) \ + pxor x3, x2; \ + movdqa x0, x4; \ + pand x1, x0; \ + pxor x2, x0; \ + por x3, x2; \ + pxor RNOT, x4; \ + pxor x0, x1; \ + pxor x2, x0; \ + pand x4, x2; \ + pxor x0, x2; \ + por x4, x0; \ + pxor x3, x0; \ + pand x2, x3; \ + pxor x3, x4; \ + pxor x1, x3; \ + pand x0, x1; \ + pxor x1, x4; \ + pxor x3, x0; + +#define SI5(x0, x1, x2, x3, x4) \ + movdqa x1, x4; \ + por x2, x1; \ + pxor x4, x2; \ + pxor x3, x1; \ + pand x4, x3; \ + pxor x3, x2; \ + por x0, x3; \ + pxor RNOT, x0; \ + pxor x2, x3; \ + por x0, x2; \ + pxor x1, x4; \ + pxor x4, x2; \ + pand x0, x4; \ + pxor x1, x0; \ + pxor x3, x1; \ + pand x2, x0; \ + pxor x3, x2; \ + pxor x2, x0; \ + pxor x4, x2; \ + pxor x3, x4; + +#define SI6(x0, x1, x2, x3, x4) \ + pxor x2, x0; \ + movdqa x0, x4; \ + pand x3, x0; \ + pxor x3, x2; \ + pxor x2, x0; \ + pxor x1, x3; \ + por x4, x2; \ + pxor x3, x2; \ + pand x0, x3; \ + pxor RNOT, x0; \ + pxor x1, x3; \ + pand x2, x1; \ + pxor x0, x4; \ + pxor x4, x3; \ + pxor x2, x4; \ + pxor x1, x0; \ + pxor x0, x2; + +#define SI7(x0, x1, x2, x3, x4) \ + movdqa x3, x4; \ + pand x0, x3; \ + pxor x2, x0; \ + por x4, x2; \ + pxor x1, x4; \ + pxor RNOT, x0; \ + por x3, x1; \ + pxor x0, x4; \ + pand x2, x0; \ + pxor x1, x0; \ + pand x2, x1; \ + pxor x2, x3; \ + pxor x3, x4; \ + pand x3, x2; \ + por x0, x3; \ + pxor x4, x1; \ + pxor x4, x3; \ + pand x0, x4; \ + pxor x2, x4; + +#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ + movdqa x2, t3; \ + movdqa x0, t1; \ + unpcklps x3, t3; \ + movdqa x0, t2; \ + unpcklps x1, t1; \ + unpckhps x1, t2; \ + movdqa t3, x1; \ + unpckhps x3, x2; \ + movdqa t1, x0; \ + movhlps t1, x1; \ + movdqa t2, t1; \ + movlhps t3, x0; \ + movlhps x2, t1; \ + movhlps t2, x2; \ + movdqa x2, x3; \ + movdqa t1, x2; + +#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ + movdqu (0*4*4)(in), x0; \ + movdqu (1*4*4)(in), x1; \ + movdqu (2*4*4)(in), x2; \ + movdqu (3*4*4)(in), x3; \ + \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) + +#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + movdqu x0, (0*4*4)(out); \ + movdqu x1, (1*4*4)(out); \ + movdqu x2, (2*4*4)(out); \ + movdqu x3, (3*4*4)(out); + +#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ + \ + movdqu (0*4*4)(out), t0; \ + pxor t0, x0; \ + movdqu x0, (0*4*4)(out); \ + movdqu (1*4*4)(out), t0; \ + pxor t0, x1; \ + movdqu x1, (1*4*4)(out); \ + movdqu (2*4*4)(out), t0; \ + pxor t0, x2; \ + movdqu x2, (2*4*4)(out); \ + movdqu (3*4*4)(out), t0; \ + pxor t0, x3; \ + movdqu x3, (3*4*4)(out); + +.align 8 +.global __serpent_enc_blk_4way +.type __serpent_enc_blk_4way,@function; + +__serpent_enc_blk_4way: + /* input: + * arg_ctx(%esp): ctx, CTX + * arg_dst(%esp): dst + * arg_src(%esp): src + * arg_xor(%esp): bool, if true: xor output + */ + + pcmpeqd RNOT, RNOT; + + movl arg_ctx(%esp), CTX; + + movl arg_src(%esp), %eax; + read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); + + K(RA, RB, RC, RD, RE, 0); + S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1); + S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2); + S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3); + S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4); + S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5); + S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6); + S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7); + S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8); + S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9); + S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10); + S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11); + S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12); + S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13); + S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14); + S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15); + S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16); + S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17); + S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18); + S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19); + S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20); + S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21); + S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22); + S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23); + S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24); + S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25); + S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26); + S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27); + S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28); + S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29); + S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30); + S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31); + S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32); + + movl arg_dst(%esp), %eax; + + cmpb $0, arg_xor(%esp); + jnz __enc_xor4; + + write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); + + ret; + +__enc_xor4: + xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); + + ret; + +.align 8 +.global serpent_dec_blk_4way +.type serpent_dec_blk_4way,@function; + +serpent_dec_blk_4way: + /* input: + * arg_ctx(%esp): ctx, CTX + * arg_dst(%esp): dst + * arg_src(%esp): src + */ + + pcmpeqd RNOT, RNOT; + + movl arg_ctx(%esp), CTX; + + movl arg_src(%esp), %eax; + read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); + + K(RA, RB, RC, RD, RE, 32); + SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31); + SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30); + SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29); + SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28); + SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27); + SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26); + SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25); + SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24); + SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23); + SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22); + SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21); + SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20); + SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19); + SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18); + SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17); + SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16); + SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15); + SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14); + SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13); + SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12); + SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11); + SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10); + SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9); + SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8); + SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7); + SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6); + SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5); + SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4); + SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3); + SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2); + SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1); + SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0); + + movl arg_dst(%esp), %eax; + write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA); + + ret; diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h index b7fd3b5..d3ef63f 100644 --- a/arch/x86/include/asm/serpent.h +++ b/arch/x86/include/asm/serpent.h @@ -4,6 +4,35 @@ #include #include +#ifdef CONFIG_X86_32 + +#define SERPENT_PARALLEL_BLOCKS 4 + +asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_4way(ctx, dst, src, false); +} + +static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + __serpent_enc_blk_4way(ctx, dst, src, true); +} + +static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, + const u8 *src) +{ + serpent_dec_blk_4way(ctx, dst, src); +} + +#else + #define SERPENT_PARALLEL_BLOCKS 8 asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst, @@ -30,3 +59,5 @@ static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, } #endif + +#endif -- cgit v1.1 From 18482053f92b099663bd36a10e8f6bd2c8544669 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Wed, 9 Nov 2011 16:26:36 +0200 Subject: crypto: serpent-sse2 - add lrw support Patch adds LRW support for serpent-sse2 by using lrw_crypt(). Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios): Benchmark results with tcrypt: Intel Celeron T1600 (x86_64) (fam:6, model:15, step:13): size lrw-enc lrw-dec 16B 1.00x 0.96x 64B 1.01x 1.01x 256B 3.01x 2.97x 1024B 3.39x 3.33x 8192B 3.35x 3.33x AMD Phenom II 1055T (x86_64) (fam:16, model:10): size lrw-enc lrw-dec 16B 0.98x 1.03x 64B 1.01x 1.04x 256B 2.10x 2.14x 1024B 2.28x 2.33x 8192B 2.30x 2.33x Intel Atom N270 (i586): size lrw-enc lrw-dec 16B 0.97x 0.97x 64B 1.47x 1.50x 256B 1.72x 1.69x 1024B 1.88x 1.81x 8192B 1.84x 1.79x Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent_sse2_glue.c | 211 ++++++++++++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 947cf57..db318e5 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -39,12 +39,17 @@ #include #include #include +#include #include #include #include #include #include +#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE) +#define HAS_LRW +#endif + struct async_serpent_ctx { struct cryptd_ablkcipher *cryptd_tfm; }; @@ -460,6 +465,152 @@ static struct crypto_alg blk_ctr_alg = { }, }; +#ifdef HAS_LRW + +struct crypt_priv { + struct serpent_ctx *ctx; + bool fpu_enabled; +}; + +static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = SERPENT_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { + serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + __serpent_encrypt(ctx->ctx, srcdst, srcdst); +} + +static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) +{ + const unsigned int bsize = SERPENT_BLOCK_SIZE; + struct crypt_priv *ctx = priv; + int i; + + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); + + if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { + serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); + return; + } + + for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) + __serpent_decrypt(ctx->ctx, srcdst, srcdst); +} + +struct serpent_lrw_ctx { + struct lrw_table_ctx lrw_table; + struct serpent_ctx serpent_ctx; +}; + +static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + int err; + + err = __serpent_setkey(&ctx->serpent_ctx, key, keylen - + SERPENT_BLOCK_SIZE); + if (err) + return err; + + return lrw_init_table(&ctx->lrw_table, key + keylen - + SERPENT_BLOCK_SIZE); +} + +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->serpent_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + ret = lrw_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->serpent_ctx, + .fpu_enabled = false, + }; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + ret = lrw_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static void lrw_exit_tfm(struct crypto_tfm *tfm) +{ + struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + + lrw_free_table(&ctx->lrw_table); +} + +static struct crypto_alg blk_lrw_alg = { + .cra_name = "__lrw-serpent-sse2", + .cra_driver_name = "__driver-lrw-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_lrw_alg.cra_list), + .cra_exit = lrw_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = lrw_serpent_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}; + +#endif + static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, unsigned int key_len) { @@ -658,6 +809,48 @@ static struct crypto_alg ablk_ctr_alg = { }, }; +#ifdef HAS_LRW + +static int ablk_lrw_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-lrw-serpent-sse2", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_lrw_alg = { + .cra_name = "lrw(serpent)", + .cra_driver_name = "lrw-serpent-sse2", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list), + .cra_init = ablk_lrw_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .max_keysize = SERPENT_MAX_KEY_SIZE + + SERPENT_BLOCK_SIZE, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; + +#endif + static int __init serpent_sse2_init(void) { int err; @@ -685,8 +878,22 @@ static int __init serpent_sse2_init(void) err = crypto_register_alg(&ablk_ctr_alg); if (err) goto ablk_ctr_err; +#ifdef HAS_LRW + err = crypto_register_alg(&blk_lrw_alg); + if (err) + goto blk_lrw_err; + err = crypto_register_alg(&ablk_lrw_alg); + if (err) + goto ablk_lrw_err; +#endif return err; +#ifdef HAS_LRW +ablk_lrw_err: + crypto_unregister_alg(&blk_lrw_alg); +blk_lrw_err: + crypto_unregister_alg(&ablk_ctr_alg); +#endif ablk_ctr_err: crypto_unregister_alg(&ablk_cbc_alg); ablk_cbc_err: @@ -703,6 +910,10 @@ blk_ecb_err: static void __exit serpent_sse2_exit(void) { +#ifdef HAS_LRW + crypto_unregister_alg(&ablk_lrw_alg); + crypto_unregister_alg(&blk_lrw_alg); +#endif crypto_unregister_alg(&ablk_ctr_alg); crypto_unregister_alg(&ablk_cbc_alg); crypto_unregister_alg(&ablk_ecb_alg); -- cgit v1.1 From 5962f8b66dd040ad89d55b58967ea2dec607f4d3 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Wed, 9 Nov 2011 16:26:41 +0200 Subject: crypto: serpent-sse2 - add xts support Patch adds XTS support for serpent-sse2 by using xts_crypt(). Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios): Intel Celeron T1600 (x86_64) (fam:6, model:15, step:13): size xts-enc xts-dec 16B 0.98x 1.00x 64B 1.00x 1.01x 256B 2.78x 2.75x 1024B 3.30x 3.26x 8192B 3.39x 3.30x AMD Phenom II 1055T (x86_64) (fam:16, model:10): size xts-enc xts-dec 16B 1.05x 1.02x 64B 1.04x 1.03x 256B 2.10x 2.05x 1024B 2.34x 2.35x 8192B 2.34x 2.40x Intel Atom N270 (i586): size xts-enc xts-dec 16B 0.95x 0.96x 64B 1.53x 1.50x 256B 1.72x 1.75x 1024B 1.88x 1.87x 8192B 1.86x 1.83x Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent_sse2_glue.c | 180 +++++++++++++++++++++++++++++++++++- 1 file changed, 178 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index db318e5..2dffc5a 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,10 @@ #define HAS_LRW #endif +#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE) +#define HAS_XTS +#endif + struct async_serpent_ctx { struct cryptd_ablkcipher *cryptd_tfm; }; @@ -465,7 +470,7 @@ static struct crypto_alg blk_ctr_alg = { }, }; -#ifdef HAS_LRW +#if defined(HAS_LRW) || defined(HAS_XTS) struct crypt_priv { struct serpent_ctx *ctx; @@ -506,6 +511,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) __serpent_decrypt(ctx->ctx, srcdst, srcdst); } +#endif + +#ifdef HAS_LRW + struct serpent_lrw_ctx { struct lrw_table_ctx lrw_table; struct serpent_ctx serpent_ctx; @@ -611,6 +620,114 @@ static struct crypto_alg blk_lrw_alg = { #endif +#ifdef HAS_XTS + +struct serpent_xts_ctx { + struct serpent_ctx tweak_ctx; + struct serpent_ctx crypt_ctx; +}; + +static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + int err; + + /* key consists of keys of equal size concatenated, therefore + * the length must be even + */ + if (keylen % 2) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + /* first half of xts-key is for crypt */ + err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2); +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), + .crypt_ctx = &crypt_ctx, + .crypt_fn = encrypt_callback, + }; + int ret; + + ret = xts_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[SERPENT_PARALLEL_BLOCKS]; + struct crypt_priv crypt_ctx = { + .ctx = &ctx->crypt_ctx, + .fpu_enabled = false, + }; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = &ctx->tweak_ctx, + .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt), + .crypt_ctx = &crypt_ctx, + .crypt_fn = decrypt_callback, + }; + int ret; + + ret = xts_crypt(desc, dst, src, nbytes, &req); + serpent_fpu_end(crypt_ctx.fpu_enabled); + + return ret; +} + +static struct crypto_alg blk_xts_alg = { + .cra_name = "__xts-serpent-sse2", + .cra_driver_name = "__driver-xts-serpent-sse2", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct serpent_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_xts_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE * 2, + .max_keysize = SERPENT_MAX_KEY_SIZE * 2, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = xts_serpent_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}; + +#endif + static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, unsigned int key_len) { @@ -851,6 +968,46 @@ static struct crypto_alg ablk_lrw_alg = { #endif +#ifdef HAS_XTS + +static int ablk_xts_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-xts-serpent-sse2", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_xts_alg = { + .cra_name = "xts(serpent)", + .cra_driver_name = "xts-serpent-sse2", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = SERPENT_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_serpent_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list), + .cra_init = ablk_xts_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = SERPENT_MIN_KEY_SIZE * 2, + .max_keysize = SERPENT_MAX_KEY_SIZE * 2, + .ivsize = SERPENT_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}; + +#endif + static int __init serpent_sse2_init(void) { int err; @@ -886,14 +1043,29 @@ static int __init serpent_sse2_init(void) if (err) goto ablk_lrw_err; #endif +#ifdef HAS_XTS + err = crypto_register_alg(&blk_xts_alg); + if (err) + goto blk_xts_err; + err = crypto_register_alg(&ablk_xts_alg); + if (err) + goto ablk_xts_err; +#endif return err; +#ifdef HAS_XTS + crypto_unregister_alg(&ablk_xts_alg); +ablk_xts_err: + crypto_unregister_alg(&blk_xts_alg); +blk_xts_err: +#endif #ifdef HAS_LRW + crypto_unregister_alg(&ablk_lrw_alg); ablk_lrw_err: crypto_unregister_alg(&blk_lrw_alg); blk_lrw_err: - crypto_unregister_alg(&ablk_ctr_alg); #endif + crypto_unregister_alg(&ablk_ctr_alg); ablk_ctr_err: crypto_unregister_alg(&ablk_cbc_alg); ablk_cbc_err: @@ -910,6 +1082,10 @@ blk_ecb_err: static void __exit serpent_sse2_exit(void) { +#ifdef HAS_XTS + crypto_unregister_alg(&ablk_xts_alg); + crypto_unregister_alg(&blk_xts_alg); +#endif #ifdef HAS_LRW crypto_unregister_alg(&ablk_lrw_alg); crypto_unregister_alg(&blk_lrw_alg); -- cgit v1.1 From d35643385628d44a5933a0755b01478eb4df5c65 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Wed, 9 Nov 2011 19:44:12 +0200 Subject: crypto: serpent-sse2 - clear CRYPTO_TFM_REQ_MAY_SLEEP in lrw and xts modes LRW/XTS patches for serpent-sse2 forgot to add this. CRYPTO_TFM_REQ_MAY_SLEEP should be cleared as sleeping between kernel_fpu_begin()/kernel_fpu_end() is not allowed. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent_sse2_glue.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 2dffc5a..2f5c304 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -554,6 +554,7 @@ static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, }; int ret; + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; ret = lrw_crypt(desc, dst, src, nbytes, &req); serpent_fpu_end(crypt_ctx.fpu_enabled); @@ -579,6 +580,7 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, }; int ret; + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; ret = lrw_crypt(desc, dst, src, nbytes, &req); serpent_fpu_end(crypt_ctx.fpu_enabled); @@ -671,6 +673,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, }; int ret; + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; ret = xts_crypt(desc, dst, src, nbytes, &req); serpent_fpu_end(crypt_ctx.fpu_enabled); @@ -697,6 +700,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, }; int ret; + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; ret = xts_crypt(desc, dst, src, nbytes, &req); serpent_fpu_end(crypt_ctx.fpu_enabled); -- cgit v1.1 From d88e4cb67197d007fb778d62fe17360e970d5bfa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Nov 2011 12:32:25 -0800 Subject: freezer: remove now unused TIF_FREEZE Signed-off-by: Tejun Heo Cc: Arnd Bergmann Cc: linux-arch@vger.kernel.org --- arch/x86/include/asm/thread_info.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a1fe5c1..32125af 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -90,7 +90,6 @@ struct thread_info { #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ -#define TIF_FREEZE 23 /* is freezing for suspend */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ @@ -112,7 +111,6 @@ struct thread_info { #define _TIF_FORK (1 << TIF_FORK) #define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) -#define _TIF_FREEZE (1 << TIF_FREEZE) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) -- cgit v1.1 From 80df46494846e857399618c54df30ce294dc1edd Mon Sep 17 00:00:00 2001 From: Maxim Uvarov Date: Fri, 14 Oct 2011 15:36:51 -0700 Subject: xen: Make XEN_MAX_DOMAIN_MEMORY have more sensible defaults Which is that 128GB is not going to happen with 32-bit PV DomU. Lets use something more realistic. Also update the 64-bit to 500GB which is the max a PV guest can do. Signed-off-by: Maxim Uvarov [v1: Updated 128GB->500GB for 64-bit] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/Kconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 26c731a..fdce49c 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -29,7 +29,8 @@ config XEN_PVHVM config XEN_MAX_DOMAIN_MEMORY int - default 128 + default 500 if X86_64 + default 64 if X86_32 depends on XEN help This only affects the sizing of some bss arrays, the unused @@ -48,3 +49,4 @@ config XEN_DEBUG_FS help Enable statistics output and various tuning options in debugfs. Enabling this option may incur a significant performance overhead. + -- cgit v1.1 From 0f9f5a9588468cddeccc9146b86798492c7cd4f5 Mon Sep 17 00:00:00 2001 From: Annie Li Date: Tue, 22 Nov 2011 09:58:06 +0800 Subject: xen/granttable: Introducing grant table V2 stucture This patch introduces new structures of grant table V2, grant table V2 is an extension from V1. Grant table is shared between guest and Xen, and Xen is responsible to do corresponding work for grant operations, such as: figure out guest's grant table version, perform different actions based on different grant table version, etc. Although full-page structure of V2 is different from V1, it play the same role as V1. Acked-by: Ian Campbell Signed-off-by: Annie Li Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/grant-table.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 6bbfd7a..c6ab2e7 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -64,10 +64,10 @@ static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, - struct grant_entry **__shared) + void **__shared) { int rc; - struct grant_entry *shared = *__shared; + void *shared = *__shared; if (shared == NULL) { struct vm_struct *area = @@ -83,8 +83,7 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, return rc; } -void arch_gnttab_unmap_shared(struct grant_entry *shared, - unsigned long nr_gframes) +void arch_gnttab_unmap_shared(void *shared, unsigned long nr_gframes) { apply_to_page_range(&init_mm, (unsigned long)shared, PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); -- cgit v1.1 From 85ff6acb075a484780b3d763fdf41596d8fc0970 Mon Sep 17 00:00:00 2001 From: Annie Li Date: Tue, 22 Nov 2011 09:59:21 +0800 Subject: xen/granttable: Grant tables V2 implementation Receiver-side copying of packets is based on this implementation, it gives better performance and better CPU accounting. It totally supports three types: full-page, sub-page and transitive grants. However this patch does not cover sub-page and transitive grants, it mainly focus on Full-page part and implements grant table V2 interfaces corresponding to what already exists in grant table V1, such as: grant table V2 initialization, mapping, releasing and exported interfaces. Each guest can only supports one type of grant table type, every entry in grant table should be the same version. It is necessary to set V1 or V2 version before initializing the grant table. Grant table exported interfaces of V2 are same with those of V1, Xen is responsible to judge what grant table version guests are using in every grant operation. V2 fulfills the same role of V1, and it is totally backwards compitable with V1. If dom0 support grant table V2, the guests runing on it can run with either V1 or V2. Acked-by: Ian Campbell Signed-off-by: Annie Li [v1: Modified alloc_vm_area call (new parameters), indentation, and cleanpatch warnings] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/grant-table.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index c6ab2e7..65160a8 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -54,6 +54,20 @@ static int map_pte_fn(pte_t *pte, struct page *pmd_page, return 0; } +/* + * This function is used to map shared frames to store grant status. It is + * different from map_pte_fn above, the frames type here is uint64_t. + */ +static int map_pte_fn_status(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + uint64_t **frames = (uint64_t **)data; + + set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL)); + (*frames)++; + return 0; +} + static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) { @@ -83,7 +97,30 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, return rc; } -void arch_gnttab_unmap_shared(void *shared, unsigned long nr_gframes) +int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + grant_status_t **__shared) +{ + int rc; + grant_status_t *shared = *__shared; + + if (shared == NULL) { + /* No need to pass in PTE as we are going to do it + * in apply_to_page_range anyhow. */ + struct vm_struct *area = + alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL); + BUG_ON(area == NULL); + shared = area->addr; + *__shared = shared; + } + + rc = apply_to_page_range(&init_mm, (unsigned long)shared, + PAGE_SIZE * nr_gframes, + map_pte_fn_status, &frames); + return rc; +} + +void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) { apply_to_page_range(&init_mm, (unsigned long)shared, PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); -- cgit v1.1 From 282e5aaba2a0cdfde4d2c2e34bc7438cd6f7a00f Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Thu, 17 Nov 2011 11:41:31 +0100 Subject: x86: Kconfig: drop unknown symbol 'APM_MODULE' There's no Kconfig symbol APM_MODULE, so the check for it will always fail. There's no need to append _MODULE to tristate symbols anyhow, because the config tools will do the right thing automagically. Signed-off-by: Paul Bolle Signed-off-by: Jiri Kosina --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6a47bb2..771a185 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1718,7 +1718,7 @@ source "drivers/sfi/Kconfig" config X86_APM_BOOT def_bool y - depends on APM || APM_MODULE + depends on APM menuconfig APM tristate "APM (Advanced Power Management) BIOS support" -- cgit v1.1 From 4673ca8eb3690832e76371371955a8b02e1f59d4 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 24 Nov 2011 14:54:28 +0200 Subject: lib: move GENERIC_IOMAP to lib/Kconfig define GENERIC_IOMAP in a central location instead of all architectures. This will be helpful for the follow-up patch which makes it select other configs. Code is also a bit shorter this way. Signed-off-by: Michael S. Tsirkin --- arch/x86/Kconfig | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cb9a104..08af645 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -75,6 +75,7 @@ config X86 select HAVE_BPF_JIT if (X86_64 && NET) select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG + select GENERIC_IOMAP config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) @@ -140,9 +141,6 @@ config NEED_SG_DMA_LENGTH config GENERIC_ISA_DMA def_bool ISA_DMA_API -config GENERIC_IOMAP - def_bool y - config GENERIC_BUG def_bool y depends on BUG -- cgit v1.1 From 37fe6a42b3433b79a159ceb06a94cd1ef00e279d Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Tue, 29 Nov 2011 15:08:29 +0900 Subject: x86: Check stack overflow in detail Currently, only kernel stack is checked for the overflow, which is not sufficient for systems that need a high reliability. To enhance it, it is required to check the IRQ and exception stacks, as well. This patch checks all the stack types and will cause messages of stacks in detail when free stack space drops below a certain limit except user stack. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Randy Dunlap Link: http://lkml.kernel.org/r/20111129060829.11076.51733.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar Cc: "H. Peter Anvin" --- arch/x86/Kconfig.debug | 7 +++++-- arch/x86/kernel/irq_64.c | 29 +++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index bf56e17..4caec12 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -63,8 +63,11 @@ config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL ---help--- - This option will cause messages to be printed if free stack space - drops below a certain limit. + Say Y here if you want to check the overflows of kernel, IRQ + and exception stacks. This option will cause messages of the + stacks in detail when free stack space drops below a certain + limit. + If in doubt, say "N". config X86_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 69bca46..928a7e9 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -36,18 +36,35 @@ EXPORT_PER_CPU_SYMBOL(irq_regs); static inline void stack_overflow_check(struct pt_regs *regs) { #ifdef CONFIG_DEBUG_STACKOVERFLOW + struct orig_ist *oist; + u64 irq_stack_top, irq_stack_bottom; + u64 estack_top, estack_bottom; u64 curbase = (u64)task_stack_page(current); if (user_mode_vm(regs)) return; - WARN_ONCE(regs->sp >= curbase && - regs->sp <= curbase + THREAD_SIZE && - regs->sp < curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + 128, + if (regs->sp >= curbase && + regs->sp <= curbase + THREAD_SIZE && + regs->sp >= curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128) + return; + + irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack); + irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr); + if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) + return; + + oist = &__get_cpu_var(orig_ist); + estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ; + estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; + if (regs->sp >= estack_top && regs->sp <= estack_bottom) + return; - "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", - current->comm, curbase, regs->sp); + WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", + current->comm, curbase, regs->sp, + irq_stack_top, irq_stack_bottom, + estack_top, estack_bottom); #endif } -- cgit v1.1 From 55af77969fbd7a841838220ea2287432e0da8ae5 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Tue, 29 Nov 2011 15:08:36 +0900 Subject: x86: Panic on detection of stack overflow Currently, messages are just output on the detection of stack overflow, which is not sufficient for systems that need a high reliability. This is because in general the overflow may corrupt data, and the additional corruption may occur due to reading them unless systems stop. This patch adds the sysctl parameter kernel.panic_on_stackoverflow and causes a panic when detecting the overflows of kernel, IRQ and exception stacks except user stack according to the parameter. It is disabled by default. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Randy Dunlap Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_32.c | 2 ++ arch/x86/kernel/irq_64.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 7209070..e16e99eb 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -43,6 +43,8 @@ static void print_stack_overflow(void) { printk(KERN_WARNING "low stack detected by irq handler\n"); dump_stack(); + if (sysctl_panic_on_stackoverflow) + panic("low stack detected by irq handler - check messages\n"); } #else diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 928a7e9..42552b0 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat); DEFINE_PER_CPU(struct pt_regs *, irq_regs); EXPORT_PER_CPU_SYMBOL(irq_regs); +int sysctl_panic_on_stackoverflow; + /* * Probabilistic stack overflow check: * @@ -65,6 +67,9 @@ static inline void stack_overflow_check(struct pt_regs *regs) current->comm, curbase, regs->sp, irq_stack_top, irq_stack_bottom, estack_top, estack_bottom); + + if (sysctl_panic_on_stackoverflow) + panic("low stack detected by irq handler - check messages\n"); #endif } -- cgit v1.1 From 467e6b7a7c0eb792ebaf322ddb7363742b4ead40 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Tue, 29 Nov 2011 15:08:45 +0900 Subject: x86: Clean up the range of stack overflow checking The overflow checking of kernel stack checks if the stack pointer points to the available kernel stack range, which is derived from the original overflow checking. It is clear that curbase address is always less than low boundary of available kernel stack. So, this patch removes the first condition that checks if the pointer is higher than curbase. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Randy Dunlap Link: http://lkml.kernel.org/r/20111129060845.11076.40916.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar Cc: "H. Peter Anvin" --- arch/x86/kernel/irq_64.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 42552b0..54e2b2b 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -46,10 +46,9 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (user_mode_vm(regs)) return; - if (regs->sp >= curbase && - regs->sp <= curbase + THREAD_SIZE && - regs->sp >= curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + 128) + if (regs->sp >= curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + 128 && + regs->sp <= curbase + THREAD_SIZE) return; irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack); -- cgit v1.1 From 3603a2512f9e69dc87914ba922eb4a0812b21cd6 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 13 Oct 2011 15:14:25 -0400 Subject: x86, reboot: Use NMI instead of REBOOT_VECTOR to stop cpus A recent discussion started talking about the locking on the pstore fs and how it relates to the kmsg infrastructure. We noticed it was possible for userspace to r/w to the pstore fs (grabbing the locks in the process) and block the panic path from r/w to the same fs. The reason was the cpu with the lock could be doing work while the crashing cpu is panic'ing. Busting those spinlocks might cause those cpus to step on each other's data. Fine, fair enough. It was suggested it would be nice to serialize the panic path (ie stop the other cpus) and have only one cpu running. This would allow us to bust the spinlocks and not worry about another cpu stepping on the data. Of course, smp_send_stop() does this in the panic case. kmsg_dump() would have to be moved to be called after it. Easy enough. The only problem is on x86 the smp_send_stop() function calls the REBOOT_VECTOR. Any cpu with irqs disabled (which pstore and its backend ERST would do), block this IPI and thus do not stop. This makes it difficult to reliably log data to the pstore fs. The patch below switches from the REBOOT_VECTOR to NMI (and mimics what kdump does). Switching to NMI allows us to deliver the IPI when irqs are disabled, increasing the reliability of this function. However, Andi carefully noted that on some machines this approach does not work because of broken BIOSes or whatever. To help accomodate this, the next couple of patches will run a selftest and provide a knob to disable. V2: uses atomic ops to serialize the cpu that shuts everyone down V3: comment cleanup Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert Richter Cc: seiji.aguchi@hds.com Cc: vgoyal@redhat.com Cc: mjg@redhat.com Cc: tony.luck@intel.com Cc: gong.chen@intel.com Cc: satoru.moriya@hds.com Cc: avi@redhat.com Cc: Andi Kleen Link: http://lkml.kernel.org/r/1318533267-18880-2-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 16204dc..e72b175 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * Some notes on x86 processor bugs affecting SMP operation: * @@ -148,6 +149,60 @@ void native_send_call_func_ipi(const struct cpumask *mask) free_cpumask_var(allbutself); } +static atomic_t stopping_cpu = ATOMIC_INIT(-1); + +static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) +{ + /* We are registered on stopping cpu too, avoid spurious NMI */ + if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) + return NMI_HANDLED; + + stop_this_cpu(NULL); + + return NMI_HANDLED; +} + +static void native_nmi_stop_other_cpus(int wait) +{ + unsigned long flags; + unsigned long timeout; + + if (reboot_force) + return; + + /* + * Use an own vector here because smp_call_function + * does lots of things not suitable in a panic situation. + */ + if (num_online_cpus() > 1) { + /* did someone beat us here? */ + if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id() != -1)) + return; + + if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, + NMI_FLAG_FIRST, "smp_stop")) + /* Note: we ignore failures here */ + return; + + /* sync above data before sending NMI */ + wmb(); + + apic->send_IPI_allbutself(NMI_VECTOR); + + /* + * Don't wait longer than a second if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && (wait || timeout--)) + udelay(1); + } + + local_irq_save(flags); + disable_local_APIC(); + local_irq_restore(flags); +} + /* * this function calls the 'stop' function on all other CPUs in the system. */ @@ -160,7 +215,7 @@ asmlinkage void smp_reboot_interrupt(void) irq_exit(); } -static void native_stop_other_cpus(int wait) +static void native_irq_stop_other_cpus(int wait) { unsigned long flags; unsigned long timeout; @@ -230,7 +285,7 @@ struct smp_ops smp_ops = { .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, - .stop_other_cpus = native_stop_other_cpus, + .stop_other_cpus = native_nmi_stop_other_cpus, .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, -- cgit v1.1 From 99e8b9ca90d688c3ac7d3a141b701c9694a93925 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 13 Oct 2011 15:14:26 -0400 Subject: x86, NMI: Add NMI IPI selftest The previous patch modified the stop cpus path to use NMI instead of IRQ as the way to communicate to the other cpus to shutdown. There were some concerns that various machines may have problems with using an NMI IPI. This patch creates a selftest to check if NMI is working at boot. The idea is to help catch any issues before the machine panics and we learn the hard way. Loosely based on the locking-selftest.c file, this separate file runs a couple of simple tests and reports the results. The output looks like: ... Brought up 4 CPUs ---------------- | NMI testsuite: -------------------- remote IPI: ok | local IPI: ok | -------------------- Good, all 2 testcases passed! | --------------------------------- Total of 4 processors activated (21330.61 BogoMIPS). ... Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert Richter Cc: seiji.aguchi@hds.com Cc: vgoyal@redhat.com Cc: mjg@redhat.com Cc: tony.luck@intel.com Cc: gong.chen@intel.com Cc: satoru.moriya@hds.com Cc: avi@redhat.com Cc: Andi Kleen Link: http://lkml.kernel.org/r/1318533267-18880-3-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 12 +++ arch/x86/include/asm/smp.h | 6 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/nmi_selftest.c | 179 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/smpboot.c | 1 + 5 files changed, 199 insertions(+) create mode 100644 arch/x86/kernel/nmi_selftest.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 4caec12..97da3c1 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -287,4 +287,16 @@ config DEBUG_STRICT_USER_COPY_CHECKS If unsure, or if you run an older (pre 4.4) gcc, say N. +config DEBUG_NMI_SELFTEST + bool "NMI Selftest" + depends on DEBUG_KERNEL + ---help--- + Enabling this option turns on a quick NMI selftest to verify + that the NMI behaves correctly. + + This might help diagnose strange hangs that rely on NMI to + function properly. + + If unsure, say N. + endmenu diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 73b11bc..0434c40 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -225,5 +225,11 @@ extern int hard_smp_processor_id(void); #endif /* CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_DEBUG_NMI_SELFTEST +extern void nmi_selftest(void); +#else +#define nmi_selftest() do { } while (0) +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_SMP_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8baca3c..02b2f05 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -80,6 +80,7 @@ obj-$(CONFIG_APB_TIMER) += apb_timer.o obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o +obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c new file mode 100644 index 0000000..572adb6 --- /dev/null +++ b/arch/x86/kernel/nmi_selftest.c @@ -0,0 +1,179 @@ +/* + * arch/x86/kernel/nmi-selftest.c + * + * Testsuite for NMI: IPIs + * + * Started by Don Zickus: + * (using lib/locking-selftest.c as a guide) + * + * Copyright (C) 2011 Red Hat, Inc., Don Zickus + */ + +#include +#include +#include + +#include +#include + +#define SUCCESS 0 +#define FAILURE 1 +#define TIMEOUT 2 + +static int nmi_fail; + +/* check to see if NMI IPIs work on this machine */ +static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly; + +static int testcase_total; +static int testcase_successes; +static int expected_testcase_failures; +static int unexpected_testcase_failures; +static int unexpected_testcase_unknowns; + +static int nmi_unk_cb(unsigned int val, struct pt_regs *regs) +{ + unexpected_testcase_unknowns++; + return NMI_HANDLED; +} + +static void init_nmi_testsuite(void) +{ + /* trap all the unknown NMIs we may generate */ + register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); +} + +static void cleanup_nmi_testsuite(void) +{ + unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); +} + +static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) +{ + int cpu = raw_smp_processor_id(); + + if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask))) + return NMI_HANDLED; + + return NMI_DONE; +} + +static void test_nmi_ipi(struct cpumask *mask) +{ + unsigned long timeout; + + if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + NMI_FLAG_FIRST, "nmi_selftest")) { + nmi_fail = FAILURE; + return; + } + + /* sync above data before sending NMI */ + wmb(); + + apic->send_IPI_mask(mask, NMI_VECTOR); + + /* Don't wait longer than a second */ + timeout = USEC_PER_SEC; + while (!cpumask_empty(mask) && timeout--) + udelay(1); + + /* What happens if we timeout, do we still unregister?? */ + unregister_nmi_handler(NMI_LOCAL, "nmi_selftest"); + + if (!timeout) + nmi_fail = TIMEOUT; + return; +} + +static void remote_ipi(void) +{ + cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void local_ipi(void) +{ + cpumask_clear(to_cpumask(nmi_ipi_mask)); + cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void reset_nmi(void) +{ + nmi_fail = 0; +} + +static void dotest(void (*testcase_fn)(void), int expected) +{ + testcase_fn(); + /* + * Filter out expected failures: + */ + if (nmi_fail != expected) { + unexpected_testcase_failures++; + + if (nmi_fail == FAILURE) + printk("FAILED |"); + else if (nmi_fail == TIMEOUT) + printk("TIMEOUT|"); + else + printk("ERROR |"); + dump_stack(); + } else { + testcase_successes++; + printk(" ok |"); + } + testcase_total++; + + reset_nmi(); +} + +static inline void print_testname(const char *testname) +{ + printk("%12s:", testname); +} + +void nmi_selftest(void) +{ + init_nmi_testsuite(); + + /* + * Run the testsuite: + */ + printk("----------------\n"); + printk("| NMI testsuite:\n"); + printk("--------------------\n"); + + print_testname("remote IPI"); + dotest(remote_ipi, SUCCESS); + printk("\n"); + print_testname("local IPI"); + dotest(local_ipi, SUCCESS); + printk("\n"); + + cleanup_nmi_testsuite(); + + if (unexpected_testcase_failures) { + printk("--------------------\n"); + printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", + unexpected_testcase_failures, testcase_total); + printk("-----------------------------------------------------------------\n"); + } else if (expected_testcase_failures && testcase_successes) { + printk("--------------------\n"); + printk("%3d out of %3d testcases failed, as expected. |\n", + expected_testcase_failures, testcase_total); + printk("----------------------------------------------------\n"); + } else if (expected_testcase_failures && !testcase_successes) { + printk("--------------------\n"); + printk("All %3d testcases failed, as expected. |\n", + expected_testcase_failures); + printk("----------------------------------------\n"); + } else { + printk("--------------------\n"); + printk("Good, all %3d testcases passed! |\n", + testcase_successes); + printk("---------------------------------\n"); + } +} diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9f548cb..1927781 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1142,6 +1142,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) { pr_debug("Boot done.\n"); + nmi_selftest(); impress_friends(); #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); -- cgit v1.1 From bda62633983f9db49ce0b1a9235b3709c1cda5f0 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 13 Oct 2011 15:14:27 -0400 Subject: x86, NMI: Add knob to disable using NMI IPIs to stop cpus Some machines may exhibit problems using the NMI to stop other cpus. This knob just allows one to revert back to the original behaviour to help diagnose the problem. V2: make function static Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert Richter Cc: seiji.aguchi@hds.com Cc: vgoyal@redhat.com Cc: mjg@redhat.com Cc: tony.luck@intel.com Cc: gong.chen@intel.com Cc: satoru.moriya@hds.com Cc: avi@redhat.com Cc: Andi Kleen Link: http://lkml.kernel.org/r/1318533267-18880-4-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index e72b175..113acda 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -249,6 +249,11 @@ static void native_irq_stop_other_cpus(int wait) local_irq_restore(flags); } +static void native_smp_disable_nmi_ipi(void) +{ + smp_ops.stop_other_cpus = native_irq_stop_other_cpus; +} + /* * Reschedule call back. */ @@ -280,6 +285,14 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) irq_exit(); } +static int __init nonmi_ipi_setup(char *str) +{ + native_smp_disable_nmi_ipi(); + return 1; +} + +__setup("nonmi_ipi", nonmi_ipi_setup); + struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, -- cgit v1.1 From 53b5650273fea486ac8ac6c1d1e9a6cd17aa31ca Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 5 Dec 2011 12:25:44 +0100 Subject: x86: Fix the 32-bit stackoverflow-debug build The panic_on_stackoverflow variable needs to be avilable on the 32-bit side as well ... Cc: Mitsuo Hayasaka Cc: Randy Dunlap Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/20111129060836.11076.12323.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_32.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index e16e99eb..40fc861 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -28,6 +28,9 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs); EXPORT_PER_CPU_SYMBOL(irq_regs); #ifdef CONFIG_DEBUG_STACKOVERFLOW + +int sysctl_panic_on_stackoverflow __read_mostly; + /* Debugging check for stack overflow: is there less than 1KB free? */ static int check_stack_overflow(void) { -- cgit v1.1 From 54b0264ec8c6e90f0413ad30e2f91c65e7844613 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 16 Nov 2011 18:17:40 +0000 Subject: x86/sfi: Kill the IRQ as id hack Nothing should now need it so take it out Signed-off-by: Alan Cox Signed-off-by: Ingo Molnar --- arch/x86/platform/mrst/mrst.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index b1489a0..6a21f60 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -802,8 +802,7 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *entry) if (mrst_has_msic()) return; - /* ID as IRQ is a hack that will go away */ - pdev = platform_device_alloc(entry->name, entry->irq); + pdev = platform_device_alloc(entry->name, 0); if (pdev == NULL) { pr_err("out of memory for SFI platform device '%s'.\n", entry->name); -- cgit v1.1 From 1ea7c6737c8f68453f55c894b3d07d7f48fcbef8 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 10 Nov 2011 13:29:14 +0000 Subject: x86/config: Revamp configuration for MID devices This follows on from the patch applied in 3.2rc1 which creates an INTEL_MID configuration. We can now add the entry for Medfield specific code. After this is merged the final patch will be submitted which moves the rest of the device Kconfig dependancies to MRST/MEDFIELD/INTEL_MID as appropriate. Signed-off-by: Alan Cox Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 17 +++++++++++++++++ arch/x86/Kconfig.debug | 6 +++--- arch/x86/kernel/early_printk.c | 2 +- arch/x86/platform/mrst/Makefile | 2 +- 4 files changed, 22 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cb9a104..9e7a361 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -419,6 +419,23 @@ config X86_MRST nor standard legacy replacement devices/features. e.g. Moorestown does not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. +config X86_MDFLD + bool "Medfield MID platform" + depends on PCI + depends on PCI_GOANY + depends on X86_IO_APIC + select APB_TIMER + select I2C + select SPI + select INTEL_SCU_IPC + select X86_PLATFORM_DEVICES + ---help--- + Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin + Internet Device(MID) platform. + Unlike standard x86 PCs, Medfield does not have many legacy devices + nor standard legacy replacement devices/features. e.g. Medfield does + not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. + endif config X86_RDC321X diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index bf56e17..28c3c73 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -43,9 +43,9 @@ config EARLY_PRINTK with klogd/syslogd or the X server. You should normally N here, unless you want to debug such a crash. -config EARLY_PRINTK_MRST - bool "Early printk for MRST platform support" - depends on EARLY_PRINTK && X86_MRST +config EARLY_PRINTK_INTEL_MID + bool "Early printk for Intel MID platform support" + depends on EARLY_PRINTK && X86_INTEL_MID config EARLY_PRINTK_DBGP bool "Early printk via EHCI debug port" diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index cd28a35..7a53da0 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -240,7 +240,7 @@ static int __init setup_early_printk(char *buf) if (!strncmp(buf, "xen", 3)) early_console_register(&xenboot_console, keep); #endif -#ifdef CONFIG_EARLY_PRINTK_MRST +#ifdef CONFIG_EARLY_PRINTK_INTEL_MID if (!strncmp(buf, "mrst", 4)) { mrst_early_console_init(); early_console_register(&early_mrst_console, keep); diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile index 1ea3877..ddeec73 100644 --- a/arch/x86/platform/mrst/Makefile +++ b/arch/x86/platform/mrst/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_X86_MRST) += mrst.o obj-$(CONFIG_X86_MRST) += vrtc.o -obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o +obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_mrst.o obj-$(CONFIG_X86_MRST) += pmu.o -- cgit v1.1 From 9af0c7a6fa860698d080481f24a342ba74b68982 Mon Sep 17 00:00:00 2001 From: Ludwig Nussel Date: Tue, 15 Nov 2011 14:46:46 -0800 Subject: x86: Fix mmap random address range On x86_32 casting the unsigned int result of get_random_int() to long may result in a negative value. On x86_32 the range of mmap_rnd() therefore was -255 to 255. The 32bit mode on x86_64 used 0 to 255 as intended. The bug was introduced by 675a081 ("x86: unify mmap_{32|64}.c") in January 2008. Signed-off-by: Ludwig Nussel Cc: Linus Torvalds Cc: harvey.harrison@gmail.com Cc: "H. Peter Anvin" Cc: Harvey Harrison Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/201111152246.pAFMklOB028527@wpaz5.hot.corp.google.com Signed-off-by: Ingo Molnar --- arch/x86/mm/mmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 4b5ba85..845df68 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -75,9 +75,9 @@ static unsigned long mmap_rnd(void) */ if (current->flags & PF_RANDOMIZE) { if (mmap_is_ia32()) - rnd = (long)get_random_int() % (1<<8); + rnd = get_random_int() % (1<<8); else - rnd = (long)(get_random_int() % (1<<28)); + rnd = get_random_int() % (1<<28); } return rnd << PAGE_SHIFT; } -- cgit v1.1 From d1bbdd669298b7ca08284ddb29153dfc039dd89d Mon Sep 17 00:00:00 2001 From: Mike Ditto Date: Tue, 15 Nov 2011 14:46:50 -0800 Subject: arch/x86/kernel/e820.c: Eliminate bubble sort from sanitize_e820_map() Replace the bubble sort in sanitize_e820_map() with a call to the generic kernel sort function to avoid pathological performance with large maps. On large (thousands of entries) E820 maps, the previous code took minutes to run; with this change it's now milliseconds. Signed-off-by: Mike Ditto Cc: sassmann@kpanic.de Cc: yuenn@google.com Cc: Stefan Assmann Cc: Nancy Yuen Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 59 ++++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 303a0e4..f655f80 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -227,22 +228,38 @@ void __init e820_print_map(char *who) * ____________________33__ * ______________________4_ */ +struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ +}; + +static int __init cpcompare(const void *a, const void *b) +{ + struct change_member * const *app = a, * const *bpp = b; + const struct change_member *ap = *app, *bp = *bpp; + + /* + * Inputs are pointers to two elements of change_point[]. If their + * addresses are unequal, their difference dominates. If the addresses + * are equal, then consider one that represents the end of its region + * to be greater than one that does not. + */ + if (ap->addr != bp->addr) + return ap->addr > bp->addr ? 1 : -1; + + return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr); +} int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map) { - struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ - }; static struct change_member change_point_list[2*E820_X_MAX] __initdata; static struct change_member *change_point[2*E820_X_MAX] __initdata; static struct e820entry *overlap_list[E820_X_MAX] __initdata; static struct e820entry new_bios[E820_X_MAX] __initdata; - struct change_member *change_tmp; unsigned long current_type, last_type; unsigned long long last_addr; - int chgidx, still_changing; + int chgidx; int overlap_entries; int new_bios_entry; int old_nr, new_nr, chg_nr; @@ -279,35 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, chg_nr = chgidx; /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i = 1; i < chg_nr; i++) { - unsigned long long curaddr, lastaddr; - unsigned long long curpbaddr, lastpbaddr; - - curaddr = change_point[i]->addr; - lastaddr = change_point[i - 1]->addr; - curpbaddr = change_point[i]->pbios->addr; - lastpbaddr = change_point[i - 1]->pbios->addr; - - /* - * swap entries, when: - * - * curaddr > lastaddr or - * curaddr == lastaddr and curaddr == curpbaddr and - * lastaddr != lastpbaddr - */ - if (curaddr < lastaddr || - (curaddr == lastaddr && curaddr == curpbaddr && - lastaddr != lastpbaddr)) { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing = 1; - } - } - } + sort(change_point, chg_nr, sizeof *change_point, cpcompare, 0); /* create a new bios memory map, removing overlaps */ overlap_entries = 0; /* number of entries in the overlap table */ -- cgit v1.1 From 706d9a9c8b5758390036b9980a2b12d809599777 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Tue, 15 Nov 2011 14:48:56 -0800 Subject: arch/x86/kernel/e820.c: quiet sparse noise about plain integer as NULL pointer The last parameter to sort() is a pointer to the function used to swap items. This parameter should be NULL, not 0, when not used. This quiets the following sparse warning: warning: Using plain integer as NULL pointer Signed-off-by: H Hartley Sweeten Signed-off-by: Andrew Morton Cc: hartleys@visionengravers.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index f655f80..d6bd853 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -296,7 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, chg_nr = chgidx; /* sort change-point list by memory addresses (low -> high) */ - sort(change_point, chg_nr, sizeof *change_point, cpcompare, 0); + sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL); /* create a new bios memory map, removing overlaps */ overlap_entries = 0; /* number of entries in the overlap table */ -- cgit v1.1 From b565201cf75210614903ef2ae5917b4379681647 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 15 Nov 2011 15:33:56 -0800 Subject: x86: Reduce clock calibration time during slave cpu startup Reduce the startup time for slave cpus. Adds hooks for an arch-specific function for clock calibration. These hooks are used on x86. If a newly started cpu has the same phys_proc_id as a core already active, uses the TSC for the delay loop and has a CONSTANT_TSC, use the already-calculated value of loops_per_jiffy. This patch reduces the time required to start slave cpus on a 4096 cpu system from: 465 sec OLD 62 sec NEW This reduces boot time on a 4096p system by almost 7 minutes. Nice... Signed-off-by: Jack Steiner Cc: "H. Peter Anvin" Cc: John Stultz [fix CONFIG_SMP=n build] Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 16 +++++++++++----- arch/x86/kernel/tsc.c | 20 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9f548cb..00eef55 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -207,23 +207,29 @@ static void __cpuinit smp_callin(void) * Need to setup vector mappings before we enable interrupts. */ setup_vector_irq(smp_processor_id()); + + /* + * Save our processor parameters. Note: this information + * is needed for clock calibration. + */ + smp_store_cpu_info(cpuid); + /* * Get our bogomips. + * Update loops_per_jiffy in cpu_data. Previous call to + * smp_store_cpu_info() stored a value that is close but not as + * accurate as the value just calculated. * * Need to enable IRQs because it can take longer and then * the NMI watchdog might kill us. */ local_irq_enable(); calibrate_delay(); + cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; local_irq_disable(); pr_debug("Stack at about %p\n", &cpuid); /* - * Save our processor parameters - */ - smp_store_cpu_info(cpuid); - - /* * This must be done before setting cpu_online_mask * or calling notify_cpu_starting. */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index db48336..490fb33 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -995,3 +995,23 @@ void __init tsc_init(void) check_system_tsc_reliable(); } +#ifdef CONFIG_SMP +/* + * If we have a constant TSC and are using the TSC for the delay loop, + * we can skip clock calibration if another cpu in the same socket has already + * been calibrated. This assumes that CONSTANT_TSC applies to all + * cpus in the socket - this should be a safe assumption. + */ +unsigned long __cpuinit calibrate_delay_is_known(void) +{ + int i, cpu = smp_processor_id(); + + if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) + return 0; + + for_each_online_cpu(i) + if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id) + return cpu_data(i).loops_per_jiffy; + return 0; +} +#endif -- cgit v1.1 From f9b15df466ba923a5832c9121ad8327ccf5483ef Mon Sep 17 00:00:00 2001 From: Alessandro Rubini Date: Sat, 29 Oct 2011 00:48:42 +0200 Subject: x86/Kconfig: Cyclone-timer depends on x86-summit CONFIG_X86_CYCLONE_TIMER depends on CONFIG_X86_32_NON_STANDARD, which forces drivers/clocksource/cyclone.c to be compiled. The file doesn't do anything unless enabled by arch/x86/kernel/apic/summit_32.c Make CONFIG_X86_CYCLONE_TIMER depend by X86_SUMMIT instead, to avoid unnecessary code in other non-standard systems. Signed-off-by: Alessandro Rubini Cc: john stultz Link: http://lkml.kernel.org/r/20111028224842.GA7582@mail.gnudd.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9e7a361..faf39a0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -633,7 +633,7 @@ config X86_SUMMIT_NUMA config X86_CYCLONE_TIMER def_bool y - depends on X86_32_NON_STANDARD + depends on X86_SUMMIT source "arch/x86/Kconfig.cpu" -- cgit v1.1 From 45db1c6176c8171d9ae6fa6d82e07d115a5950ca Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 5 Dec 2011 16:08:49 -0800 Subject: x86, um: Use the same style generated syscall tables as native Now when the native kernel uses a single style of generated system call table, follow suite for UML and implement the same style, all in C. This requires __NR_syscall_max and NR_syscalls to be generated; on native this is done in asm-headers.h but that file is common to all UML architectures; therefore put it in user-headers.h instead which already have accommodations for architecture-specific values. Signed-off-by: H. Peter Anvin --- arch/x86/um/Makefile | 3 ++- arch/x86/um/sys_call_table_32.S | 26 ------------------- arch/x86/um/sys_call_table_32.c | 55 +++++++++++++++++++++++++++++++++++++++++ arch/x86/um/sys_call_table_64.c | 31 +++++++++-------------- arch/x86/um/user-offsets.c | 15 +++++++++++ 5 files changed, 84 insertions(+), 46 deletions(-) delete mode 100644 arch/x86/um/sys_call_table_32.S create mode 100644 arch/x86/um/sys_call_table_32.c (limited to 'arch/x86') diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 8fb5840..5d065b2 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -37,7 +37,8 @@ subarch-$(CONFIG_MODULES) += ../kernel/module.o USER_OBJS := bugs_$(BITS).o ptrace_user.o fault.o extra-y += user-offsets.s -$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) +$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) \ + -Iarch/x86/include/generated UNPROFILE_OBJS := stub_segv.o CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING) diff --git a/arch/x86/um/sys_call_table_32.S b/arch/x86/um/sys_call_table_32.S deleted file mode 100644 index a7ca80d..0000000 --- a/arch/x86/um/sys_call_table_32.S +++ /dev/null @@ -1,26 +0,0 @@ -#include -/* Steal i386 syscall table for our purposes, but with some slight changes.*/ - -#define sys_iopl sys_ni_syscall -#define sys_ioperm sys_ni_syscall - -#define sys_vm86old sys_ni_syscall -#define sys_vm86 sys_ni_syscall - -#define old_mmap sys_old_mmap - -#define ptregs_fork sys_fork -#define ptregs_execve sys_execve -#define ptregs_iopl sys_iopl -#define ptregs_vm86old sys_vm86old -#define ptregs_clone sys_clone -#define ptregs_vm86 sys_vm86 -#define ptregs_sigaltstack sys_sigaltstack -#define ptregs_vfork sys_vfork - -.section .rodata,"a" - -#include "../kernel/syscall_table_32.S" - -ENTRY(syscall_table_size) -.long .-sys_call_table diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c new file mode 100644 index 0000000..b897fca --- /dev/null +++ b/arch/x86/um/sys_call_table_32.c @@ -0,0 +1,55 @@ +/* + * System call table for UML/i386, copied from arch/x86/kernel/syscall_*.c + * with some changes for UML. + */ + +#include +#include +#include +#include + +#define __NO_STUBS + +/* + * Below you can see, in terms of #define's, the differences between the x86-64 + * and the UML syscall table. + */ + +/* Not going to be implemented by UML, since we have no hardware. */ +#define stub_iopl sys_ni_syscall +#define sys_ioperm sys_ni_syscall + +#define sys_vm86old sys_ni_syscall +#define sys_vm86 sys_ni_syscall + +#define old_mmap sys_old_mmap + +#define ptregs_fork sys_fork +#define ptregs_execve sys_execve +#define ptregs_iopl sys_iopl +#define ptregs_vm86old sys_vm86old +#define ptregs_clone sys_clone +#define ptregs_vm86 sys_vm86 +#define ptregs_sigaltstack sys_sigaltstack +#define ptregs_vfork sys_vfork + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#include + +#undef __SYSCALL_I386 +#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, + +typedef void (*sys_call_ptr_t)(void); + +extern void sys_ni_syscall(void); + +sys_call_ptr_t sys_call_table[] __cacheline_aligned = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include +}; + +int syscall_table_size = sizeof(sys_call_table); diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 99522f7..797a639 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -1,11 +1,12 @@ /* - * System call table for UML/x86-64, copied from arch/x86_64/kernel/syscall.c + * System call table for UML/x86-64, copied from arch/x86/kernel/syscall_*.c * with some changes for UML. */ #include #include #include +#include #define __NO_STUBS @@ -34,31 +35,23 @@ #define stub_sigaltstack sys_sigaltstack #define stub_rt_sigreturn sys_rt_sigreturn -#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_UNISTD_64_H -#include "../../x86/include/asm/unistd_64.h" +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include -#undef __SYSCALL -#define __SYSCALL(nr, sym) [ nr ] = sym, -#undef _ASM_X86_UNISTD_64_H +#undef __SYSCALL_64 +#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym, typedef void (*sys_call_ptr_t)(void); extern void sys_ni_syscall(void); -/* - * We used to have a trick here which made sure that holes in the - * x86_64 table were filled in with sys_ni_syscall, but a comment in - * unistd_64.h says that holes aren't allowed, so the trick was - * removed. - * The trick looked like this - * [0 ... UM_NR_syscall_max] = &sys_ni_syscall - * before including unistd_64.h - the later initializations overwrote - * the sys_ni_syscall filler. - */ - sys_call_ptr_t sys_call_table[] __cacheline_aligned = { -#include + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include }; int syscall_table_size = sizeof(sys_call_table); diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c index ca49be8..5edf4f4 100644 --- a/arch/x86/um/user-offsets.c +++ b/arch/x86/um/user-offsets.c @@ -8,6 +8,18 @@ #include #include +#ifdef __i386__ +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls[] = { +#include +}; +#else +#define __SYSCALL_64(nr, sym, compat) [nr] = 1, +static char syscalls[] = { +#include +}; +#endif + #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -77,4 +89,7 @@ void foo(void) DEFINE(UM_PROT_READ, PROT_READ); DEFINE(UM_PROT_WRITE, PROT_WRITE); DEFINE(UM_PROT_EXEC, PROT_EXEC); + + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + DEFINE(NR_syscalls, sizeof(syscalls)); } -- cgit v1.1 From a074335a370eca6d72f2ec890e4ae22923a2aea4 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 5 Dec 2011 22:48:49 -0800 Subject: x86, um: Mark system call tables readonly Mark the system call tables readonly, as they already are on native, and the 32-bit UM version was in the previous assembly version. The 32-bit version lost it due to copy and paste from the 64-bit version, which was missing the const. Cc: Jeff Dike Cc: Richard Weinberger Link: http://lkml.kernel.org/r/tip-45db1c6176c8171d9ae6fa6d82e07d115a5950ca@git.kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/um/sys_call_table_32.c | 2 +- arch/x86/um/sys_call_table_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index b897fca..0606aa3 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -43,7 +43,7 @@ typedef void (*sys_call_ptr_t)(void); extern void sys_ni_syscall(void); -sys_call_ptr_t sys_call_table[] __cacheline_aligned = { +const sys_call_ptr_t sys_call_table[] __cacheline_aligned = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 797a639..fe626c3 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -45,7 +45,7 @@ typedef void (*sys_call_ptr_t)(void); extern void sys_ni_syscall(void); -sys_call_ptr_t sys_call_table[] __cacheline_aligned = { +const sys_call_ptr_t sys_call_table[] __cacheline_aligned = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. -- cgit v1.1 From 855c743a27bb58a9a521bdc485ef5acfdb69badc Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 6 Dec 2011 09:08:34 +0100 Subject: x86/mm: Initialize high mem before free_all_bootmem() Patch fixes a boot crash with pagealloc debugging enabled: Initializing HighMem for node 0 (000377fe:0003fff0) BUG: unable to handle kernel paging request at f6fefe80 IP: [] find_range_array+0x5e/0x69 [...] Call Trace: [] __get_free_all_memory_range+0x39/0xb4 [] add_highpages_with_active_regions+0x18/0x9b [] set_highmem_pages_init+0x70/0x90 [] mem_init+0x50/0x21b [] start_kernel+0x1bf/0x31c [] i386_start_kernel+0x65/0x67 The crash happens when memblock wants to allocate big area for temporary "struct range" array and reuses pages from top of low memory, which were already passed to the buddy allocator. Reported-by: Ingo Molnar Signed-off-by: Stanislaw Gruszka Cc: linux-mm@kvack.org Cc: Mel Gorman Link: http://lkml.kernel.org/r/20111206080833.GB3105@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3bebaed..a2fecb1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -744,6 +744,17 @@ void __init mem_init(void) #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif + /* + * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to + * be done before free_all_bootmem(). Memblock use free low memory for + * temporary data (see find_range_array()) and for this purpose can use + * pages that was already passed to the buddy allocator, hence marked as + * not accessible in the page tables when compiled with + * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not + * important here. + */ + set_highmem_pages_init(); + /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); @@ -755,8 +766,6 @@ void __init mem_init(void) if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) reservedpages++; - set_highmem_pages_init(); - codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; -- cgit v1.1 From 54c29c635ae91f5d75ced7bffeaa77ba37ca02bb Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 29 Nov 2011 17:05:11 +0100 Subject: mm, x86: Remove debug_pagealloc_enabled When (no)bootmem finish operation, it pass pages to buddy allocator. Since debug_pagealloc_enabled is not set, we will do not protect pages, what is not what we want with CONFIG_DEBUG_PAGEALLOC=y. To fix remove debug_pagealloc_enabled. That variable was introduced by commit 12d6f21e "x86: do not PSE on CONFIG_DEBUG_PAGEALLOC=y" to get more CPA (change page attribude) code testing. But currently we have CONFIG_CPA_DEBUG, which test CPA. Signed-off-by: Stanislaw Gruszka Acked-by: Mel Gorman Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1322582711-14571-1-git-send-email-sgruszka@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index f9e5267..5031eef 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1334,12 +1334,6 @@ void kernel_map_pages(struct page *page, int numpages, int enable) } /* - * If page allocator is not up yet then do not call c_p_a(): - */ - if (!debug_pagealloc_enabled) - return; - - /* * The return value is ignored as the calls cannot fail. * Large pages for identity mappings are not used at boot time * and hence no memory allocations during large page split. -- cgit v1.1 From 565cbc3e934f221369a656b4469a044aa4c3f2a8 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 6 Dec 2011 13:08:59 -0500 Subject: x86, NMI: NMI-selftest should handle the UP case properly If no remote cpus are online, then just quietly skip the remote IPI test for now. Signed-off-by: Don Zickus Cc: andi@firstfloor.org Cc: torvalds@linux-foundation.org Cc: peterz@infradead.org Cc: robert.richter@amd.com Link: http://lkml.kernel.org/r/20111206180859.GR1669@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi_selftest.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 572adb6..1e42a23 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -90,7 +90,8 @@ static void remote_ipi(void) { cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); - test_nmi_ipi(to_cpumask(nmi_ipi_mask)); + if (!cpumask_empty(nmi_ipi_mask)) + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); } static void local_ipi(void) -- cgit v1.1 From d2db6610219cbcadceea6c43ee03d89068b7d759 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Wed, 7 Dec 2011 17:29:10 +0900 Subject: x86: Add stack top margin for stack overflow checking It seems that a margin for stack overflow checking is added to top of a kernel stack but is not added to IRQ and exception stacks in stack_overflow_check(). Therefore, the overflows of IRQ and exception stacks are always detected only after they actually occurred and data corruption might occur due to them. This patch adds the margin to top of IRQ and exception stacks as well as a kernel stack to enhance reliability. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20111207082910.9847.3359.stgit@ltc219.sdl.hitachi.co.jp [ removed the #undef - we typically don't do that for uncommon names ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 54e2b2b..d04d3ecd 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -38,6 +38,7 @@ int sysctl_panic_on_stackoverflow; static inline void stack_overflow_check(struct pt_regs *regs) { #ifdef CONFIG_DEBUG_STACKOVERFLOW +#define STACK_TOP_MARGIN 128 struct orig_ist *oist; u64 irq_stack_top, irq_stack_bottom; u64 estack_top, estack_bottom; @@ -47,17 +48,18 @@ static inline void stack_overflow_check(struct pt_regs *regs) return; if (regs->sp >= curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + 128 && + sizeof(struct pt_regs) + STACK_TOP_MARGIN && regs->sp <= curbase + THREAD_SIZE) return; - irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack); + irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) + + STACK_TOP_MARGIN; irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr); if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) return; oist = &__get_cpu_var(orig_ist); - estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ; + estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; if (regs->sp >= estack_top && regs->sp <= estack_bottom) return; -- cgit v1.1 From 4f941c57fe7e04e38c2401d53516bfd16038c9ab Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Wed, 7 Dec 2011 16:06:30 -0500 Subject: x86, NMI: NMI selftest depends on the local apic The selftest doesn't work with out a local apic for now. Reported-by: Randy Durlap Signed-off-by: Don Zickus Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/20111207210630.GI1669@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 97da3c1..aa4158f 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -289,7 +289,7 @@ config DEBUG_STRICT_USER_COPY_CHECKS config DEBUG_NMI_SELFTEST bool "NMI Selftest" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && X86_LOCAL_APIC ---help--- Enabling this option turns on a quick NMI selftest to verify that the NMI behaves correctly. -- cgit v1.1 From 3d6240b53e34e372be007e08f7066e7625910675 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 7 Dec 2011 14:06:12 +0300 Subject: x86, NMI: Add to_cpumask() to silence compile warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gcc complains if we don't cast this to a struct cpumask pointer. arch/x86/kernel/nmi_selftest.c:93:2: warning: passing argument 1 of ‘cpumask_empty’ from incompatible pointer type [enabled by default] Signed-off-by: Dan Carpenter Cc: Don Zickus Link: http://lkml.kernel.org/r/20111207110612.GA3437@mwanda Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 1e42a23..0d01a8e 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -90,7 +90,7 @@ static void remote_ipi(void) { cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); - if (!cpumask_empty(nmi_ipi_mask)) + if (!cpumask_empty(to_cpumask(nmi_ipi_mask))) test_nmi_ipi(to_cpumask(nmi_ipi_mask)); } -- cgit v1.1 From 54eed6cb16ec315565aaaf8e34252ca253a68b7b Mon Sep 17 00:00:00 2001 From: Petr Holasek Date: Thu, 8 Dec 2011 13:16:41 +0100 Subject: x86/numa: Add constraints check for nid parameters This patch adds constraint checks to the numa_set_distance() function. When the check triggers (this should not happen normally) it emits a warning and avoids a store to a negative index in numa_distance[] array - i.e. avoids memory corruption. Negative ids can be passed when the pxm-to-nids mapping is not properly filled while parsing the SRAT. Signed-off-by: Petr Holasek Acked-by: David Rientjes Cc: Anton Arapov Link: http://lkml.kernel.org/r/20111208121640.GA2229@dhcp-27-244.brq.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/mm/numa.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index fbeaaf4..cdc0054 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -430,8 +430,9 @@ static int __init numa_alloc_distance(void) * calls are ignored until the distance table is reset with * numa_reset_distance(). * - * If @from or @to is higher than the highest known node at the time of - * table creation or @distance doesn't make sense, the call is ignored. + * If @from or @to is higher than the highest known node or lower than zero + * at the time of table creation or @distance doesn't make sense, the call + * is ignored. * This is to allow simplification of specific NUMA config implementations. */ void __init numa_set_distance(int from, int to, int distance) @@ -439,8 +440,9 @@ void __init numa_set_distance(int from, int to, int distance) if (!numa_distance && numa_alloc_distance() < 0) return; - if (from >= numa_distance_cnt || to >= numa_distance_cnt) { - printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", + if (from >= numa_distance_cnt || to >= numa_distance_cnt || + from < 0 || to < 0) { + pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n", from, to, distance); return; } -- cgit v1.1 From 819a693b5a503788a7af54a3d95c4857780a7230 Mon Sep 17 00:00:00 2001 From: Wang YanQing Date: Thu, 8 Dec 2011 12:00:27 +0800 Subject: typo fixes: aera -> area, exntension -> extension One printk and one comment typo fix. Signed-off-by: Wang YanQing Signed-off-by: Jiri Kosina --- arch/x86/pci/pcbios.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index db0e9a5..da8fe05 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -44,7 +44,7 @@ static inline void set_bios_x(void) pcibios_enabled = 1; set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT); if (__supported_pte_mask & _PAGE_NX) - printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n"); + printk(KERN_INFO "PCI : PCI BIOS area is rw and x. Use pci=nobios if you want it NX.\n"); } /* -- cgit v1.1 From 47db9e7c808a45b1f86971f25eca5e38fa95ab86 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 9 Dec 2011 11:13:59 -0800 Subject: x86, um: Fix typo in 32-bit system call modifications We override sys_iopl(), not stub_iopl(); the latter is a 64-bitism that doesn't apply to i386 in the first place. Reported-by: Richard Weinberger Signed-off-by: H. Peter Anvin --- arch/x86/um/sys_call_table_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 0606aa3..416bd40 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -16,7 +16,7 @@ */ /* Not going to be implemented by UML, since we have no hardware. */ -#define stub_iopl sys_ni_syscall +#define sys_iopl sys_ni_syscall #define sys_ioperm sys_ni_syscall #define sys_vm86old sys_ni_syscall -- cgit v1.1 From 8af21e7e71d1ac56d9b66fb787a14fd66af7f5f7 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Sat, 27 Aug 2011 09:35:45 +0100 Subject: x86: Add missing bzImage fields to struct setup_header commit 37ba7ab5e33c ("x86, boot: make kernel_alignment adjustable; new bzImage fields") introduced some new fields into the bzImage header but struct setup_header was not updated accordingly. Add the missing 'pref_address' and 'init_size' fields. Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1318848017-12301-1-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/bootparam.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index e020d88..2f90c51 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -64,6 +64,8 @@ struct setup_header { __u32 payload_offset; __u32 payload_length; __u64 setup_data; + __u64 pref_address; + __u32 init_size; } __attribute__((packed)); struct sys_desc_table { -- cgit v1.1 From f7d7d01be53cb47e0ae212c4e968aa28b82d2138 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 15 Nov 2011 12:56:14 +0000 Subject: x86: Don't use magic strings for EFI loader signature Introduce a symbol, EFI_LOADER_SIGNATURE instead of using the magic strings, which also helps to reduce the amount of ifdeffery. Cc: Matthew Garrett Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1318848017-12301-1-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/efi.h | 4 ++++ arch/x86/kernel/setup.c | 7 +------ 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index b8d8bfc..26d8c18 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -3,6 +3,8 @@ #ifdef CONFIG_X86_32 +#define EFI_LOADER_SIGNATURE "EL32" + extern unsigned long asmlinkage efi_call_phys(void *, ...); #define efi_call_phys0(f) efi_call_phys(f) @@ -35,6 +37,8 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #else /* !CONFIG_X86_32 */ +#define EFI_LOADER_SIGNATURE "EL64" + extern u64 efi_call0(void *fp); extern u64 efi_call1(void *fp, u64 arg1); extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9a9e40f..4d5243c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -752,12 +752,7 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, -#ifdef CONFIG_X86_32 - "EL32", -#else - "EL64", -#endif - 4)) { + EFI_LOADER_SIGNATURE, 4)) { efi_enabled = 1; efi_memblock_x86_reserve_range(); } -- cgit v1.1 From 291f36325f9f252bd76ef5f603995f37e453fc60 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 12 Dec 2011 21:27:52 +0000 Subject: x86, efi: EFI boot stub support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is currently a large divide between kernel development and the development of EFI boot loaders. The idea behind this patch is to give the kernel developers full control over the EFI boot process. As H. Peter Anvin put it, "The 'kernel carries its own stub' approach been very successful in dealing with BIOS, and would make a lot of sense to me for EFI as well." This patch introduces an EFI boot stub that allows an x86 bzImage to be loaded and executed by EFI firmware. The bzImage appears to the firmware as an EFI application. Luckily there are enough free bits within the bzImage header so that it can masquerade as an EFI application, thereby coercing the EFI firmware into loading it and jumping to its entry point. The beauty of this masquerading approach is that both BIOS and EFI boot loaders can still load and run the same bzImage, thereby allowing a single kernel image to work in any boot environment. The EFI boot stub supports multiple initrds, but they must exist on the same partition as the bzImage. Command-line arguments for the kernel can be appended after the bzImage name when run from the EFI shell, e.g. Shell> bzImage console=ttyS0 root=/dev/sdb initrd=initrd.img v7: - Fix checkpatch warnings. v6: - Try to allocate initrd memory just below hdr->inird_addr_max. v5: - load_options_size is UTF-16, which needs dividing by 2 to convert to the corresponding ASCII size. v4: - Don't read more than image->load_options_size v3: - Fix following warnings when compiling CONFIG_EFI_STUB=n arch/x86/boot/tools/build.c: In function ‘main’: arch/x86/boot/tools/build.c:138:24: warning: unused variable ‘pe_header’ arch/x86/boot/tools/build.c:138:15: warning: unused variable ‘file_sz’ - As reported by Matthew Garrett, some Apple machines have GOPs that don't have hardware attached. We need to weed these out by searching for ones that handle the PCIIO protocol. - Don't allocate memory if no initrds are on cmdline - Don't trust image->load_options_size Maarten Lankhorst noted: - Don't strip first argument when booted from efibootmgr - Don't allocate too much memory for cmdline - Don't update cmdline_size, the kernel considers it read-only - Don't accept '\n' for initrd names v2: - File alignment was too large, was 8192 should be 512. Reported by Maarten Lankhorst on LKML. - Added UGA support for graphics - Use VIDEO_TYPE_EFI instead of hard-coded number. - Move linelength assignment until after we've assigned depth - Dynamically fill out AddressOfEntryPoint in tools/build.c - Don't use magic number for GDT/TSS stuff. Requested by Andi Kleen - The bzImage may need to be relocated as it may have been loaded at a high address address by the firmware. This was required to get my macbook booting because the firmware loaded it at 0x7cxxxxxx, which triggers this error in decompress_kernel(), if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff)) error("Destination address too large"); Cc: Mike Waychison Cc: Matthew Garrett Tested-by: Henrik Rydberg Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1321383097.2657.9.camel@mfleming-mobl1.ger.corp.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 7 + arch/x86/boot/compressed/Makefile | 10 +- arch/x86/boot/compressed/eboot.c | 1014 ++++++++++++++++++++++++++++++++ arch/x86/boot/compressed/eboot.h | 60 ++ arch/x86/boot/compressed/efi_stub_32.S | 86 +++ arch/x86/boot/compressed/efi_stub_64.S | 1 + arch/x86/boot/compressed/head_32.S | 22 + arch/x86/boot/compressed/head_64.S | 20 + arch/x86/boot/compressed/string.c | 9 + arch/x86/boot/header.S | 158 +++++ arch/x86/boot/string.c | 35 ++ arch/x86/boot/tools/build.c | 39 ++ arch/x86/kernel/asm-offsets.c | 2 + 13 files changed, 1462 insertions(+), 1 deletion(-) create mode 100644 arch/x86/boot/compressed/eboot.c create mode 100644 arch/x86/boot/compressed/eboot.h create mode 100644 arch/x86/boot/compressed/efi_stub_32.S create mode 100644 arch/x86/boot/compressed/efi_stub_64.S (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index efb4294..d71b656 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1478,6 +1478,13 @@ config EFI resultant kernel should continue to boot on existing non-EFI platforms. +config EFI_STUB + bool "EFI stub support" + depends on EFI + ---help--- + This kernel feature allows a bzImage to be loaded directly + by EFI firmware without the use of a bootloader. + config SECCOMP def_bool y prompt "Enable seccomp to safely compute untrusted bytecode" diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 09664ef..b123b9a 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -23,7 +23,15 @@ LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy -$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE +VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ + $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \ + $(obj)/piggy.o + +ifeq ($(CONFIG_EFI_STUB), y) + VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o +endif + +$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE $(call if_changed,ld) @: diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c new file mode 100644 index 0000000..4055e63 --- /dev/null +++ b/arch/x86/boot/compressed/eboot.c @@ -0,0 +1,1014 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2011 Intel Corporation; author Matt Fleming + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + * + * ----------------------------------------------------------------------- */ + +#include +#include +#include +#include + +#include "eboot.h" + +static efi_system_table_t *sys_table; + +static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size, + unsigned long *desc_size) +{ + efi_memory_desc_t *m = NULL; + efi_status_t status; + unsigned long key; + u32 desc_version; + + *map_size = sizeof(*m) * 32; +again: + /* + * Add an additional efi_memory_desc_t because we're doing an + * allocation which may be in a new descriptor region. + */ + *map_size += sizeof(*m); + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, *map_size, (void **)&m); + if (status != EFI_SUCCESS) + goto fail; + + status = efi_call_phys5(sys_table->boottime->get_memory_map, map_size, + m, &key, desc_size, &desc_version); + if (status == EFI_BUFFER_TOO_SMALL) { + efi_call_phys1(sys_table->boottime->free_pool, m); + goto again; + } + + if (status != EFI_SUCCESS) + efi_call_phys1(sys_table->boottime->free_pool, m); + +fail: + *map = m; + return status; +} + +/* + * Allocate at the highest possible address that is not above 'max'. + */ +static efi_status_t high_alloc(unsigned long size, unsigned long align, + unsigned long *addr, unsigned long max) +{ + unsigned long map_size, desc_size; + efi_memory_desc_t *map; + efi_status_t status; + unsigned long nr_pages; + u64 max_addr = 0; + int i; + + status = __get_map(&map, &map_size, &desc_size); + if (status != EFI_SUCCESS) + goto fail; + + nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; +again: + for (i = 0; i < map_size / desc_size; i++) { + efi_memory_desc_t *desc; + unsigned long m = (unsigned long)map; + u64 start, end; + + desc = (efi_memory_desc_t *)(m + (i * desc_size)); + if (desc->type != EFI_CONVENTIONAL_MEMORY) + continue; + + if (desc->num_pages < nr_pages) + continue; + + start = desc->phys_addr; + end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT); + + if ((start + size) > end || (start + size) > max) + continue; + + if (end - size > max) + end = max; + + if (round_down(end - size, align) < start) + continue; + + start = round_down(end - size, align); + + /* + * Don't allocate at 0x0. It will confuse code that + * checks pointers against NULL. + */ + if (start == 0x0) + continue; + + if (start > max_addr) + max_addr = start; + } + + if (!max_addr) + status = EFI_NOT_FOUND; + else { + status = efi_call_phys4(sys_table->boottime->allocate_pages, + EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, + nr_pages, &max_addr); + if (status != EFI_SUCCESS) { + max = max_addr; + max_addr = 0; + goto again; + } + + *addr = max_addr; + } + +free_pool: + efi_call_phys1(sys_table->boottime->free_pool, map); + +fail: + return status; +} + +/* + * Allocate at the lowest possible address. + */ +static efi_status_t low_alloc(unsigned long size, unsigned long align, + unsigned long *addr) +{ + unsigned long map_size, desc_size; + efi_memory_desc_t *map; + efi_status_t status; + unsigned long nr_pages; + int i; + + status = __get_map(&map, &map_size, &desc_size); + if (status != EFI_SUCCESS) + goto fail; + + nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; + for (i = 0; i < map_size / desc_size; i++) { + efi_memory_desc_t *desc; + unsigned long m = (unsigned long)map; + u64 start, end; + + desc = (efi_memory_desc_t *)(m + (i * desc_size)); + + if (desc->type != EFI_CONVENTIONAL_MEMORY) + continue; + + if (desc->num_pages < nr_pages) + continue; + + start = desc->phys_addr; + end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT); + + /* + * Don't allocate at 0x0. It will confuse code that + * checks pointers against NULL. Skip the first 8 + * bytes so we start at a nice even number. + */ + if (start == 0x0) + start += 8; + + start = round_up(start, align); + if ((start + size) > end) + continue; + + status = efi_call_phys4(sys_table->boottime->allocate_pages, + EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, + nr_pages, &start); + if (status == EFI_SUCCESS) { + *addr = start; + break; + } + } + + if (i == map_size / desc_size) + status = EFI_NOT_FOUND; + +free_pool: + efi_call_phys1(sys_table->boottime->free_pool, map); +fail: + return status; +} + +static void low_free(unsigned long size, unsigned long addr) +{ + unsigned long nr_pages; + + nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; + efi_call_phys2(sys_table->boottime->free_pages, addr, size); +} + +static void find_bits(unsigned long mask, u8 *pos, u8 *size) +{ + u8 first, len; + + first = 0; + len = 0; + + if (mask) { + while (!(mask & 0x1)) { + mask = mask >> 1; + first++; + } + + while (mask & 0x1) { + mask = mask >> 1; + len++; + } + } + + *pos = first; + *size = len; +} + +/* + * See if we have Graphics Output Protocol + */ +static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto, + unsigned long size) +{ + struct efi_graphics_output_protocol *gop, *first_gop; + struct efi_pixel_bitmask pixel_info; + unsigned long nr_gops; + efi_status_t status; + void **gop_handle; + u16 width, height; + u32 fb_base, fb_size; + u32 pixels_per_scan_line; + int pixel_format; + int i; + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, size, &gop_handle); + if (status != EFI_SUCCESS) + return status; + + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, proto, + NULL, &size, gop_handle); + if (status != EFI_SUCCESS) + goto free_handle; + + first_gop = NULL; + + nr_gops = size / sizeof(void *); + for (i = 0; i < nr_gops; i++) { + struct efi_graphics_output_mode_info *info; + efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; + void *pciio; + void *h = gop_handle[i]; + + status = efi_call_phys3(sys_table->boottime->handle_protocol, + h, proto, &gop); + if (status != EFI_SUCCESS) + continue; + + efi_call_phys3(sys_table->boottime->handle_protocol, + h, &pciio_proto, &pciio); + + status = efi_call_phys4(gop->query_mode, gop, + gop->mode->mode, &size, &info); + if (status == EFI_SUCCESS && (!first_gop || pciio)) { + /* + * Apple provide GOPs that are not backed by + * real hardware (they're used to handle + * multiple displays). The workaround is to + * search for a GOP implementing the PCIIO + * protocol, and if one isn't found, to just + * fallback to the first GOP. + */ + width = info->horizontal_resolution; + height = info->vertical_resolution; + fb_base = gop->mode->frame_buffer_base; + fb_size = gop->mode->frame_buffer_size; + pixel_format = info->pixel_format; + pixel_info = info->pixel_information; + pixels_per_scan_line = info->pixels_per_scan_line; + + /* + * Once we've found a GOP supporting PCIIO, + * don't bother looking any further. + */ + if (pciio) + break; + + first_gop = gop; + } + } + + /* Did we find any GOPs? */ + if (!first_gop) + goto free_handle; + + /* EFI framebuffer */ + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_width = width; + si->lfb_height = height; + si->lfb_base = fb_base; + si->lfb_size = fb_size; + si->pages = 1; + + if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; + si->red_size = 8; + si->red_pos = 0; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 16; + si->rsvd_size = 8; + si->rsvd_pos = 24; + } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) { + si->lfb_depth = 32; + si->lfb_linelength = pixels_per_scan_line * 4; + si->red_size = 8; + si->red_pos = 16; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 0; + si->rsvd_size = 8; + si->rsvd_pos = 24; + } else if (pixel_format == PIXEL_BIT_MASK) { + find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size); + find_bits(pixel_info.green_mask, &si->green_pos, + &si->green_size); + find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size); + find_bits(pixel_info.reserved_mask, &si->rsvd_pos, + &si->rsvd_size); + si->lfb_depth = si->red_size + si->green_size + + si->blue_size + si->rsvd_size; + si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8; + } else { + si->lfb_depth = 4; + si->lfb_linelength = si->lfb_width / 2; + si->red_size = 0; + si->red_pos = 0; + si->green_size = 0; + si->green_pos = 0; + si->blue_size = 0; + si->blue_pos = 0; + si->rsvd_size = 0; + si->rsvd_pos = 0; + } + +free_handle: + efi_call_phys1(sys_table->boottime->free_pool, gop_handle); + return status; +} + +/* + * See if we have Universal Graphics Adapter (UGA) protocol + */ +static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto, + unsigned long size) +{ + struct efi_uga_draw_protocol *uga, *first_uga; + unsigned long nr_ugas; + efi_status_t status; + u32 width, height; + void **uga_handle = NULL; + int i; + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, size, &uga_handle); + if (status != EFI_SUCCESS) + return status; + + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, uga_proto, + NULL, &size, uga_handle); + if (status != EFI_SUCCESS) + goto free_handle; + + first_uga = NULL; + + nr_ugas = size / sizeof(void *); + for (i = 0; i < nr_ugas; i++) { + efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; + void *handle = uga_handle[i]; + u32 w, h, depth, refresh; + void *pciio; + + status = efi_call_phys3(sys_table->boottime->handle_protocol, + handle, uga_proto, &uga); + if (status != EFI_SUCCESS) + continue; + + efi_call_phys3(sys_table->boottime->handle_protocol, + handle, &pciio_proto, &pciio); + + status = efi_call_phys5(uga->get_mode, uga, &w, &h, + &depth, &refresh); + if (status == EFI_SUCCESS && (!first_uga || pciio)) { + width = w; + height = h; + + /* + * Once we've found a UGA supporting PCIIO, + * don't bother looking any further. + */ + if (pciio) + break; + + first_uga = uga; + } + } + + if (!first_uga) + goto free_handle; + + /* EFI framebuffer */ + si->orig_video_isVGA = VIDEO_TYPE_EFI; + + si->lfb_depth = 32; + si->lfb_width = width; + si->lfb_height = height; + + si->red_size = 8; + si->red_pos = 16; + si->green_size = 8; + si->green_pos = 8; + si->blue_size = 8; + si->blue_pos = 0; + si->rsvd_size = 8; + si->rsvd_pos = 24; + + +free_handle: + efi_call_phys1(sys_table->boottime->free_pool, uga_handle); + return status; +} + +void setup_graphics(struct boot_params *boot_params) +{ + efi_guid_t graphics_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID; + struct screen_info *si; + efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; + efi_status_t status; + unsigned long size; + void **gop_handle = NULL; + void **uga_handle = NULL; + + si = &boot_params->screen_info; + memset(si, 0, sizeof(*si)); + + size = 0; + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, &graphics_proto, + NULL, &size, gop_handle); + if (status == EFI_BUFFER_TOO_SMALL) + status = setup_gop(si, &graphics_proto, size); + + if (status != EFI_SUCCESS) { + size = 0; + status = efi_call_phys5(sys_table->boottime->locate_handle, + EFI_LOCATE_BY_PROTOCOL, &uga_proto, + NULL, &size, uga_handle); + if (status == EFI_BUFFER_TOO_SMALL) + setup_uga(si, &uga_proto, size); + } +} + +struct initrd { + efi_file_handle_t *handle; + u64 size; +}; + +/* + * Check the cmdline for a LILO-style initrd= arguments. + * + * We only support loading an initrd from the same filesystem as the + * kernel image. + */ +static efi_status_t handle_ramdisks(efi_loaded_image_t *image, + struct setup_header *hdr) +{ + struct initrd *initrds; + unsigned long initrd_addr; + efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID; + u64 initrd_total; + efi_file_io_interface_t *io; + efi_file_handle_t *fh; + efi_status_t status; + int nr_initrds; + char *str; + int i, j, k; + + initrd_addr = 0; + initrd_total = 0; + + str = (char *)(unsigned long)hdr->cmd_line_ptr; + + j = 0; /* See close_handles */ + + if (!str || !*str) + return EFI_SUCCESS; + + for (nr_initrds = 0; *str; nr_initrds++) { + str = strstr(str, "initrd="); + if (!str) + break; + + str += 7; + + /* Skip any leading slashes */ + while (*str == '/' || *str == '\\') + str++; + + while (*str && *str != ' ' && *str != '\n') + str++; + } + + if (!nr_initrds) + return EFI_SUCCESS; + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, + nr_initrds * sizeof(*initrds), + &initrds); + if (status != EFI_SUCCESS) + goto fail; + + str = (char *)(unsigned long)hdr->cmd_line_ptr; + for (i = 0; i < nr_initrds; i++) { + struct initrd *initrd; + efi_file_handle_t *h; + efi_file_info_t *info; + efi_char16_t filename[256]; + unsigned long info_sz; + efi_guid_t info_guid = EFI_FILE_INFO_ID; + efi_char16_t *p; + u64 file_sz; + + str = strstr(str, "initrd="); + if (!str) + break; + + str += 7; + + initrd = &initrds[i]; + p = filename; + + /* Skip any leading slashes */ + while (*str == '/' || *str == '\\') + str++; + + while (*str && *str != ' ' && *str != '\n') { + if (p >= filename + sizeof(filename)) + break; + + *p++ = *str++; + } + + *p = '\0'; + + /* Only open the volume once. */ + if (!i) { + efi_boot_services_t *boottime; + + boottime = sys_table->boottime; + + status = efi_call_phys3(boottime->handle_protocol, + image->device_handle, &fs_proto, &io); + if (status != EFI_SUCCESS) + goto free_initrds; + + status = efi_call_phys2(io->open_volume, io, &fh); + if (status != EFI_SUCCESS) + goto free_initrds; + } + + status = efi_call_phys5(fh->open, fh, &h, filename, + EFI_FILE_MODE_READ, (u64)0); + if (status != EFI_SUCCESS) + goto close_handles; + + initrd->handle = h; + + info_sz = 0; + status = efi_call_phys4(h->get_info, h, &info_guid, + &info_sz, NULL); + if (status != EFI_BUFFER_TOO_SMALL) + goto close_handles; + +grow: + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, info_sz, &info); + if (status != EFI_SUCCESS) + goto close_handles; + + status = efi_call_phys4(h->get_info, h, &info_guid, + &info_sz, info); + if (status == EFI_BUFFER_TOO_SMALL) { + efi_call_phys1(sys_table->boottime->free_pool, info); + goto grow; + } + + file_sz = info->file_size; + efi_call_phys1(sys_table->boottime->free_pool, info); + + if (status != EFI_SUCCESS) + goto close_handles; + + initrd->size = file_sz; + initrd_total += file_sz; + } + + if (initrd_total) { + unsigned long addr; + + /* + * Multiple initrd's need to be at consecutive + * addresses in memory, so allocate enough memory for + * all the initrd's. + */ + status = high_alloc(initrd_total, 0x1000, + &initrd_addr, hdr->initrd_addr_max); + if (status != EFI_SUCCESS) + goto close_handles; + + /* We've run out of free low memory. */ + if (initrd_addr > hdr->initrd_addr_max) { + status = EFI_INVALID_PARAMETER; + goto free_initrd_total; + } + + addr = initrd_addr; + for (j = 0; j < nr_initrds; j++) { + u64 size; + + size = initrds[j].size; + status = efi_call_phys3(fh->read, initrds[j].handle, + &size, addr); + if (status != EFI_SUCCESS) + goto free_initrd_total; + + efi_call_phys1(fh->close, initrds[j].handle); + + addr += size; + } + + } + + efi_call_phys1(sys_table->boottime->free_pool, initrds); + + hdr->ramdisk_image = initrd_addr; + hdr->ramdisk_size = initrd_total; + + return status; + +free_initrd_total: + low_free(initrd_total, initrd_addr); + +close_handles: + for (k = j; k < nr_initrds; k++) + efi_call_phys1(fh->close, initrds[k].handle); +free_initrds: + efi_call_phys1(sys_table->boottime->free_pool, initrds); +fail: + hdr->ramdisk_image = 0; + hdr->ramdisk_size = 0; + + return status; +} + +/* + * Because the x86 boot code expects to be passed a boot_params we + * need to create one ourselves (usually the bootloader would create + * one for us). + */ +static efi_status_t make_boot_params(struct boot_params *boot_params, + efi_loaded_image_t *image, + void *handle) +{ + struct efi_info *efi = &boot_params->efi_info; + struct apm_bios_info *bi = &boot_params->apm_bios_info; + struct sys_desc_table *sdt = &boot_params->sys_desc_table; + struct e820entry *e820_map = &boot_params->e820_map[0]; + struct e820entry *prev = NULL; + struct setup_header *hdr = &boot_params->hdr; + unsigned long size, key, desc_size, _size; + efi_memory_desc_t *mem_map; + void *options = image->load_options; + u32 load_options_size = image->load_options_size / 2; /* ASCII */ + int options_size = 0; + efi_status_t status; + __u32 desc_version; + unsigned long cmdline; + u8 nr_entries; + u16 *s2; + u8 *s1; + int i; + + hdr->type_of_loader = 0x21; + + /* Convert unicode cmdline to ascii */ + cmdline = 0; + s2 = (u16 *)options; + + if (s2) { + while (*s2 && *s2 != '\n' && options_size < load_options_size) { + s2++; + options_size++; + } + + if (options_size) { + if (options_size > hdr->cmdline_size) + options_size = hdr->cmdline_size; + + options_size++; /* NUL termination */ + + status = low_alloc(options_size, 1, &cmdline); + if (status != EFI_SUCCESS) + goto fail; + + s1 = (u8 *)(unsigned long)cmdline; + s2 = (u16 *)options; + + for (i = 0; i < options_size - 1; i++) + *s1++ = *s2++; + + *s1 = '\0'; + } + } + + hdr->cmd_line_ptr = cmdline; + + hdr->ramdisk_image = 0; + hdr->ramdisk_size = 0; + + status = handle_ramdisks(image, hdr); + if (status != EFI_SUCCESS) + goto free_cmdline; + + setup_graphics(boot_params); + + /* Clear APM BIOS info */ + memset(bi, 0, sizeof(*bi)); + + memset(sdt, 0, sizeof(*sdt)); + + memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32)); + + size = sizeof(*mem_map) * 32; + +again: + size += sizeof(*mem_map); + _size = size; + status = low_alloc(size, 1, (unsigned long *)&mem_map); + if (status != EFI_SUCCESS) + goto free_cmdline; + + status = efi_call_phys5(sys_table->boottime->get_memory_map, &size, + mem_map, &key, &desc_size, &desc_version); + if (status == EFI_BUFFER_TOO_SMALL) { + low_free(_size, (unsigned long)mem_map); + goto again; + } + + if (status != EFI_SUCCESS) + goto free_mem_map; + + efi->efi_systab = (unsigned long)sys_table; + efi->efi_memdesc_size = desc_size; + efi->efi_memdesc_version = desc_version; + efi->efi_memmap = (unsigned long)mem_map; + efi->efi_memmap_size = size; + +#ifdef CONFIG_X86_64 + efi->efi_systab_hi = (unsigned long)sys_table >> 32; + efi->efi_memmap_hi = (unsigned long)mem_map >> 32; +#endif + + /* Might as well exit boot services now */ + status = efi_call_phys2(sys_table->boottime->exit_boot_services, + handle, key); + if (status != EFI_SUCCESS) + goto free_mem_map; + + /* Historic? */ + boot_params->alt_mem_k = 32 * 1024; + + /* + * Convert the EFI memory map to E820. + */ + nr_entries = 0; + for (i = 0; i < size / desc_size; i++) { + efi_memory_desc_t *d; + unsigned int e820_type = 0; + unsigned long m = (unsigned long)mem_map; + + d = (efi_memory_desc_t *)(m + (i * desc_size)); + switch (d->type) { + case EFI_RESERVED_TYPE: + case EFI_RUNTIME_SERVICES_CODE: + case EFI_RUNTIME_SERVICES_DATA: + case EFI_MEMORY_MAPPED_IO: + case EFI_MEMORY_MAPPED_IO_PORT_SPACE: + case EFI_PAL_CODE: + e820_type = E820_RESERVED; + break; + + case EFI_UNUSABLE_MEMORY: + e820_type = E820_UNUSABLE; + break; + + case EFI_ACPI_RECLAIM_MEMORY: + e820_type = E820_ACPI; + break; + + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + e820_type = E820_RAM; + break; + + case EFI_ACPI_MEMORY_NVS: + e820_type = E820_NVS; + break; + + default: + continue; + } + + /* Merge adjacent mappings */ + if (prev && prev->type == e820_type && + (prev->addr + prev->size) == d->phys_addr) + prev->size += d->num_pages << 12; + else { + e820_map->addr = d->phys_addr; + e820_map->size = d->num_pages << 12; + e820_map->type = e820_type; + prev = e820_map++; + nr_entries++; + } + } + + boot_params->e820_entries = nr_entries; + + return EFI_SUCCESS; + +free_mem_map: + low_free(_size, (unsigned long)mem_map); +free_cmdline: + if (options_size) + low_free(options_size, hdr->cmd_line_ptr); +fail: + return status; +} + +/* + * On success we return a pointer to a boot_params structure, and NULL + * on failure. + */ +struct boot_params *efi_main(void *handle, efi_system_table_t *_table) +{ + struct boot_params *boot_params; + unsigned long start, nr_pages; + struct desc_ptr *gdt, *idt; + efi_loaded_image_t *image; + struct setup_header *hdr; + efi_status_t status; + efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID; + struct desc_struct *desc; + + sys_table = _table; + + /* Check if we were booted by the EFI firmware */ + if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + goto fail; + + status = efi_call_phys3(sys_table->boottime->handle_protocol, + handle, &proto, (void *)&image); + if (status != EFI_SUCCESS) + goto fail; + + status = low_alloc(0x4000, 1, (unsigned long *)&boot_params); + if (status != EFI_SUCCESS) + goto fail; + + memset(boot_params, 0x0, 0x4000); + + /* Copy first two sectors to boot_params */ + memcpy(boot_params, image->image_base, 1024); + + hdr = &boot_params->hdr; + + /* + * The EFI firmware loader could have placed the kernel image + * anywhere in memory, but the kernel has various restrictions + * on the max physical address it can run at. Attempt to move + * the kernel to boot_params.pref_address, or as low as + * possible. + */ + start = hdr->pref_address; + nr_pages = round_up(hdr->init_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE; + + status = efi_call_phys4(sys_table->boottime->allocate_pages, + EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA, + nr_pages, &start); + if (status != EFI_SUCCESS) { + status = low_alloc(hdr->init_size, hdr->kernel_alignment, + &start); + if (status != EFI_SUCCESS) + goto fail; + } + + hdr->code32_start = (__u32)start; + hdr->pref_address = (__u64)(unsigned long)image->image_base; + + memcpy((void *)start, image->image_base, image->image_size); + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, sizeof(*gdt), + (void **)&gdt); + if (status != EFI_SUCCESS) + goto fail; + + gdt->size = 0x800; + status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address); + if (status != EFI_SUCCESS) + goto fail; + + status = efi_call_phys3(sys_table->boottime->allocate_pool, + EFI_LOADER_DATA, sizeof(*idt), + (void **)&idt); + if (status != EFI_SUCCESS) + goto fail; + + idt->size = 0; + idt->address = 0; + + status = make_boot_params(boot_params, image, handle); + if (status != EFI_SUCCESS) + goto fail; + + memset((char *)gdt->address, 0x0, gdt->size); + desc = (struct desc_struct *)gdt->address; + + /* The first GDT is a dummy and the second is unused. */ + desc += 2; + + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit = 0xf; + desc->avl = 0; + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; + + desc++; + desc->limit0 = 0xffff; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE; + desc->s = DESC_TYPE_CODE_DATA; + desc->dpl = 0; + desc->p = 1; + desc->limit = 0xf; + desc->avl = 0; + desc->l = 0; + desc->d = SEG_OP_SIZE_32BIT; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; + +#ifdef CONFIG_X86_64 + /* Task segment value */ + desc++; + desc->limit0 = 0x0000; + desc->base0 = 0x0000; + desc->base1 = 0x0000; + desc->type = SEG_TYPE_TSS; + desc->s = 0; + desc->dpl = 0; + desc->p = 1; + desc->limit = 0x0; + desc->avl = 0; + desc->l = 0; + desc->d = 0; + desc->g = SEG_GRANULARITY_4KB; + desc->base2 = 0x00; +#endif /* CONFIG_X86_64 */ + + asm volatile ("lidt %0" : : "m" (*idt)); + asm volatile ("lgdt %0" : : "m" (*gdt)); + + asm volatile("cli"); + + return boot_params; +fail: + return NULL; +} diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h new file mode 100644 index 0000000..f66d023 --- /dev/null +++ b/arch/x86/boot/compressed/eboot.h @@ -0,0 +1,60 @@ +#ifndef BOOT_COMPRESSED_EBOOT_H +#define BOOT_COMPRESSED_EBOOT_H + +#define SEG_TYPE_DATA (0 << 3) +#define SEG_TYPE_READ_WRITE (1 << 1) +#define SEG_TYPE_CODE (1 << 3) +#define SEG_TYPE_EXEC_READ (1 << 1) +#define SEG_TYPE_TSS ((1 << 3) | (1 << 0)) +#define SEG_OP_SIZE_32BIT (1 << 0) +#define SEG_GRANULARITY_4KB (1 << 0) + +#define DESC_TYPE_CODE_DATA (1 << 0) + +#define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT) + +#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 +#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 +#define PIXEL_BIT_MASK 2 +#define PIXEL_BLT_ONLY 3 +#define PIXEL_FORMAT_MAX 4 + +struct efi_pixel_bitmask { + u32 red_mask; + u32 green_mask; + u32 blue_mask; + u32 reserved_mask; +}; + +struct efi_graphics_output_mode_info { + u32 version; + u32 horizontal_resolution; + u32 vertical_resolution; + int pixel_format; + struct efi_pixel_bitmask pixel_information; + u32 pixels_per_scan_line; +} __packed; + +struct efi_graphics_output_protocol_mode { + u32 max_mode; + u32 mode; + unsigned long info; + unsigned long size_of_info; + u64 frame_buffer_base; + unsigned long frame_buffer_size; +} __packed; + +struct efi_graphics_output_protocol { + void *query_mode; + unsigned long set_mode; + unsigned long blt; + struct efi_graphics_output_protocol_mode *mode; +}; + +struct efi_uga_draw_protocol { + void *get_mode; + void *set_mode; + void *blt; +}; + +#endif /* BOOT_COMPRESSED_EBOOT_H */ diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S new file mode 100644 index 0000000..a53440e --- /dev/null +++ b/arch/x86/boot/compressed/efi_stub_32.S @@ -0,0 +1,86 @@ +/* + * EFI call stub for IA32. + * + * This stub allows us to make EFI calls in physical mode with interrupts + * turned off. Note that this implementation is different from the one in + * arch/x86/platform/efi/efi_stub_32.S because we're _already_ in physical + * mode at this point. + */ + +#include +#include + +/* + * efi_call_phys(void *, ...) is a function with variable parameters. + * All the callers of this function assure that all the parameters are 4-bytes. + */ + +/* + * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save. + * So we'd better save all of them at the beginning of this function and restore + * at the end no matter how many we use, because we can not assure EFI runtime + * service functions will comply with gcc calling convention, too. + */ + +.text +ENTRY(efi_call_phys) + /* + * 0. The function can only be called in Linux kernel. So CS has been + * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found + * the values of these registers are the same. And, the corresponding + * GDT entries are identical. So I will do nothing about segment reg + * and GDT, but change GDT base register in prelog and epilog. + */ + + /* + * 1. Because we haven't been relocated by this point we need to + * use relative addressing. + */ + call 1f +1: popl %edx + subl $1b, %edx + + /* + * 2. Now on the top of stack is the return + * address in the caller of efi_call_phys(), then parameter 1, + * parameter 2, ..., param n. To make things easy, we save the return + * address of efi_call_phys in a global variable. + */ + popl %ecx + movl %ecx, saved_return_addr(%edx) + /* get the function pointer into ECX*/ + popl %ecx + movl %ecx, efi_rt_function_ptr(%edx) + + /* + * 3. Call the physical function. + */ + call *%ecx + + /* + * 4. Balance the stack. And because EAX contain the return value, + * we'd better not clobber it. We need to calculate our address + * again because %ecx and %edx are not preserved across EFI function + * calls. + */ + call 1f +1: popl %edx + subl $1b, %edx + + movl efi_rt_function_ptr(%edx), %ecx + pushl %ecx + + /* + * 10. Push the saved return address onto the stack and return. + */ + movl saved_return_addr(%edx), %ecx + pushl %ecx + ret +ENDPROC(efi_call_phys) +.previous + +.data +saved_return_addr: + .long 0 +efi_rt_function_ptr: + .long 0 diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S new file mode 100644 index 0000000..cedc60d --- /dev/null +++ b/arch/x86/boot/compressed/efi_stub_64.S @@ -0,0 +1 @@ +#include "../../platform/efi/efi_stub_64.S" diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 67a655a..a055993 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -32,6 +32,28 @@ __HEAD ENTRY(startup_32) +#ifdef CONFIG_EFI_STUB + /* + * We don't need the return address, so set up the stack so + * efi_main() can find its arugments. + */ + add $0x4, %esp + + call efi_main + cmpl $0, %eax + je preferred_addr + movl %eax, %esi + call 1f +1: + popl %eax + subl $1b, %eax + subl BP_pref_address(%esi), %eax + add BP_code32_start(%esi), %eax + leal preferred_addr(%eax), %eax + jmp *%eax + +preferred_addr: +#endif cld /* * Test KEEP_SEGMENTS flag to see if the bootloader is asking diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 35af09d..558d76c 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -199,6 +199,26 @@ ENTRY(startup_64) * an identity mapped page table being provied that maps our * entire text+data+bss and hopefully all of memory. */ +#ifdef CONFIG_EFI_STUB + pushq %rsi + mov %rcx, %rdi + mov %rdx, %rsi + call efi_main + popq %rsi + cmpq $0,%rax + je preferred_addr + movq %rax,%rsi + call 1f +1: + popq %rax + subq $1b, %rax + subq BP_pref_address(%rsi), %rax + add BP_code32_start(%esi), %eax + leaq preferred_addr(%rax), %rax + jmp *%rax + +preferred_addr: +#endif /* Setup data segments. */ xorl %eax, %eax diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 19b3e69..ffb9c5c 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,2 +1,11 @@ #include "misc.h" + +int memcmp(const void *s1, const void *s2, size_t len) +{ + u8 diff; + asm("repe; cmpsb; setnz %0" + : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); + return diff; +} + #include "../string.c" diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index bdb4d45..f1bbeeb 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -45,6 +45,11 @@ SYSSEG = 0x1000 /* historical load address >> 4 */ .global bootsect_start bootsect_start: +#ifdef CONFIG_EFI_STUB + # "MZ", MS-DOS header + .byte 0x4d + .byte 0x5a +#endif # Normalize the start address ljmp $BOOTSEG, $start2 @@ -79,6 +84,14 @@ bs_die: # invoke the BIOS reset code... ljmp $0xf000,$0xfff0 +#ifdef CONFIG_EFI_STUB + .org 0x3c + # + # Offset to the PE header. + # + .long pe_header +#endif /* CONFIG_EFI_STUB */ + .section ".bsdata", "a" bugger_off_msg: .ascii "Direct booting from floppy is no longer supported.\r\n" @@ -87,6 +100,141 @@ bugger_off_msg: .ascii "Remove disk and press any key to reboot . . .\r\n" .byte 0 +#ifdef CONFIG_EFI_STUB +pe_header: + .ascii "PE" + .word 0 + +coff_header: +#ifdef CONFIG_X86_32 + .word 0x14c # i386 +#else + .word 0x8664 # x86-64 +#endif + .word 2 # nr_sections + .long 0 # TimeDateStamp + .long 0 # PointerToSymbolTable + .long 1 # NumberOfSymbols + .word section_table - optional_header # SizeOfOptionalHeader +#ifdef CONFIG_X86_32 + .word 0x306 # Characteristics. + # IMAGE_FILE_32BIT_MACHINE | + # IMAGE_FILE_DEBUG_STRIPPED | + # IMAGE_FILE_EXECUTABLE_IMAGE | + # IMAGE_FILE_LINE_NUMS_STRIPPED +#else + .word 0x206 # Characteristics + # IMAGE_FILE_DEBUG_STRIPPED | + # IMAGE_FILE_EXECUTABLE_IMAGE | + # IMAGE_FILE_LINE_NUMS_STRIPPED +#endif + +optional_header: +#ifdef CONFIG_X86_32 + .word 0x10b # PE32 format +#else + .word 0x20b # PE32+ format +#endif + .byte 0x02 # MajorLinkerVersion + .byte 0x14 # MinorLinkerVersion + + # Filled in by build.c + .long 0 # SizeOfCode + + .long 0 # SizeOfInitializedData + .long 0 # SizeOfUninitializedData + + # Filled in by build.c + .long 0x0000 # AddressOfEntryPoint + + .long 0x0000 # BaseOfCode +#ifdef CONFIG_X86_32 + .long 0 # data +#endif + +extra_header_fields: +#ifdef CONFIG_X86_32 + .long 0 # ImageBase +#else + .quad 0 # ImageBase +#endif + .long 0x1000 # SectionAlignment + .long 0x200 # FileAlignment + .word 0 # MajorOperatingSystemVersion + .word 0 # MinorOperatingSystemVersion + .word 0 # MajorImageVersion + .word 0 # MinorImageVersion + .word 0 # MajorSubsystemVersion + .word 0 # MinorSubsystemVersion + .long 0 # Win32VersionValue + + # + # The size of the bzImage is written in tools/build.c + # + .long 0 # SizeOfImage + + .long 0x200 # SizeOfHeaders + .long 0 # CheckSum + .word 0xa # Subsystem (EFI application) + .word 0 # DllCharacteristics +#ifdef CONFIG_X86_32 + .long 0 # SizeOfStackReserve + .long 0 # SizeOfStackCommit + .long 0 # SizeOfHeapReserve + .long 0 # SizeOfHeapCommit +#else + .quad 0 # SizeOfStackReserve + .quad 0 # SizeOfStackCommit + .quad 0 # SizeOfHeapReserve + .quad 0 # SizeOfHeapCommit +#endif + .long 0 # LoaderFlags + .long 0x1 # NumberOfRvaAndSizes + + .quad 0 # ExportTable + .quad 0 # ImportTable + .quad 0 # ResourceTable + .quad 0 # ExceptionTable + .quad 0 # CertificationTable + .quad 0 # BaseRelocationTable + + # Section table +section_table: + .ascii ".text" + .byte 0 + .byte 0 + .byte 0 + .long 0 + .long 0x0 # startup_{32,64} + .long 0 # Size of initialized data + # on disk + .long 0x0 # startup_{32,64} + .long 0 # PointerToRelocations + .long 0 # PointerToLineNumbers + .word 0 # NumberOfRelocations + .word 0 # NumberOfLineNumbers + .long 0x60500020 # Characteristics (section flags) + + # + # The EFI application loader requires a relocation section + # because EFI applications are relocatable and not having + # this section seems to confuse it. But since we don't need + # the loader to fixup any relocs for us just fill it with a + # single dummy reloc. + # + .ascii ".reloc" + .byte 0 + .byte 0 + .long reloc_end - reloc_start + .long reloc_start + .long reloc_end - reloc_start # SizeOfRawData + .long reloc_start # PointerToRawData + .long 0 # PointerToRelocations + .long 0 # PointerToLineNumbers + .word 0 # NumberOfRelocations + .word 0 # NumberOfLineNumbers + .long 0x42100040 # Characteristics (section flags) +#endif /* CONFIG_EFI_STUB */ # Kernel attributes; used by setup. This is part 1 of the # header, from the old boot sector. @@ -318,3 +466,13 @@ die: setup_corrupt: .byte 7 .string "No setup signature found...\n" + + .data +dummy: .long 0 + + .section .reloc +reloc_start: + .long dummy - reloc_start + .long 10 + .word 0 +reloc_end: diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 3cbc405..574dedf 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -111,3 +111,38 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas return result; } + +/** + * strlen - Find the length of a string + * @s: The string to be sized + */ +size_t strlen(const char *s) +{ + const char *sc; + + for (sc = s; *sc != '\0'; ++sc) + /* nothing */; + return sc - s; +} + +/** + * strstr - Find the first substring in a %NUL terminated string + * @s1: The string to be searched + * @s2: The string to search for + */ +char *strstr(const char *s1, const char *s2) +{ + size_t l1, l2; + + l2 = strlen(s2); + if (!l2) + return (char *)s1; + l1 = strlen(s1); + while (l1 >= l2) { + l1--; + if (!memcmp(s1, s2, l2)) + return (char *)s1; + s1++; + } + return NULL; +} diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index fdc60a0..4e9bd6b 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -135,6 +135,9 @@ static void usage(void) int main(int argc, char ** argv) { +#ifdef CONFIG_EFI_STUB + unsigned int file_sz, pe_header; +#endif unsigned int i, sz, setup_sectors; int c; u32 sys_size; @@ -194,6 +197,42 @@ int main(int argc, char ** argv) buf[0x1f6] = sys_size >> 16; buf[0x1f7] = sys_size >> 24; +#ifdef CONFIG_EFI_STUB + file_sz = sz + i + ((sys_size * 16) - sz); + + pe_header = *(unsigned int *)&buf[0x3c]; + + /* Size of code */ + *(unsigned int *)&buf[pe_header + 0x1c] = file_sz; + + /* Size of image */ + *(unsigned int *)&buf[pe_header + 0x50] = file_sz; + +#ifdef CONFIG_X86_32 + /* Address of entry point */ + *(unsigned int *)&buf[pe_header + 0x28] = i; + + /* .text size */ + *(unsigned int *)&buf[pe_header + 0xb0] = file_sz; + + /* .text size of initialised data */ + *(unsigned int *)&buf[pe_header + 0xb8] = file_sz; +#else + /* + * Address of entry point. startup_32 is at the beginning and + * the 64-bit entry point (startup_64) is always 512 bytes + * after. + */ + *(unsigned int *)&buf[pe_header + 0x28] = i + 512; + + /* .text size */ + *(unsigned int *)&buf[pe_header + 0xc0] = file_sz; + + /* .text size of initialised data */ + *(unsigned int *)&buf[pe_header + 0xc8] = file_sz; +#endif /* CONFIG_X86_32 */ +#endif /* CONFIG_EFI_STUB */ + crc = partial_crc32(buf, i, crc); if (fwrite(buf, 1, i, stdout) != i) die("Writing setup failed"); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 4f13faf..68de2dc 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -67,4 +67,6 @@ void common(void) { OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); + OFFSET(BP_pref_address, boot_params, hdr.pref_address); + OFFSET(BP_code32_start, boot_params, hdr.code32_start); } -- cgit v1.1 From 3e8f9451d3db669d7c0d8b330d4f5770149d90d5 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 15 Dec 2011 22:19:41 +0000 Subject: x86: Fix INTEL_MID silly Doh.. pass the brown paper bags - preferably filled with mince pies.. This fixes occasional build failures. Signed-off-by: Alan Cox Link: http://lkml.kernel.org/n/tip-r0oc1knlvzuqr69artaeq8s8@git.kernel.org [ extended the changelog a bit ] Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index faf39a0..2b54a2f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -429,6 +429,7 @@ config X86_MDFLD select SPI select INTEL_SCU_IPC select X86_PLATFORM_DEVICES + select X86_INTEL_MID ---help--- Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin Internet Device(MID) platform. -- cgit v1.1 From 2d2da60fb40a80cc59383121ccf763e0e0e8a42a Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Fri, 16 Dec 2011 13:30:58 +0100 Subject: x86, efi: Break up large initrd reads The efi boot stub tries to read the entire initrd in 1 go, however some efi implementations hang if too much if asked to read too much data at the same time. After some experimentation I found out that my asrock p67 board will hang if asked to read chunks of 4MiB, so use a safe value. elilo reads in chunks of 16KiB, but since that requires many read calls I use a value of 1 MiB. hpa suggested adding individual blacklists for when systems are found where this value causes a crash. Signed-off-by: Maarten Lankhorst Link: http://lkml.kernel.org/r/4EEB3A02.3090201@gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/eboot.c | 20 ++++++++++++++------ arch/x86/boot/compressed/eboot.h | 1 + 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 4055e63..fec216f 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -643,14 +643,22 @@ grow: u64 size; size = initrds[j].size; - status = efi_call_phys3(fh->read, initrds[j].handle, - &size, addr); - if (status != EFI_SUCCESS) - goto free_initrd_total; + while (size) { + u64 chunksize; + if (size > EFI_READ_CHUNK_SIZE) + chunksize = EFI_READ_CHUNK_SIZE; + else + chunksize = size; + status = efi_call_phys3(fh->read, + initrds[j].handle, + &chunksize, addr); + if (status != EFI_SUCCESS) + goto free_initrd_total; + addr += chunksize; + size -= chunksize; + } efi_call_phys1(fh->close, initrds[j].handle); - - addr += size; } } diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index f66d023..3925166 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -12,6 +12,7 @@ #define DESC_TYPE_CODE_DATA (1 << 0) #define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT) +#define EFI_READ_CHUNK_SIZE (1024 * 1024) #define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0 #define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1 -- cgit v1.1 From a0c3832a578c84d4a93c61e22cb09c99fa9447ea Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Sat, 17 Dec 2011 21:57:25 +0000 Subject: x86/apb: Fix configuration constraints The APB timer requires SFI, SCU and MID support Reported-by: Ingo Molnar Signed-off-by: Alan Cox Link: http://lkml.kernel.org/r/20111217215719.3743.93550.stgit@bob.linux.org.uk Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2b54a2f..ca4ee76 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -665,6 +665,7 @@ config APB_TIMER def_bool y if MRST prompt "Langwell APB Timer Support" if X86_MRST select DW_APB_TIMER + depends on X86_INTEL_MID && SFI help APB timer is the replacement for 8254, HPET on X86 MID platforms. The APBT provides a stable time base on SMP -- cgit v1.1 From 933b9463a0ef75da681747b2dac06c1754465372 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Sat, 17 Dec 2011 17:43:40 +0000 Subject: x86/intel config: Revamp configuration to allow for Moorestown and Medfield This sets all up the other bits that need to be INTEL_MID specific rather than Moorestown specific. Signed-off-by: Alan Cox Link: http://lkml.kernel.org/r/20111217174318.7207.91543.stgit@bob.linux.org.uk Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++-- arch/x86/include/asm/fixmap.h | 2 +- arch/x86/include/asm/setup.h | 2 +- arch/x86/pci/Makefile | 2 +- arch/x86/platform/mrst/Makefile | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ca4ee76..c3c9343 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -662,8 +662,8 @@ config HPET_EMULATE_RTC depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) config APB_TIMER - def_bool y if MRST - prompt "Langwell APB Timer Support" if X86_MRST + def_bool y if X86_INTEL_MID + prompt "Intel MID APB Timer Support" if X86_INTEL_MID select DW_APB_TIMER depends on X86_INTEL_MID && SFI help diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 460c74e..4da3c0c 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -117,7 +117,7 @@ enum fixed_addresses { #endif FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ -#ifdef CONFIG_X86_MRST +#ifdef CONFIG_X86_INTEL_MID FIX_LNW_VRTC, #endif __end_of_permanent_fixed_addresses, diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 9756551..d0f19f9 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -47,7 +47,7 @@ extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); extern void setup_default_timer_irq(void); -#ifdef CONFIG_X86_MRST +#ifdef CONFIG_X86_INTEL_MID extern void x86_mrst_early_setup(void); #else static inline void x86_mrst_early_setup(void) { } diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 6b8759f..75b06f3 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -15,7 +15,7 @@ obj-$(CONFIG_X86_VISWS) += visws.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o -obj-$(CONFIG_X86_MRST) += mrst.o +obj-$(CONFIG_X86_INTEL_MID) += mrst.o obj-y += common.o early.o obj-y += amd_bus.o bus_numa.o diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile index ddeec73..7baed51 100644 --- a/arch/x86/platform/mrst/Makefile +++ b/arch/x86/platform/mrst/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_X86_MRST) += mrst.o -obj-$(CONFIG_X86_MRST) += vrtc.o +obj-$(CONFIG_X86_INTEL_MID) += mrst.o +obj-$(CONFIG_X86_INTEL_MID) += vrtc.o obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_mrst.o obj-$(CONFIG_X86_MRST) += pmu.o -- cgit v1.1 From d79a8869d8a4b565b12a88faeff834b09a36368c Mon Sep 17 00:00:00 2001 From: Michael Demeter Date: Thu, 15 Dec 2011 22:31:23 +0000 Subject: x86/mrst: Add additional debug prints for pb_keys Added additional debug output that we always seem to add during power ons to validate firmware operation. Signed-off-by: Michael Demeter Signed-off-by: Kirill A. Shutemov Signed-off-by: Alan Cox Link: http://lkml.kernel.org/r/20111215223116.10166.50803.stgit@bob.linux.org.uk [ fixed line breaks, formatting and commit title. ] Signed-off-by: Ingo Molnar --- arch/x86/platform/mrst/mrst.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 6a21f60..b6a33d2 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -983,6 +983,7 @@ static int __init pb_keys_init(void) num = sizeof(gpio_button) / sizeof(struct gpio_keys_button); for (i = 0; i < num; i++) { gb[i].gpio = get_gpio_by_name(gb[i].desc); + pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, gb[i].gpio); if (gb[i].gpio == -1) continue; -- cgit v1.1 From 88715b9ade718564fd8b1318735826370481366b Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 13 Dec 2011 12:53:07 +0200 Subject: crypto: twofish-x86_64-3way - remove unneeded LRW/XTS #ifdefs Since LRW & XTS are selected by twofish-x86_64-3way, we don't need these #ifdefs anymore. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/twofish_glue_3way.c | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 954f59e..7fee8c1 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -35,14 +35,6 @@ #include #include -#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE) -#define HAS_LRW -#endif - -#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE) -#define HAS_XTS -#endif - /* regular block cipher functions from twofish_x86_64 module */ asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, const u8 *src); @@ -442,8 +434,6 @@ static struct crypto_alg blk_ctr_alg = { }, }; -#if defined(HAS_LRW) || defined(HAS_XTS) - static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) { const unsigned int bsize = TF_BLOCK_SIZE; @@ -474,10 +464,6 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) twofish_dec_blk(ctx, srcdst, srcdst); } -#endif - -#ifdef HAS_LRW - struct twofish_lrw_ctx { struct lrw_table_ctx lrw_table; struct twofish_ctx twofish_ctx; @@ -562,10 +548,6 @@ static struct crypto_alg blk_lrw_alg = { }, }; -#endif - -#ifdef HAS_XTS - struct twofish_xts_ctx { struct twofish_ctx tweak_ctx; struct twofish_ctx crypt_ctx; @@ -655,8 +637,6 @@ static struct crypto_alg blk_xts_alg = { }, }; -#endif - int __init init(void) { int err; @@ -670,27 +650,19 @@ int __init init(void) err = crypto_register_alg(&blk_ctr_alg); if (err) goto ctr_err; -#ifdef HAS_LRW err = crypto_register_alg(&blk_lrw_alg); if (err) goto blk_lrw_err; -#endif -#ifdef HAS_XTS err = crypto_register_alg(&blk_xts_alg); if (err) goto blk_xts_err; -#endif return 0; -#ifdef HAS_XTS crypto_unregister_alg(&blk_xts_alg); blk_xts_err: -#endif -#ifdef HAS_LRW crypto_unregister_alg(&blk_lrw_alg); blk_lrw_err: -#endif crypto_unregister_alg(&blk_ctr_alg); ctr_err: crypto_unregister_alg(&blk_cbc_alg); @@ -702,12 +674,8 @@ ecb_err: void __exit fini(void) { -#ifdef HAS_XTS crypto_unregister_alg(&blk_xts_alg); -#endif -#ifdef HAS_LRW crypto_unregister_alg(&blk_lrw_alg); -#endif crypto_unregister_alg(&blk_ctr_alg); crypto_unregister_alg(&blk_cbc_alg); crypto_unregister_alg(&blk_ecb_alg); -- cgit v1.1 From 7ba8babf84fa4e9b648e247223043785f596dd23 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Tue, 13 Dec 2011 12:53:17 +0200 Subject: crypto: serpent-sse2 - remove unneeded LRW/XTS #ifdefs Since LRW & XTS are selected by serpent-sse2, we don't need these #ifdefs anymore. Signed-off-by: Jussi Kivilinna Signed-off-by: Herbert Xu --- arch/x86/crypto/serpent_sse2_glue.c | 40 ------------------------------------- 1 file changed, 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 2f5c304..7955a9b 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -47,14 +47,6 @@ #include #include -#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE) -#define HAS_LRW -#endif - -#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE) -#define HAS_XTS -#endif - struct async_serpent_ctx { struct cryptd_ablkcipher *cryptd_tfm; }; @@ -470,8 +462,6 @@ static struct crypto_alg blk_ctr_alg = { }, }; -#if defined(HAS_LRW) || defined(HAS_XTS) - struct crypt_priv { struct serpent_ctx *ctx; bool fpu_enabled; @@ -511,10 +501,6 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) __serpent_decrypt(ctx->ctx, srcdst, srcdst); } -#endif - -#ifdef HAS_LRW - struct serpent_lrw_ctx { struct lrw_table_ctx lrw_table; struct serpent_ctx serpent_ctx; @@ -620,10 +606,6 @@ static struct crypto_alg blk_lrw_alg = { }, }; -#endif - -#ifdef HAS_XTS - struct serpent_xts_ctx { struct serpent_ctx tweak_ctx; struct serpent_ctx crypt_ctx; @@ -730,8 +712,6 @@ static struct crypto_alg blk_xts_alg = { }, }; -#endif - static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, unsigned int key_len) { @@ -930,8 +910,6 @@ static struct crypto_alg ablk_ctr_alg = { }, }; -#ifdef HAS_LRW - static int ablk_lrw_init(struct crypto_tfm *tfm) { struct cryptd_ablkcipher *cryptd_tfm; @@ -970,10 +948,6 @@ static struct crypto_alg ablk_lrw_alg = { }, }; -#endif - -#ifdef HAS_XTS - static int ablk_xts_init(struct crypto_tfm *tfm) { struct cryptd_ablkcipher *cryptd_tfm; @@ -1010,8 +984,6 @@ static struct crypto_alg ablk_xts_alg = { }, }; -#endif - static int __init serpent_sse2_init(void) { int err; @@ -1039,36 +1011,28 @@ static int __init serpent_sse2_init(void) err = crypto_register_alg(&ablk_ctr_alg); if (err) goto ablk_ctr_err; -#ifdef HAS_LRW err = crypto_register_alg(&blk_lrw_alg); if (err) goto blk_lrw_err; err = crypto_register_alg(&ablk_lrw_alg); if (err) goto ablk_lrw_err; -#endif -#ifdef HAS_XTS err = crypto_register_alg(&blk_xts_alg); if (err) goto blk_xts_err; err = crypto_register_alg(&ablk_xts_alg); if (err) goto ablk_xts_err; -#endif return err; -#ifdef HAS_XTS crypto_unregister_alg(&ablk_xts_alg); ablk_xts_err: crypto_unregister_alg(&blk_xts_alg); blk_xts_err: -#endif -#ifdef HAS_LRW crypto_unregister_alg(&ablk_lrw_alg); ablk_lrw_err: crypto_unregister_alg(&blk_lrw_alg); blk_lrw_err: -#endif crypto_unregister_alg(&ablk_ctr_alg); ablk_ctr_err: crypto_unregister_alg(&ablk_cbc_alg); @@ -1086,14 +1050,10 @@ blk_ecb_err: static void __exit serpent_sse2_exit(void) { -#ifdef HAS_XTS crypto_unregister_alg(&ablk_xts_alg); crypto_unregister_alg(&blk_xts_alg); -#endif -#ifdef HAS_LRW crypto_unregister_alg(&ablk_lrw_alg); crypto_unregister_alg(&blk_lrw_alg); -#endif crypto_unregister_alg(&ablk_ctr_alg); crypto_unregister_alg(&ablk_cbc_alg); crypto_unregister_alg(&ablk_ecb_alg); -- cgit v1.1 From 549c89b98c4530b278dde1a3f68ce5ebbb1e6304 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 29 Nov 2011 12:44:55 -0800 Subject: x86: Do not schedule while still in NMI context The NMI handler uses the paranoid_exit routine that checks the NEED_RESCHED flag, and if it is set and the return is for userspace, then interrupts are enabled, the stack is swapped to the thread's stack, and schedule is called. The problem with this is that we are still in an NMI context until an iret is executed. This means that any new NMIs are now starved until an interrupt or exception occurs and does the iret. As NMIs can not be masked and can interrupt any location, they are treated as a special case. NEED_RESCHED should not be set in an NMI handler. The interruption by the NMI should not disturb the work flow for scheduling. Any IPI sent to a processor after sending the NEED_RESCHED would have to wait for the NMI anyway, and after the IPI finishes the schedule would be called as required. There is no reason to do anything special leaving an NMI. Remove the call to paranoid_exit and do a simple return. This not only fixes the bug of starved NMIs, but it also cleans up the code. Link: http://lkml.kernel.org/r/CA+55aFzgM55hXTs4griX5e9=v_O+=ue+7Rj0PTD=M7hFYpyULQ@mail.gmail.com Acked-by: Andi Kleen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "H. Peter Anvin" Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Paul Turner Signed-off-by: Linus Torvalds Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index faf8d5e..3819ea9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1489,46 +1489,14 @@ ENTRY(nmi) movq %rsp,%rdi movq $-1,%rsi call do_nmi -#ifdef CONFIG_TRACE_IRQFLAGS - /* paranoidexit; without TRACE_IRQS_OFF */ - /* ebx: no swapgs flag */ - DISABLE_INTERRUPTS(CLBR_NONE) testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore - testl $3,CS(%rsp) - jnz nmi_userspace nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 jmp irq_return -nmi_userspace: - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz nmi_swapgs - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz nmi_schedule - movl %ebx,%edx /* arg3: thread flags */ - ENABLE_INTERRUPTS(CLBR_NONE) - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - DISABLE_INTERRUPTS(CLBR_NONE) - jmp nmi_userspace -nmi_schedule: - ENABLE_INTERRUPTS(CLBR_ANY) - call schedule - DISABLE_INTERRUPTS(CLBR_ANY) - jmp nmi_userspace CFI_ENDPROC -#else - jmp paranoid_exit - CFI_ENDPROC -#endif END(nmi) ENTRY(ignore_sysret) -- cgit v1.1 From 1fd466efc88c48f50e5ee29f4dbb4e210a889172 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 8 Dec 2011 12:32:27 -0500 Subject: x86: Document the NMI handler about not using paranoid_exit Linus cleaned up the NMI handler but it still needs some comments to explain why it uses save_paranoid but not paranoid_exit. Just to keep others from adding that in the future, document why it's not used. Cc: Linus Torvalds Cc: Andi Kleen Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3819ea9..d1d5434 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1480,9 +1480,16 @@ END(error_exit) ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq_cfi $-1 + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + /* + * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit + * as we should not be calling schedule in NMI context. + * Even with normal interrupts enabled. An NMI should not be + * setting NEED_RESCHED or anything that normal interrupts and + * exceptions might do. + */ call save_paranoid DEFAULT_FRAME 0 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ -- cgit v1.1 From 3f3c8b8c4b2a34776c3470142a7c8baafcda6eb0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 8 Dec 2011 12:36:23 -0500 Subject: x86: Add workaround to NMI iret woes In x86, when an NMI goes off, the CPU goes into an NMI context that prevents other NMIs to trigger on that CPU. If an NMI is suppose to trigger, it has to wait till the previous NMI leaves NMI context. At that time, the next NMI can trigger (note, only one more NMI will trigger, as only one can be latched at a time). The way x86 gets out of NMI context is by calling iret. The problem with this is that this causes problems if the NMI handle either triggers an exception, or a breakpoint. Both the exception and the breakpoint handlers will finish with an iret. If this happens while in NMI context, the CPU will leave NMI context and a new NMI may come in. As NMI handlers are not made to be re-entrant, this can cause havoc with the system, not to mention, the nested NMI will write all over the previous NMI's stack. Linus Torvalds proposed the following workaround to this problem: https://lkml.org/lkml/2010/7/14/264 "In fact, I wonder if we couldn't just do a software NMI disable instead? Hav ea per-cpu variable (in the _core_ percpu areas that get allocated statically) that points to the NMI stack frame, and just make the NMI code itself do something like NMI entry: - load percpu NMI stack frame pointer - if non-zero we know we're nested, and should ignore this NMI: - we're returning to kernel mode, so return immediately by using "popf/ret", which also keeps NMI's disabled in the hardware until the "real" NMI iret happens. - before the popf/iret, use the NMI stack pointer to make the NMI return stack be invalid and cause a fault - set the NMI stack pointer to the current stack pointer NMI exit (not the above "immediate exit because we nested"): clear the percpu NMI stack pointer Just do the iret. Now, the thing is, now the "iret" is atomic. If we had a nested NMI, we'll take a fault, and that re-does our "delayed" NMI - and NMI's will stay masked. And if we didn't have a nested NMI, that iret will now unmask NMI's, and everything is happy." I first tried to follow this advice but as I started implementing this code, a few gotchas showed up. One, is accessing per-cpu variables in the NMI handler. The problem is that per-cpu variables use the %gs register to get the variable for the given CPU. But as the NMI may happen in userspace, we must first perform a SWAPGS to get to it. The NMI handler already does this later in the code, but its too late as we have saved off all the registers and we don't want to do that for a disabled NMI. Peter Zijlstra suggested to keep all variables on the stack. This simplifies things greatly and it has the added benefit of cache locality. Two, faulting on the iret. I really wanted to make this work, but it was becoming very hacky, and I never got it to be stable. The iret already had a fault handler for userspace faulting with bad segment registers, and getting NMI to trigger a fault and detect it was very tricky. But for strange reasons, the system would usually take a double fault and crash. I never figured out why and decided to go with a simple "jmp" approach. The new approach I took also simplified things. Finally, the last problem with Linus's approach was to have the nested NMI handler do a ret instead of an iret to give the first NMI NMI-context again. The problem is that ret is much more limited than an iret. I couldn't figure out how to get the stack back where it belonged. I could have copied the current stack, pushed the return onto it, but my fear here is that there may be some place that writes data below the stack pointer. I know that is not something code should depend on, but I don't want to chance it. I may add this feature later, but for now, an NMI handler that loses NMI context will not get it back. Here's what is done: When an NMI comes in, the HW pushes the interrupt stack frame onto the per cpu NMI stack that is selected by the IST. A special location on the NMI stack holds a variable that is set when the first NMI handler runs. If this variable is set then we know that this is a nested NMI and we process the nested NMI code. There is still a race when this variable is cleared and an NMI comes in just before the first NMI does the return. For this case, if the variable is cleared, we also check if the interrupted stack is the NMI stack. If it is, then we process the nested NMI code. Why the two tests and not just test the interrupted stack? If the first NMI hits a breakpoint and loses NMI context, and then it hits another breakpoint and while processing that breakpoint we get a nested NMI. When processing a breakpoint, the stack changes to the breakpoint stack. If another NMI comes in here we can't rely on the interrupted stack to be the NMI stack. If the variable is not set and the interrupted task's stack is not the NMI stack, then we know this is the first NMI and we can process things normally. But in order to do so, we need to do a few things first. 1) Set the stack variable that tells us that we are in an NMI handler 2) Make two copies of the interrupt stack frame. One copy is used to return on iret The other is used to restore the first one if we have a nested NMI. This is what the stack will look like: +-------------------------+ | original SS | | original Return RSP | | original RFLAGS | | original CS | | original RIP | +-------------------------+ | temp storage for rdx | +-------------------------+ | NMI executing variable | +-------------------------+ | Saved SS | | Saved Return RSP | | Saved RFLAGS | | Saved CS | | Saved RIP | +-------------------------+ | copied SS | | copied Return RSP | | copied RFLAGS | | copied CS | | copied RIP | +-------------------------+ | pt_regs | +-------------------------+ The original stack frame contains what the HW put in when we entered the NMI. We store %rdx as a temp variable to use. Both the original HW stack frame and this %rdx storage will be clobbered by nested NMIs so we can not rely on them later in the first NMI handler. The next item is the special stack variable that is set when we execute the rest of the NMI handler. Then we have two copies of the interrupt stack. The second copy is modified by any nested NMIs to let the first NMI know that we triggered a second NMI (latched) and that we should repeat the NMI handler. If the first NMI hits an exception or breakpoint that takes it out of NMI context, if a second NMI comes in before the first one finishes, it will update the copied interrupt stack to point to a fix up location to trigger another NMI. When the first NMI calls iret, it will instead jump to the fix up location. This fix up location will copy the saved interrupt stack back to the copy and execute the nmi handler again. Note, the nested NMI knows enough to check if it preempted a previous NMI handler while it is in the fixup location. If it has, it will not modify the copied interrupt stack and will just leave as if nothing happened. As the NMI handle is about to execute again, there's no reason to latch now. To test all this, I forced the NMI handler to call iret and take itself out of NMI context. I also added assemble code to write to the serial to make sure that it hits the nested path as well as the fix up path. Everything seems to be working fine. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Paul Turner Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 177 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index d1d5434..b62aa29 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1475,11 +1475,166 @@ ENTRY(error_exit) CFI_ENDPROC END(error_exit) +/* + * Test if a given stack is an NMI stack or not. + */ + .macro test_in_nmi reg stack nmi_ret normal_ret + cmpq %\reg, \stack + ja \normal_ret + subq $EXCEPTION_STKSZ, %\reg + cmpq %\reg, \stack + jb \normal_ret + jmp \nmi_ret + .endm /* runs on exception stack */ ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME + /* + * We allow breakpoints in NMIs. If a breakpoint occurs, then + * the iretq it performs will take us out of NMI context. + * This means that we can have nested NMIs where the next + * NMI is using the top of the stack of the previous NMI. We + * can't let it execute because the nested NMI will corrupt the + * stack of the previous NMI. NMI handlers are not re-entrant + * anyway. + * + * To handle this case we do the following: + * Check the a special location on the stack that contains + * a variable that is set when NMIs are executing. + * The interrupted task's stack is also checked to see if it + * is an NMI stack. + * If the variable is not set and the stack is not the NMI + * stack then: + * o Set the special variable on the stack + * o Copy the interrupt frame into a "saved" location on the stack + * o Copy the interrupt frame into a "copy" location on the stack + * o Continue processing the NMI + * If the variable is set or the previous stack is the NMI stack: + * o Modify the "copy" location to jump to the repeate_nmi + * o return back to the first NMI + * + * Now on exit of the first NMI, we first clear the stack variable + * The NMI stack will tell any nested NMIs at that point that it is + * nested. Then we pop the stack normally with iret, and if there was + * a nested NMI that updated the copy interrupt stack frame, a + * jump will be made to the repeat_nmi code that will handle the second + * NMI. + */ + + /* Use %rdx as out temp variable throughout */ + pushq_cfi %rdx + + /* + * Check the special variable on the stack to see if NMIs are + * executing. + */ + cmp $1, -8(%rsp) + je nested_nmi + + /* + * Now test if the previous stack was an NMI stack. + * We need the double check. We check the NMI stack to satisfy the + * race when the first NMI clears the variable before returning. + * We check the variable because the first NMI could be in a + * breakpoint routine using a breakpoint stack. + */ + lea 6*8(%rsp), %rdx + test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + +nested_nmi: + /* + * Do nothing if we interrupted the fixup in repeat_nmi. + * It's about to repeat the NMI handler, so we are fine + * with ignoring this one. + */ + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out + +1: + /* Set up the interrupted NMIs stack to jump to repeat_nmi */ + leaq -6*8(%rsp), %rdx + movq %rdx, %rsp + CFI_ADJUST_CFA_OFFSET 6*8 + pushq_cfi $__KERNEL_DS + pushq_cfi %rdx + pushfq_cfi + pushq_cfi $__KERNEL_CS + pushq_cfi $repeat_nmi + + /* Put stack back */ + addq $(11*8), %rsp + CFI_ADJUST_CFA_OFFSET -11*8 + +nested_nmi_out: + popq_cfi %rdx + + /* No need to check faults here */ + INTERRUPT_RETURN + +first_nmi: + /* + * Because nested NMIs will use the pushed location that we + * stored in rdx, we must keep that space available. + * Here's what our stack frame will look like: + * +-------------------------+ + * | original SS | + * | original Return RSP | + * | original RFLAGS | + * | original CS | + * | original RIP | + * +-------------------------+ + * | temp storage for rdx | + * +-------------------------+ + * | NMI executing variable | + * +-------------------------+ + * | Saved SS | + * | Saved Return RSP | + * | Saved RFLAGS | + * | Saved CS | + * | Saved RIP | + * +-------------------------+ + * | copied SS | + * | copied Return RSP | + * | copied RFLAGS | + * | copied CS | + * | copied RIP | + * +-------------------------+ + * | pt_regs | + * +-------------------------+ + * + * The saved RIP is used to fix up the copied RIP that a nested + * NMI may zero out. The original stack frame and the temp storage + * is also used by nested NMIs and can not be trusted on exit. + */ + /* Set the NMI executing variable on the stack. */ + pushq_cfi $1 + + /* Copy the stack frame to the Saved frame */ + .rept 5 + pushq_cfi 6*8(%rsp) + .endr + + /* Make another copy, this one may be modified by nested NMIs */ + .rept 5 + pushq_cfi 4*8(%rsp) + .endr + + /* Do not pop rdx, nested NMIs will corrupt it */ + movq 11*8(%rsp), %rdx + + /* + * Everything below this point can be preempted by a nested + * NMI if the first NMI took an exception. Repeated NMIs + * caused by an exception and nested NMI will start here, and + * can still be preempted by another NMI. + */ +restart_nmi: pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1502,10 +1657,32 @@ nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 + /* Clear the NMI executing stack variable */ + movq $0, 10*8(%rsp) jmp irq_return CFI_ENDPROC END(nmi) + /* + * If an NMI hit an iret because of an exception or breakpoint, + * it can lose its NMI context, and a nested NMI may come in. + * In that case, the nested NMI will change the preempted NMI's + * stack to jump to here when it does the final iret. + */ +repeat_nmi: + INTR_FRAME + /* Update the stack variable to say we are still in NMI */ + movq $1, 5*8(%rsp) + + /* copy the saved stack back to copy stack */ + .rept 5 + pushq_cfi 4*8(%rsp) + .endr + + jmp restart_nmi + CFI_ENDPROC +end_repeat_nmi: + ENTRY(ignore_sysret) CFI_STARTPROC mov $-ENOSYS,%eax -- cgit v1.1 From 228bdaa95fb830e08b6acd1afd4d2c55093cabfa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 9 Dec 2011 03:02:19 -0500 Subject: x86: Keep current stack in NMI breakpoints We want to allow NMI handlers to have breakpoints to be able to remove stop_machine from ftrace, kprobes and jump_labels. But if an NMI interrupts a current breakpoint, and then it triggers a breakpoint itself, it will switch to the breakpoint stack and corrupt the data on it for the breakpoint processing that it interrupted. Instead, have the NMI check if it interrupted breakpoint processing by checking if the stack that is currently used is a breakpoint stack. If it is, then load a special IDT that changes the IST for the debug exception to keep the same stack in kernel context. When the NMI is done, it puts it back. This way, if the NMI does trigger a breakpoint, it will keep using the same stack and not stomp on the breakpoint data for the breakpoint it interrupted. Suggested-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- arch/x86/include/asm/desc.h | 12 ++++++++++++ arch/x86/include/asm/processor.h | 6 ++++++ arch/x86/kernel/cpu/common.c | 22 ++++++++++++++++++++++ arch/x86/kernel/head_64.S | 4 ++++ arch/x86/kernel/nmi.c | 15 +++++++++++++++ arch/x86/kernel/traps.c | 6 ++++++ 6 files changed, 65 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 41935fa..e95822d 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in extern struct desc_ptr idt_descr; extern gate_desc idt_table[]; +extern struct desc_ptr nmi_idt_descr; +extern gate_desc nmi_idt_table[]; struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; @@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit = (limit >> 16) & 0xf; } +#ifdef CONFIG_X86_64 +static inline void set_nmi_gate(int gate, void *addr) +{ + gate_desc s; + + pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); + write_idt_entry(nmi_idt_table, gate, &s); +} +#endif + static inline void _set_gate(int gate, unsigned type, void *addr, unsigned dpl, unsigned ist, unsigned seg) { diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b650435..4b39d6d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -402,6 +402,9 @@ DECLARE_PER_CPU(char *, irq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern unsigned long kernel_eflags; extern asmlinkage void ignore_sysret(void); +int is_debug_stack(unsigned long addr); +void debug_stack_set_zero(void); +void debug_stack_reset(void); #else /* X86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR /* @@ -416,6 +419,9 @@ struct stack_canary { }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif +static inline int is_debug_stack(unsigned long addr) { return 0; } +static inline void debug_stack_set_zero(void) { } +static inline void debug_stack_reset(void) { } #endif /* X86_64 */ extern unsigned int xstate_size; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index aa003b1..caa4045 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1026,6 +1026,8 @@ __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) nmi_idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); @@ -1090,6 +1092,24 @@ unsigned long kernel_eflags; */ DEFINE_PER_CPU(struct orig_ist, orig_ist); +static DEFINE_PER_CPU(unsigned long, debug_stack_addr); + +int is_debug_stack(unsigned long addr) +{ + return addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ); +} + +void debug_stack_set_zero(void) +{ + load_idt((const struct desc_ptr *)&nmi_idt_descr); +} + +void debug_stack_reset(void) +{ + load_idt((const struct desc_ptr *)&idt_descr); +} + #else /* CONFIG_X86_64 */ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; @@ -1208,6 +1228,8 @@ void __cpuinit cpu_init(void) estacks += exception_stack_sizes[v]; oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; + if (v == DEBUG_STACK-1) + per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; } } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e11e394..40f4eb3 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -417,6 +417,10 @@ ENTRY(phys_base) ENTRY(idt_table) .skip IDT_ENTRIES * 16 + .align L1_CACHE_BYTES +ENTRY(nmi_idt_table) + .skip IDT_ENTRIES * 16 + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e88f37b..de8d4b3 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -408,6 +408,18 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) dotraplinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { + int update_debug_stack = 0; + + /* + * If we interrupted a breakpoint, it is possible that + * the nmi handler will have breakpoints too. We need to + * change the IDT such that breakpoints that happen here + * continue to use the NMI stack. + */ + if (unlikely(is_debug_stack(regs->sp))) { + debug_stack_set_zero(); + update_debug_stack = 1; + } nmi_enter(); inc_irq_stat(__nmi_count); @@ -416,6 +428,9 @@ do_nmi(struct pt_regs *regs, long error_code) default_do_nmi(regs); nmi_exit(); + + if (unlikely(update_debug_stack)) + debug_stack_reset(); } void stop_nmi(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a8e3eb8..a93c5ca 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -723,4 +723,10 @@ void __init trap_init(void) cpu_init(); x86_init.irqs.trap_init(); + +#ifdef CONFIG_X86_64 + memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); + set_nmi_gate(1, &debug); + set_nmi_gate(3, &int3); +#endif } -- cgit v1.1 From ccd49c2391773ffbf52bb80d75c4a92b16972517 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 13 Dec 2011 16:44:16 -0500 Subject: x86: Allow NMIs to hit breakpoints in i386 With i386, NMIs and breakpoints use the current stack and they do not reset the stack pointer to a fix point that might corrupt a previous NMI or breakpoint (as it does in x86_64). But NMIs are still not made to be re-entrant, and need to prevent the case that an NMI hitting a breakpoint (which does an iret), doesn't allow another NMI to run. The fix is to let the NMI be in 3 different states: 1) not running 2) executing 3) latched When no NMI is executing on a given CPU, the state is "not running". When the first NMI comes in, the state is switched to "executing". On exit of that NMI, a cmpxchg is performed to switch the state back to "not running" and if that fails, the NMI is restarted. If a breakpoint is hit and does an iret, which re-enables NMIs, and another NMI comes in before the first NMI finished, it will detect that the state is not in the "not running" state and the current NMI is nested. In this case, the state is switched to "latched" to let the interrupted NMI know to restart the NMI handler, and the nested NMI exits without doing anything. Cc: Linus Torvalds Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Paul Turner Cc: Frederic Weisbecker Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- arch/x86/kernel/nmi.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 94 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index de8d4b3..47acaf3 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -405,11 +405,84 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) unknown_nmi_error(reason, regs); } -dotraplinkage notrace __kprobes void -do_nmi(struct pt_regs *regs, long error_code) -{ - int update_debug_stack = 0; +/* + * NMIs can hit breakpoints which will cause it to lose its + * NMI context with the CPU when the breakpoint does an iret. + */ +#ifdef CONFIG_X86_32 +/* + * For i386, NMIs use the same stack as the kernel, and we can + * add a workaround to the iret problem in C. Simply have 3 states + * the NMI can be in. + * + * 1) not running + * 2) executing + * 3) latched + * + * When no NMI is in progress, it is in the "not running" state. + * When an NMI comes in, it goes into the "executing" state. + * Normally, if another NMI is triggered, it does not interrupt + * the running NMI and the HW will simply latch it so that when + * the first NMI finishes, it will restart the second NMI. + * (Note, the latch is binary, thus multiple NMIs triggering, + * when one is running, are ignored. Only one NMI is restarted.) + * + * If an NMI hits a breakpoint that executes an iret, another + * NMI can preempt it. We do not want to allow this new NMI + * to run, but we want to execute it when the first one finishes. + * We set the state to "latched", and the first NMI will perform + * an cmpxchg on the state, and if it doesn't successfully + * reset the state to "not running" it will restart the next + * NMI. + */ +enum nmi_states { + NMI_NOT_RUNNING, + NMI_EXECUTING, + NMI_LATCHED, +}; +static DEFINE_PER_CPU(enum nmi_states, nmi_state); + +#define nmi_nesting_preprocess(regs) \ + do { \ + if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ + __get_cpu_var(nmi_state) = NMI_LATCHED; \ + return; \ + } \ + nmi_restart: \ + __get_cpu_var(nmi_state) = NMI_EXECUTING; \ + } while (0) + +#define nmi_nesting_postprocess() \ + do { \ + if (cmpxchg(&__get_cpu_var(nmi_state), \ + NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ + goto nmi_restart; \ + } while (0) +#else /* x86_64 */ +/* + * In x86_64 things are a bit more difficult. This has the same problem + * where an NMI hitting a breakpoint that calls iret will remove the + * NMI context, allowing a nested NMI to enter. What makes this more + * difficult is that both NMIs and breakpoints have their own stack. + * When a new NMI or breakpoint is executed, the stack is set to a fixed + * point. If an NMI is nested, it will have its stack set at that same + * fixed address that the first NMI had, and will start corrupting the + * stack. This is handled in entry_64.S, but the same problem exists with + * the breakpoint stack. + * + * If a breakpoint is being processed, and the debug stack is being used, + * if an NMI comes in and also hits a breakpoint, the stack pointer + * will be set to the same fixed address as the breakpoint that was + * interrupted, causing that stack to be corrupted. To handle this case, + * check if the stack that was interrupted is the debug stack, and if + * so, change the IDT so that new breakpoints will use the current stack + * and not switch to the fixed address. On return of the NMI, switch back + * to the original IDT. + */ +static DEFINE_PER_CPU(int, update_debug_stack); +static inline void nmi_nesting_preprocess(struct pt_regs *regs) +{ /* * If we interrupted a breakpoint, it is possible that * the nmi handler will have breakpoints too. We need to @@ -418,8 +491,22 @@ do_nmi(struct pt_regs *regs, long error_code) */ if (unlikely(is_debug_stack(regs->sp))) { debug_stack_set_zero(); - update_debug_stack = 1; + __get_cpu_var(update_debug_stack) = 1; } +} + +static inline void nmi_nesting_postprocess(void) +{ + if (unlikely(__get_cpu_var(update_debug_stack))) + debug_stack_reset(); +} +#endif + +dotraplinkage notrace __kprobes void +do_nmi(struct pt_regs *regs, long error_code) +{ + nmi_nesting_preprocess(regs); + nmi_enter(); inc_irq_stat(__nmi_count); @@ -429,8 +516,8 @@ do_nmi(struct pt_regs *regs, long error_code) nmi_exit(); - if (unlikely(update_debug_stack)) - debug_stack_reset(); + /* On i386, may loop back to preprocess */ + nmi_nesting_postprocess(); } void stop_nmi(void) -- cgit v1.1 From 42181186ad4db986fcaa40ca95c6e407e9e79372 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 11:43:02 -0500 Subject: x86: Add counter when debug stack is used with interrupts enabled Mathieu Desnoyers pointed out a case that can cause issues with NMIs running on the debug stack: int3 -> interrupt -> NMI -> int3 Because the interrupt changes the stack, the NMI will not see that it preempted the debug stack. Looking deeper at this case, interrupts only happen when the int3 is from userspace or in an a location in the exception table (fixup). userspace -> int3 -> interurpt -> NMI -> int3 All other int3s that happen in the kernel should be processed without ever enabling interrupts, as the do_trap() call will panic the kernel if it is called to process any other location within the kernel. Adding a counter around the sections that enable interrupts while using the debug stack allows the NMI to also check that case. If the NMI sees that it either interrupted a task using the debug stack or the debug counter is non-zero, then it will have to change the IDT table to make the int3 not change stacks (which will corrupt the stack if it does). Note, I had to move the debug_usage functions out of processor.h and into debugreg.h because of the static inlined functions to inc and dec the debug_usage counter. __get_cpu_var() requires smp.h which includes processor.h, and would fail to build. Link: http://lkml.kernel.org/r/1323976535.23971.112.camel@gandalf.stny.rr.com Reported-by: Mathieu Desnoyers Cc: Linus Torvalds Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Paul Turner Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- arch/x86/include/asm/debugreg.h | 22 ++++++++++++++++++++++ arch/x86/include/asm/processor.h | 6 ------ arch/x86/kernel/cpu/common.c | 6 ++++-- arch/x86/kernel/traps.c | 14 ++++++++++++++ 4 files changed, 40 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 078ad0c..b903d5e 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump); extern void hw_breakpoint_restore(void); +#ifdef CONFIG_X86_64 +DECLARE_PER_CPU(int, debug_stack_usage); +static inline void debug_stack_usage_inc(void) +{ + __get_cpu_var(debug_stack_usage)++; +} +static inline void debug_stack_usage_dec(void) +{ + __get_cpu_var(debug_stack_usage)--; +} +int is_debug_stack(unsigned long addr); +void debug_stack_set_zero(void); +void debug_stack_reset(void); +#else /* !X86_64 */ +static inline int is_debug_stack(unsigned long addr) { return 0; } +static inline void debug_stack_set_zero(void) { } +static inline void debug_stack_reset(void) { } +static inline void debug_stack_usage_inc(void) { } +static inline void debug_stack_usage_dec(void) { } +#endif /* X86_64 */ + + #endif /* __KERNEL__ */ #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 4b39d6d..b650435 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -402,9 +402,6 @@ DECLARE_PER_CPU(char *, irq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern unsigned long kernel_eflags; extern asmlinkage void ignore_sysret(void); -int is_debug_stack(unsigned long addr); -void debug_stack_set_zero(void); -void debug_stack_reset(void); #else /* X86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR /* @@ -419,9 +416,6 @@ struct stack_canary { }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif -static inline int is_debug_stack(unsigned long addr) { return 0; } -static inline void debug_stack_set_zero(void) { } -static inline void debug_stack_reset(void) { } #endif /* X86_64 */ extern unsigned int xstate_size; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index caa4045..266e464 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1093,11 +1093,13 @@ unsigned long kernel_eflags; DEFINE_PER_CPU(struct orig_ist, orig_ist); static DEFINE_PER_CPU(unsigned long, debug_stack_addr); +DEFINE_PER_CPU(int, debug_stack_usage); int is_debug_stack(unsigned long addr) { - return addr <= __get_cpu_var(debug_stack_addr) && - addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ); + return __get_cpu_var(debug_stack_usage) || + (addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); } void debug_stack_set_zero(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a93c5ca..0072b38 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -316,9 +316,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) return; #endif + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); + debug_stack_usage_dec(); } #ifdef CONFIG_X86_64 @@ -411,6 +417,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) SIGTRAP) == NOTIFY_STOP) return; + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); + /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); @@ -418,6 +430,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); preempt_conditional_cli(regs); + debug_stack_usage_dec(); return; } @@ -437,6 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); + debug_stack_usage_dec(); return; } -- cgit v1.1 From 8a25a2fd126c621f44f3aeaef80d51f00fc11639 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Wed, 21 Dec 2011 14:29:42 -0800 Subject: cpu: convert 'cpu' and 'machinecheck' sysdev_class to a regular subsystem This moves the 'cpu sysdev_class' over to a regular 'cpu' subsystem and converts the devices to regular devices. The sysdev drivers are implemented as subsystem interfaces now. After all sysdev classes are ported to regular driver core entities, the sysdev implementation will be entirely removed from the kernel. Userspace relies on events and generic sysfs subsystem infrastructure from sysdev devices, which are made available with this conversion. Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Tony Luck Cc: Fenghua Yu Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Borislav Petkov Cc: Tigran Aivazian Cc: Len Brown Cc: Zhang Rui Cc: Dave Jones Cc: Peter Zijlstra Cc: Russell King Cc: Andrew Morton Cc: Arjan van de Ven Cc: "Rafael J. Wysocki" Cc: "Srivatsa S. Bhat" Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/intel_cacheinfo.c | 25 +++--- arch/x86/kernel/cpu/mcheck/mce-internal.h | 4 +- arch/x86/kernel/cpu/mcheck/mce.c | 128 +++++++++++++++--------------- arch/x86/kernel/cpu/mcheck/mce_amd.c | 11 ++- arch/x86/kernel/cpu/mcheck/therm_throt.c | 63 ++++++++------- arch/x86/kernel/microcode_core.c | 58 +++++++------- 7 files changed, 145 insertions(+), 146 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index c9321f3..0b05fb4 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -149,7 +149,7 @@ static inline void enable_p5_mce(void) {} void mce_setup(struct mce *m); void mce_log(struct mce *m); -DECLARE_PER_CPU(struct sys_device, mce_sysdev); +DECLARE_PER_CPU(struct device, mce_device); /* * Maximum banks number. diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index a3b0811..6b45e5e 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -844,8 +844,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) #include #include - -extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ +#include /* pointer to kobject for cpuX/cache */ static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); @@ -1073,9 +1072,9 @@ err_out: static DECLARE_BITMAP(cache_dev_map, NR_CPUS); /* Add/Remove cache interface for CPU device */ -static int __cpuinit cache_add_dev(struct sys_device * sys_dev) +static int __cpuinit cache_add_dev(struct device *dev) { - unsigned int cpu = sys_dev->id; + unsigned int cpu = dev->id; unsigned long i, j; struct _index_kobject *this_object; struct _cpuid4_info *this_leaf; @@ -1087,7 +1086,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), &ktype_percpu_entry, - &sys_dev->kobj, "%s", "cache"); + &dev->kobj, "%s", "cache"); if (retval < 0) { cpuid4_cache_sysfs_exit(cpu); return retval; @@ -1124,9 +1123,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) return 0; } -static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) +static void __cpuinit cache_remove_dev(struct device *dev) { - unsigned int cpu = sys_dev->id; + unsigned int cpu = dev->id; unsigned long i; if (per_cpu(ici_cpuid4_info, cpu) == NULL) @@ -1145,17 +1144,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - cache_add_dev(sys_dev); + cache_add_dev(dev); break; case CPU_DEAD: case CPU_DEAD_FROZEN: - cache_remove_dev(sys_dev); + cache_remove_dev(dev); break; } return NOTIFY_OK; @@ -1174,9 +1173,9 @@ static int __cpuinit cache_sysfs_init(void) for_each_online_cpu(i) { int err; - struct sys_device *sys_dev = get_cpu_sysdev(i); + struct device *dev = get_cpu_device(i); - err = cache_add_dev(sys_dev); + err = cache_add_dev(dev); if (err) return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index fefcc69..ed44c8a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,4 +1,4 @@ -#include +#include #include enum severity_level { @@ -17,7 +17,7 @@ enum severity_level { struct mce_bank { u64 ctl; /* subevents to enable */ unsigned char init; /* initialise bank? */ - struct sysdev_attribute attr; /* sysdev attribute */ + struct device_attribute attr; /* device attribute */ char attrname[ATTR_LEN]; /* attribute name */ }; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 362056a..0156c6f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -1751,7 +1751,7 @@ static struct syscore_ops mce_syscore_ops = { }; /* - * mce_sysdev: Sysfs support + * mce_device: Sysfs support */ static void mce_cpu_restart(void *data) @@ -1787,27 +1787,28 @@ static void mce_enable_ce(void *all) __mcheck_cpu_init_timer(); } -static struct sysdev_class mce_sysdev_class = { +static struct bus_type mce_subsys = { .name = "machinecheck", + .dev_name = "machinecheck", }; -DEFINE_PER_CPU(struct sys_device, mce_sysdev); +DEFINE_PER_CPU(struct device, mce_device); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) { return container_of(attr, struct mce_bank, attr); } -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t show_bank(struct device *s, struct device_attribute *attr, char *buf) { return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_bank(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1822,14 +1823,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, } static ssize_t -show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) +show_trigger(struct device *s, struct device_attribute *attr, char *buf) { strcpy(buf, mce_helper); strcat(buf, "\n"); return strlen(mce_helper) + 1; } -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, const char *buf, size_t siz) { char *p; @@ -1844,8 +1845,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, return strlen(mce_helper) + !!p; } -static ssize_t set_ignore_ce(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t set_ignore_ce(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1868,8 +1869,8 @@ static ssize_t set_ignore_ce(struct sys_device *s, return size; } -static ssize_t set_cmci_disabled(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t set_cmci_disabled(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1891,108 +1892,107 @@ static ssize_t set_cmci_disabled(struct sys_device *s, return size; } -static ssize_t store_int_with_restart(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t store_int_with_restart(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { - ssize_t ret = sysdev_store_int(s, attr, buf, size); + ssize_t ret = device_store_int(s, attr, buf, size); mce_restart(); return ret; } -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); -static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); +static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); +static DEVICE_INT_ATTR(tolerant, 0644, tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); +static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); -static struct sysdev_ext_attribute attr_check_interval = { - _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, - store_int_with_restart), +static struct dev_ext_attribute dev_attr_check_interval = { + __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), &check_interval }; -static struct sysdev_ext_attribute attr_ignore_ce = { - _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), +static struct dev_ext_attribute dev_attr_ignore_ce = { + __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), &mce_ignore_ce }; -static struct sysdev_ext_attribute attr_cmci_disabled = { - _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), +static struct dev_ext_attribute dev_attr_cmci_disabled = { + __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), &mce_cmci_disabled }; -static struct sysdev_attribute *mce_sysdev_attrs[] = { - &attr_tolerant.attr, - &attr_check_interval.attr, - &attr_trigger, - &attr_monarch_timeout.attr, - &attr_dont_log_ce.attr, - &attr_ignore_ce.attr, - &attr_cmci_disabled.attr, +static struct device_attribute *mce_device_attrs[] = { + &dev_attr_tolerant.attr, + &dev_attr_check_interval.attr, + &dev_attr_trigger, + &dev_attr_monarch_timeout.attr, + &dev_attr_dont_log_ce.attr, + &dev_attr_ignore_ce.attr, + &dev_attr_cmci_disabled.attr, NULL }; -static cpumask_var_t mce_sysdev_initialized; +static cpumask_var_t mce_device_initialized; -/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_sysdev_create(unsigned int cpu) +/* Per cpu device init. All of the cpus still share the same ctrl bank: */ +static __cpuinit int mce_device_create(unsigned int cpu) { - struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); + struct device *dev = &per_cpu(mce_device, cpu); int err; int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&sysdev->kobj, 0, sizeof(struct kobject)); - sysdev->id = cpu; - sysdev->cls = &mce_sysdev_class; + memset(&dev->kobj, 0, sizeof(struct kobject)); + dev->id = cpu; + dev->bus = &mce_subsys; - err = sysdev_register(sysdev); + err = device_register(dev); if (err) return err; - for (i = 0; mce_sysdev_attrs[i]; i++) { - err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]); + for (i = 0; mce_device_attrs[i]; i++) { + err = device_create_file(dev, mce_device_attrs[i]); if (err) goto error; } for (j = 0; j < banks; j++) { - err = sysdev_create_file(sysdev, &mce_banks[j].attr); + err = device_create_file(dev, &mce_banks[j].attr); if (err) goto error2; } - cpumask_set_cpu(cpu, mce_sysdev_initialized); + cpumask_set_cpu(cpu, mce_device_initialized); return 0; error2: while (--j >= 0) - sysdev_remove_file(sysdev, &mce_banks[j].attr); + device_remove_file(dev, &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); + device_remove_file(dev, mce_device_attrs[i]); - sysdev_unregister(sysdev); + device_unregister(dev); return err; } -static __cpuinit void mce_sysdev_remove(unsigned int cpu) +static __cpuinit void mce_device_remove(unsigned int cpu) { - struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); + struct device *dev = &per_cpu(mce_device, cpu); int i; - if (!cpumask_test_cpu(cpu, mce_sysdev_initialized)) + if (!cpumask_test_cpu(cpu, mce_device_initialized)) return; - for (i = 0; mce_sysdev_attrs[i]; i++) - sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); + for (i = 0; mce_device_attrs[i]; i++) + device_remove_file(dev, mce_device_attrs[i]); for (i = 0; i < banks; i++) - sysdev_remove_file(sysdev, &mce_banks[i].attr); + device_remove_file(dev, &mce_banks[i].attr); - sysdev_unregister(sysdev); - cpumask_clear_cpu(cpu, mce_sysdev_initialized); + device_unregister(dev); + cpumask_clear_cpu(cpu, mce_device_initialized); } /* Make sure there are no machine checks on offlined CPUs. */ @@ -2042,7 +2042,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - mce_sysdev_create(cpu); + mce_device_create(cpu); if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); break; @@ -2050,7 +2050,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD_FROZEN: if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); - mce_sysdev_remove(cpu); + mce_device_remove(cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: @@ -2084,7 +2084,7 @@ static __init void mce_init_banks(void) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; - struct sysdev_attribute *a = &b->attr; + struct device_attribute *a = &b->attr; sysfs_attr_init(&a->attr); a->attr.name = b->attrname; @@ -2104,16 +2104,16 @@ static __init int mcheck_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL); + zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); mce_init_banks(); - err = sysdev_class_register(&mce_sysdev_class); + err = subsys_system_register(&mce_subsys, NULL); if (err) return err; for_each_online_cpu(i) { - err = mce_sysdev_create(i); + err = mce_device_create(i); if (err) return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f547421..56d2aa1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -548,7 +547,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (!b) goto out; - err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj, + err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj, b->kobj, name); if (err) goto out; @@ -571,7 +570,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj); + b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj); if (!b->kobj) goto out_free; @@ -591,7 +590,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj, + err = sysfs_create_link(&per_cpu(mce_device, i).kobj, b->kobj, name); if (err) goto out; @@ -669,7 +668,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name); + sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -681,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name); + sysfs_remove_link(&per_cpu(mce_device, i).kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 787e06c..59e3f6e 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -69,16 +68,16 @@ static atomic_t therm_throt_en = ATOMIC_INIT(0); static u32 lvtthmr_init __read_mostly; #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, \ - therm_throt_sysdev_show_##_name, \ +#define define_therm_throt_device_one_ro(_name) \ + static DEVICE_ATTR(_name, 0444, \ + therm_throt_device_show_##_name, \ NULL) \ -#define define_therm_throt_sysdev_show_func(event, name) \ +#define define_therm_throt_device_show_func(event, name) \ \ -static ssize_t therm_throt_sysdev_show_##event##_##name( \ - struct sys_device *dev, \ - struct sysdev_attribute *attr, \ +static ssize_t therm_throt_device_show_##event##_##name( \ + struct device *dev, \ + struct device_attribute *attr, \ char *buf) \ { \ unsigned int cpu = dev->id; \ @@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name( \ return ret; \ } -define_therm_throt_sysdev_show_func(core_throttle, count); -define_therm_throt_sysdev_one_ro(core_throttle_count); +define_therm_throt_device_show_func(core_throttle, count); +define_therm_throt_device_one_ro(core_throttle_count); -define_therm_throt_sysdev_show_func(core_power_limit, count); -define_therm_throt_sysdev_one_ro(core_power_limit_count); +define_therm_throt_device_show_func(core_power_limit, count); +define_therm_throt_device_one_ro(core_power_limit_count); -define_therm_throt_sysdev_show_func(package_throttle, count); -define_therm_throt_sysdev_one_ro(package_throttle_count); +define_therm_throt_device_show_func(package_throttle, count); +define_therm_throt_device_one_ro(package_throttle_count); -define_therm_throt_sysdev_show_func(package_power_limit, count); -define_therm_throt_sysdev_one_ro(package_power_limit_count); +define_therm_throt_device_show_func(package_power_limit, count); +define_therm_throt_device_one_ro(package_power_limit_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_core_throttle_count.attr, + &dev_attr_core_throttle_count.attr, NULL }; @@ -223,36 +222,36 @@ static int thresh_event_valid(int event) #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, +static __cpuinit int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) { int err; struct cpuinfo_x86 *c = &cpu_data(cpu); - err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); + err = sysfs_create_group(&dev->kobj, &thermal_attr_group); if (err) return err; if (cpu_has(c, X86_FEATURE_PLN)) - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_core_power_limit_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_core_power_limit_count.attr, thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PTS)) { - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_package_throttle_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_count.attr, thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PLN)) - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_package_power_limit_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_power_limit_count.attr, thermal_attr_group.name); } return err; } -static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) +static __cpuinit void thermal_throttle_remove_dev(struct device *dev) { - sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); + sysfs_remove_group(&dev->kobj, &thermal_attr_group); } /* Mutex protecting device creation against CPU hotplug: */ @@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; int err = 0; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: mutex_lock(&therm_cpu_lock); - err = thermal_throttle_add_dev(sys_dev, cpu); + err = thermal_throttle_add_dev(dev, cpu); mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; @@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, case CPU_DEAD: case CPU_DEAD_FROZEN: mutex_lock(&therm_cpu_lock); - thermal_throttle_remove_dev(sys_dev); + thermal_throttle_remove_dev(dev); mutex_unlock(&therm_cpu_lock); break; } @@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void) #endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); + err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); WARN_ON(err); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index f2d2a66..cf88f2a 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu) return err; } -static ssize_t reload_store(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t reload_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t size) { unsigned long val; @@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev, return ret; } -static ssize_t version_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) +static ssize_t version_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); } -static ssize_t pf_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) +static ssize_t pf_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); } -static SYSDEV_ATTR(reload, 0200, NULL, reload_store); -static SYSDEV_ATTR(version, 0400, version_show, NULL); -static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); +static DEVICE_ATTR(reload, 0200, NULL, reload_store); +static DEVICE_ATTR(version, 0400, version_show, NULL); +static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); static struct attribute *mc_default_attrs[] = { - &attr_reload.attr, - &attr_version.attr, - &attr_processor_flags.attr, + &dev_attr_reload.attr, + &dev_attr_version.attr, + &dev_attr_processor_flags.attr, NULL }; @@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu) return ustate; } -static int mc_sysdev_add(struct sys_device *sys_dev) +static int mc_device_add(struct device *dev, struct subsys_interface *sif) { - int err, cpu = sys_dev->id; + int err, cpu = dev->id; if (!cpu_online(cpu)) return 0; pr_debug("CPU%d added\n", cpu); - err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); + err = sysfs_create_group(&dev->kobj, &mc_attr_group); if (err) return err; if (microcode_init_cpu(cpu) == UCODE_ERROR) { - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + sysfs_remove_group(&dev->kobj, &mc_attr_group); return -EINVAL; } return err; } -static int mc_sysdev_remove(struct sys_device *sys_dev) +static int mc_device_remove(struct device *dev, struct subsys_interface *sif) { - int cpu = sys_dev->id; + int cpu = dev->id; if (!cpu_online(cpu)) return 0; pr_debug("CPU%d removed\n", cpu); microcode_fini_cpu(cpu); - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + sysfs_remove_group(&dev->kobj, &mc_attr_group); return 0; } -static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, +static struct subsys_interface mc_cpu_interface = { + .name = "microcode", + .subsys = &cpu_subsys, + .add_dev = mc_device_add, + .remove_dev = mc_device_remove, }; /** @@ -464,9 +466,9 @@ static __cpuinit int mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: @@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: pr_debug("CPU%d added\n", cpu); - if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) + if (sysfs_create_group(&dev->kobj, &mc_attr_group)) pr_err("Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: /* Suspend is in progress, only remove the interface */ - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + sysfs_remove_group(&dev->kobj, &mc_attr_group); pr_debug("CPU%d removed\n", cpu); break; @@ -527,7 +529,7 @@ static int __init microcode_init(void) get_online_cpus(); mutex_lock(µcode_mutex); - error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + error = subsys_interface_register(&mc_cpu_interface); mutex_unlock(µcode_mutex); put_online_cpus(); @@ -561,7 +563,7 @@ static void __exit microcode_exit(void) get_online_cpus(); mutex_lock(µcode_mutex); - sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + subsys_interface_unregister(&mc_cpu_interface); mutex_unlock(µcode_mutex); put_online_cpus(); -- cgit v1.1 From edbaa603eb801655e80808a9cf3d3b622e8ac66b Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Wed, 21 Dec 2011 16:26:03 -0800 Subject: driver-core: remove sysdev.h usage. The sysdev.h file should not be needed by any in-kernel code, so remove the .h file from these random files that seem to still want to include it. The sysdev code will be going away soon, so this include needs to be removed no matter what. Cc: Jiandong Zheng Cc: Scott Branden Cc: Russell King Cc: Kukjin Kim Cc: David Brown Cc: Daniel Walker Cc: Bryan Huntsman Cc: Ben Dooks Cc: Wan ZongShun Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Guan Xuetao Cc: "Venkatesh Pallipadi Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Grant Likely Cc: Richard Purdie Cc: Matthew Garrett Signed-off-by: Kay Sievers --- arch/x86/kernel/hpet.c | 1 - arch/x86/kernel/irqinit.c | 2 +- arch/x86/platform/uv/uv_sysfs.c | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index b946a9e..98094a9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index b3300e6..313fb5c 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c index 309c70f..5d4ba30 100644 --- a/arch/x86/platform/uv/uv_sysfs.c +++ b/arch/x86/platform/uv/uv_sysfs.c @@ -19,7 +19,7 @@ * Copyright (c) Russ Anderson */ -#include +#include #include #include -- cgit v1.1 From 933393f58fef9963eac61db8093689544e29a600 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 22 Dec 2011 11:58:51 -0600 Subject: percpu: Remove irqsafe_cpu_xxx variants We simply say that regular this_cpu use must be safe regardless of preemption and interrupt state. That has no material change for x86 and s390 implementations of this_cpu operations. However, arches that do not provide their own implementation for this_cpu operations will now get code generated that disables interrupts instead of preemption. -tj: This is part of on-going percpu API cleanup. For detailed discussion of the subject, please refer to the following thread. http://thread.gmane.org/gmane.linux.kernel/1222078 Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo LKML-Reference: --- arch/x86/include/asm/percpu.h | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 3470c9d..562ccb5 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -414,22 +414,6 @@ do { \ #define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) #define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) -#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) -#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) -#define irqsafe_cpu_add_4(pcp, val) percpu_add_op((pcp), val) -#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) -#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) -#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) -#define irqsafe_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) -#define irqsafe_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) -#define irqsafe_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) -#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) -#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) -#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) -#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) -#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) -#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) - #ifndef CONFIG_M386 #define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) #define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) @@ -445,9 +429,6 @@ do { \ #define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #endif /* !CONFIG_M386 */ #ifdef CONFIG_X86_CMPXCHG64 @@ -467,7 +448,6 @@ do { \ #define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) #define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) -#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) #endif /* CONFIG_X86_CMPXCHG64 */ /* @@ -495,13 +475,6 @@ do { \ #define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) #define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) -#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) -#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) - /* * Pretty complex macro to generate cmpxchg16 instruction. The instruction * is not supported on early AMD64 processors so we must be able to emulate @@ -532,7 +505,6 @@ do { \ #define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) #define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) -#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) #endif -- cgit v1.1 From d6185f20a0efbf175e12831d0de330e4f21725aa Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Thu, 22 Sep 2011 13:52:56 +0300 Subject: KVM: nVMX: Add KVM_REQ_IMMEDIATE_EXIT This patch adds a new vcpu->requests bit, KVM_REQ_IMMEDIATE_EXIT. This bit requests that when next entering the guest, we should run it only for as little as possible, and exit again. We use this new option in nested VMX: When L1 launches L2, but L0 wishes L1 to continue running so it can inject an event to it, we unfortunately cannot just pretend to have run L2 for a little while - We must really launch L2, otherwise certain one-off vmcs12 parameters (namely, L1 injection into L2) will be lost. So the existing code runs L2 in this case. But L2 could potentially run for a long time until it exits, and the injection into L1 will be delayed. The new KVM_REQ_IMMEDIATE_EXIT allows us to request that L2 will be entered, as necessary, but will exit as soon as possible after entry. Our implementation of this request uses smp_send_reschedule() to send a self-IPI, with interrupts disabled. The interrupts remain disabled until the guest is entered, and then, after the entry is complete (often including processing an injection and jumping to the relevant handler), the physical interrupt is noticed and causes an exit. On recent Intel processors, we could have achieved the same goal by using MTF instead of a self-IPI. Another technique worth considering in the future is to use VM_EXIT_ACK_INTR_ON_EXIT and a highest-priority vector IPI - to slightly improve performance by avoiding the useless interrupt handler which ends up being called when smp_send_reschedule() is used. Signed-off-by: Nadav Har'El Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 11 +++++++---- arch/x86/kvm/x86.c | 7 ++++++- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 579a0b5..d75d914 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3945,12 +3945,15 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) static void enable_irq_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; - if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) - /* We can get here when nested_run_pending caused - * vmx_interrupt_allowed() to return false. In this case, do - * nothing - the interrupt will be injected later. + if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { + /* + * We get here if vmx_interrupt_allowed() said we can't + * inject to L1 now because L2 must run. Ask L2 to exit + * right after entry, so we can inject to L1 more promptly. */ + kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); return; + } cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4c938da..e24edbc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5648,6 +5648,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) int r; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; + bool req_immediate_exit = 0; if (vcpu->requests) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) @@ -5687,7 +5688,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) record_steal_time(vcpu); if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); - + req_immediate_exit = + kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); } r = kvm_mmu_reload(vcpu); @@ -5738,6 +5740,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + if (req_immediate_exit) + smp_send_reschedule(vcpu->cpu); + kvm_guest_enter(); if (unlikely(vcpu->arch.switch_db_regs)) { -- cgit v1.1 From 51cfe38ea50aa631f58ed8c340ed6f0143c325a8 Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Thu, 22 Sep 2011 13:53:26 +0300 Subject: KVM: nVMX: Fix warning-causing idt-vectoring-info behavior When L0 wishes to inject an interrupt while L2 is running, it emulates an exit to L1 with EXIT_REASON_EXTERNAL_INTERRUPT. This was explained in the original nVMX patch 23, titled "Correct handling of interrupt injection". Unfortunately, it is possible (though rare) that at this point there is valid idt_vectoring_info in vmcs02. For example, L1 injected some interrupt to L2, and when L2 tried to run this interrupt's handler, it got a page fault - so it returns the original interrupt vector in idt_vectoring_info. The problem is that if this is the case, we cannot exit to L1 with EXTERNAL_INTERRUPT like we wished to, because the VMX spec guarantees that idt_vectoring_info and exit_reason_external_interrupt can never happen together. This is not just specified in the spec - a KVM L1 actually prints a kernel warning "unexpected, valid vectoring info" if we violate this guarantee, and some users noticed these warnings in L1's logs. In order to better emulate a processor, which would never return the external interrupt and the idt-vectoring-info together, we need to separate the two injection steps: First, complete L1's injection into L2 (i.e., enter L2, injecting to it the idt-vectoring-info); Second, after entry into L2 succeeds and it exits back to L0, exit to L1 with the EXIT_REASON_EXTERNAL_INTERRUPT. Most of this is already in the code - the only change we need is to remain in L2 (and not exit to L1) in this case. Note that the previous patch ensures (by using KVM_REQ_IMMEDIATE_EXIT) that although we do enter L2 first, it will exit immediately after processing its injection, allowing us to promptly inject to L1. Note how we test vmcs12->idt_vectoring_info_field; This isn't really the vmcs12 value (we haven't exited to L1 yet, so vmcs12 hasn't been updated), but rather the place we save, at the end of vmx_vcpu_run, the vmcs02 value of this field. This was explained in patch 25 ("Correct handling of idt vectoring info") of the original nVMX patch series. Thanks to Dave Allan and to Federico Simoncelli for reporting this bug, to Abel Gordon for helping me figure out the solution, and to Avi Kivity for helping to improve it. Signed-off-by: Nadav Har'El Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d75d914..6e28d58 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4080,11 +4080,12 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) { - struct vmcs12 *vmcs12; - if (to_vmx(vcpu)->nested.nested_run_pending) + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + if (to_vmx(vcpu)->nested.nested_run_pending || + (vmcs12->idt_vectoring_info_field & + VECTORING_INFO_VALID_MASK)) return 0; nested_vmx_vmexit(vcpu); - vmcs12 = get_vmcs12(vcpu); vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT; vmcs12->vm_exit_intr_info = 0; /* fall through to normal code, but now in L1, not L2 */ -- cgit v1.1 From f759e2b4c728cee82e4bc1132d0e41177b79a0b1 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 22 Sep 2011 16:53:17 +0800 Subject: KVM: MMU: avoid pte_list_desc running out in kvm_mmu_pte_write kvm_mmu_pte_write is unsafe since we need to alloc pte_list_desc in the function when spte is prefetched, unfortunately, we can not know how many spte need to be prefetched on this path, that means we can use out of the free pte_list_desc object in the cache, and BUG_ON() is triggered, also some path does not fill the cache, such as INS instruction emulated that does not trigger page fault Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f1b36cf..232c5a3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -593,6 +593,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, return 0; } +static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) +{ + return cache->nobjs; +} + static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, struct kmem_cache *cache) { @@ -970,6 +975,14 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) return &linfo->rmap_pde; } +static bool rmap_can_add(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_memory_cache *cache; + + cache = &vcpu->arch.mmu_pte_list_desc_cache; + return mmu_memory_cache_free_objects(cache); +} + static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { struct kvm_mmu_page *sp; @@ -3586,6 +3599,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, break; } + /* + * No need to care whether allocation memory is successful + * or not since pte prefetch is skiped if it does not have + * enough objects in the cache. + */ + mmu_topup_memory_caches(vcpu); spin_lock(&vcpu->kvm->mmu_lock); if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) gentry = 0; @@ -3656,7 +3675,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mmu_page_zap_pte(vcpu->kvm, sp, spte); if (gentry && !((sp->role.word ^ vcpu->arch.mmu.base_role.word) - & mask.word)) + & mask.word) && rmap_can_add(vcpu)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); if (!remote_flush && need_remote_flush(entry, *spte)) remote_flush = true; @@ -3717,10 +3736,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, goto out; } - r = mmu_topup_memory_caches(vcpu); - if (r) - goto out; - er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); switch (er) { -- cgit v1.1 From d5ae7ce835cc89556dc18e2070e754f026402efa Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 22 Sep 2011 16:53:46 +0800 Subject: KVM: x86: tag the instructions which are used to write page table The idea is from Avi: | tag instructions that are typically used to modify the page tables, and | drop shadow if any other instruction is used. | The list would include, I'd guess, and, or, bts, btc, mov, xchg, cmpxchg, | and cmpxchg8b. This patch is used to tag the instructions and in the later path, shadow page is dropped if it is written by other instructions Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f1e3be18..a10950a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -125,8 +125,9 @@ #define Lock (1<<26) /* lock prefix is allowed for the instruction */ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) +#define PageTable (1 << 29) /* instruction used to write page table */ /* Source 2 operand type */ -#define Src2Shift (29) +#define Src2Shift (30) #define Src2None (OpNone << Src2Shift) #define Src2CL (OpCL << Src2Shift) #define Src2ImmByte (OpImmByte << Src2Shift) @@ -3033,10 +3034,10 @@ static struct opcode group7_rm7[] = { static struct opcode group1[] = { I(Lock, em_add), - I(Lock, em_or), + I(Lock | PageTable, em_or), I(Lock, em_adc), I(Lock, em_sbb), - I(Lock, em_and), + I(Lock | PageTable, em_and), I(Lock, em_sub), I(Lock, em_xor), I(0, em_cmp), @@ -3096,18 +3097,21 @@ static struct group_dual group7 = { { static struct opcode group8[] = { N, N, N, N, - D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), - D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), + D(DstMem | SrcImmByte | ModRM), + D(DstMem | SrcImmByte | ModRM | Lock | PageTable), + D(DstMem | SrcImmByte | ModRM | Lock), + D(DstMem | SrcImmByte | ModRM | Lock | PageTable), }; static struct group_dual group9 = { { - N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, + N, D(DstMem64 | ModRM | Lock | PageTable), N, N, N, N, N, N, }, { N, N, N, N, N, N, N, N, } }; static struct opcode group11[] = { - I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), + I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov), + X7(D(Undefined)), }; static struct gprefix pfx_0f_6f_0f_7f = { @@ -3120,7 +3124,7 @@ static struct opcode opcode_table[256] = { I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), /* 0x08 - 0x0F */ - I6ALU(Lock, em_or), + I6ALU(Lock | PageTable, em_or), I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), N, /* 0x10 - 0x17 */ @@ -3132,7 +3136,7 @@ static struct opcode opcode_table[256] = { I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), /* 0x20 - 0x27 */ - I6ALU(Lock, em_and), N, N, + I6ALU(Lock | PageTable, em_and), N, N, /* 0x28 - 0x2F */ I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), /* 0x30 - 0x37 */ @@ -3165,11 +3169,11 @@ static struct opcode opcode_table[256] = { G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), G(DstMem | SrcImmByte | ModRM | Group, group1), I2bv(DstMem | SrcReg | ModRM, em_test), - I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg), + I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), /* 0x88 - 0x8F */ - I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), + I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov), I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), - I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg), + I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg), D(ModRM | SrcMem | NoAccess | DstReg), I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm), G(0, group1A), @@ -3182,7 +3186,7 @@ static struct opcode opcode_table[256] = { II(ImplicitOps | Stack, em_popf, popf), N, N, /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), - I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), + I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), I2bv(SrcSI | DstDI | Mov | String, em_mov), I2bv(SrcSI | DstDI | String, em_cmp), /* 0xA8 - 0xAF */ @@ -3280,12 +3284,13 @@ static struct opcode twobyte_table[256] = { D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), - DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), + DI(ImplicitOps, rsm), + D(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ - D2bv(DstMem | SrcReg | ModRM | Lock), + D2bv(DstMem | SrcReg | ModRM | Lock | PageTable), I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), D(DstMem | SrcReg | ModRM | BitOp | Lock), I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), @@ -3293,7 +3298,7 @@ static struct opcode twobyte_table[256] = { D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, - G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), + G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable), D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xCF */ -- cgit v1.1 From 1cb3f3ae5a3855ba430430706da4201ace1d6ec4 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 22 Sep 2011 17:02:48 +0800 Subject: KVM: x86: retry non-page-table writing instructions If the emulation is caused by #PF and it is non-page_table writing instruction, it means the VM-EXIT is caused by shadow page protected, we can zap the shadow page and retry this instruction directly The idea is from Avi Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/include/asm/kvm_host.h | 5 ++++ arch/x86/kvm/emulate.c | 5 ++++ arch/x86/kvm/mmu.c | 25 +++++++++++++++----- arch/x86/kvm/x86.c | 47 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 77 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index a026507..9a4acf4 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -364,6 +364,7 @@ enum x86_intercept { #endif int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); +bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_FAILED -1 #define EMULATION_OK 0 #define EMULATION_RESTART 1 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b4973f4..4ceefa9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -444,6 +444,9 @@ struct kvm_vcpu_arch { cpumask_var_t wbinvd_dirty_mask; + unsigned long last_retry_eip; + unsigned long last_retry_addr; + struct { bool halted; gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; @@ -692,6 +695,7 @@ enum emulation_result { #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) +#define EMULTYPE_RETRY (1 << 3) int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, void *insn, int insn_len); @@ -756,6 +760,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes, bool guest_initiated); +int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a10950a..8547958 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3702,6 +3702,11 @@ done: return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; } +bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt) +{ + return ctxt->d & PageTable; +} + static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) { /* The second termination condition only applies for REPE diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 232c5a3..7a22eb8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1998,7 +1998,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; } -static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) +int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; struct hlist_node *node; @@ -2007,7 +2007,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; - + spin_lock(&kvm->mmu_lock); for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); @@ -2015,8 +2015,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); + return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { @@ -3698,9 +3701,8 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); - spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); - spin_unlock(&vcpu->kvm->mmu_lock); + return r; } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); @@ -3721,10 +3723,18 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } +static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr) +{ + if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu)) + return vcpu_match_mmio_gpa(vcpu, addr); + + return vcpu_match_mmio_gva(vcpu, addr); +} + int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, void *insn, int insn_len) { - int r; + int r, emulation_type = EMULTYPE_RETRY; enum emulation_result er; r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); @@ -3736,7 +3746,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, goto out; } - er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); + if (is_mmio_page_fault(vcpu, cr2)) + emulation_type = 0; + + er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); switch (er) { case EMULATE_DONE: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e24edbc..7ba1ab7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4836,6 +4836,50 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) return false; } +static bool retry_instruction(struct x86_emulate_ctxt *ctxt, + unsigned long cr2, int emulation_type) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + unsigned long last_retry_eip, last_retry_addr, gpa = cr2; + + last_retry_eip = vcpu->arch.last_retry_eip; + last_retry_addr = vcpu->arch.last_retry_addr; + + /* + * If the emulation is caused by #PF and it is non-page_table + * writing instruction, it means the VM-EXIT is caused by shadow + * page protected, we can zap the shadow page and retry this + * instruction directly. + * + * Note: if the guest uses a non-page-table modifying instruction + * on the PDE that points to the instruction, then we will unmap + * the instruction and go to an infinite loop. So, we cache the + * last retried eip and the last fault address, if we meet the eip + * and the address again, we can break out of the potential infinite + * loop. + */ + vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; + + if (!(emulation_type & EMULTYPE_RETRY)) + return false; + + if (x86_page_table_writing_insn(ctxt)) + return false; + + if (ctxt->eip == last_retry_eip && last_retry_addr == cr2) + return false; + + vcpu->arch.last_retry_eip = ctxt->eip; + vcpu->arch.last_retry_addr = cr2; + + if (!vcpu->arch.mmu.direct_map) + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); + + kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + + return true; +} + int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, int emulation_type, @@ -4877,6 +4921,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } + if (retry_instruction(ctxt, cr2, emulation_type)) + return EMULATE_DONE; + /* this is needed for vmware backdoor interface to work since it changes registers values during IO operation */ if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { -- cgit v1.1 From 6f6fbe98c3a9f3e9d69cd354a0459989e594e707 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 22 Sep 2011 16:55:10 +0800 Subject: KVM: x86: cleanup port-in/port-out emulated Remove the same code between emulator_pio_in_emulated and emulator_pio_out_emulated Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 59 ++++++++++++++++++++++++------------------------------ 1 file changed, 26 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7ba1ab7..a215448 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4349,32 +4349,24 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) return r; } - -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, void *val, - unsigned int count) +static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, + unsigned int count, bool in) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - if (vcpu->arch.pio.count) - goto data_avail; - - trace_kvm_pio(0, port, size, count); + trace_kvm_pio(!in, port, size, count); vcpu->arch.pio.port = port; - vcpu->arch.pio.in = 1; + vcpu->arch.pio.in = in; vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - data_avail: - memcpy(val, vcpu->arch.pio_data, size * count); vcpu->arch.pio.count = 0; return 1; } vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = KVM_EXIT_IO_IN; + vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = size; vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; @@ -4383,36 +4375,37 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, return 0; } -static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, - const void *val, unsigned int count) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int ret; - trace_kvm_pio(1, port, size, count); - - vcpu->arch.pio.port = port; - vcpu->arch.pio.in = 0; - vcpu->arch.pio.count = count; - vcpu->arch.pio.size = size; - - memcpy(vcpu->arch.pio_data, val, size * count); + if (vcpu->arch.pio.count) + goto data_avail; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { + ret = emulator_pio_in_out(vcpu, size, port, val, count, true); + if (ret) { +data_avail: + memcpy(val, vcpu->arch.pio_data, size * count); vcpu->arch.pio.count = 0; return 1; } - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = KVM_EXIT_IO_OUT; - vcpu->run->io.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = count; - vcpu->run->io.port = port; - return 0; } +static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, + const void *val, unsigned int count) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + memcpy(vcpu->arch.pio_data, val, size * count); + return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); +} + static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { return kvm_x86_ops->get_segment_base(vcpu, seg); -- cgit v1.1 From d01f8d5e02cc79998e3160f7ad545f77891b00e5 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 22 Sep 2011 16:55:36 +0800 Subject: KVM: MMU: do not mark accessed bit on pte write path In current code, the accessed bit is always set when page fault occurred, do not need to set it on pte write path Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu.c | 22 +--------------------- 2 files changed, 1 insertion(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4ceefa9..f8ab0d7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -356,7 +356,6 @@ struct kvm_vcpu_arch { gfn_t last_pt_write_gfn; int last_pt_write_count; u64 *last_pte_updated; - gfn_t last_pte_gfn; struct fpu guest_fpu; u64 xcr0; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7a22eb8..b432a71 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2207,11 +2207,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_mmio_spte(sptep, gfn, pfn, pte_access)) return 0; - /* - * We don't set the accessed bit, since we sometimes want to see - * whether the guest actually used the pte (in order to detect - * demand paging). - */ spte = PT_PRESENT_MASK; if (!speculative) spte |= shadow_accessed_mask; @@ -2362,10 +2357,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } kvm_release_pfn_clean(pfn); - if (speculative) { + if (speculative) vcpu->arch.last_pte_updated = sptep; - vcpu->arch.last_pte_gfn = gfn; - } } static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) @@ -3533,18 +3526,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) return !!(spte && (*spte & shadow_accessed_mask)); } -static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u64 *spte = vcpu->arch.last_pte_updated; - - if (spte - && vcpu->arch.last_pte_gfn == gfn - && shadow_accessed_mask - && !(*spte & shadow_accessed_mask) - && is_shadow_present_pte(*spte)) - set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); -} - void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes, bool guest_initiated) @@ -3615,7 +3596,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++vcpu->kvm->stat.mmu_pte_write; trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); if (guest_initiated) { - kvm_mmu_access_page(vcpu, gfn); if (gfn == vcpu->arch.last_pt_write_gfn && !last_updated_pte_accessed(vcpu)) { ++vcpu->arch.last_pt_write_count; -- cgit v1.1 From 505aef8f30a95f7e4abf2c07e54ded1521587ba0 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 22 Sep 2011 16:56:06 +0800 Subject: KVM: MMU: cleanup FNAME(invlpg) Directly Use mmu_page_zap_pte to zap spte in FNAME(invlpg), also remove the same code between FNAME(invlpg) and FNAME(sync_page) Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 16 ++++++++++------ arch/x86/kvm/paging_tmpl.h | 44 +++++++++++++++++--------------------------- 2 files changed, 27 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b432a71..d15f908 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1809,7 +1809,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } -static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, +static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *spte) { u64 pte; @@ -1817,17 +1817,21 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, pte = *spte; if (is_shadow_present_pte(pte)) { - if (is_last_spte(pte, sp->role.level)) + if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); - else { + if (is_large_pte(pte)) + --kvm->stat.lpages; + } else { child = page_header(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); } - } else if (is_mmio_spte(pte)) + return true; + } + + if (is_mmio_spte(pte)) mmu_spte_clear_no_track(spte); - if (is_large_pte(pte)) - --kvm->stat.lpages; + return false; } static void kvm_mmu_page_unlink_children(struct kvm *kvm, diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 9299410..d8d3906 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -656,6 +656,18 @@ out_unlock: return 0; } +static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) +{ + int offset = 0; + + WARN_ON(sp->role.level != 1); + + if (PTTYPE == 32) + offset = sp->role.quadrant << PT64_LEVEL_BITS; + + return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); +} + static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; @@ -663,7 +675,6 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) gpa_t pte_gpa = -1; int level; u64 *sptep; - int need_flush = 0; vcpu_clear_mmio_info(vcpu, gva); @@ -675,36 +686,20 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) sp = page_header(__pa(sptep)); if (is_last_spte(*sptep, level)) { - int offset, shift; - if (!sp->unsync) break; - shift = PAGE_SHIFT - - (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; - offset = sp->role.quadrant << shift; - - pte_gpa = (sp->gfn << PAGE_SHIFT) + offset; + pte_gpa = FNAME(get_level1_sp_gpa)(sp); pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); - if (is_shadow_present_pte(*sptep)) { - if (is_large_pte(*sptep)) - --vcpu->kvm->stat.lpages; - drop_spte(vcpu->kvm, sptep); - need_flush = 1; - } else if (is_mmio_spte(*sptep)) - mmu_spte_clear_no_track(sptep); - - break; + if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) + kvm_flush_remote_tlbs(vcpu->kvm); } if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) break; } - if (need_flush) - kvm_flush_remote_tlbs(vcpu->kvm); - atomic_inc(&vcpu->kvm->arch.invlpg_counter); spin_unlock(&vcpu->kvm->mmu_lock); @@ -769,19 +764,14 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - int i, offset, nr_present; + int i, nr_present = 0; bool host_writable; gpa_t first_pte_gpa; - offset = nr_present = 0; - /* direct kvm_mmu_page can not be unsync. */ BUG_ON(sp->role.direct); - if (PTTYPE == 32) - offset = sp->role.quadrant << PT64_LEVEL_BITS; - - first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); + first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { unsigned pte_access; -- cgit v1.1 From f57f2ef58f6703e6df70ed52a198920cb3e8edba Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 22 Sep 2011 16:56:39 +0800 Subject: KVM: MMU: fast prefetch spte on invlpg path Fast prefetch spte for the unsync shadow page on invlpg path Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 4 +--- arch/x86/kvm/mmu.c | 38 +++++++++++++++----------------------- arch/x86/kvm/paging_tmpl.h | 30 ++++++++++++++++++------------ arch/x86/kvm/x86.c | 4 ++-- 4 files changed, 36 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f8ab0d7..3c9ea26 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -461,7 +461,6 @@ struct kvm_arch { unsigned int n_requested_mmu_pages; unsigned int n_max_mmu_pages; unsigned int indirect_shadow_pages; - atomic_t invlpg_counter; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. @@ -757,8 +756,7 @@ int fx_init(struct kvm_vcpu *vcpu); void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes, - bool guest_initiated); + const u8 *new, int bytes); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d15f908..c01137f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3531,8 +3531,7 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) } void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes, - bool guest_initiated) + const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; union kvm_mmu_page_role mask = { .word = 0 }; @@ -3541,7 +3540,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, LIST_HEAD(invalid_list); u64 entry, gentry, *spte; unsigned pte_size, page_offset, misaligned, quadrant, offset; - int level, npte, invlpg_counter, r, flooded = 0; + int level, npte, r, flooded = 0; bool remote_flush, local_flush, zap_page; /* @@ -3556,19 +3555,16 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter); - /* * Assume that the pte write on a page table of the same type * as the current vcpu paging mode since we update the sptes only * when they have the same mode. */ - if ((is_pae(vcpu) && bytes == 4) || !new) { + if (is_pae(vcpu) && bytes == 4) { /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ - if (is_pae(vcpu)) { - gpa &= ~(gpa_t)7; - bytes = 8; - } + gpa &= ~(gpa_t)7; + bytes = 8; + r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8)); if (r) gentry = 0; @@ -3594,22 +3590,18 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ mmu_topup_memory_caches(vcpu); spin_lock(&vcpu->kvm->mmu_lock); - if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) - gentry = 0; kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); - if (guest_initiated) { - if (gfn == vcpu->arch.last_pt_write_gfn - && !last_updated_pte_accessed(vcpu)) { - ++vcpu->arch.last_pt_write_count; - if (vcpu->arch.last_pt_write_count >= 3) - flooded = 1; - } else { - vcpu->arch.last_pt_write_gfn = gfn; - vcpu->arch.last_pt_write_count = 1; - vcpu->arch.last_pte_updated = NULL; - } + if (gfn == vcpu->arch.last_pt_write_gfn + && !last_updated_pte_accessed(vcpu)) { + ++vcpu->arch.last_pt_write_count; + if (vcpu->arch.last_pt_write_count >= 3) + flooded = 1; + } else { + vcpu->arch.last_pt_write_gfn = gfn; + vcpu->arch.last_pt_write_count = 1; + vcpu->arch.last_pte_updated = NULL; } mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index d8d3906..9efb860 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -672,20 +672,27 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; - gpa_t pte_gpa = -1; int level; u64 *sptep; vcpu_clear_mmio_info(vcpu, gva); - spin_lock(&vcpu->kvm->mmu_lock); + /* + * No need to check return value here, rmap_can_add() can + * help us to skip pte prefetch later. + */ + mmu_topup_memory_caches(vcpu); + spin_lock(&vcpu->kvm->mmu_lock); for_each_shadow_entry(vcpu, gva, iterator) { level = iterator.level; sptep = iterator.sptep; sp = page_header(__pa(sptep)); if (is_last_spte(*sptep, level)) { + pt_element_t gpte; + gpa_t pte_gpa; + if (!sp->unsync) break; @@ -694,22 +701,21 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (mmu_page_zap_pte(vcpu->kvm, sp, sptep)) kvm_flush_remote_tlbs(vcpu->kvm); + + if (!rmap_can_add(vcpu)) + break; + + if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, + sizeof(pt_element_t))) + break; + + FNAME(update_pte)(vcpu, sp, sptep, &gpte); } if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) break; } - - atomic_inc(&vcpu->kvm->arch.invlpg_counter); - spin_unlock(&vcpu->kvm->mmu_lock); - - if (pte_gpa == -1) - return; - - if (mmu_topup_memory_caches(vcpu)) - return; - kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0); } static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a215448..9c980ce 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4087,7 +4087,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); if (ret < 0) return 0; - kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); + kvm_mmu_pte_write(vcpu, gpa, val, bytes); return 1; } @@ -4324,7 +4324,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (!exchanged) return X86EMUL_CMPXCHG_FAILED; - kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1); + kvm_mmu_pte_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; -- cgit v1.1 From f8734352c6f9c4f3d85f0c97b7731b7f925c62fd Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 22 Sep 2011 16:56:58 +0800 Subject: KVM: MMU: remove unnecessary kvm_mmu_free_some_pages In kvm_mmu_pte_write, we do not need to alloc shadow page, so calling kvm_mmu_free_some_pages is really unnecessary Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c01137f..7e57938 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3590,7 +3590,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ mmu_topup_memory_caches(vcpu); spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); if (gfn == vcpu->arch.last_pt_write_gfn -- cgit v1.1 From 889e5cbced6c191bb7e25c1b30b43e59a12561f9 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 22 Sep 2011 16:57:23 +0800 Subject: KVM: MMU: split kvm_mmu_pte_write function kvm_mmu_pte_write is too long, we split it for better readable Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 194 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 119 insertions(+), 75 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7e57938..986aea5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3530,48 +3530,28 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) return !!(spte && (*spte & shadow_accessed_mask)); } -void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) +static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, + const u8 *new, int *bytes) { - gfn_t gfn = gpa >> PAGE_SHIFT; - union kvm_mmu_page_role mask = { .word = 0 }; - struct kvm_mmu_page *sp; - struct hlist_node *node; - LIST_HEAD(invalid_list); - u64 entry, gentry, *spte; - unsigned pte_size, page_offset, misaligned, quadrant, offset; - int level, npte, r, flooded = 0; - bool remote_flush, local_flush, zap_page; - - /* - * If we don't have indirect shadow pages, it means no page is - * write-protected, so we can exit simply. - */ - if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) - return; - - zap_page = remote_flush = local_flush = false; - offset = offset_in_page(gpa); - - pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); + u64 gentry; + int r; /* * Assume that the pte write on a page table of the same type * as the current vcpu paging mode since we update the sptes only * when they have the same mode. */ - if (is_pae(vcpu) && bytes == 4) { + if (is_pae(vcpu) && *bytes == 4) { /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ - gpa &= ~(gpa_t)7; - bytes = 8; - - r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8)); + *gpa &= ~(gpa_t)7; + *bytes = 8; + r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); if (r) gentry = 0; new = (const u8 *)&gentry; } - switch (bytes) { + switch (*bytes) { case 4: gentry = *(const u32 *)new; break; @@ -3583,71 +3563,135 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, break; } - /* - * No need to care whether allocation memory is successful - * or not since pte prefetch is skiped if it does not have - * enough objects in the cache. - */ - mmu_topup_memory_caches(vcpu); - spin_lock(&vcpu->kvm->mmu_lock); - ++vcpu->kvm->stat.mmu_pte_write; - trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); + return gentry; +} + +/* + * If we're seeing too many writes to a page, it may no longer be a page table, + * or we may be forking, in which case it is better to unmap the page. + */ +static bool detect_write_flooding(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + bool flooded = false; + if (gfn == vcpu->arch.last_pt_write_gfn && !last_updated_pte_accessed(vcpu)) { ++vcpu->arch.last_pt_write_count; if (vcpu->arch.last_pt_write_count >= 3) - flooded = 1; + flooded = true; } else { vcpu->arch.last_pt_write_gfn = gfn; vcpu->arch.last_pt_write_count = 1; vcpu->arch.last_pte_updated = NULL; } + return flooded; +} + +/* + * Misaligned accesses are too much trouble to fix up; also, they usually + * indicate a page is not used as a page table. + */ +static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, + int bytes) +{ + unsigned offset, pte_size, misaligned; + + pgprintk("misaligned: gpa %llx bytes %d role %x\n", + gpa, bytes, sp->role.word); + + offset = offset_in_page(gpa); + pte_size = sp->role.cr4_pae ? 8 : 4; + misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); + misaligned |= bytes < 4; + + return misaligned; +} + +static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) +{ + unsigned page_offset, quadrant; + u64 *spte; + int level; + + page_offset = offset_in_page(gpa); + level = sp->role.level; + *nspte = 1; + if (!sp->role.cr4_pae) { + page_offset <<= 1; /* 32->64 */ + /* + * A 32-bit pde maps 4MB while the shadow pdes map + * only 2MB. So we need to double the offset again + * and zap two pdes instead of one. + */ + if (level == PT32_ROOT_LEVEL) { + page_offset &= ~7; /* kill rounding error */ + page_offset <<= 1; + *nspte = 2; + } + quadrant = page_offset >> PAGE_SHIFT; + page_offset &= ~PAGE_MASK; + if (quadrant != sp->role.quadrant) + return NULL; + } + + spte = &sp->spt[page_offset / sizeof(*spte)]; + return spte; +} + +void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) +{ + gfn_t gfn = gpa >> PAGE_SHIFT; + union kvm_mmu_page_role mask = { .word = 0 }; + struct kvm_mmu_page *sp; + struct hlist_node *node; + LIST_HEAD(invalid_list); + u64 entry, gentry, *spte; + int npte; + bool remote_flush, local_flush, zap_page, flooded, misaligned; + + /* + * If we don't have indirect shadow pages, it means no page is + * write-protected, so we can exit simply. + */ + if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) + return; + + zap_page = remote_flush = local_flush = false; + + pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); + + gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes); + + /* + * No need to care whether allocation memory is successful + * or not since pte prefetch is skiped if it does not have + * enough objects in the cache. + */ + mmu_topup_memory_caches(vcpu); + + spin_lock(&vcpu->kvm->mmu_lock); + ++vcpu->kvm->stat.mmu_pte_write; + trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); + + flooded = detect_write_flooding(vcpu, gfn); mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { - pte_size = sp->role.cr4_pae ? 8 : 4; - misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); - misaligned |= bytes < 4; + misaligned = detect_write_misaligned(sp, gpa, bytes); + if (misaligned || flooded) { - /* - * Misaligned accesses are too much trouble to fix - * up; also, they usually indicate a page is not used - * as a page table. - * - * If we're seeing too many writes to a page, - * it may no longer be a page table, or we may be - * forking, in which case it is better to unmap the - * page. - */ - pgprintk("misaligned: gpa %llx bytes %d role %x\n", - gpa, bytes, sp->role.word); zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; } - page_offset = offset; - level = sp->role.level; - npte = 1; - if (!sp->role.cr4_pae) { - page_offset <<= 1; /* 32->64 */ - /* - * A 32-bit pde maps 4MB while the shadow pdes map - * only 2MB. So we need to double the offset again - * and zap two pdes instead of one. - */ - if (level == PT32_ROOT_LEVEL) { - page_offset &= ~7; /* kill rounding error */ - page_offset <<= 1; - npte = 2; - } - quadrant = page_offset >> PAGE_SHIFT; - page_offset &= ~PAGE_MASK; - if (quadrant != sp->role.quadrant) - continue; - } + + spte = get_written_sptes(sp, gpa, &npte); + if (!spte) + continue; + local_flush = true; - spte = &sp->spt[page_offset / sizeof(*spte)]; while (npte--) { entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte); -- cgit v1.1 From 5d9ca30e96f567b67a36727aa4ebb34911a2b84a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 22 Sep 2011 16:57:55 +0800 Subject: KVM: MMU: fix detecting misaligned accessed Sometimes, we only modify the last one byte of a pte to update status bit, for example, clear_bit is used to clear r/w bit in linux kernel and 'andb' instruction is used in this function, in this case, kvm_mmu_pte_write will treat it as misaligned access, and the shadow page table is zapped Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 986aea5..ca6f72a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3602,6 +3602,14 @@ static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, offset = offset_in_page(gpa); pte_size = sp->role.cr4_pae ? 8 : 4; + + /* + * Sometimes, the OS only writes the last one bytes to update status + * bits, for example, in linux, andb instruction is used in clear_bit(). + */ + if (!(offset & (pte_size - 1)) && bytes == 1) + return false; + misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; -- cgit v1.1 From a30f47cb150dd8d109923eeb65fe73e8b3e09046 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 22 Sep 2011 16:58:36 +0800 Subject: KVM: MMU: improve write flooding detected Detecting write-flooding does not work well, when we handle page written, if the last speculative spte is not accessed, we treat the page is write-flooding, however, we can speculative spte on many path, such as pte prefetch, page synced, that means the last speculative spte may be not point to the written page and the written page can be accessed via other sptes, so depends on the Accessed bit of the last speculative spte is not enough Instead of detected page accessed, we can detect whether the spte is accessed after it is written, if the spte is not accessed but it is written frequently, we treat is not a page table or it not used for a long time Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 6 ++-- arch/x86/kvm/mmu.c | 62 +++++++++++++++++------------------------ arch/x86/kvm/paging_tmpl.h | 12 ++++---- 3 files changed, 32 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3c9ea26..c1f19de 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -239,6 +239,8 @@ struct kvm_mmu_page { int clear_spte_count; #endif + int write_flooding_count; + struct rcu_head rcu; }; @@ -353,10 +355,6 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_page_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; - gfn_t last_pt_write_gfn; - int last_pt_write_count; - u64 *last_pte_updated; - struct fpu guest_fpu; u64 xcr0; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ca6f72a..e9534ce 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1653,6 +1653,18 @@ static void init_shadow_page_table(struct kvm_mmu_page *sp) sp->spt[i] = 0ull; } +static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) +{ + sp->write_flooding_count = 0; +} + +static void clear_sp_write_flooding_count(u64 *spte) +{ + struct kvm_mmu_page *sp = page_header(__pa(spte)); + + __clear_sp_write_flooding_count(sp); +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -1696,6 +1708,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, } else if (sp->unsync) kvm_mmu_mark_parents_unsync(sp); + __clear_sp_write_flooding_count(sp); trace_kvm_mmu_get_page(sp, false); return sp; } @@ -1848,15 +1861,6 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) mmu_page_remove_parent_pte(sp, parent_pte); } -static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) -{ - int i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) - vcpu->arch.last_pte_updated = NULL; -} - static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *parent_pte; @@ -1916,7 +1920,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, } sp->role.invalid = 1; - kvm_mmu_reset_last_pte_updated(kvm); return ret; } @@ -2361,8 +2364,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } kvm_release_pfn_clean(pfn); - if (speculative) - vcpu->arch.last_pte_updated = sptep; } static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) @@ -3523,13 +3524,6 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, kvm_mmu_flush_tlb(vcpu); } -static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) -{ - u64 *spte = vcpu->arch.last_pte_updated; - - return !!(spte && (*spte & shadow_accessed_mask)); -} - static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, const u8 *new, int *bytes) { @@ -3570,22 +3564,16 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, * If we're seeing too many writes to a page, it may no longer be a page table, * or we may be forking, in which case it is better to unmap the page. */ -static bool detect_write_flooding(struct kvm_vcpu *vcpu, gfn_t gfn) +static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte) { - bool flooded = false; - - if (gfn == vcpu->arch.last_pt_write_gfn - && !last_updated_pte_accessed(vcpu)) { - ++vcpu->arch.last_pt_write_count; - if (vcpu->arch.last_pt_write_count >= 3) - flooded = true; - } else { - vcpu->arch.last_pt_write_gfn = gfn; - vcpu->arch.last_pt_write_count = 1; - vcpu->arch.last_pte_updated = NULL; - } + /* + * Skip write-flooding detected for the sp whose level is 1, because + * it can become unsync, then the guest page is not write-protected. + */ + if (sp->role.level == 1) + return false; - return flooded; + return ++sp->write_flooding_count >= 3; } /* @@ -3657,7 +3645,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; - bool remote_flush, local_flush, zap_page, flooded, misaligned; + bool remote_flush, local_flush, zap_page; /* * If we don't have indirect shadow pages, it means no page is @@ -3683,12 +3671,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++vcpu->kvm->stat.mmu_pte_write; trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); - flooded = detect_write_flooding(vcpu, gfn); mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { - misaligned = detect_write_misaligned(sp, gpa, bytes); + spte = get_written_sptes(sp, gpa, &npte); - if (misaligned || flooded) { + if (detect_write_misaligned(sp, gpa, bytes) || + detect_write_flooding(sp, spte)) { zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 9efb860..52e9d58 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -497,6 +497,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, shadow_walk_next(&it)) { gfn_t table_gfn; + clear_sp_write_flooding_count(it.sptep); drop_large_spte(vcpu, it.sptep); sp = NULL; @@ -522,6 +523,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, shadow_walk_next(&it)) { gfn_t direct_gfn; + clear_sp_write_flooding_count(it.sptep); validate_direct_spte(vcpu, it.sptep, direct_access); drop_large_spte(vcpu, it.sptep); @@ -536,6 +538,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, link_shadow_page(it.sptep, sp); } + clear_sp_write_flooding_count(it.sptep); mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, user_fault, write_fault, emulate, it.level, gw->gfn, pfn, prefault, map_writable); @@ -599,11 +602,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, */ if (!r) { pgprintk("%s: guest page fault\n", __func__); - if (!prefault) { + if (!prefault) inject_page_fault(vcpu, &walker.fault); - /* reset fork detector */ - vcpu->arch.last_pt_write_count = 0; - } + return 0; } @@ -641,9 +642,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, sptep, *sptep, emulate); - if (!emulate) - vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ - ++vcpu->stat.pf_fixed; trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); -- cgit v1.1 From 3f2e5260f5a17d37be3e3c83aca2f335b9bf3893 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 14 Sep 2011 09:58:32 +0200 Subject: KVM: x86: Simplify kvm timer handler The vcpu reference of a kvm_timer can't become NULL while the timer is valid, so drop this redundant test. This also makes it pointless to carry a separate __kvm_timer_fn, fold it into kvm_timer_fn. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/timer.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index ae432ea..6b85cc6 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -18,9 +18,10 @@ #include #include "kvm_timer.h" -static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) +enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) { - int restart_timer = 0; + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + struct kvm_vcpu *vcpu = ktimer->vcpu; wait_queue_head_t *q = &vcpu->wq; /* @@ -40,26 +41,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) if (ktimer->t_ops->is_periodic(ktimer)) { hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); - restart_timer = 1; - } - - return restart_timer; -} - -enum hrtimer_restart kvm_timer_fn(struct hrtimer *data) -{ - int restart_timer; - struct kvm_vcpu *vcpu; - struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); - - vcpu = ktimer->vcpu; - if (!vcpu) - return HRTIMER_NORESTART; - - restart_timer = __kvm_timer_fn(vcpu, ktimer); - if (restart_timer) return HRTIMER_RESTART; - else + } else return HRTIMER_NORESTART; } - -- cgit v1.1 From 5202397df819d3c5a3f201bd4af6b86542115fb6 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Tue, 1 Nov 2011 17:28:47 -0700 Subject: KVM guest: remove KVM guest pv mmu support This has not been used for some years now. It's time to remove it. Signed-off-by: Chris Wright Signed-off-by: Avi Kivity --- arch/x86/kernel/kvm.c | 181 -------------------------------------------------- 1 file changed, 181 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a9c2116..f0c6fd6 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -39,8 +39,6 @@ #include #include -#define MMU_QUEUE_SIZE 1024 - static int kvmapf = 1; static int parse_no_kvmapf(char *arg) @@ -60,21 +58,10 @@ static int parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); -struct kvm_para_state { - u8 mmu_queue[MMU_QUEUE_SIZE]; - int mmu_queue_len; -}; - -static DEFINE_PER_CPU(struct kvm_para_state, para_state); static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; -static struct kvm_para_state *kvm_para_state(void) -{ - return &per_cpu(para_state, raw_smp_processor_id()); -} - /* * No need for any "IO delay" on KVM */ @@ -271,151 +258,6 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) } } -static void kvm_mmu_op(void *buffer, unsigned len) -{ - int r; - unsigned long a1, a2; - - do { - a1 = __pa(buffer); - a2 = 0; /* on i386 __pa() always returns <4G */ - r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2); - buffer += r; - len -= r; - } while (len); -} - -static void mmu_queue_flush(struct kvm_para_state *state) -{ - if (state->mmu_queue_len) { - kvm_mmu_op(state->mmu_queue, state->mmu_queue_len); - state->mmu_queue_len = 0; - } -} - -static void kvm_deferred_mmu_op(void *buffer, int len) -{ - struct kvm_para_state *state = kvm_para_state(); - - if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { - kvm_mmu_op(buffer, len); - return; - } - if (state->mmu_queue_len + len > sizeof state->mmu_queue) - mmu_queue_flush(state); - memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len); - state->mmu_queue_len += len; -} - -static void kvm_mmu_write(void *dest, u64 val) -{ - __u64 pte_phys; - struct kvm_mmu_op_write_pte wpte; - -#ifdef CONFIG_HIGHPTE - struct page *page; - unsigned long dst = (unsigned long) dest; - - page = kmap_atomic_to_page(dest); - pte_phys = page_to_pfn(page); - pte_phys <<= PAGE_SHIFT; - pte_phys += (dst & ~(PAGE_MASK)); -#else - pte_phys = (unsigned long)__pa(dest); -#endif - wpte.header.op = KVM_MMU_OP_WRITE_PTE; - wpte.pte_val = val; - wpte.pte_phys = pte_phys; - - kvm_deferred_mmu_op(&wpte, sizeof wpte); -} - -/* - * We only need to hook operations that are MMU writes. We hook these so that - * we can use lazy MMU mode to batch these operations. We could probably - * improve the performance of the host code if we used some of the information - * here to simplify processing of batched writes. - */ -static void kvm_set_pte(pte_t *ptep, pte_t pte) -{ - kvm_mmu_write(ptep, pte_val(pte)); -} - -static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - kvm_mmu_write(ptep, pte_val(pte)); -} - -static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd) -{ - kvm_mmu_write(pmdp, pmd_val(pmd)); -} - -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE -static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte) -{ - kvm_mmu_write(ptep, pte_val(pte)); -} - -static void kvm_pte_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - kvm_mmu_write(ptep, 0); -} - -static void kvm_pmd_clear(pmd_t *pmdp) -{ - kvm_mmu_write(pmdp, 0); -} -#endif - -static void kvm_set_pud(pud_t *pudp, pud_t pud) -{ - kvm_mmu_write(pudp, pud_val(pud)); -} - -#if PAGETABLE_LEVELS == 4 -static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd) -{ - kvm_mmu_write(pgdp, pgd_val(pgd)); -} -#endif -#endif /* PAGETABLE_LEVELS >= 3 */ - -static void kvm_flush_tlb(void) -{ - struct kvm_mmu_op_flush_tlb ftlb = { - .header.op = KVM_MMU_OP_FLUSH_TLB, - }; - - kvm_deferred_mmu_op(&ftlb, sizeof ftlb); -} - -static void kvm_release_pt(unsigned long pfn) -{ - struct kvm_mmu_op_release_pt rpt = { - .header.op = KVM_MMU_OP_RELEASE_PT, - .pt_phys = (u64)pfn << PAGE_SHIFT, - }; - - kvm_mmu_op(&rpt, sizeof rpt); -} - -static void kvm_enter_lazy_mmu(void) -{ - paravirt_enter_lazy_mmu(); -} - -static void kvm_leave_lazy_mmu(void) -{ - struct kvm_para_state *state = kvm_para_state(); - - mmu_queue_flush(state); - paravirt_leave_lazy_mmu(); -} - static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; @@ -424,29 +266,6 @@ static void __init paravirt_ops_setup(void) if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; - if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) { - pv_mmu_ops.set_pte = kvm_set_pte; - pv_mmu_ops.set_pte_at = kvm_set_pte_at; - pv_mmu_ops.set_pmd = kvm_set_pmd; -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE - pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; - pv_mmu_ops.pte_clear = kvm_pte_clear; - pv_mmu_ops.pmd_clear = kvm_pmd_clear; -#endif - pv_mmu_ops.set_pud = kvm_set_pud; -#if PAGETABLE_LEVELS == 4 - pv_mmu_ops.set_pgd = kvm_set_pgd; -#endif -#endif - pv_mmu_ops.flush_tlb_user = kvm_flush_tlb; - pv_mmu_ops.release_pte = kvm_release_pt; - pv_mmu_ops.release_pmd = kvm_release_pt; - pv_mmu_ops.release_pud = kvm_release_pt; - - pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; - } #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; #endif -- cgit v1.1 From fb92045843a8cd99c7b843d9b567a680a3854ba1 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Tue, 1 Nov 2011 17:31:18 -0700 Subject: KVM: MMU: remove KVM host pv mmu support The host side pv mmu support has been marked for feature removal in January 2011. It's not in use, is slower than shadow or hardware assisted paging, and a maintenance burden. It's November 2011, time to remove it. Signed-off-by: Chris Wright Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 13 ---- arch/x86/kvm/mmu.c | 135 ---------------------------------------- arch/x86/kvm/x86.c | 12 ---- 3 files changed, 160 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c1f19de..6d83264 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -244,13 +244,6 @@ struct kvm_mmu_page { struct rcu_head rcu; }; -struct kvm_pv_mmu_op_buffer { - void *ptr; - unsigned len; - unsigned processed; - char buf[512] __aligned(sizeof(long)); -}; - struct kvm_pio_request { unsigned long count; int in; @@ -347,10 +340,6 @@ struct kvm_vcpu_arch { */ struct kvm_mmu *walk_mmu; - /* only needed in kvm_pv_mmu_op() path, but it's hot so - * put it here to avoid allocation */ - struct kvm_pv_mmu_op_buffer mmu_op_buffer; - struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; struct kvm_mmu_memory_cache mmu_page_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; @@ -667,8 +656,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); -int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, - gpa_t addr, unsigned long *ret); u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); extern bool tdp_enabled; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e9534ce..a9b3a32 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2028,20 +2028,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); -static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) -{ - struct kvm_mmu_page *sp; - struct hlist_node *node; - LIST_HEAD(invalid_list); - - for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { - pgprintk("%s: zap %llx %x\n", - __func__, gfn, sp->role.word); - kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - } - kvm_mmu_commit_zap_page(kvm, &invalid_list); -} - static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) { int slot = memslot_id(kvm, gfn); @@ -4004,127 +3990,6 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) return nr_mmu_pages; } -static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer, - unsigned len) -{ - if (len > buffer->len) - return NULL; - return buffer->ptr; -} - -static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer, - unsigned len) -{ - void *ret; - - ret = pv_mmu_peek_buffer(buffer, len); - if (!ret) - return ret; - buffer->ptr += len; - buffer->len -= len; - buffer->processed += len; - return ret; -} - -static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, - gpa_t addr, gpa_t value) -{ - int bytes = 8; - int r; - - if (!is_long_mode(vcpu) && !is_pae(vcpu)) - bytes = 4; - - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - - if (!emulator_write_phys(vcpu, addr, &value, bytes)) - return -EFAULT; - - return 1; -} - -static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) -{ - (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu)); - return 1; -} - -static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr) -{ - spin_lock(&vcpu->kvm->mmu_lock); - mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT); - spin_unlock(&vcpu->kvm->mmu_lock); - return 1; -} - -static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu, - struct kvm_pv_mmu_op_buffer *buffer) -{ - struct kvm_mmu_op_header *header; - - header = pv_mmu_peek_buffer(buffer, sizeof *header); - if (!header) - return 0; - switch (header->op) { - case KVM_MMU_OP_WRITE_PTE: { - struct kvm_mmu_op_write_pte *wpte; - - wpte = pv_mmu_read_buffer(buffer, sizeof *wpte); - if (!wpte) - return 0; - return kvm_pv_mmu_write(vcpu, wpte->pte_phys, - wpte->pte_val); - } - case KVM_MMU_OP_FLUSH_TLB: { - struct kvm_mmu_op_flush_tlb *ftlb; - - ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb); - if (!ftlb) - return 0; - return kvm_pv_mmu_flush_tlb(vcpu); - } - case KVM_MMU_OP_RELEASE_PT: { - struct kvm_mmu_op_release_pt *rpt; - - rpt = pv_mmu_read_buffer(buffer, sizeof *rpt); - if (!rpt) - return 0; - return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys); - } - default: return 0; - } -} - -int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, - gpa_t addr, unsigned long *ret) -{ - int r; - struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; - - buffer->ptr = buffer->buf; - buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); - buffer->processed = 0; - - r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); - if (r) - goto out; - - while (buffer->len) { - r = kvm_pv_mmu_op_one(vcpu, buffer); - if (r < 0) - goto out; - if (r == 0) - break; - } - - r = 1; -out: - *ret = buffer->processed; - return r; -} - int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) { struct kvm_shadow_walk_iterator iterator; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9c980ce..a3b25a5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5273,15 +5273,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); -static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, - unsigned long a1) -{ - if (is_long_mode(vcpu)) - return a0; - else - return a0 | ((gpa_t)a1 << 32); -} - int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { u64 param, ingpa, outgpa, ret; @@ -5377,9 +5368,6 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; - case KVM_HC_MMU_OP: - r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); - break; default: ret = -KVM_ENOSYS; break; -- cgit v1.1 From d6eebf8b80316ea61718dc115cd6a20c16195327 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 14 Nov 2011 18:21:34 +0900 Subject: KVM: MMU: Clean up BUG_ON() conditions in rmap_write_protect() Remove redundant checks and use is_large_pte() macro. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a9b3a32..973f254 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1027,7 +1027,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) spte = rmap_next(kvm, rmapp, NULL); while (spte) { - BUG_ON(!spte); BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); if (is_writable_pte(*spte)) { @@ -1043,9 +1042,8 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) rmapp = gfn_to_rmap(kvm, gfn, i); spte = rmap_next(kvm, rmapp, NULL); while (spte) { - BUG_ON(!spte); BUG_ON(!(*spte & PT_PRESENT_MASK)); - BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); + BUG_ON(!is_large_pte(*spte)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); if (is_writable_pte(*spte)) { drop_spte(kvm, spte); -- cgit v1.1 From 9b9b1492364758de82c19c36f07baa9ae162c7e5 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 14 Nov 2011 18:22:28 +0900 Subject: KVM: MMU: Split gfn_to_rmap() into two functions rmap_write_protect() calls gfn_to_rmap() for each level with gfn fixed. This results in calling gfn_to_memslot() repeatedly with that gfn. This patch introduces __gfn_to_rmap() which takes the slot as an argument to avoid this. This is also needed for the following dirty logging optimization. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 973f254..fa71085 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -958,23 +958,29 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) } } -/* - * Take gfn and return the reverse mapping to it. - */ -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) +static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level, + struct kvm_memory_slot *slot) { - struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; - slot = gfn_to_memslot(kvm, gfn); if (likely(level == PT_PAGE_TABLE_LEVEL)) return &slot->rmap[gfn - slot->base_gfn]; linfo = lpage_info_slot(gfn, slot, level); - return &linfo->rmap_pde; } +/* + * Take gfn and return the reverse mapping to it. + */ +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) +{ + struct kvm_memory_slot *slot; + + slot = gfn_to_memslot(kvm, gfn); + return __gfn_to_rmap(kvm, gfn, level, slot); +} + static bool rmap_can_add(struct kvm_vcpu *vcpu) { struct kvm_mmu_memory_cache *cache; @@ -1019,12 +1025,14 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) static int rmap_write_protect(struct kvm *kvm, u64 gfn) { + struct kvm_memory_slot *slot; unsigned long *rmapp; u64 *spte; int i, write_protected = 0; - rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); + slot = gfn_to_memslot(kvm, gfn); + rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot); spte = rmap_next(kvm, rmapp, NULL); while (spte) { BUG_ON(!(*spte & PT_PRESENT_MASK)); @@ -1039,7 +1047,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) /* check for huge page mappings */ for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - rmapp = gfn_to_rmap(kvm, gfn, i); + rmapp = __gfn_to_rmap(kvm, gfn, i, slot); spte = rmap_next(kvm, rmapp, NULL); while (spte) { BUG_ON(!(*spte & PT_PRESENT_MASK)); -- cgit v1.1 From 7850ac5420803996e2960d15b924021f28e0dffc Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 14 Nov 2011 18:23:34 +0900 Subject: KVM: Count the number of dirty pages for dirty logging Needed for the next patch which uses this number to decide how to write protect a slot. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a3b25a5..220c83b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3466,10 +3466,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r, i; + int r; struct kvm_memory_slot *memslot; unsigned long n; - unsigned long is_dirty = 0; mutex_lock(&kvm->slots_lock); @@ -3484,11 +3483,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, n = kvm_dirty_bitmap_bytes(memslot); - for (i = 0; !is_dirty && i < n/sizeof(long); i++) - is_dirty = memslot->dirty_bitmap[i]; - /* If nothing is dirty, don't bother messing with page tables. */ - if (is_dirty) { + if (memslot->nr_dirty_pages) { struct kvm_memslots *slots, *old_slots; unsigned long *dirty_bitmap; @@ -3503,6 +3499,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, goto out; memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + slots->memslots[log->slot].nr_dirty_pages = 0; slots->generation++; old_slots = kvm->memslots; -- cgit v1.1 From 95d4c16ce78cb6b7549a09159c409d52ddd18dae Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 14 Nov 2011 18:24:50 +0900 Subject: KVM: Optimize dirty logging by rmap_write_protect() Currently, write protecting a slot needs to walk all the shadow pages and checks ones which have a pte mapping a page in it. The walk is overly heavy when dirty pages in that slot are not so many and checking the shadow pages would result in unwanted cache pollution. To mitigate this problem, we use rmap_write_protect() and check only the sptes which can be reached from gfns marked in the dirty bitmap when the number of dirty pages are less than that of shadow pages. This criterion is reasonable in its meaning and worked well in our test: write protection became some times faster than before when the ratio of dirty pages are low and was not worse even when the ratio was near the criterion. Note that the locking for this write protection becomes fine grained. The reason why this is safe is descripted in the comments. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/mmu.c | 14 +++++++--- arch/x86/kvm/x86.c | 58 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 63 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6d83264..69b6525 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -648,6 +648,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); +int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, + struct kvm_memory_slot *slot); void kvm_mmu_zap_all(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fa71085..aecdea2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1023,15 +1023,13 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -static int rmap_write_protect(struct kvm *kvm, u64 gfn) +int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, + struct kvm_memory_slot *slot) { - struct kvm_memory_slot *slot; unsigned long *rmapp; u64 *spte; int i, write_protected = 0; - slot = gfn_to_memslot(kvm, gfn); - rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot); spte = rmap_next(kvm, rmapp, NULL); while (spte) { @@ -1066,6 +1064,14 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) return write_protected; } +static int rmap_write_protect(struct kvm *kvm, u64 gfn) +{ + struct kvm_memory_slot *slot; + + slot = gfn_to_memslot(kvm, gfn); + return kvm_mmu_rmap_write_protect(kvm, gfn, slot); +} + static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 220c83b..af546b7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3460,6 +3460,50 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, return 0; } +/** + * write_protect_slot - write protect a slot for dirty logging + * @kvm: the kvm instance + * @memslot: the slot we protect + * @dirty_bitmap: the bitmap indicating which pages are dirty + * @nr_dirty_pages: the number of dirty pages + * + * We have two ways to find all sptes to protect: + * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and + * checks ones that have a spte mapping a page in the slot. + * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap. + * + * Generally speaking, if there are not so many dirty pages compared to the + * number of shadow pages, we should use the latter. + * + * Note that letting others write into a page marked dirty in the old bitmap + * by using the remaining tlb entry is not a problem. That page will become + * write protected again when we flush the tlb and then be reported dirty to + * the user space by copying the old bitmap. + */ +static void write_protect_slot(struct kvm *kvm, + struct kvm_memory_slot *memslot, + unsigned long *dirty_bitmap, + unsigned long nr_dirty_pages) +{ + /* Not many dirty pages compared to # of shadow pages. */ + if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { + unsigned long gfn_offset; + + for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { + unsigned long gfn = memslot->base_gfn + gfn_offset; + + spin_lock(&kvm->mmu_lock); + kvm_mmu_rmap_write_protect(kvm, gfn, memslot); + spin_unlock(&kvm->mmu_lock); + } + kvm_flush_remote_tlbs(kvm); + } else { + spin_lock(&kvm->mmu_lock); + kvm_mmu_slot_remove_write_access(kvm, memslot->id); + spin_unlock(&kvm->mmu_lock); + } +} + /* * Get (and clear) the dirty memory log for a memory slot. */ @@ -3468,7 +3512,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, { int r; struct kvm_memory_slot *memslot; - unsigned long n; + unsigned long n, nr_dirty_pages; mutex_lock(&kvm->slots_lock); @@ -3482,9 +3526,10 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, goto out; n = kvm_dirty_bitmap_bytes(memslot); + nr_dirty_pages = memslot->nr_dirty_pages; /* If nothing is dirty, don't bother messing with page tables. */ - if (memslot->nr_dirty_pages) { + if (nr_dirty_pages) { struct kvm_memslots *slots, *old_slots; unsigned long *dirty_bitmap; @@ -3498,8 +3543,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, if (!slots) goto out; memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); - slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; - slots->memslots[log->slot].nr_dirty_pages = 0; + memslot = &slots->memslots[log->slot]; + memslot->dirty_bitmap = dirty_bitmap; + memslot->nr_dirty_pages = 0; slots->generation++; old_slots = kvm->memslots; @@ -3508,9 +3554,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; kfree(old_slots); - spin_lock(&kvm->mmu_lock); - kvm_mmu_slot_remove_write_access(kvm, log->slot); - spin_unlock(&kvm->mmu_lock); + write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages); r = -EFAULT; if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) -- cgit v1.1 From 46199f33c29533e7ad2a7d2128dc30175d1d4157 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 17 Nov 2011 10:56:09 +0200 Subject: KVM: VMX: remove unneeded vmx_load_host_state() calls. vmx_load_host_state() does not handle msrs switching (except MSR_KERNEL_GS_BASE) since commit 26bb0981b3f. Remove call to it where it is no longer make sense. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6e28d58..ba24022 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1747,7 +1747,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) int save_nmsrs, index; unsigned long *msr_bitmap; - vmx_load_host_state(vmx); save_nmsrs = 0; #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) { @@ -2142,12 +2141,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 1; /* Otherwise falls through */ default: - vmx_load_host_state(to_vmx(vcpu)); if (vmx_get_vmx_msr(vcpu, msr_index, pdata)) return 0; msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { - vmx_load_host_state(to_vmx(vcpu)); data = msr->data; break; } @@ -2171,7 +2168,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) switch (msr_index) { case MSR_EFER: - vmx_load_host_state(vmx); ret = kvm_set_msr_common(vcpu, msr_index, data); break; #ifdef CONFIG_X86_64 @@ -2220,7 +2216,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) break; msr = find_msr_entry(vmx, msr_index); if (msr) { - vmx_load_host_state(vmx); msr->data = data; break; } -- cgit v1.1 From d7841a4b1b6e8509881e1ec21c024c82ccf565a6 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 22 Nov 2011 15:16:54 +0900 Subject: KVM: x86 emulator: Use opcode::execute for IN/OUT IN : E4, E5, EC, ED OUT: E6, E7, EE, EF Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 54 ++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8547958..8ba4ea8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2776,6 +2776,24 @@ static int em_jcxz(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_in(struct x86_emulate_ctxt *ctxt) +{ + if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val, + &ctxt->dst.val)) + return X86EMUL_IO_NEEDED; + + return X86EMUL_CONTINUE; +} + +static int em_out(struct x86_emulate_ctxt *ctxt) +{ + ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val, + &ctxt->src.val, 1); + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + static int em_cli(struct x86_emulate_ctxt *ctxt) { if (emulator_bad_iopl(ctxt)) @@ -3004,6 +3022,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define D2bv(_f) D((_f) | ByteOp), D(_f) #define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) +#define I2bvIP(_f, _e, _i, _p) \ + IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p) #define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ @@ -3217,13 +3237,13 @@ static struct opcode opcode_table[256] = { /* 0xE0 - 0xE7 */ X3(I(SrcImmByte, em_loop)), I(SrcImmByte, em_jcxz), - D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), - D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), + I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in), + I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out), /* 0xE8 - 0xEF */ D(SrcImm | Stack), D(SrcImm | ImplicitOps), I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps), - D2bvIP(SrcDX | DstAcc, in, check_perm_in), - D2bvIP(SrcAcc | DstDX, out, check_perm_out), + I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in), + I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out), /* 0xF0 - 0xF7 */ N, DI(ImplicitOps, icebp), N, N, DI(ImplicitOps | Priv, hlt), D(ImplicitOps), @@ -3325,6 +3345,7 @@ static struct opcode twobyte_table[256] = { #undef D2bv #undef D2bvIP #undef I2bv +#undef I2bvIP #undef I6ALU static unsigned imm_size(struct x86_emulate_ctxt *ctxt) @@ -3867,11 +3888,12 @@ special_insn: case 0x6c: /* insb */ case 0x6d: /* insw/insd */ ctxt->src.val = ctxt->regs[VCPU_REGS_RDX]; - goto do_io_in; + rc = em_in(ctxt); + break; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX]; - goto do_io_out; + rc = em_out(ctxt); break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(ctxt->b, ctxt->eflags)) @@ -3915,12 +3937,6 @@ special_insn: ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; rc = em_grp2(ctxt); break; - case 0xe4: /* inb */ - case 0xe5: /* in */ - goto do_io_in; - case 0xe6: /* outb */ - case 0xe7: /* out */ - goto do_io_out; case 0xe8: /* call (near) */ { long int rel = ctxt->src.val; ctxt->src.val = (unsigned long) ctxt->_eip; @@ -3933,20 +3949,6 @@ special_insn: jmp_rel(ctxt, ctxt->src.val); ctxt->dst.type = OP_NONE; /* Disable writeback. */ break; - case 0xec: /* in al,dx */ - case 0xed: /* in (e/r)ax,dx */ - do_io_in: - if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val, - &ctxt->dst.val)) - goto done; /* IO is needed */ - break; - case 0xee: /* out dx,al */ - case 0xef: /* out dx,(e/r)ax */ - do_io_out: - ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val, - &ctxt->src.val, 1); - ctxt->dst.type = OP_NONE; /* Disable writeback. */ - break; case 0xf4: /* hlt */ ctxt->ops->halt(ctxt); break; -- cgit v1.1 From ce7faab24fbfb0b5207636ee4795e924bcf97e8a Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 22 Nov 2011 15:17:48 +0900 Subject: KVM: x86 emulator: Use opcode::execute for BT family BT : 0F A3 BTS: 0F AB BTR: 0F B3 BTC: 0F BB Group 8: 0F BA Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 77 +++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8ba4ea8..7a9ce6d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2813,6 +2813,35 @@ static int em_sti(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_bt(struct x86_emulate_ctxt *ctxt) +{ + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + /* only subword offset */ + ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; + + emulate_2op_SrcV_nobyte(ctxt, "bt"); + return X86EMUL_CONTINUE; +} + +static int em_bts(struct x86_emulate_ctxt *ctxt) +{ + emulate_2op_SrcV_nobyte(ctxt, "bts"); + return X86EMUL_CONTINUE; +} + +static int em_btr(struct x86_emulate_ctxt *ctxt) +{ + emulate_2op_SrcV_nobyte(ctxt, "btr"); + return X86EMUL_CONTINUE; +} + +static int em_btc(struct x86_emulate_ctxt *ctxt) +{ + emulate_2op_SrcV_nobyte(ctxt, "btc"); + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3117,10 +3146,10 @@ static struct group_dual group7 = { { static struct opcode group8[] = { N, N, N, N, - D(DstMem | SrcImmByte | ModRM), - D(DstMem | SrcImmByte | ModRM | Lock | PageTable), - D(DstMem | SrcImmByte | ModRM | Lock), - D(DstMem | SrcImmByte | ModRM | Lock | PageTable), + I(DstMem | SrcImmByte | ModRM, em_bt), + I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts), + I(DstMem | SrcImmByte | ModRM | Lock, em_btr), + I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc), }; static struct group_dual group9 = { { @@ -3299,26 +3328,27 @@ static struct opcode twobyte_table[256] = { X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), - DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), + DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), DI(ImplicitOps, rsm), - D(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable), + I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ D2bv(DstMem | SrcReg | ModRM | Lock | PageTable), I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), - D(DstMem | SrcReg | ModRM | BitOp | Lock), + I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, - G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable), + G(BitOp, group8), + I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xCF */ @@ -4103,21 +4133,10 @@ twobyte_insn: case 0x90 ... 0x9f: /* setcc r/m8 */ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; - case 0xa3: - bt: /* bt */ - ctxt->dst.type = OP_NONE; - /* only subword offset */ - ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte(ctxt, "bt"); - break; case 0xa4: /* shld imm8, r, r/m */ case 0xa5: /* shld cl, r, r/m */ emulate_2op_cl(ctxt, "shld"); break; - case 0xab: - bts: /* bts */ - emulate_2op_SrcV_nobyte(ctxt, "bts"); - break; case 0xac: /* shrd imm8, r, r/m */ case 0xad: /* shrd cl, r, r/m */ emulate_2op_cl(ctxt, "shrd"); @@ -4141,31 +4160,11 @@ twobyte_insn: ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; } break; - case 0xb3: - btr: /* btr */ - emulate_2op_SrcV_nobyte(ctxt, "btr"); - break; case 0xb6 ... 0xb7: /* movzx */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val : (u16) ctxt->src.val; break; - case 0xba: /* Grp8 */ - switch (ctxt->modrm_reg & 3) { - case 0: - goto bt; - case 1: - goto bts; - case 2: - goto btr; - case 3: - goto btc; - } - break; - case 0xbb: - btc: /* btc */ - emulate_2op_SrcV_nobyte(ctxt, "btc"); - break; case 0xbc: { /* bsf */ u8 zf; __asm__ ("bsf %2, %0; setz %1" -- cgit v1.1 From d4ddafcdf2201326ec9717172767cfad0ede1472 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 22 Nov 2011 15:18:35 +0900 Subject: KVM: x86 emulator: Use opcode::execute for CALL CALL: E8 Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7a9ce6d..6b7a03b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2482,6 +2482,15 @@ static int em_das(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_call(struct x86_emulate_ctxt *ctxt) +{ + long rel = ctxt->src.val; + + ctxt->src.val = (unsigned long)ctxt->_eip; + jmp_rel(ctxt, rel); + return em_push(ctxt); +} + static int em_call_far(struct x86_emulate_ctxt *ctxt) { u16 sel, old_cs; @@ -3269,7 +3278,7 @@ static struct opcode opcode_table[256] = { I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out), /* 0xE8 - 0xEF */ - D(SrcImm | Stack), D(SrcImm | ImplicitOps), + I(SrcImm | Stack, em_call), D(SrcImm | ImplicitOps), I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps), I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out), @@ -3967,13 +3976,6 @@ special_insn: ctxt->src.val = ctxt->regs[VCPU_REGS_RCX]; rc = em_grp2(ctxt); break; - case 0xe8: /* call (near) */ { - long int rel = ctxt->src.val; - ctxt->src.val = (unsigned long) ctxt->_eip; - jmp_rel(ctxt, rel); - rc = em_push(ctxt); - break; - } case 0xe9: /* jmp rel */ case 0xeb: /* jmp rel short */ jmp_rel(ctxt, ctxt->src.val); -- cgit v1.1 From bc00f8d2c2df33c6e7fc8a12c94c0c74d491e566 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 22 Nov 2011 15:19:19 +0900 Subject: KVM: x86 emulator: Use opcode::execute for MOV to cr/dr MOV: 0F 22 (move to control registers) MOV: 0F 23 (move to debug registers) Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 52 +++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6b7a03b..7fe5ed1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2638,6 +2638,34 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_cr_write(struct x86_emulate_ctxt *ctxt) +{ + if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) + return emulate_gp(ctxt, 0); + + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_dr_write(struct x86_emulate_ctxt *ctxt) +{ + unsigned long val; + + if (ctxt->mode == X86EMUL_MODE_PROT64) + val = ctxt->src.val & ~0ULL; + else + val = ctxt->src.val & ~0U; + + /* #UD condition is already handled. */ + if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0) + return emulate_gp(ctxt, 0); + + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) { if (ctxt->modrm_reg > VCPU_SREG_GS) @@ -3304,8 +3332,8 @@ static struct opcode twobyte_table[256] = { /* 0x20 - 0x2F */ DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), - DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write), - DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write), + IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write), + IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write), N, N, N, N, N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ @@ -4080,26 +4108,6 @@ twobyte_insn: case 0x21: /* mov from dr to reg */ ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); break; - case 0x22: /* mov reg, cr */ - if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) { - emulate_gp(ctxt, 0); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - ctxt->dst.type = OP_NONE; - break; - case 0x23: /* mov from reg to dr */ - if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val & - ((ctxt->mode == X86EMUL_MODE_PROT64) ? - ~0ULL : ~0U)) < 0) { - /* #UD condition is already handled by the code above */ - emulate_gp(ctxt, 0); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - - ctxt->dst.type = OP_NONE; /* no writeback */ - break; case 0x30: /* wrmsr */ msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] -- cgit v1.1 From e1e210b0a7f7b4d14577bdc76719963f7facc0e7 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 22 Nov 2011 15:20:03 +0900 Subject: KVM: x86 emulator: Use opcode::execute for WRMSR/RDMSR WRMSR: 0F 30 RDMSR: 0F 32 Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 52 +++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7fe5ed1..906c5eb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2666,6 +2666,30 @@ static int em_dr_write(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_wrmsr(struct x86_emulate_ctxt *ctxt) +{ + u64 msr_data; + + msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] + | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); + if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + +static int em_rdmsr(struct x86_emulate_ctxt *ctxt) +{ + u64 msr_data; + + if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) + return emulate_gp(ctxt, 0); + + ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; + ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; + return X86EMUL_CONTINUE; +} + static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) { if (ctxt->modrm_reg > VCPU_SREG_GS) @@ -3337,9 +3361,9 @@ static struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ - DI(ImplicitOps | Priv, wrmsr), + II(ImplicitOps | Priv, em_wrmsr, wrmsr), IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), - DI(ImplicitOps | Priv, rdmsr), + II(ImplicitOps | Priv, em_rdmsr, rdmsr), DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), I(ImplicitOps | VendorSpecific, em_sysenter), I(ImplicitOps | Priv | VendorSpecific, em_sysexit), @@ -3818,7 +3842,6 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { struct x86_emulate_ops *ops = ctxt->ops; - u64 msr_data; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; @@ -4108,29 +4131,6 @@ twobyte_insn: case 0x21: /* mov from dr to reg */ ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); break; - case 0x30: - /* wrmsr */ - msr_data = (u32)ctxt->regs[VCPU_REGS_RAX] - | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32); - if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) { - emulate_gp(ctxt, 0); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - rc = X86EMUL_CONTINUE; - break; - case 0x32: - /* rdmsr */ - if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) { - emulate_gp(ctxt, 0); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } else { - ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data; - ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32; - } - rc = X86EMUL_CONTINUE; - break; case 0x40 ... 0x4f: /* cmov */ ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val; if (!test_cc(ctxt->b, ctxt->eflags)) -- cgit v1.1 From e940b5c20f89282fe826c5e2237932ab280497cf Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 22 Nov 2011 15:20:47 +0900 Subject: KVM: x86 emulator: Use opcode::execute for CMPXCHG CMPXCHG: 0F B0, 0F B1 Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 906c5eb..799000d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1832,6 +1832,24 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) return rc; } +static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) +{ + /* Save real source value, then compare EAX against destination. */ + ctxt->src.orig_val = ctxt->src.val; + ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; + emulate_2op_SrcV(ctxt, "cmp"); + + if (ctxt->eflags & EFLG_ZF) { + /* Success: write back to memory. */ + ctxt->dst.val = ctxt->src.orig_val; + } else { + /* Failure: write the value we saw to EAX. */ + ctxt->dst.type = OP_REG; + ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; + } + return X86EMUL_CONTINUE; +} + static int em_lseg(struct x86_emulate_ctxt *ctxt) { int seg = ctxt->src2.val; @@ -3400,7 +3418,7 @@ static struct opcode twobyte_table[256] = { D(DstMem | SrcReg | Src2CL | ModRM), D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ - D2bv(DstMem | SrcReg | ModRM | Lock | PageTable), + I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), @@ -4153,23 +4171,6 @@ twobyte_insn: break; case 0xae: /* clflush */ break; - case 0xb0 ... 0xb1: /* cmpxchg */ - /* - * Save real source value, then compare EAX against - * destination. - */ - ctxt->src.orig_val = ctxt->src.val; - ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; - emulate_2op_SrcV(ctxt, "cmp"); - if (ctxt->eflags & EFLG_ZF) { - /* Success: write back to memory. */ - ctxt->dst.val = ctxt->src.orig_val; - } else { - /* Failure: write the value we saw to EAX. */ - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; - } - break; case 0xb6 ... 0xb7: /* movzx */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val -- cgit v1.1 From ff227392cd7b858b2b04732e02697122fd1b35b0 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 22 Nov 2011 15:21:33 +0900 Subject: KVM: x86 emulator: Use opcode::execute for BSF/BSR BSF: 0F BC BSR: 0F BD Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 60 +++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 799000d..4cd3313 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2921,6 +2921,40 @@ static int em_btc(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_bsf(struct x86_emulate_ctxt *ctxt) +{ + u8 zf; + + __asm__ ("bsf %2, %0; setz %1" + : "=r"(ctxt->dst.val), "=q"(zf) + : "r"(ctxt->src.val)); + + ctxt->eflags &= ~X86_EFLAGS_ZF; + if (zf) { + ctxt->eflags |= X86_EFLAGS_ZF; + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + } + return X86EMUL_CONTINUE; +} + +static int em_bsr(struct x86_emulate_ctxt *ctxt) +{ + u8 zf; + + __asm__ ("bsr %2, %0; setz %1" + : "=r"(ctxt->dst.val), "=q"(zf) + : "r"(ctxt->src.val)); + + ctxt->eflags &= ~X86_EFLAGS_ZF; + if (zf) { + ctxt->eflags |= X86_EFLAGS_ZF; + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + } + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3428,7 +3462,7 @@ static struct opcode twobyte_table[256] = { N, N, G(BitOp, group8), I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), - D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xCF */ D2bv(DstMem | SrcReg | ModRM | Lock), @@ -4176,30 +4210,6 @@ twobyte_insn: ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val : (u16) ctxt->src.val; break; - case 0xbc: { /* bsf */ - u8 zf; - __asm__ ("bsf %2, %0; setz %1" - : "=r"(ctxt->dst.val), "=q"(zf) - : "r"(ctxt->src.val)); - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - ctxt->dst.type = OP_NONE; /* Disable writeback. */ - } - break; - } - case 0xbd: { /* bsr */ - u8 zf; - __asm__ ("bsr %2, %0; setz %1" - : "=r"(ctxt->dst.val), "=q"(zf) - : "r"(ctxt->src.val)); - ctxt->eflags &= ~X86_EFLAGS_ZF; - if (zf) { - ctxt->eflags |= X86_EFLAGS_ZF; - ctxt->dst.type = OP_NONE; /* Disable writeback. */ - } - break; - } case 0xbe ... 0xbf: /* movsx */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : -- cgit v1.1 From 93a5cef07d686a0341d056b0f930a762c7174a13 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 24 Nov 2011 17:37:48 +0800 Subject: KVM: introduce KVM_MEM_SLOTS_NUM macro Introduce KVM_MEM_SLOTS_NUM macro to instead of KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 4 +++- arch/x86/kvm/mmu.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 69b6525..1769f3d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -31,6 +31,8 @@ #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 +#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) + #define KVM_MMIO_SIZE 16 #define KVM_PIO_PAGE_OFFSET 1 @@ -228,7 +230,7 @@ struct kvm_mmu_page { * One bit set per slot which has memory * in this shadow page. */ - DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); + DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM); bool unsync; int root_count; /* Currently serving as active root */ unsigned int unsync_children; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aecdea2..715dcb4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1349,7 +1349,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, PAGE_SIZE); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); + bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); sp->parent_ptes = 0; mmu_page_add_parent_pte(vcpu, sp, parent_pte); kvm_mod_used_mmu_pages(vcpu->kvm, +1); -- cgit v1.1 From be593d6286075801bba6d60fa466a39c24cc7616 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 24 Nov 2011 17:38:24 +0800 Subject: KVM: introduce update_memslots function Introduce update_memslots to update slot which will be update to kvm->memslots Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index af546b7..917a287 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3546,7 +3546,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, memslot = &slots->memslots[log->slot]; memslot->dirty_bitmap = dirty_bitmap; memslot->nr_dirty_pages = 0; - slots->generation++; + update_memslots(slots, NULL); old_slots = kvm->memslots; rcu_assign_pointer(kvm->memslots, slots); -- cgit v1.1 From be6ba0f0962a39091c52eb9167ddea201fe80716 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 24 Nov 2011 17:39:18 +0800 Subject: KVM: introduce kvm_for_each_memslot macro Introduce kvm_for_each_memslot to walk all valid memslot Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 715dcb4..d737443 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1128,15 +1128,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int (*handler)(struct kvm *kvm, unsigned long *rmapp, unsigned long data)) { - int i, j; + int j; int ret; int retval = 0; struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; slots = kvm_memslots(kvm); - for (i = 0; i < slots->nmemslots; i++) { - struct kvm_memory_slot *memslot = &slots->memslots[i]; + kvm_for_each_memslot(memslot, slots) { unsigned long start = memslot->userspace_addr; unsigned long end; @@ -3985,15 +3985,15 @@ nomem: */ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) { - int i; unsigned int nr_mmu_pages; unsigned int nr_pages = 0; struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; slots = kvm_memslots(kvm); - for (i = 0; i < slots->nmemslots; i++) - nr_pages += slots->memslots[i].npages; + kvm_for_each_memslot(memslot, slots) + nr_pages += memslot->npages; nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, -- cgit v1.1 From 28a37544fb0223eb9805d2567b88f7360edec52a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 24 Nov 2011 19:04:35 +0800 Subject: KVM: introduce id_to_memslot function Introduce id_to_memslot to get memslot by slot id Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 6 ++++-- arch/x86/kvm/x86.c | 18 +++++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ba24022..8f19d91 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2711,11 +2711,13 @@ static gva_t rmode_tss_base(struct kvm *kvm) { if (!kvm->arch.tss_addr) { struct kvm_memslots *slots; + struct kvm_memory_slot *slot; gfn_t base_gfn; slots = kvm_memslots(kvm); - base_gfn = slots->memslots[0].base_gfn + - kvm->memslots->memslots[0].npages - 3; + slot = id_to_memslot(slots, 0); + base_gfn = slot->base_gfn + slot->npages - 3; + return base_gfn << PAGE_SHIFT; } return kvm->arch.tss_addr; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 917a287..b6776c6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3520,7 +3520,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, if (log->slot >= KVM_MEMORY_SLOTS) goto out; - memslot = &kvm->memslots->memslots[log->slot]; + memslot = id_to_memslot(kvm->memslots, log->slot); r = -ENOENT; if (!memslot->dirty_bitmap) goto out; @@ -3531,27 +3531,27 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, /* If nothing is dirty, don't bother messing with page tables. */ if (nr_dirty_pages) { struct kvm_memslots *slots, *old_slots; - unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap, *dirty_bitmap_head; - dirty_bitmap = memslot->dirty_bitmap_head; - if (memslot->dirty_bitmap == dirty_bitmap) - dirty_bitmap += n / sizeof(long); - memset(dirty_bitmap, 0, n); + dirty_bitmap = memslot->dirty_bitmap; + dirty_bitmap_head = memslot->dirty_bitmap_head; + if (dirty_bitmap == dirty_bitmap_head) + dirty_bitmap_head += n / sizeof(long); + memset(dirty_bitmap_head, 0, n); r = -ENOMEM; slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!slots) goto out; memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); - memslot = &slots->memslots[log->slot]; - memslot->dirty_bitmap = dirty_bitmap; + memslot = id_to_memslot(slots, log->slot); memslot->nr_dirty_pages = 0; + memslot->dirty_bitmap = dirty_bitmap_head; update_memslots(slots, NULL); old_slots = kvm->memslots; rcu_assign_pointer(kvm->memslots, slots); synchronize_srcu_expedited(&kvm->srcu); - dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; kfree(old_slots); write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages); -- cgit v1.1 From 2b5e97e1fadf1ade87558f2a2003616879f9e228 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 23 Nov 2011 12:27:39 +0900 Subject: KVM: x86 emulator: Use opcode::execute for INS/OUTS from/to port in DX INSB : 6C INSW/INSD : 6D OUTSB : 6E OUTSW/OUTSD: 6F The I/O port address is read from the DX register when we decode the operand because we see the SrcDX/DstDX flag is set. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4cd3313..ac8e5ed 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3321,8 +3321,8 @@ static struct opcode opcode_table[256] = { I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), I(SrcImmByte | Mov | Stack, em_push), I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), - D2bvIP(DstDI | SrcDX | Mov | String, ins, check_perm_in), /* insb, insw/insd */ - D2bvIP(SrcSI | DstDX | String, outs, check_perm_out), /* outsb, outsw/outsd */ + I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */ + I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ X16(D(SrcImmByte)), /* 0x80 - 0x87 */ @@ -4027,16 +4027,6 @@ special_insn: goto cannot_emulate; ctxt->dst.val = (s32) ctxt->src.val; break; - case 0x6c: /* insb */ - case 0x6d: /* insw/insd */ - ctxt->src.val = ctxt->regs[VCPU_REGS_RDX]; - rc = em_in(ctxt); - break; - case 0x6e: /* outsb */ - case 0x6f: /* outsw/outsd */ - ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX]; - rc = em_out(ctxt); - break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(ctxt->b, ctxt->eflags)) jmp_rel(ctxt, ctxt->src.val); -- cgit v1.1 From 00b27a3efb116062ca5a276ad5cb01ea1b80b5f6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 23 Nov 2011 16:30:32 +0200 Subject: KVM: Move cpuid code to new file The cpuid code has grown; put it into a separate file. Signed-off-by: Avi Kivity --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/cpuid.c | 625 +++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/cpuid.h | 46 ++++ arch/x86/kvm/lapic.c | 1 + arch/x86/kvm/vmx.c | 1 + arch/x86/kvm/x86.c | 634 +------------------------------------------------- arch/x86/kvm/x86.h | 5 +- 7 files changed, 679 insertions(+), 635 deletions(-) create mode 100644 arch/x86/kvm/cpuid.c create mode 100644 arch/x86/kvm/cpuid.h (limited to 'arch/x86') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f43..161b76a 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o timer.o + i8254.o timer.o cpuid.o kvm-intel-y += vmx.o kvm-amd-y += svm.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c new file mode 100644 index 0000000..0a332ec --- /dev/null +++ b/arch/x86/kvm/cpuid.c @@ -0,0 +1,625 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * cpuid support routines + * + * derived from arch/x86/kvm/x86.c + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates. + * Copyright IBM Corporation, 2008 + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include +#include +#include +#include +#include "cpuid.h" +#include "lapic.h" +#include "mmu.h" +#include "trace.h" + +void kvm_update_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + struct kvm_lapic *apic = vcpu->arch.apic; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (!best) + return; + + /* Update OSXSAVE bit */ + if (cpu_has_xsave && best->function == 0x1) { + best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) + best->ecx |= bit(X86_FEATURE_OSXSAVE); + } + + if (apic) { + if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) + apic->lapic_timer.timer_mode_mask = 3 << 17; + else + apic->lapic_timer.timer_mode_mask = 1 << 17; + } +} + +static int is_efer_nx(void) +{ + unsigned long long efer = 0; + + rdmsrl_safe(MSR_EFER, &efer); + return efer & EFER_NX; +} + +static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_cpuid_entry2 *e, *entry; + + entry = NULL; + for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + e = &vcpu->arch.cpuid_entries[i]; + if (e->function == 0x80000001) { + entry = e; + break; + } + } + if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { + entry->edx &= ~(1 << 20); + printk(KERN_INFO "kvm: guest NX capability removed\n"); + } +} + +/* when an old userspace process fills a new kernel module */ +int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries) +{ + int r, i; + struct kvm_cpuid_entry *cpuid_entries; + + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -ENOMEM; + cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); + if (!cpuid_entries) + goto out; + r = -EFAULT; + if (copy_from_user(cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry))) + goto out_free; + for (i = 0; i < cpuid->nent; i++) { + vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; + vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; + vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; + vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; + vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; + vcpu->arch.cpuid_entries[i].index = 0; + vcpu->arch.cpuid_entries[i].flags = 0; + vcpu->arch.cpuid_entries[i].padding[0] = 0; + vcpu->arch.cpuid_entries[i].padding[1] = 0; + vcpu->arch.cpuid_entries[i].padding[2] = 0; + } + vcpu->arch.cpuid_nent = cpuid->nent; + cpuid_fix_nx_cap(vcpu); + r = 0; + kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); + kvm_update_cpuid(vcpu); + +out_free: + vfree(cpuid_entries); +out: + return r; +} + +int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + int r; + + r = -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r = -EFAULT; + if (copy_from_user(&vcpu->arch.cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry2))) + goto out; + vcpu->arch.cpuid_nent = cpuid->nent; + kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); + kvm_update_cpuid(vcpu); + return 0; + +out: + return r; +} + +int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + int r; + + r = -E2BIG; + if (cpuid->nent < vcpu->arch.cpuid_nent) + goto out; + r = -EFAULT; + if (copy_to_user(entries, &vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) + goto out; + return 0; + +out: + cpuid->nent = vcpu->arch.cpuid_nent; + return r; +} + +static void cpuid_mask(u32 *word, int wordnum) +{ + *word &= boot_cpu_data.x86_capability[wordnum]; +} + +static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index) +{ + entry->function = function; + entry->index = index; + cpuid_count(entry->function, entry->index, + &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); + entry->flags = 0; +} + +static bool supported_xcr0_bit(unsigned bit) +{ + u64 mask = ((u64)1 << bit); + + return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; +} + +#define F(x) bit(X86_FEATURE_##x) + +static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, + u32 index, int *nent, int maxnent) +{ + unsigned f_nx = is_efer_nx() ? F(NX) : 0; +#ifdef CONFIG_X86_64 + unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) + ? F(GBPAGES) : 0; + unsigned f_lm = F(LM); +#else + unsigned f_gbpages = 0; + unsigned f_lm = 0; +#endif + unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; + + /* cpuid 1.edx */ + const u32 kvm_supported_word0_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | + 0 /* Reserved, DS, ACPI */ | F(MMX) | + F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | + 0 /* HTT, TM, Reserved, PBE */; + /* cpuid 0x80000001.edx */ + const u32 kvm_supported_word1_x86_features = + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* Reserved */ | + f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | + F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | + 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); + /* cpuid 1.ecx */ + const u32 kvm_supported_word4_x86_features = + F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | + 0 /* DS-CPL, VMX, SMX, EST */ | + 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | + 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | + 0 /* Reserved, DCA */ | F(XMM4_1) | + F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | + 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | + F(F16C) | F(RDRAND); + /* cpuid 0x80000001.ecx */ + const u32 kvm_supported_word6_x86_features = + F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | + F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + + /* cpuid 0xC0000001.edx */ + const u32 kvm_supported_word5_x86_features = + F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | + F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | + F(PMM) | F(PMM_EN); + + /* cpuid 7.0.ebx */ + const u32 kvm_supported_word9_x86_features = + F(SMEP) | F(FSGSBASE) | F(ERMS); + + /* all calls to cpuid_count() should be made on the same cpu */ + get_cpu(); + do_cpuid_1_ent(entry, function, index); + ++*nent; + + switch (function) { + case 0: + entry->eax = min(entry->eax, (u32)0xd); + break; + case 1: + entry->edx &= kvm_supported_word0_x86_features; + cpuid_mask(&entry->edx, 0); + entry->ecx &= kvm_supported_word4_x86_features; + cpuid_mask(&entry->ecx, 4); + /* we support x2apic emulation even if host does not support + * it since we emulate x2apic in software */ + entry->ecx |= F(X2APIC); + break; + /* function 2 entries are STATEFUL. That is, repeated cpuid commands + * may return different values. This forces us to get_cpu() before + * issuing the first command, and also to emulate this annoying behavior + * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ + case 2: { + int t, times = entry->eax & 0xff; + + entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + for (t = 1; t < times && *nent < maxnent; ++t) { + do_cpuid_1_ent(&entry[t], function, 0); + entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + ++*nent; + } + break; + } + /* function 4 has additional index. */ + case 4: { + int i, cache_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until cache_type is zero */ + for (i = 1; *nent < maxnent; ++i) { + cache_type = entry[i - 1].eax & 0x1f; + if (!cache_type) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 7: { + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* Mask ebx against host capbability word 9 */ + if (index == 0) { + entry->ebx &= kvm_supported_word9_x86_features; + cpuid_mask(&entry->ebx, 9); + } else + entry->ebx = 0; + entry->eax = 0; + entry->ecx = 0; + entry->edx = 0; + break; + } + case 9: + break; + /* function 0xb has additional index. */ + case 0xb: { + int i, level_type; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* read more entries until level_type is zero */ + for (i = 1; *nent < maxnent; ++i) { + level_type = entry[i - 1].ecx & 0xff00; + if (!level_type) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } + case 0xd: { + int idx, i; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) { + do_cpuid_1_ent(&entry[i], function, idx); + if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) + continue; + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + ++i; + } + break; + } + case KVM_CPUID_SIGNATURE: { + char signature[12] = "KVMKVMKVM\0\0"; + u32 *sigptr = (u32 *)signature; + entry->eax = 0; + entry->ebx = sigptr[0]; + entry->ecx = sigptr[1]; + entry->edx = sigptr[2]; + break; + } + case KVM_CPUID_FEATURES: + entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | + (1 << KVM_FEATURE_NOP_IO_DELAY) | + (1 << KVM_FEATURE_CLOCKSOURCE2) | + (1 << KVM_FEATURE_ASYNC_PF) | + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + + if (sched_info_on()) + entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); + + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + break; + case 0x80000000: + entry->eax = min(entry->eax, 0x8000001a); + break; + case 0x80000001: + entry->edx &= kvm_supported_word1_x86_features; + cpuid_mask(&entry->edx, 1); + entry->ecx &= kvm_supported_word6_x86_features; + cpuid_mask(&entry->ecx, 6); + break; + case 0x80000008: { + unsigned g_phys_as = (entry->eax >> 16) & 0xff; + unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); + unsigned phys_as = entry->eax & 0xff; + + if (!g_phys_as) + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); + entry->ebx = entry->edx = 0; + break; + } + case 0x80000019: + entry->ecx = entry->edx = 0; + break; + case 0x8000001a: + break; + case 0x8000001d: + break; + /*Add support for Centaur's CPUID instruction*/ + case 0xC0000000: + /*Just support up to 0xC0000004 now*/ + entry->eax = min(entry->eax, 0xC0000004); + break; + case 0xC0000001: + entry->edx &= kvm_supported_word5_x86_features; + cpuid_mask(&entry->edx, 5); + break; + case 3: /* Processor serial number */ + case 5: /* MONITOR/MWAIT */ + case 6: /* Thermal management */ + case 0xA: /* Architectural Performance Monitoring */ + case 0x80000007: /* Advanced power management */ + case 0xC0000002: + case 0xC0000003: + case 0xC0000004: + default: + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + + kvm_x86_ops->set_supported_cpuid(function, entry); + + put_cpu(); +} + +#undef F + +int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries) +{ + struct kvm_cpuid_entry2 *cpuid_entries; + int limit, nent = 0, r = -E2BIG; + u32 func; + + if (cpuid->nent < 1) + goto out; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + cpuid->nent = KVM_MAX_CPUID_ENTRIES; + r = -ENOMEM; + cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); + if (!cpuid_entries) + goto out; + + do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); + limit = cpuid_entries[0].eax; + for (func = 1; func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); + limit = cpuid_entries[nent - 1].eax; + for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); + + + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + /* Add support for Centaur's CPUID instruction. */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) { + do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0, + &nent, cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + limit = cpuid_entries[nent - 1].eax; + for (func = 0xC0000001; + func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + } + + do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, + cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent, + cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + r = -EFAULT; + if (copy_to_user(entries, cpuid_entries, + nent * sizeof(struct kvm_cpuid_entry2))) + goto out_free; + cpuid->nent = nent; + r = 0; + +out_free: + vfree(cpuid_entries); +out: + return r; +} + +static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) +{ + struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; + int j, nent = vcpu->arch.cpuid_nent; + + e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; + /* when no next entry is found, the current entry[i] is reselected */ + for (j = i + 1; ; j = (j + 1) % nent) { + struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; + if (ej->function == e->function) { + ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; + return j; + } + } + return 0; /* silence gcc, even though control never reaches here */ +} + +/* find an entry with matching function, matching index (if needed), and that + * should be read next (if it's stateful) */ +static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, + u32 function, u32 index) +{ + if (e->function != function) + return 0; + if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) + return 0; + if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && + !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) + return 0; + return 1; +} + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index) +{ + int i; + struct kvm_cpuid_entry2 *best = NULL; + + for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { + struct kvm_cpuid_entry2 *e; + + e = &vcpu->arch.cpuid_entries[i]; + if (is_matching_cpuid_entry(e, function, index)) { + if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) + move_to_next_stateful_cpuid_entry(vcpu, i); + best = e; + break; + } + } + return best; +} +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); + +int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); + if (!best || best->eax < 0x80000008) + goto not_found; + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + if (best) + return best->eax & 0xff; +not_found: + return 36; +} + +/* + * If no match is found, check whether we exceed the vCPU's limit + * and return the content of the highest valid _standard_ leaf instead. + * This is to satisfy the CPUID specification. + */ +static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, + u32 function, u32 index) +{ + struct kvm_cpuid_entry2 *maxlevel; + + maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + if (!maxlevel || maxlevel->eax >= function) + return NULL; + if (function & 0x80000000) { + maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); + if (!maxlevel) + return NULL; + } + return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); +} + +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + u32 function, index; + struct kvm_cpuid_entry2 *best; + + function = kvm_register_read(vcpu, VCPU_REGS_RAX); + index = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_register_write(vcpu, VCPU_REGS_RAX, 0); + kvm_register_write(vcpu, VCPU_REGS_RBX, 0); + kvm_register_write(vcpu, VCPU_REGS_RCX, 0); + kvm_register_write(vcpu, VCPU_REGS_RDX, 0); + best = kvm_find_cpuid_entry(vcpu, function, index); + + if (!best) + best = check_cpuid_limit(vcpu, function, index); + + if (best) { + kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); + kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); + kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); + } + kvm_x86_ops->skip_emulated_instruction(vcpu); + trace_kvm_cpuid(function, + kvm_register_read(vcpu, VCPU_REGS_RAX), + kvm_register_read(vcpu, VCPU_REGS_RBX), + kvm_register_read(vcpu, VCPU_REGS_RCX), + kvm_register_read(vcpu, VCPU_REGS_RDX)); +} +EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h new file mode 100644 index 0000000..5b97e17 --- /dev/null +++ b/arch/x86/kvm/cpuid.h @@ -0,0 +1,46 @@ +#ifndef ARCH_X86_KVM_CPUID_H +#define ARCH_X86_KVM_CPUID_H + +#include "x86.h" + +void kvm_update_cpuid(struct kvm_vcpu *vcpu); +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index); +int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries); +int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries); +int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries); +int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 *cpuid, + struct kvm_cpuid_entry2 __user *entries); + + +static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_XSAVE)); +} + +static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_SMEP)); +} + +static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); +} + +#endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 54abb40..a7f3e65 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -38,6 +38,7 @@ #include "irq.h" #include "trace.h" #include "x86.h" +#include "cpuid.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8f19d91..4ceced2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -18,6 +18,7 @@ #include "irq.h" #include "mmu.h" +#include "cpuid.h" #include #include diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b6776c6..4e533d2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -26,6 +26,7 @@ #include "tss.h" #include "kvm_cache_regs.h" #include "x86.h" +#include "cpuid.h" #include #include @@ -82,8 +83,6 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU static void update_cr8_intercept(struct kvm_vcpu *vcpu); -static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries); static void process_nmi(struct kvm_vcpu *vcpu); struct kvm_x86_ops *kvm_x86_ops; @@ -574,54 +573,6 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 1, 0); - return best && (best->ecx & bit(X86_FEATURE_XSAVE)); -} - -static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_SMEP)); -} - -static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); -} - -static void update_cpuid(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - struct kvm_lapic *apic = vcpu->arch.apic; - - best = kvm_find_cpuid_entry(vcpu, 1, 0); - if (!best) - return; - - /* Update OSXSAVE bit */ - if (cpu_has_xsave && best->function == 0x1) { - best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) - best->ecx |= bit(X86_FEATURE_OSXSAVE); - } - - if (apic) { - if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) - apic->lapic_timer.timer_mode_mask = 3 << 17; - else - apic->lapic_timer.timer_mode_mask = 1 << 17; - } -} - int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); @@ -655,7 +606,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_mmu_reset_context(vcpu); if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) - update_cpuid(vcpu); + kvm_update_cpuid(vcpu); return 0; } @@ -2265,466 +2216,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); } -static int is_efer_nx(void) -{ - unsigned long long efer = 0; - - rdmsrl_safe(MSR_EFER, &efer); - return efer & EFER_NX; -} - -static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_cpuid_entry2 *e, *entry; - - entry = NULL; - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - if (e->function == 0x80000001) { - entry = e; - break; - } - } - if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) { - entry->edx &= ~(1 << 20); - printk(KERN_INFO "kvm: guest NX capability removed\n"); - } -} - -/* when an old userspace process fills a new kernel module */ -static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, - struct kvm_cpuid *cpuid, - struct kvm_cpuid_entry __user *entries) -{ - int r, i; - struct kvm_cpuid_entry *cpuid_entries; - - r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent); - if (!cpuid_entries) - goto out; - r = -EFAULT; - if (copy_from_user(cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry))) - goto out_free; - for (i = 0; i < cpuid->nent; i++) { - vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; - vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; - vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx; - vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx; - vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx; - vcpu->arch.cpuid_entries[i].index = 0; - vcpu->arch.cpuid_entries[i].flags = 0; - vcpu->arch.cpuid_entries[i].padding[0] = 0; - vcpu->arch.cpuid_entries[i].padding[1] = 0; - vcpu->arch.cpuid_entries[i].padding[2] = 0; - } - vcpu->arch.cpuid_nent = cpuid->nent; - cpuid_fix_nx_cap(vcpu); - r = 0; - kvm_apic_set_version(vcpu); - kvm_x86_ops->cpuid_update(vcpu); - update_cpuid(vcpu); - -out_free: - vfree(cpuid_entries); -out: - return r; -} - -static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) -{ - int r; - - r = -E2BIG; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - goto out; - r = -EFAULT; - if (copy_from_user(&vcpu->arch.cpuid_entries, entries, - cpuid->nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - vcpu->arch.cpuid_nent = cpuid->nent; - kvm_apic_set_version(vcpu); - kvm_x86_ops->cpuid_update(vcpu); - update_cpuid(vcpu); - return 0; - -out: - return r; -} - -static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, - struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) -{ - int r; - - r = -E2BIG; - if (cpuid->nent < vcpu->arch.cpuid_nent) - goto out; - r = -EFAULT; - if (copy_to_user(entries, &vcpu->arch.cpuid_entries, - vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - return 0; - -out: - cpuid->nent = vcpu->arch.cpuid_nent; - return r; -} - -static void cpuid_mask(u32 *word, int wordnum) -{ - *word &= boot_cpu_data.x86_capability[wordnum]; -} - -static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index) -{ - entry->function = function; - entry->index = index; - cpuid_count(entry->function, entry->index, - &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); - entry->flags = 0; -} - -static bool supported_xcr0_bit(unsigned bit) -{ - u64 mask = ((u64)1 << bit); - - return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; -} - -#define F(x) bit(X86_FEATURE_##x) - -static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index, int *nent, int maxnent) -{ - unsigned f_nx = is_efer_nx() ? F(NX) : 0; -#ifdef CONFIG_X86_64 - unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) - ? F(GBPAGES) : 0; - unsigned f_lm = F(LM); -#else - unsigned f_gbpages = 0; - unsigned f_lm = 0; -#endif - unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; - - /* cpuid 1.edx */ - const u32 kvm_supported_word0_x86_features = - F(FPU) | F(VME) | F(DE) | F(PSE) | - F(TSC) | F(MSR) | F(PAE) | F(MCE) | - F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | - F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) | - 0 /* Reserved, DS, ACPI */ | F(MMX) | - F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | - 0 /* HTT, TM, Reserved, PBE */; - /* cpuid 0x80000001.edx */ - const u32 kvm_supported_word1_x86_features = - F(FPU) | F(VME) | F(DE) | F(PSE) | - F(TSC) | F(MSR) | F(PAE) | F(MCE) | - F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | - F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* Reserved */ | - f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | - 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); - /* cpuid 1.ecx */ - const u32 kvm_supported_word4_x86_features = - F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | - 0 /* DS-CPL, VMX, SMX, EST */ | - 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | - 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | - 0 /* Reserved, DCA */ | F(XMM4_1) | - F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | - F(F16C) | F(RDRAND); - /* cpuid 0x80000001.ecx */ - const u32 kvm_supported_word6_x86_features = - F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | - F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | - 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); - - /* cpuid 0xC0000001.edx */ - const u32 kvm_supported_word5_x86_features = - F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | - F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | - F(PMM) | F(PMM_EN); - - /* cpuid 7.0.ebx */ - const u32 kvm_supported_word9_x86_features = - F(SMEP) | F(FSGSBASE) | F(ERMS); - - /* all calls to cpuid_count() should be made on the same cpu */ - get_cpu(); - do_cpuid_1_ent(entry, function, index); - ++*nent; - - switch (function) { - case 0: - entry->eax = min(entry->eax, (u32)0xd); - break; - case 1: - entry->edx &= kvm_supported_word0_x86_features; - cpuid_mask(&entry->edx, 0); - entry->ecx &= kvm_supported_word4_x86_features; - cpuid_mask(&entry->ecx, 4); - /* we support x2apic emulation even if host does not support - * it since we emulate x2apic in software */ - entry->ecx |= F(X2APIC); - break; - /* function 2 entries are STATEFUL. That is, repeated cpuid commands - * may return different values. This forces us to get_cpu() before - * issuing the first command, and also to emulate this annoying behavior - * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ - case 2: { - int t, times = entry->eax & 0xff; - - entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - for (t = 1; t < times && *nent < maxnent; ++t) { - do_cpuid_1_ent(&entry[t], function, 0); - entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - ++*nent; - } - break; - } - /* function 4 has additional index. */ - case 4: { - int i, cache_type; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* read more entries until cache_type is zero */ - for (i = 1; *nent < maxnent; ++i) { - cache_type = entry[i - 1].eax & 0x1f; - if (!cache_type) - break; - do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - } - break; - } - case 7: { - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* Mask ebx against host capbability word 9 */ - if (index == 0) { - entry->ebx &= kvm_supported_word9_x86_features; - cpuid_mask(&entry->ebx, 9); - } else - entry->ebx = 0; - entry->eax = 0; - entry->ecx = 0; - entry->edx = 0; - break; - } - case 9: - break; - /* function 0xb has additional index. */ - case 0xb: { - int i, level_type; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - /* read more entries until level_type is zero */ - for (i = 1; *nent < maxnent; ++i) { - level_type = entry[i - 1].ecx & 0xff00; - if (!level_type) - break; - do_cpuid_1_ent(&entry[i], function, i); - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - } - break; - } - case 0xd: { - int idx, i; - - entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) { - do_cpuid_1_ent(&entry[i], function, idx); - if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) - continue; - entry[i].flags |= - KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - ++*nent; - ++i; - } - break; - } - case KVM_CPUID_SIGNATURE: { - char signature[12] = "KVMKVMKVM\0\0"; - u32 *sigptr = (u32 *)signature; - entry->eax = 0; - entry->ebx = sigptr[0]; - entry->ecx = sigptr[1]; - entry->edx = sigptr[2]; - break; - } - case KVM_CPUID_FEATURES: - entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | - (1 << KVM_FEATURE_NOP_IO_DELAY) | - (1 << KVM_FEATURE_CLOCKSOURCE2) | - (1 << KVM_FEATURE_ASYNC_PF) | - (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); - - if (sched_info_on()) - entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); - - entry->ebx = 0; - entry->ecx = 0; - entry->edx = 0; - break; - case 0x80000000: - entry->eax = min(entry->eax, 0x8000001a); - break; - case 0x80000001: - entry->edx &= kvm_supported_word1_x86_features; - cpuid_mask(&entry->edx, 1); - entry->ecx &= kvm_supported_word6_x86_features; - cpuid_mask(&entry->ecx, 6); - break; - case 0x80000008: { - unsigned g_phys_as = (entry->eax >> 16) & 0xff; - unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); - unsigned phys_as = entry->eax & 0xff; - - if (!g_phys_as) - g_phys_as = phys_as; - entry->eax = g_phys_as | (virt_as << 8); - entry->ebx = entry->edx = 0; - break; - } - case 0x80000019: - entry->ecx = entry->edx = 0; - break; - case 0x8000001a: - break; - case 0x8000001d: - break; - /*Add support for Centaur's CPUID instruction*/ - case 0xC0000000: - /*Just support up to 0xC0000004 now*/ - entry->eax = min(entry->eax, 0xC0000004); - break; - case 0xC0000001: - entry->edx &= kvm_supported_word5_x86_features; - cpuid_mask(&entry->edx, 5); - break; - case 3: /* Processor serial number */ - case 5: /* MONITOR/MWAIT */ - case 6: /* Thermal management */ - case 0xA: /* Architectural Performance Monitoring */ - case 0x80000007: /* Advanced power management */ - case 0xC0000002: - case 0xC0000003: - case 0xC0000004: - default: - entry->eax = entry->ebx = entry->ecx = entry->edx = 0; - break; - } - - kvm_x86_ops->set_supported_cpuid(function, entry); - - put_cpu(); -} - -#undef F - -static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, - struct kvm_cpuid_entry2 __user *entries) -{ - struct kvm_cpuid_entry2 *cpuid_entries; - int limit, nent = 0, r = -E2BIG; - u32 func; - - if (cpuid->nent < 1) - goto out; - if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) - cpuid->nent = KVM_MAX_CPUID_ENTRIES; - r = -ENOMEM; - cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent); - if (!cpuid_entries) - goto out; - - do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); - limit = cpuid_entries[0].eax; - for (func = 1; func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); - limit = cpuid_entries[nent - 1].eax; - for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); - - - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - /* Add support for Centaur's CPUID instruction. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) { - do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0, - &nent, cpuid->nent); - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - limit = cpuid_entries[nent - 1].eax; - for (func = 0xC0000001; - func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - } - - do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, - cpuid->nent); - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent, - cpuid->nent); - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - r = -EFAULT; - if (copy_to_user(entries, cpuid_entries, - nent * sizeof(struct kvm_cpuid_entry2))) - goto out_free; - cpuid->nent = nent; - r = 0; - -out_free: - vfree(cpuid_entries); -out: - return r; -} - static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { @@ -5438,125 +4929,6 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) return emulator_write_emulated(ctxt, rip, instruction, 3, NULL); } -static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) -{ - struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; - int j, nent = vcpu->arch.cpuid_nent; - - e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; - /* when no next entry is found, the current entry[i] is reselected */ - for (j = i + 1; ; j = (j + 1) % nent) { - struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; - if (ej->function == e->function) { - ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - return j; - } - } - return 0; /* silence gcc, even though control never reaches here */ -} - -/* find an entry with matching function, matching index (if needed), and that - * should be read next (if it's stateful) */ -static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, - u32 function, u32 index) -{ - if (e->function != function) - return 0; - if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) - return 0; - if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && - !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) - return 0; - return 1; -} - -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index) -{ - int i; - struct kvm_cpuid_entry2 *best = NULL; - - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - struct kvm_cpuid_entry2 *e; - - e = &vcpu->arch.cpuid_entries[i]; - if (is_matching_cpuid_entry(e, function, index)) { - if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) - move_to_next_stateful_cpuid_entry(vcpu, i); - best = e; - break; - } - } - return best; -} -EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); - -int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); - if (!best || best->eax < 0x80000008) - goto not_found; - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); - if (best) - return best->eax & 0xff; -not_found: - return 36; -} - -/* - * If no match is found, check whether we exceed the vCPU's limit - * and return the content of the highest valid _standard_ leaf instead. - * This is to satisfy the CPUID specification. - */ -static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, - u32 function, u32 index) -{ - struct kvm_cpuid_entry2 *maxlevel; - - maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); - if (!maxlevel || maxlevel->eax >= function) - return NULL; - if (function & 0x80000000) { - maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); - if (!maxlevel) - return NULL; - } - return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); -} - -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) -{ - u32 function, index; - struct kvm_cpuid_entry2 *best; - - function = kvm_register_read(vcpu, VCPU_REGS_RAX); - index = kvm_register_read(vcpu, VCPU_REGS_RCX); - kvm_register_write(vcpu, VCPU_REGS_RAX, 0); - kvm_register_write(vcpu, VCPU_REGS_RBX, 0); - kvm_register_write(vcpu, VCPU_REGS_RCX, 0); - kvm_register_write(vcpu, VCPU_REGS_RDX, 0); - best = kvm_find_cpuid_entry(vcpu, function, index); - - if (!best) - best = check_cpuid_limit(vcpu, function, index); - - if (best) { - kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); - kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); - kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); - kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); - } - kvm_x86_ops->skip_emulated_instruction(vcpu); - trace_kvm_cpuid(function, - kvm_register_read(vcpu, VCPU_REGS_RAX), - kvm_register_read(vcpu, VCPU_REGS_RBX), - kvm_register_read(vcpu, VCPU_REGS_RCX), - kvm_register_read(vcpu, VCPU_REGS_RDX)); -} -EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); - /* * Check if userspace requested an interrupt window, and that the * interrupt window is open. @@ -6222,7 +5594,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (sregs->cr4 & X86_CR4_OSXSAVE) - update_cpuid(vcpu); + kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); if (!is_long_mode(vcpu) && is_pae(vcpu)) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index d36fe23..cb80c29 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -33,9 +33,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr) return (nr == BP_VECTOR) || (nr == OF_VECTOR); } -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); - static inline bool is_protmode(struct kvm_vcpu *vcpu) { return kvm_read_cr0_bits(vcpu, X86_CR0_PE); @@ -125,4 +122,6 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); +extern u64 host_xcr0; + #endif -- cgit v1.1 From fb215366b3c7320ac25dca766a0152df16534932 Mon Sep 17 00:00:00 2001 From: "Liu, Jinsong" Date: Mon, 28 Nov 2011 03:55:19 -0800 Subject: KVM: expose latest Intel cpu new features (BMI1/BMI2/FMA/AVX2) to guest Intel latest cpu add 6 new features, refer http://software.intel.com/file/36945 The new feature cpuid listed as below: 1. FMA CPUID.EAX=01H:ECX.FMA[bit 12] 2. MOVBE CPUID.EAX=01H:ECX.MOVBE[bit 22] 3. BMI1 CPUID.EAX=07H,ECX=0H:EBX.BMI1[bit 3] 4. AVX2 CPUID.EAX=07H,ECX=0H:EBX.AVX2[bit 5] 5. BMI2 CPUID.EAX=07H,ECX=0H:EBX.BMI2[bit 8] 6. LZCNT CPUID.EAX=80000001H:ECX.LZCNT[bit 5] This patch expose these features to guest. Among them, FMA/MOVBE/LZCNT has already been defined, MOVBE/LZCNT has already been exposed. This patch defines BMI1/AVX2/BMI2, and exposes FMA/BMI1/AVX2/BMI2 to guest. Signed-off-by: Liu, Jinsong Signed-off-by: Avi Kivity --- arch/x86/include/asm/cpufeature.h | 3 +++ arch/x86/kvm/cpuid.c | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index f3444f7..17c5d4b 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -197,7 +197,10 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ #define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */ #define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0a332ec..47be763 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -222,7 +222,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | - 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | + F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | @@ -242,7 +242,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = - F(SMEP) | F(FSGSBASE) | F(ERMS); + F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); -- cgit v1.1 From 831bf664e9c1fc08fc6b3984d00d275cac82f5e9 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 28 Nov 2011 11:20:29 +0200 Subject: KVM: Refactor and simplify kvm_dev_ioctl_get_supported_cpuid This patch cleans and simplifies kvm_dev_ioctl_get_supported_cpuid by using a table instead of duplicating code as Avi suggested. This patch also fixes a bug where kvm_dev_ioctl_get_supported_cpuid would return -E2BIG when amount of entries passed was just right. Signed-off-by: Sasha Levin Signed-off-by: Avi Kivity --- arch/x86/kvm/cpuid.c | 113 ++++++++++++++++++++++++++++----------------------- 1 file changed, 63 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 47be763..52593e8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -183,9 +183,10 @@ static bool supported_xcr0_bit(unsigned bit) #define F(x) bit(X86_FEATURE_##x) -static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, +static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { + int r; unsigned f_nx = is_efer_nx() ? F(NX) : 0; #ifdef CONFIG_X86_64 unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) @@ -246,6 +247,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); + + r = -E2BIG; + + if (*nent >= maxnent) + goto out; + do_cpuid_1_ent(entry, function, index); ++*nent; @@ -271,7 +278,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - for (t = 1; t < times && *nent < maxnent; ++t) { + for (t = 1; t < times; ++t) { + if (*nent >= maxnent) + goto out; + do_cpuid_1_ent(&entry[t], function, 0); entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; ++*nent; @@ -284,7 +294,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until cache_type is zero */ - for (i = 1; *nent < maxnent; ++i) { + for (i = 1; ; ++i) { + if (*nent >= maxnent) + goto out; + cache_type = entry[i - 1].eax & 0x1f; if (!cache_type) break; @@ -316,7 +329,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until level_type is zero */ - for (i = 1; *nent < maxnent; ++i) { + for (i = 1; ; ++i) { + if (*nent >= maxnent) + goto out; + level_type = entry[i - 1].ecx & 0xff00; if (!level_type) break; @@ -331,7 +347,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, int idx, i; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) { + for (idx = 1, i = 1; idx < 64; ++idx) { + if (*nent >= maxnent) + goto out; + do_cpuid_1_ent(&entry[i], function, idx); if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) continue; @@ -416,17 +435,41 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, kvm_x86_ops->set_supported_cpuid(function, entry); + r = 0; + +out: put_cpu(); + + return r; } #undef F +struct kvm_cpuid_param { + u32 func; + u32 idx; + bool has_leaf_count; + bool (*qualifier)(struct kvm_cpuid_param *param); +}; + +static bool is_centaur_cpu(struct kvm_cpuid_param *param) +{ + return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; +} + int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { struct kvm_cpuid_entry2 *cpuid_entries; - int limit, nent = 0, r = -E2BIG; + int limit, nent = 0, r = -E2BIG, i; u32 func; + static struct kvm_cpuid_param param[] = { + { .func = 0, .has_leaf_count = true }, + { .func = 0x80000000, .has_leaf_count = true }, + { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true }, + { .func = KVM_CPUID_SIGNATURE }, + { .func = KVM_CPUID_FEATURES }, + }; if (cpuid->nent < 1) goto out; @@ -437,61 +480,31 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, if (!cpuid_entries) goto out; - do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent); - limit = cpuid_entries[0].eax; - for (func = 1; func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent); - limit = cpuid_entries[nent - 1].eax; - for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); - - + r = 0; + for (i = 0; i < ARRAY_SIZE(param); i++) { + struct kvm_cpuid_param *ent = ¶m[i]; - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; + if (ent->qualifier && !ent->qualifier(ent)) + continue; - /* Add support for Centaur's CPUID instruction. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) { - do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0, + r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx, &nent, cpuid->nent); - r = -E2BIG; - if (nent >= cpuid->nent) + if (r) goto out_free; + if (!ent->has_leaf_count) + continue; + limit = cpuid_entries[nent - 1].eax; - for (func = 0xC0000001; - func <= limit && nent < cpuid->nent; ++func) - do_cpuid_ent(&cpuid_entries[nent], func, 0, - &nent, cpuid->nent); + for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) + r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx, + &nent, cpuid->nent); - r = -E2BIG; - if (nent >= cpuid->nent) + if (r) goto out_free; } - do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, - cpuid->nent); - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - - do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent, - cpuid->nent); - - r = -E2BIG; - if (nent >= cpuid->nent) - goto out_free; - r = -EFAULT; if (copy_to_user(entries, cpuid_entries, nent * sizeof(struct kvm_cpuid_entry2))) -- cgit v1.1 From 0375f7fad904b59502341ccecfc8faea70b34c91 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 28 Nov 2011 20:41:00 +0800 Subject: KVM: MMU: audit: replace mmu audit tracepoint with jump-label The tracepoint is only used to audit mmu code, it should not be exposed to user, let us replace it with jump-label. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 16 +++++++++++----- arch/x86/kvm/mmu_audit.c | 28 +++++++++++++--------------- arch/x86/kvm/mmutrace.h | 19 ------------------- arch/x86/kvm/paging_tmpl.h | 4 ++-- 4 files changed, 26 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d737443..62f69db 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -68,6 +68,12 @@ char *audit_point_name[] = { "post sync" }; +#ifdef CONFIG_KVM_MMU_AUDIT +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point); +#else +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } +#endif + #undef MMU_DEBUG #ifdef MMU_DEBUG @@ -2852,12 +2858,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) return; vcpu_clear_mmio_info(vcpu, ~0ul); - trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); mmu_sync_children(vcpu, sp); - trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); return; } for (i = 0; i < 4; ++i) { @@ -2869,7 +2875,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_children(vcpu, sp); } } - trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) @@ -3667,7 +3673,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, spin_lock(&vcpu->kvm->mmu_lock); ++vcpu->kvm->stat.mmu_pte_write; - trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); + kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { @@ -3700,7 +3706,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); + kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); spin_unlock(&vcpu->kvm->mmu_lock); } diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 746ec25..5df6736 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -224,30 +224,29 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) mmu_spte_walk(vcpu, audit_spte); } -static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) +static bool mmu_audit; +static struct jump_label_key mmu_audit_key; + +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); - if (!__ratelimit(&ratelimit_state)) - return; + if (static_branch((&mmu_audit_key))) { + if (!__ratelimit(&ratelimit_state)) + return; - vcpu->kvm->arch.audit_point = point; - audit_all_active_sps(vcpu->kvm); - audit_vcpu_spte(vcpu); + vcpu->kvm->arch.audit_point = point; + audit_all_active_sps(vcpu->kvm); + audit_vcpu_spte(vcpu); + } } -static bool mmu_audit; - static void mmu_audit_enable(void) { - int ret; - if (mmu_audit) return; - ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); - WARN_ON(ret); - + jump_label_inc(&mmu_audit_key); mmu_audit = true; } @@ -256,8 +255,7 @@ static void mmu_audit_disable(void) if (!mmu_audit) return; - unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); - tracepoint_synchronize_unregister(); + jump_label_dec(&mmu_audit_key); mmu_audit = false; } diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index eed67f3..89fb0e8 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -243,25 +243,6 @@ TRACE_EVENT( TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, __entry->access) ); - -TRACE_EVENT( - kvm_mmu_audit, - TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), - TP_ARGS(vcpu, audit_point), - - TP_STRUCT__entry( - __field(struct kvm_vcpu *, vcpu) - __field(int, audit_point) - ), - - TP_fast_assign( - __entry->vcpu = vcpu; - __entry->audit_point = audit_point; - ), - - TP_printk("vcpu:%d %s", __entry->vcpu->cpu, - audit_point_name[__entry->audit_point]) -); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 52e9d58..1561028 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -632,7 +632,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; - trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); + kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); if (!force_pt_level) transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); @@ -643,7 +643,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, sptep, *sptep, emulate); ++vcpu->stat.pf_fixed; - trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); + kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); return emulate; -- cgit v1.1 From 9edb17d55f3ea4943f9654f2aad7a99b4c55840a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 28 Nov 2011 20:41:38 +0800 Subject: KVM: x86: remove the dead code of KVM_EXIT_HYPERCALL KVM_EXIT_HYPERCALL is not used anymore, so remove the code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4e533d2..4650531 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5389,10 +5389,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (r <= 0) goto out; - if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) - kvm_register_write(vcpu, VCPU_REGS_RAX, - kvm_run->hypercall.ret); - r = __vcpu_run(vcpu); out: -- cgit v1.1 From e459e3228dc57f7160e564ce0f09edb5bee656d3 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 28 Nov 2011 20:42:16 +0800 Subject: KVM: MMU: move the relevant mmu code to mmu.c Move the mmu code in kvm_arch_vcpu_init() to kvm_mmu_create() Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/mmu.c | 6 +++++- arch/x86/kvm/x86.c | 11 +---------- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1769f3d..020413a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -752,6 +752,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, @@ -773,6 +774,11 @@ void kvm_disable_tdp(void); int complete_pio(struct kvm_vcpu *vcpu); bool kvm_check_iopl(struct kvm_vcpu *vcpu); +static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +{ + return gpa; +} + static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 62f69db..262a3af 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3839,7 +3839,11 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + vcpu->arch.walk_mmu = &vcpu->arch.mmu; + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu.translate_gpa = translate_gpa; + vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; return alloc_mmu_pages(vcpu); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4650531..d99976e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3430,12 +3430,7 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, kvm_x86_ops->get_segment(vcpu, var, seg); } -static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) -{ - return gpa; -} - -static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) { gpa_t t_gpa; struct x86_exception exception; @@ -5915,10 +5910,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm = vcpu->kvm; vcpu->arch.emulate_ctxt.ops = &emulate_ops; - vcpu->arch.walk_mmu = &vcpu->arch.mmu; - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - vcpu->arch.mmu.translate_gpa = translate_gpa; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else -- cgit v1.1 From d750ea28865dbff6a73444358d189dd811c68c50 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 28 Nov 2011 20:43:18 +0800 Subject: KVM: MMU: remove oos_shadow parameter The unsync code should be stable now, maybe it is the time to remove this parameter to cleanup the code a little bit Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 262a3af..b1178d1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -93,9 +93,6 @@ static int dbg = 0; module_param(dbg, bool, 0644); #endif -static int oos_shadow = 1; -module_param(oos_shadow, bool, 0644); - #ifndef MMU_DEBUG #define ASSERT(x) do { } while (0) #else @@ -2196,8 +2193,6 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return 1; if (!need_unsync && !s->unsync) { - if (!oos_shadow) - return 1; need_unsync = true; } } -- cgit v1.1 From e37fa7853c276403da2fea8792c579e8bfd75042 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 30 Nov 2011 17:43:24 +0800 Subject: KVM: MMU: audit: inline audit function inline audit function and little cleanup Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 28 +++++++--------------------- arch/x86/kvm/mmu_audit.c | 29 +++++++++++++++++++++-------- 2 files changed, 28 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b1178d1..7a8e99c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -59,21 +59,6 @@ enum { AUDIT_POST_SYNC }; -char *audit_point_name[] = { - "pre page fault", - "post page fault", - "pre pte write", - "post pte write", - "pre sync", - "post sync" -}; - -#ifdef CONFIG_KVM_MMU_AUDIT -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point); -#else -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } -#endif - #undef MMU_DEBUG #ifdef MMU_DEBUG @@ -1539,6 +1524,13 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, return ret; } +#ifdef CONFIG_KVM_MMU_AUDIT +#include "mmu_audit.c" +#else +static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } +static void mmu_audit_disable(void) { } +#endif + static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { @@ -4035,12 +4027,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) mmu_free_memory_caches(vcpu); } -#ifdef CONFIG_KVM_MMU_AUDIT -#include "mmu_audit.c" -#else -static void mmu_audit_disable(void) { } -#endif - void kvm_mmu_module_exit(void) { mmu_destroy_caches(); diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 5df6736..fe15dcc 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -19,6 +19,15 @@ #include +char const *audit_point_name[] = { + "pre page fault", + "post page fault", + "pre pte write", + "post pte write", + "pre sync", + "post sync" +}; + #define audit_printk(kvm, fmt, args...) \ printk(KERN_ERR "audit: (%s) error: " \ fmt, audit_point_name[kvm->arch.audit_point], ##args) @@ -227,18 +236,22 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) static bool mmu_audit; static struct jump_label_key mmu_audit_key; -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) +static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); - if (static_branch((&mmu_audit_key))) { - if (!__ratelimit(&ratelimit_state)) - return; + if (!__ratelimit(&ratelimit_state)) + return; - vcpu->kvm->arch.audit_point = point; - audit_all_active_sps(vcpu->kvm); - audit_vcpu_spte(vcpu); - } + vcpu->kvm->arch.audit_point = point; + audit_all_active_sps(vcpu->kvm); + audit_vcpu_spte(vcpu); +} + +static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) +{ + if (static_branch((&mmu_audit_key))) + __kvm_mmu_audit(vcpu, point); } static void mmu_audit_enable(void) -- cgit v1.1 From 086c9855019935854311b477b33498a6ea357ef6 Mon Sep 17 00:00:00 2001 From: "Alex,Shi" Date: Thu, 20 Oct 2011 15:34:01 +0800 Subject: KVM: use this_cpu_xxx replace percpu_xxx funcs percpu_xxx funcs are duplicated with this_cpu_xxx funcs, so replace them for further code clean up. And in preempt safe scenario, __this_cpu_xxx funcs has a bit better performance since __this_cpu_xxx has no redundant preempt_disable() Signed-off-by: Alex Shi Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d99976e..d55a94f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4664,15 +4664,15 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); static int kvm_is_in_guest(void) { - return percpu_read(current_vcpu) != NULL; + return __this_cpu_read(current_vcpu) != NULL; } static int kvm_is_user_mode(void) { int user_mode = 3; - if (percpu_read(current_vcpu)) - user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu)); + if (__this_cpu_read(current_vcpu)) + user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu)); return user_mode != 0; } @@ -4681,8 +4681,8 @@ static unsigned long kvm_get_guest_ip(void) { unsigned long ip = 0; - if (percpu_read(current_vcpu)) - ip = kvm_rip_read(percpu_read(current_vcpu)); + if (__this_cpu_read(current_vcpu)) + ip = kvm_rip_read(__this_cpu_read(current_vcpu)); return ip; } @@ -4695,13 +4695,13 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) { - percpu_write(current_vcpu, vcpu); + __this_cpu_write(current_vcpu, vcpu); } EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) { - percpu_write(current_vcpu, NULL); + __this_cpu_write(current_vcpu, NULL); } EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); -- cgit v1.1 From 3d56cbdf359c953f8bfcab68aa5cf766e4480799 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 2 Dec 2011 18:35:24 +0100 Subject: KVM: MMU: Drop unused return value of kvm_mmu_remove_some_alloc_mmu_pages freed_pages is never evaluated, so remove it as well as the return code kvm_mmu_remove_some_alloc_mmu_pages so far delivered to its only user. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7a8e99c..2a2a9b4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3890,14 +3890,14 @@ restart: spin_unlock(&kvm->mmu_lock); } -static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, - struct list_head *invalid_list) +static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, + struct list_head *invalid_list) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); + kvm_mmu_prepare_zap_page(kvm, page, invalid_list); } static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) @@ -3912,15 +3912,15 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int idx, freed_pages; + int idx; LIST_HEAD(invalid_list); idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); if (!kvm_freed && nr_to_scan > 0 && kvm->arch.n_used_mmu_pages > 0) { - freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, - &invalid_list); + kvm_mmu_remove_some_alloc_mmu_pages(kvm, + &invalid_list); kvm_freed = kvm; } nr_to_scan--; -- cgit v1.1 From 234b639206a7d9d5ca362cff64ceddd4f27e4a46 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 2 Dec 2011 18:26:28 +0100 Subject: KVM: x86 emulator: Remove set-but-unused cr4 from check_cr_write This was probably copy&pasted from the cr0 case, but it's unneeded here. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ac8e5ed..f641201 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3028,9 +3028,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) break; } case 4: { - u64 cr4; - - cr4 = ctxt->ops->get_cr(ctxt, 4); ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) -- cgit v1.1 From cdfca7b346e6dbab1ba33260c28ccb8333485a5b Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sun, 4 Dec 2011 19:36:28 +0200 Subject: KVM: Use kmemdup() instead of kmalloc/memcpy Switch to kmemdup() in two places to shorten the code and avoid possible bugs. Signed-off-by: Sasha Levin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d55a94f..03042d6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3031,10 +3031,10 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, memset(dirty_bitmap_head, 0, n); r = -ENOMEM; - slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL); if (!slots) goto out; - memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + memslot = id_to_memslot(slots, log->slot); memslot->nr_dirty_pages = 0; memslot->dirty_bitmap = dirty_bitmap_head; -- cgit v1.1 From ff5c2c0316ff0e3e2dba3ca14167d994453df093 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sun, 4 Dec 2011 19:36:29 +0200 Subject: KVM: Use memdup_user instead of kmalloc/copy_from_user Switch to using memdup_user when possible. This makes code more smaller and compact, and prevents errors. Signed-off-by: Sasha Levin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 82 +++++++++++++++++++++++------------------------------- 1 file changed, 35 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03042d6..0a646e2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1309,12 +1309,11 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) if (page_num >= blob_size) goto out; r = -ENOMEM; - page = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!page) + page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); + if (IS_ERR(page)) { + r = PTR_ERR(page); goto out; - r = -EFAULT; - if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE)) - goto out_free; + } if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) goto out_free; r = 0; @@ -1988,15 +1987,12 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, if (msrs.nmsrs >= MAX_IO_MSRS) goto out; - r = -ENOMEM; size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; - entries = kmalloc(size, GFP_KERNEL); - if (!entries) + entries = memdup_user(user_msrs->entries, size); + if (IS_ERR(entries)) { + r = PTR_ERR(entries); goto out; - - r = -EFAULT; - if (copy_from_user(entries, user_msrs->entries, size)) - goto out_free; + } r = n = __msr_io(vcpu, &msrs, entries, do_msr); if (r < 0) @@ -2533,13 +2529,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!vcpu->arch.apic) goto out; - u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); - r = -ENOMEM; - if (!u.lapic) - goto out; - r = -EFAULT; - if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state))) + u.lapic = memdup_user(argp, sizeof(*u.lapic)); + if (IS_ERR(u.lapic)) { + r = PTR_ERR(u.lapic); goto out; + } + r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); if (r) goto out; @@ -2718,14 +2713,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_XSAVE: { - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); - r = -ENOMEM; - if (!u.xsave) - break; - - r = -EFAULT; - if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) - break; + u.xsave = memdup_user(argp, sizeof(*u.xsave)); + if (IS_ERR(u.xsave)) { + r = PTR_ERR(u.xsave); + goto out; + } r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; @@ -2746,15 +2738,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); - r = -ENOMEM; - if (!u.xcrs) - break; - - r = -EFAULT; - if (copy_from_user(u.xcrs, argp, - sizeof(struct kvm_xcrs))) - break; + u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); + if (IS_ERR(u.xcrs)) { + r = PTR_ERR(u.xcrs); + goto out; + } r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; @@ -3190,14 +3178,14 @@ long kvm_arch_vm_ioctl(struct file *filp, } case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); + struct kvm_irqchip *chip; - r = -ENOMEM; - if (!chip) + chip = memdup_user(argp, sizeof(*chip)); + if (IS_ERR(chip)) { + r = PTR_ERR(chip); goto out; - r = -EFAULT; - if (copy_from_user(chip, argp, sizeof *chip)) - goto get_irqchip_out; + } + r = -ENXIO; if (!irqchip_in_kernel(kvm)) goto get_irqchip_out; @@ -3216,14 +3204,14 @@ long kvm_arch_vm_ioctl(struct file *filp, } case KVM_SET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); + struct kvm_irqchip *chip; - r = -ENOMEM; - if (!chip) + chip = memdup_user(argp, sizeof(*chip)); + if (IS_ERR(chip)) { + r = PTR_ERR(chip); goto out; - r = -EFAULT; - if (copy_from_user(chip, argp, sizeof *chip)) - goto set_irqchip_out; + } + r = -ENXIO; if (!irqchip_in_kernel(kvm)) goto set_irqchip_out; -- cgit v1.1 From 43771ebfc9d34ab1f74095d052d225a82ae0898c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 14 Dec 2011 12:27:47 +0200 Subject: KVM: Make KVM_INTEL depend on CPU_SUP_INTEL PMU virtualization needs to talk to Intel-specific bits of perf; these are only available when CPU_SUP_INTEL=y. Fixes arch/x86/built-in.o: In function `atomic_switch_perf_msrs': vmx.c:(.text+0x6b1d4): undefined reference to `perf_guest_get_msrs' Reported-by: Ingo Molnar Reported-by: Randy Dunlap Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ff5790d..ca4d49e 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -52,6 +52,8 @@ config KVM config KVM_INTEL tristate "KVM for Intel processors support" depends on KVM + # for perf_guest_get_msrs(): + depends on CPU_SUP_INTEL ---help--- Provides support for KVM on Intel processors equipped with the VT extensions. -- cgit v1.1 From bb5a798ad58996e4d666ead1016705854d5ca616 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 14 Dec 2011 17:58:18 +0100 Subject: KVM: x86: Do not rely on implicit inclusions Works so far by change, but it is not guaranteed to stay like this. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/cpuid.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 52593e8..ca63032 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -14,6 +14,8 @@ #include #include +#include +#include #include #include #include "cpuid.h" -- cgit v1.1 From a647795efbedeedf8a1dc6deded26defa23562bd Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 14 Dec 2011 19:25:33 +0100 Subject: KVM: x86: Consolidate PIT legacy test Move the test for KVM_PIT_FLAGS_HPET_LEGACY into create_pit_timer instead of replicating it on the caller site. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 405f262..d68f99d 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -344,7 +344,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) struct kvm_timer *pt = &ps->pit_timer; s64 interval; - if (!irqchip_in_kernel(kvm)) + if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) return; interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); @@ -397,15 +397,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) case 1: /* FIXME: enhance mode 4 precision */ case 4: - if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { - create_pit_timer(kvm, val, 0); - } + create_pit_timer(kvm, val, 0); break; case 2: case 3: - if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ - create_pit_timer(kvm, val, 1); - } + create_pit_timer(kvm, val, 1); break; default: destroy_pit_timer(kvm->arch.vpit); -- cgit v1.1 From d546cb406ea0d83e2d39ec14221957a24f88a622 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 15 Dec 2011 12:38:40 +0200 Subject: KVM: drop bsp_vcpu pointer from kvm struct Drop bsp_vcpu pointer from kvm struct since its only use is incorrect anyway. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8259.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index cac4746..b6a7353 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -262,9 +262,10 @@ int kvm_pic_read_irq(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s) { - int irq; - struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu; + int irq, i; + struct kvm_vcpu *vcpu; u8 irr = s->irr, isr = s->imr; + bool found = false; s->last_irr = 0; s->irr = 0; @@ -281,12 +282,19 @@ void kvm_pic_reset(struct kvm_kpic_state *s) s->special_fully_nested_mode = 0; s->init4 = 0; - for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { - if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) - if (irr & (1 << irq) || isr & (1 << irq)) { - pic_clear_isr(s, irq); - } - } + kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm) + if (kvm_apic_accept_pic_intr(vcpu)) { + found = true; + break; + } + + + if (!found) + return; + + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) + if (irr & (1 << irq) || isr & (1 << irq)) + pic_clear_isr(s, irq); } static void pic_ioport_write(void *opaque, u32 addr, u32 val) -- cgit v1.1 From c15af35f54631b9e9b7ad1981016cc6e73cec794 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 6 Dec 2011 18:06:02 +0900 Subject: KVM: x86 emulator: Use opcode::execute for Group 1A instruction Group 1A: 8F Register em_pop() directly and remove em_grp1a(). Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f641201..cd49774 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1675,11 +1675,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_grp1a(struct x86_emulate_ctxt *ctxt) -{ - return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes); -} - static int em_grp2(struct x86_emulate_ctxt *ctxt) { switch (ctxt->modrm_reg) { @@ -3203,7 +3198,7 @@ static struct opcode group1[] = { }; static struct opcode group1A[] = { - D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, + I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N, }; static struct opcode group3[] = { @@ -4031,9 +4026,6 @@ special_insn: case 0x8d: /* lea r16/r32, m */ ctxt->dst.val = ctxt->src.addr.mem.ea; break; - case 0x8f: /* pop (sole member of Grp1a) */ - rc = em_grp1a(ctxt); - break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX]) break; -- cgit v1.1 From c04ec8393f3815e0f60dde1d6b29040bf1875d52 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 6 Dec 2011 18:06:44 +0900 Subject: KVM: x86 emulator: Use opcode::execute for Group 4/5 instructions Group 4: FE Group 5: FF Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index cd49774..5b78785 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3213,16 +3213,19 @@ static struct opcode group3[] = { }; static struct opcode group4[] = { - D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), + I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), + I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45), N, N, N, N, N, N, }; static struct opcode group5[] = { - D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), - D(SrcMem | ModRM | Stack), + I(DstMem | SrcNone | ModRM | Lock, em_grp45), + I(DstMem | SrcNone | ModRM | Lock, em_grp45), + I(SrcMem | ModRM | Stack, em_grp45), I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), - D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), - D(SrcMem | ModRM | Stack), N, + I(SrcMem | ModRM | Stack, em_grp45), + I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45), + I(SrcMem | ModRM | Stack, em_grp45), N, }; static struct opcode group6[] = { @@ -4082,12 +4085,6 @@ special_insn: case 0xfd: /* std */ ctxt->eflags |= EFLG_DF; break; - case 0xfe: /* Grp4 */ - rc = em_grp45(ctxt); - break; - case 0xff: /* Grp5 */ - rc = em_grp45(ctxt); - break; default: goto cannot_emulate; } -- cgit v1.1 From e0dac408d08c2a5e1bed2a6a9da7f3af3f7a9827 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 6 Dec 2011 18:07:27 +0900 Subject: KVM: x86 emulator: Use opcode::execute for Group 9 instruction Group 9: 0F C7 Rename em_grp9() to em_cmpxchg8b() and register it. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5b78785..de7be77 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1784,7 +1784,7 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) return rc; } -static int em_grp9(struct x86_emulate_ctxt *ctxt) +static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) { u64 old = ctxt->dst.orig_val64; @@ -3261,7 +3261,7 @@ static struct opcode group8[] = { }; static struct group_dual group9 = { { - N, D(DstMem64 | ModRM | Lock | PageTable), N, N, N, N, N, N, + N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, }, { N, N, N, N, N, N, N, N, } }; @@ -4202,9 +4202,6 @@ twobyte_insn: ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : (u64) ctxt->src.val; break; - case 0xc7: /* Grp9 (cmpxchg8b) */ - rc = em_grp9(ctxt); - break; default: goto cannot_emulate; } -- cgit v1.1 From 893420822192f717af6fde927c9e78c9b82f8327 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Nov 2011 14:57:21 +0200 Subject: KVM: Expose kvm_lapic_local_deliver() Needed to deliver performance monitoring interrupts. Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/lapic.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a7f3e65..cfdc6e0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1121,7 +1121,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) return 0; } -static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) +int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { u32 reg = apic_get_reg(apic, lvt_type); int vector, mode, trig_mode; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 138e8cc..6f4ce25 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -34,6 +34,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); +int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); -- cgit v1.1 From f5132b01386b5a67f1ff673bb2b96a507a3f7e41 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 10 Nov 2011 14:57:22 +0200 Subject: KVM: Expose a version 2 architectural PMU to a guests Use perf_events to emulate an architectural PMU, version 2. Based on PMU version 1 emulation by Avi Kivity. [avi: adjust for cpuid.c] [jan: fix anonymous field initialization for older gcc] Signed-off-by: Gleb Natapov Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 48 ++++ arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/cpuid.c | 2 + arch/x86/kvm/pmu.c | 533 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 22 +- 6 files changed, 598 insertions(+), 10 deletions(-) create mode 100644 arch/x86/kvm/pmu.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 020413a..fb60ffd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -16,10 +16,12 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -291,6 +293,37 @@ struct kvm_mmu { u64 pdptrs[4]; /* pae */ }; +enum pmc_type { + KVM_PMC_GP = 0, + KVM_PMC_FIXED, +}; + +struct kvm_pmc { + enum pmc_type type; + u8 idx; + u64 counter; + u64 eventsel; + struct perf_event *perf_event; + struct kvm_vcpu *vcpu; +}; + +struct kvm_pmu { + unsigned nr_arch_gp_counters; + unsigned nr_arch_fixed_counters; + unsigned available_event_types; + u64 fixed_ctr_ctrl; + u64 global_ctrl; + u64 global_status; + u64 global_ovf_ctrl; + u64 counter_bitmask[2]; + u64 global_ctrl_mask; + u8 version; + struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC]; + struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED]; + struct irq_work irq_work; + u64 reprogram_pmi; +}; + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -424,6 +457,8 @@ struct kvm_vcpu_arch { unsigned access; gfn_t mmio_gfn; + struct kvm_pmu pmu; + /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; @@ -891,4 +926,17 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); +int kvm_is_in_guest(void); + +void kvm_pmu_init(struct kvm_vcpu *vcpu); +void kvm_pmu_destroy(struct kvm_vcpu *vcpu); +void kvm_pmu_reset(struct kvm_vcpu *vcpu); +void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); +bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); +void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); +void kvm_deliver_pmi(struct kvm_vcpu *vcpu); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ca4d49e..1a7fe86 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -35,6 +35,7 @@ config KVM select KVM_MMIO select TASKSTATS select TASK_DELAY_ACCT + select PERF_EVENTS ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 161b76a..4f579e8 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o timer.o cpuid.o + i8254.o timer.o cpuid.o pmu.o kvm-intel-y += vmx.o kvm-amd-y += svm.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ca63032..e70be46 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -45,6 +45,8 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu) else apic->lapic_timer.timer_mode_mask = 1 << 17; } + + kvm_pmu_cpuid_update(vcpu); } static int is_efer_nx(void) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c new file mode 100644 index 0000000..7aad544 --- /dev/null +++ b/arch/x86/kvm/pmu.c @@ -0,0 +1,533 @@ +/* + * Kernel-based Virtual Machine -- Performane Monitoring Unit support + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity + * Gleb Natapov + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include +#include +#include +#include "x86.h" +#include "cpuid.h" +#include "lapic.h" + +static struct kvm_arch_event_perf_mapping { + u8 eventsel; + u8 unit_mask; + unsigned event_type; + bool inexact; +} arch_events[] = { + /* Index must match CPUID 0x0A.EBX bit vector */ + [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, + [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, + [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, + [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, + [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, + [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, +}; + +/* mapping between fixed pmc index and arch_events array */ +int fixed_pmc_events[] = {1, 0, 2}; + +static bool pmc_is_gp(struct kvm_pmc *pmc) +{ + return pmc->type == KVM_PMC_GP; +} + +static inline u64 pmc_bitmask(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + + return pmu->counter_bitmask[pmc->type]; +} + +static inline bool pmc_enabled(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); +} + +static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, + u32 base) +{ + if (msr >= base && msr < base + pmu->nr_arch_gp_counters) + return &pmu->gp_counters[msr - base]; + return NULL; +} + +static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) +{ + int base = MSR_CORE_PERF_FIXED_CTR0; + if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) + return &pmu->fixed_counters[msr - base]; + return NULL; +} + +static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx) +{ + return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx); +} + +static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) +{ + if (idx < X86_PMC_IDX_FIXED) + return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); + else + return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED); +} + +void kvm_deliver_pmi(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.apic) + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); +} + +static void trigger_pmi(struct irq_work *irq_work) +{ + struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, + irq_work); + struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu, + arch.pmu); + + kvm_deliver_pmi(vcpu); +} + +static void kvm_perf_overflow(struct perf_event *perf_event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct kvm_pmc *pmc = perf_event->overflow_handler_context; + struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); +} + +static void kvm_perf_overflow_intr(struct perf_event *perf_event, + struct perf_sample_data *data, struct pt_regs *regs) +{ + struct kvm_pmc *pmc = perf_event->overflow_handler_context; + struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { + kvm_perf_overflow(perf_event, data, regs); + kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + /* + * Inject PMI. If vcpu was in a guest mode during NMI PMI + * can be ejected on a guest mode re-entry. Otherwise we can't + * be sure that vcpu wasn't executing hlt instruction at the + * time of vmexit and is not going to re-enter guest mode until, + * woken up. So we should wake it, but this is impossible from + * NMI context. Do it from irq work instead. + */ + if (!kvm_is_in_guest()) + irq_work_queue(&pmc->vcpu->arch.pmu.irq_work); + else + kvm_make_request(KVM_REQ_PMI, pmc->vcpu); + } +} + +static u64 read_pmc(struct kvm_pmc *pmc) +{ + u64 counter, enabled, running; + + counter = pmc->counter; + + if (pmc->perf_event) + counter += perf_event_read_value(pmc->perf_event, + &enabled, &running); + + /* FIXME: Scaling needed? */ + + return counter & pmc_bitmask(pmc); +} + +static void stop_counter(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + pmc->counter = read_pmc(pmc); + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + } +} + +static void reprogram_counter(struct kvm_pmc *pmc, u32 type, + unsigned config, bool exclude_user, bool exclude_kernel, + bool intr) +{ + struct perf_event *event; + struct perf_event_attr attr = { + .type = type, + .size = sizeof(attr), + .pinned = true, + .exclude_idle = true, + .exclude_host = 1, + .exclude_user = exclude_user, + .exclude_kernel = exclude_kernel, + .config = config, + }; + + attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); + + event = perf_event_create_kernel_counter(&attr, -1, current, + intr ? kvm_perf_overflow_intr : + kvm_perf_overflow, pmc); + if (IS_ERR(event)) { + printk_once("kvm: pmu event creation failed %ld\n", + PTR_ERR(event)); + return; + } + + pmc->perf_event = event; + clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi); +} + +static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select, + u8 unit_mask) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(arch_events); i++) + if (arch_events[i].eventsel == event_select + && arch_events[i].unit_mask == unit_mask + && (pmu->available_event_types & (1 << i))) + break; + + if (i == ARRAY_SIZE(arch_events)) + return PERF_COUNT_HW_MAX; + + return arch_events[i].event_type; +} + +static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) +{ + unsigned config, type = PERF_TYPE_RAW; + u8 event_select, unit_mask; + + pmc->eventsel = eventsel; + + stop_counter(pmc); + + if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc)) + return; + + event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; + unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; + + if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE | + ARCH_PERFMON_EVENTSEL_INV | + ARCH_PERFMON_EVENTSEL_CMASK))) { + config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, + unit_mask); + if (config != PERF_COUNT_HW_MAX) + type = PERF_TYPE_HARDWARE; + } + + if (type == PERF_TYPE_RAW) + config = eventsel & X86_RAW_EVENT_MASK; + + reprogram_counter(pmc, type, config, + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT); +} + +static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) +{ + unsigned en = en_pmi & 0x3; + bool pmi = en_pmi & 0x8; + + stop_counter(pmc); + + if (!en || !pmc_enabled(pmc)) + return; + + reprogram_counter(pmc, PERF_TYPE_HARDWARE, + arch_events[fixed_pmc_events[idx]].event_type, + !(en & 0x2), /* exclude user */ + !(en & 0x1), /* exclude kernel */ + pmi); +} + +static inline u8 fixed_en_pmi(u64 ctrl, int idx) +{ + return (ctrl >> (idx * 4)) & 0xf; +} + +static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) +{ + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + u8 en_pmi = fixed_en_pmi(data, i); + struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i); + + if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi) + continue; + + reprogram_fixed_counter(pmc, en_pmi, i); + } + + pmu->fixed_ctr_ctrl = data; +} + +static void reprogram_idx(struct kvm_pmu *pmu, int idx) +{ + struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx); + + if (!pmc) + return; + + if (pmc_is_gp(pmc)) + reprogram_gp_counter(pmc, pmc->eventsel); + else { + int fidx = idx - X86_PMC_IDX_FIXED; + reprogram_fixed_counter(pmc, + fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); + } +} + +static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) +{ + int bit; + u64 diff = pmu->global_ctrl ^ data; + + pmu->global_ctrl = data; + + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + reprogram_idx(pmu, bit); +} + +bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + int ret; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + ret = pmu->version > 1; + break; + default: + ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) + || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) + || get_fixed_pmc(pmu, msr); + break; + } + return ret; +} + +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + + switch (index) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + *data = pmu->fixed_ctr_ctrl; + return 0; + case MSR_CORE_PERF_GLOBAL_STATUS: + *data = pmu->global_status; + return 0; + case MSR_CORE_PERF_GLOBAL_CTRL: + *data = pmu->global_ctrl; + return 0; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + *data = pmu->global_ovf_ctrl; + return 0; + default: + if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || + (pmc = get_fixed_pmc(pmu, index))) { + *data = read_pmc(pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { + *data = pmc->eventsel; + return 0; + } + } + return 1; +} + +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + + switch (index) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + if (pmu->fixed_ctr_ctrl == data) + return 0; + if (!(data & 0xfffffffffffff444)) { + reprogram_fixed_counters(pmu, data); + return 0; + } + break; + case MSR_CORE_PERF_GLOBAL_STATUS: + break; /* RO MSR */ + case MSR_CORE_PERF_GLOBAL_CTRL: + if (pmu->global_ctrl == data) + return 0; + if (!(data & pmu->global_ctrl_mask)) { + global_ctrl_changed(pmu, data); + return 0; + } + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { + pmu->global_status &= ~data; + pmu->global_ovf_ctrl = data; + return 0; + } + break; + default: + if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || + (pmc = get_fixed_pmc(pmu, index))) { + data = (s64)(s32)data; + pmc->counter += data - read_pmc(pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { + if (data == pmc->eventsel) + return 0; + if (!(data & 0xffffffff00200000ull)) { + reprogram_gp_counter(pmc, data); + return 0; + } + } + } + return 1; +} + +int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + bool fast_mode = pmc & (1u << 31); + bool fixed = pmc & (1u << 30); + struct kvm_pmc *counters; + u64 ctr; + + pmc &= (3u << 30) - 1; + if (!fixed && pmc >= pmu->nr_arch_gp_counters) + return 1; + if (fixed && pmc >= pmu->nr_arch_fixed_counters) + return 1; + counters = fixed ? pmu->fixed_counters : pmu->gp_counters; + ctr = read_pmc(&counters[pmc]); + if (fast_mode) + ctr = (u32)ctr; + *data = ctr; + + return 0; +} + +void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_cpuid_entry2 *entry; + unsigned bitmap_len; + + pmu->nr_arch_gp_counters = 0; + pmu->nr_arch_fixed_counters = 0; + pmu->counter_bitmask[KVM_PMC_GP] = 0; + pmu->counter_bitmask[KVM_PMC_FIXED] = 0; + pmu->version = 0; + + entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + if (!entry) + return; + + pmu->version = entry->eax & 0xff; + if (!pmu->version) + return; + + pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, + X86_PMC_MAX_GENERIC); + pmu->counter_bitmask[KVM_PMC_GP] = + ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; + bitmap_len = (entry->eax >> 24) & 0xff; + pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1); + + if (pmu->version == 1) { + pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1; + return; + } + + pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), + X86_PMC_MAX_FIXED); + pmu->counter_bitmask[KVM_PMC_FIXED] = + ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; + pmu->global_ctrl_mask = ~(((1 << pmu->nr_arch_gp_counters) - 1) + | (((1ull << pmu->nr_arch_fixed_counters) - 1) + << X86_PMC_IDX_FIXED)); +} + +void kvm_pmu_init(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + + memset(pmu, 0, sizeof(*pmu)); + for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { + pmu->gp_counters[i].type = KVM_PMC_GP; + pmu->gp_counters[i].vcpu = vcpu; + pmu->gp_counters[i].idx = i; + } + for (i = 0; i < X86_PMC_MAX_FIXED; i++) { + pmu->fixed_counters[i].type = KVM_PMC_FIXED; + pmu->fixed_counters[i].vcpu = vcpu; + pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED; + } + init_irq_work(&pmu->irq_work, trigger_pmi); + kvm_pmu_cpuid_update(vcpu); +} + +void kvm_pmu_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + int i; + + irq_work_sync(&pmu->irq_work); + for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { + struct kvm_pmc *pmc = &pmu->gp_counters[i]; + stop_counter(pmc); + pmc->counter = pmc->eventsel = 0; + } + + for (i = 0; i < X86_PMC_MAX_FIXED; i++) + stop_counter(&pmu->fixed_counters[i]); + + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = + pmu->global_ovf_ctrl = 0; +} + +void kvm_pmu_destroy(struct kvm_vcpu *vcpu) +{ + kvm_pmu_reset(vcpu); +} + +void kvm_handle_pmu_event(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + u64 bitmask; + int bit; + + bitmask = pmu->reprogram_pmi; + + for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { + struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit); + + if (unlikely(!pmc || !pmc->perf_event)) { + clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); + continue; + } + + reprogram_idx(pmu, bit); + } +} diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0a646e2..08ae951 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1602,8 +1602,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) * which we perfectly emulate ;-). Any other value should be at least * reported, some guests depend on them. */ - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: case MSR_K7_EVNTSEL0: case MSR_K7_EVNTSEL1: case MSR_K7_EVNTSEL2: @@ -1615,8 +1613,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* at least RHEL 4 unconditionally writes to the perfctr registers, * so we ignore writes to make it happy. */ - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: case MSR_K7_PERFCTR0: case MSR_K7_PERFCTR1: case MSR_K7_PERFCTR2: @@ -1653,6 +1649,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); + if (kvm_pmu_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr, data); if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); @@ -1815,10 +1813,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_K8_SYSCFG: case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: case MSR_K7_EVNTSEL0: case MSR_K7_PERFCTR0: case MSR_K8_INT_PENDING_MSG: @@ -1929,6 +1923,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = 0xbe702111; break; default: + if (kvm_pmu_msr(vcpu, msr)) + return kvm_pmu_get_msr(vcpu, msr, pdata); if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; @@ -4650,7 +4646,7 @@ static void kvm_timer_init(void) static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); -static int kvm_is_in_guest(void) +int kvm_is_in_guest(void) { return __this_cpu_read(current_vcpu) != NULL; } @@ -5114,6 +5110,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) process_nmi(vcpu); req_immediate_exit = kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu); + if (kvm_check_request(KVM_REQ_PMU, vcpu)) + kvm_handle_pmu_event(vcpu); + if (kvm_check_request(KVM_REQ_PMI, vcpu)) + kvm_deliver_pmi(vcpu); } r = kvm_mmu_reload(vcpu); @@ -5850,6 +5850,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; + kvm_pmu_reset(vcpu); + return kvm_x86_ops->vcpu_reset(vcpu); } @@ -5934,6 +5936,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_mce_banks; kvm_async_pf_hash_reset(vcpu); + kvm_pmu_init(vcpu); return 0; fail_free_mce_banks: @@ -5952,6 +5955,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { int idx; + kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); -- cgit v1.1 From 022cd0e84020eec8b589bc119699c935c7b29584 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Nov 2011 14:57:23 +0200 Subject: KVM: Add generic RDPMC support Add a helper function that emulates the RDPMC instruction operation. Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fb60ffd..52d6640 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -760,6 +760,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); +bool kvm_rdpmc(struct kvm_vcpu *vcpu); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 08ae951..27d18b7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -760,6 +760,21 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) } EXPORT_SYMBOL_GPL(kvm_get_dr); +bool kvm_rdpmc(struct kvm_vcpu *vcpu) +{ + u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + u64 data; + int err; + + err = kvm_pmu_read_pmc(vcpu, ecx, &data); + if (err) + return err; + kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data); + kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32); + return err; +} +EXPORT_SYMBOL_GPL(kvm_rdpmc); + /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. -- cgit v1.1 From 332b56e4841ef62db4dbf1b4b92195575e1c7338 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Nov 2011 14:57:24 +0200 Subject: KVM: SVM: Intercept RDPMC Intercept RDPMC and forward it to the PMU emulation code. Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e32243e..5fa553b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1014,6 +1014,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_NMI); set_intercept(svm, INTERCEPT_SMI); set_intercept(svm, INTERCEPT_SELECTIVE_CR0); + set_intercept(svm, INTERCEPT_RDPMC); set_intercept(svm, INTERCEPT_CPUID); set_intercept(svm, INTERCEPT_INVD); set_intercept(svm, INTERCEPT_HLT); @@ -2770,6 +2771,19 @@ static int emulate_on_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; } +static int rdpmc_interception(struct vcpu_svm *svm) +{ + int err; + + if (!static_cpu_has(X86_FEATURE_NRIPS)) + return emulate_on_interception(svm); + + err = kvm_rdpmc(&svm->vcpu); + kvm_complete_insn_gp(&svm->vcpu, err); + + return 1; +} + bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) { unsigned long cr0 = svm->vcpu.arch.cr0; @@ -3190,6 +3204,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_SMI] = nop_on_interception, [SVM_EXIT_INIT] = nop_on_interception, [SVM_EXIT_VINTR] = interrupt_window_interception, + [SVM_EXIT_RDPMC] = rdpmc_interception, [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_IRET] = iret_interception, [SVM_EXIT_INVD] = emulate_on_interception, -- cgit v1.1 From fee84b079d5ddee2247b5c1f53162c330c622902 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Nov 2011 14:57:25 +0200 Subject: KVM: VMX: Intercept RDPMC Intercept RDPMC and forward it to the PMU emulation code. Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4ceced2..906a7e8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1956,6 +1956,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) #endif CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | + CPU_BASED_RDPMC_EXITING | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; /* * We can allow some features even when not supported by the @@ -2410,7 +2411,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING | - CPU_BASED_INVLPG_EXITING; + CPU_BASED_INVLPG_EXITING | + CPU_BASED_RDPMC_EXITING; if (yield_on_hlt) min |= CPU_BASED_HLT_EXITING; @@ -4613,6 +4615,16 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) return 1; } +static int handle_rdpmc(struct kvm_vcpu *vcpu) +{ + int err; + + err = kvm_rdpmc(vcpu); + kvm_complete_insn_gp(vcpu, err); + + return 1; +} + static int handle_wbinvd(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); @@ -5563,6 +5575,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_HLT] = handle_halt, [EXIT_REASON_INVD] = handle_invd, [EXIT_REASON_INVLPG] = handle_invlpg, + [EXIT_REASON_RDPMC] = handle_rdpmc, [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_VMCLEAR] = handle_vmclear, [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, -- cgit v1.1 From a6c06ed1a60aff77b27ba558c315c3fed4e35565 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 10 Nov 2011 14:57:28 +0200 Subject: KVM: Expose the architectural performance monitoring CPUID leaf Provide a CPUID leaf that describes the emulated PMU. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/cpuid.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e70be46..89b02bf 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -327,6 +327,35 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } case 9: break; + case 0xa: { /* Architectural Performance Monitoring */ + struct x86_pmu_capability cap; + union cpuid10_eax eax; + union cpuid10_edx edx; + + perf_get_x86_pmu_capability(&cap); + + /* + * Only support guest architectural pmu on a host + * with architectural pmu. + */ + if (!cap.version) + memset(&cap, 0, sizeof(cap)); + + eax.split.version_id = min(cap.version, 2); + eax.split.num_counters = cap.num_counters_gp; + eax.split.bit_width = cap.bit_width_gp; + eax.split.mask_length = cap.events_mask_len; + + edx.split.num_counters_fixed = cap.num_counters_fixed; + edx.split.bit_width_fixed = cap.bit_width_fixed; + edx.split.reserved = 0; + + entry->eax = eax.full; + entry->ebx = cap.events_mask; + entry->ecx = 0; + entry->edx = edx.full; + break; + } /* function 0xb has additional index. */ case 0xb: { int i, level_type; @@ -427,7 +456,6 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ case 6: /* Thermal management */ - case 0xA: /* Architectural Performance Monitoring */ case 0x80000007: /* Advanced power management */ case 0xC0000002: case 0xC0000003: -- cgit v1.1 From 80bdec64c05b645708b0dd97919783ad077fcdc8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Nov 2011 14:57:29 +0200 Subject: KVM: x86 emulator: fix RDPMC privilege check RDPMC is only privileged if CR4.PCE=0. check_rdpmc() already implements this, so all we need to do is drop the Priv flag. Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index de7be77..d270f1a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3411,7 +3411,7 @@ static struct opcode twobyte_table[256] = { II(ImplicitOps | Priv, em_wrmsr, wrmsr), IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), II(ImplicitOps | Priv, em_rdmsr, rdmsr), - DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), + DIP(ImplicitOps, rdpmc, check_rdpmc), I(ImplicitOps | VendorSpecific, em_sysenter), I(ImplicitOps | Priv | VendorSpecific, em_sysexit), N, N, -- cgit v1.1 From 222d21aa070a4885ce3c7125a1b7ce07429ea4a1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Nov 2011 14:57:30 +0200 Subject: KVM: x86 emulator: implement RDPMC (0F 33) Signed-off-by: Avi Kivity Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 13 ++++++++++++- arch/x86/kvm/x86.c | 7 +++++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 9a4acf4..ab4092e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -181,6 +181,7 @@ struct x86_emulate_ops { int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); + int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata); void (*halt)(struct x86_emulate_ctxt *ctxt); void (*wbinvd)(struct x86_emulate_ctxt *ctxt); int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d270f1a..05a562b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2645,6 +2645,17 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_rdpmc(struct x86_emulate_ctxt *ctxt) +{ + u64 pmc; + + if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc)) + return emulate_gp(ctxt, 0); + ctxt->regs[VCPU_REGS_RAX] = (u32)pmc; + ctxt->regs[VCPU_REGS_RDX] = pmc >> 32; + return X86EMUL_CONTINUE; +} + static int em_mov(struct x86_emulate_ctxt *ctxt) { ctxt->dst.val = ctxt->src.val; @@ -3411,7 +3422,7 @@ static struct opcode twobyte_table[256] = { II(ImplicitOps | Priv, em_wrmsr, wrmsr), IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), II(ImplicitOps | Priv, em_rdmsr, rdmsr), - DIP(ImplicitOps, rdpmc, check_rdpmc), + IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), I(ImplicitOps | VendorSpecific, em_sysenter), I(ImplicitOps | Priv | VendorSpecific, em_sysexit), N, N, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27d18b7..1171def 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4146,6 +4146,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); } +static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, + u32 pmc, u64 *pdata) +{ + return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata); +} + static void emulator_halt(struct x86_emulate_ctxt *ctxt) { emul_to_vcpu(ctxt)->arch.halt_request = 1; @@ -4198,6 +4204,7 @@ static struct x86_emulate_ops emulate_ops = { .set_dr = emulator_set_dr, .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, + .read_pmc = emulator_read_pmc, .halt = emulator_halt, .wbinvd = emulator_wbinvd, .fix_hypercall = emulator_fix_hypercall, -- cgit v1.1 From 7c9c3a1e5fc8728e948b8fa3cbcfcfb86db3afda Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 29 Dec 2011 14:43:16 +0000 Subject: x86/intel config: Fix the APB_TIMER selection Seems Kconfig SELECT isn't selecting things hierarchically when selected. config APB_TIMER def_bool y if X86_INTEL_MID prompt "Intel MID APB Timer Support" if X86_INTEL_MID select DW_APB_TIMER depends on X86_INTEL_MID && SFI when we select APB_TIMER doesn't select DW_APB_TIMER so do it by hand. Signed-off-by: Alan Cox Link: http://lkml.kernel.org/n/tip-kpnaimplltk6d1lolusqj3ae@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 07620bc91..78fbb34 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -409,12 +409,14 @@ config X86_MRST depends on PCI depends on PCI_GOANY depends on X86_IO_APIC + select X86_INTEL_MID + select SFI + select DW_APB_TIMER select APB_TIMER select I2C select SPI select INTEL_SCU_IPC select X86_PLATFORM_DEVICES - select X86_INTEL_MID ---help--- Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin Internet Device(MID) platform. Moorestown consists of two chips: @@ -428,12 +430,14 @@ config X86_MDFLD depends on PCI depends on PCI_GOANY depends on X86_IO_APIC + select X86_INTEL_MID + select SFI + select DW_APB_TIMER select APB_TIMER select I2C select SPI select INTEL_SCU_IPC select X86_PLATFORM_DEVICES - select X86_INTEL_MID ---help--- Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin Internet Device(MID) platform. -- cgit v1.1 From 2c9ede55ecec58099b72e4bb8eab719f32f72c31 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Jul 2011 20:24:48 -0400 Subject: switch device_get_devnode() and ->devnode() to umode_t * both callers of device_get_devnode() are only interested in lower 16bits and nobody tries to return anything wider than 16bit anyway. Signed-off-by: Al Viro --- arch/x86/kernel/cpuid.c | 2 +- arch/x86/kernel/msr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 212a6a4..a524353 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier = .notifier_call = cpuid_class_cpu_callback, }; -static char *cpuid_devnode(struct device *dev, mode_t *mode) +static char *cpuid_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 12fcbe2..9635676 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -236,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; -static char *msr_devnode(struct device *dev, mode_t *mode) +static char *msr_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); } -- cgit v1.1 From f4ae40a6a50a98ac23d4b285f739455e926a473e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Jul 2011 04:33:43 -0400 Subject: switch debugfs to umode_t Signed-off-by: Al Viro --- arch/x86/xen/debugfs.c | 2 +- arch/x86/xen/debugfs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index 7c0fedd..ef1db19 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -109,7 +109,7 @@ static const struct file_operations u32_array_fops = { .llseek = no_llseek, }; -struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, +struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode, struct dentry *parent, u32 *array, unsigned elements) { diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h index e281320..78d2549 100644 --- a/arch/x86/xen/debugfs.h +++ b/arch/x86/xen/debugfs.h @@ -3,7 +3,7 @@ struct dentry * __init xen_init_debugfs(void); -struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, +struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode, struct dentry *parent, u32 *array, unsigned elements); -- cgit v1.1 From 28c3c05d337f6fdf84faf69374e6325b80cbf9ad Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 30 Dec 2011 14:37:05 -0500 Subject: PCI: add set_nouse_crs for use by a pci=nocrs blacklist Some machines don't boot unless passed pci=nocrs. (See https://bugzilla.redhat.com/show_bug.cgi?id=770308 for details of one report. Waiting on dmidecode output for others). Currently there is a DMI whitelist, even though the default is on. v2: drop the 1536 blacklist entry, superceded by the PNP/MMCONFIG changes from Bjorn Acked-by: Bjorn Helgaas Acked-by: Ingo Molnar Signed-off-by: Dave Jones Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 404f21a..f5ccf29 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -24,6 +24,12 @@ static int __init set_use_crs(const struct dmi_system_id *id) return 0; } +static int __init set_nouse_crs(const struct dmi_system_id *id) +{ + pci_use_crs = false; + return 0; +} + static const struct dmi_system_id pci_use_crs_table[] __initconst = { /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */ { @@ -54,6 +60,7 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), }, }, + {} }; -- cgit v1.1 From e702781fa846dd726b73e673f91ffbd3b0e8d114 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 4 Jan 2012 11:33:12 -0500 Subject: PCI: Add Dell Studio 1557 to pci=nocrs blacklist The Dell Studio 1557 also doesn't suspend correctly when CRS is enabled. Details at https://bugzilla.redhat.com/show_bug.cgi?id=769657 Reported-by: Gregory S. Hoerner Signed-off-by: Dave Jones Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index f5ccf29..0d9329f 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -61,6 +61,18 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { }, }, + /* Now for the blacklist.. */ + + /* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */ + { + .callback = set_nouse_crs, + .ident = "Dell Studio 1557", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Studio 1557"), + DMI_MATCH(DMI_BIOS_VERSION, "A09"), + }, + }, {} }; -- cgit v1.1 From 8b6a5af92c03b363df050da906480085b6cd6e00 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 4 Jan 2012 11:30:52 -0500 Subject: PCI: Add Thinkpad SL510 to pci=nocrs blacklist Enabling CRS by default breaks suspend on the Thinkpad SL510. Details in https://bugzilla.redhat.com/show_bug.cgi?id=769657 Reported-by: Stefan Kirrmann Signed-off-by: Dave Jones Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 0d9329f..e662cee 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -73,6 +73,16 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { DMI_MATCH(DMI_BIOS_VERSION, "A09"), }, }, + /* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */ + { + .callback = set_nouse_crs, + .ident = "Thinkpad SL510", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), + DMI_MATCH(DMI_BOARD_NAME, "2847DFG"), + DMI_MATCH(DMI_BIOS_VERSION, "6JET85WW (1.43 )"), + }, + }, {} }; -- cgit v1.1 From ae5cd86455381282ece162966183d3f208c6fad7 Mon Sep 17 00:00:00 2001 From: Gary Hade Date: Mon, 14 Nov 2011 15:42:16 -0800 Subject: x86/PCI: Ignore CPU non-addressable _CRS reserved memory resources This assures that a _CRS reserved host bridge window or window region is not used if it is not addressable by the CPU. The new code either trims the window to exclude the non-addressable portion or totally ignores the window if the entire window is non-addressable. The current code has been shown to be problematic with 32-bit non-PAE kernels on systems where _CRS reserves resources above 4GB. Signed-off-by: Gary Hade Reviewed-by: Bjorn Helgaas Cc: Thomas Renninger Cc: linux-kernel@vger.kernel.org Cc: stable@kernel.org Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index e662cee..425500b 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -178,7 +178,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data) struct acpi_resource_address64 addr; acpi_status status; unsigned long flags; - u64 start, end; + u64 start, orig_end, end; status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) @@ -194,7 +194,21 @@ setup_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; start = addr.minimum + addr.translation_offset; - end = addr.maximum + addr.translation_offset; + orig_end = end = addr.maximum + addr.translation_offset; + + /* Exclude non-addressable range or non-addressable portion of range */ + end = min(end, (u64)iomem_resource.end); + if (end <= start) { + dev_info(&info->bridge->dev, + "host bridge window [%#llx-%#llx] " + "(ignored, not CPU addressable)\n", start, orig_end); + return AE_OK; + } else if (orig_end != end) { + dev_info(&info->bridge->dev, + "host bridge window [%#llx-%#llx] " + "([%#llx-%#llx] ignored, not CPU addressable)\n", + start, orig_end, end + 1, orig_end); + } res = &info->res[info->res_num]; res->name = info->name; -- cgit v1.1 From 96c5590058d7fded14f43af2ab521436cecf3125 Mon Sep 17 00:00:00 2001 From: Myron Stowe Date: Fri, 28 Oct 2011 15:48:38 -0600 Subject: PCI: Pull PCI 'latency timer' setup up into the core The 'latency timer' of PCI devices, both Type 0 and Type 1, is setup in architecture-specific code [see: 'pcibios_set_master()']. There are two approaches being taken by all the architectures - check if the 'latency timer' is currently set between 16 and 255 and if not bring it within bounds, or, do nothing (and then there is the gratuitously different PA-RISC implementation). There is nothing architecture-specific about PCI's 'latency timer' so this patch pulls its setup functionality up into the PCI core by creating a generic 'pcibios_set_master()' function using the '__weak' attribute which can be used by all architectures as a default which, if necessary, can then be over-ridden by architecture-specific code. No functional change. Signed-off-by: Myron Stowe Signed-off-by: Jesse Barnes --- arch/x86/include/asm/pci_x86.h | 2 -- arch/x86/pci/i386.c | 6 ------ 2 files changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index e381978..b3a5317 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -44,8 +44,6 @@ enum pci_bf_sort_state { /* pci-i386.c */ -extern unsigned int pcibios_max_latency; - void pcibios_resource_survey(void); void pcibios_set_cache_line_size(void); diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 794b092..dd5806b 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -254,12 +254,6 @@ void __init pcibios_resource_survey(void) */ fs_initcall(pcibios_assign_resources); -/* - * If we set up a device for bus mastering, we need to check the latency - * timer as certain crappy BIOSes forget to set it properly. - */ -unsigned int pcibios_max_latency = 255; - void pcibios_set_master(struct pci_dev *dev) { u8 lat; -- cgit v1.1 From b9a276ad262815d88f4dd232d578864949aab3b9 Mon Sep 17 00:00:00 2001 From: Myron Stowe Date: Fri, 28 Oct 2011 15:49:13 -0600 Subject: PCI: x86: use generic pcibios_set_master() This patch removes x86's architecture-specific 'pcibios_set_master()' routine and lets the default PCI core based implementation handle PCI device 'latency timer' setup. No functional change. Signed-off-by: Myron Stowe Signed-off-by: Jesse Barnes --- arch/x86/pci/i386.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index dd5806b..91821a1 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -254,20 +254,6 @@ void __init pcibios_resource_survey(void) */ fs_initcall(pcibios_assign_resources); -void pcibios_set_master(struct pci_dev *dev) -{ - u8 lat; - pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat); - if (lat < 16) - lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency; - else if (lat > pcibios_max_latency) - lat = pcibios_max_latency; - else - return; - dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat); - pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); -} - static const struct vm_operations_struct pci_mmap_ops = { .access = generic_access_phys, }; -- cgit v1.1 From ca3671a83389eea1458929d22c66a69e955bfb07 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Mon, 5 Dec 2011 18:12:28 +0100 Subject: x86/PCI: amd: Kill misleading message about enablement of IO access to PCI ECS] Commit 24d9b70b8c679264756a6980e668b96b3f964826 (x86: Use PCI method for enabling AMD extended config space before MSR method) added a message when IO access to PCI ECS was enabled via access to the NB_CFG PCI register. This can lead to a bogus message like [ 0.365177] Extended Config Space enabled on 0 nodes which is misleading because IO ECS access is subsequently enabled for AMD CPUs (that support this) by modifying the corresponding NB_CFG MSR. Furthermore it's not "Extended Config Space" that is enabled by this register setting. It's the IO access that is enabled for extended configruation space. IMHO the ambiguous message needs to be cancelled. Cc: Jan Beulich Cc: Robert Richter Signed-off-by: Andreas Herrmann Signed-off-by: Jesse Barnes --- arch/x86/pci/amd_bus.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 026e493..7b7a897 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -403,7 +403,6 @@ static void __init pci_enable_pci_io_ecs(void) ++n; } } - pr_info("Extended Config Space enabled on %u nodes\n", n); #endif } -- cgit v1.1 From 6361d72b04d1f77736142bc3911a32b814370729 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 28 Oct 2011 16:28:03 -0600 Subject: x86/PCI: read Broadcom CNB20LE host bridge info before PCI scan We currently read the CNB20LE aperture information in a PCI quirk, which happens after we've already created the root bus. This patch changes it to read the apertures earlier so we can create the root bus with the correct resources. I believe the CNB20LE lives at "pci 0000:00:00" based on https://lkml.org/lkml/2010/8/13/220 CC: Ira W. Snyder CC: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/broadcom_bus.c | 62 ++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c index ab8269b..f3a7c56 100644 --- a/arch/x86/pci/broadcom_bus.c +++ b/arch/x86/pci/broadcom_bus.c @@ -15,10 +15,11 @@ #include #include #include +#include #include "bus_numa.h" -static void __devinit cnb20le_res(struct pci_dev *dev) +static void __init cnb20le_res(u8 bus, u8 slot, u8 func) { struct pci_root_info *info; struct resource res; @@ -26,21 +27,12 @@ static void __devinit cnb20le_res(struct pci_dev *dev) u8 fbus, lbus; int i; -#ifdef CONFIG_ACPI - /* - * We should get host bridge information from ACPI unless the BIOS - * doesn't support it. - */ - if (acpi_os_get_root_pointer()) - return; -#endif - info = &pci_root_info[pci_root_num]; pci_root_num++; /* read the PCI bus numbers */ - pci_read_config_byte(dev, 0x44, &fbus); - pci_read_config_byte(dev, 0x45, &lbus); + fbus = read_pci_config_byte(bus, slot, func, 0x44); + lbus = read_pci_config_byte(bus, slot, func, 0x45); info->bus_min = fbus; info->bus_max = lbus; @@ -59,8 +51,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev) } /* read the non-prefetchable memory window */ - pci_read_config_word(dev, 0xc0, &word1); - pci_read_config_word(dev, 0xc2, &word2); + word1 = read_pci_config_16(bus, slot, func, 0xc0); + word2 = read_pci_config_16(bus, slot, func, 0xc2); if (word1 != word2) { res.start = (word1 << 16) | 0x0000; res.end = (word2 << 16) | 0xffff; @@ -69,8 +61,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev) } /* read the prefetchable memory window */ - pci_read_config_word(dev, 0xc4, &word1); - pci_read_config_word(dev, 0xc6, &word2); + word1 = read_pci_config_16(bus, slot, func, 0xc4); + word2 = read_pci_config_16(bus, slot, func, 0xc6); if (word1 != word2) { res.start = (word1 << 16) | 0x0000; res.end = (word2 << 16) | 0xffff; @@ -79,8 +71,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev) } /* read the IO port window */ - pci_read_config_word(dev, 0xd0, &word1); - pci_read_config_word(dev, 0xd2, &word2); + word1 = read_pci_config_16(bus, slot, func, 0xd0); + word2 = read_pci_config_16(bus, slot, func, 0xd2); if (word1 != word2) { res.start = word1; res.end = word2; @@ -92,13 +84,37 @@ static void __devinit cnb20le_res(struct pci_dev *dev) res.start = fbus; res.end = lbus; res.flags = IORESOURCE_BUS; - dev_info(&dev->dev, "CNB20LE PCI Host Bridge (domain %04x %pR)\n", - pci_domain_nr(dev->bus), &res); + printk(KERN_INFO "CNB20LE PCI Host Bridge (domain 0000 %pR)\n", &res); for (i = 0; i < info->res_num; i++) - dev_info(&dev->dev, "host bridge window %pR\n", &info->res[i]); + printk(KERN_INFO "host bridge window %pR\n", &info->res[i]); } -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE, - cnb20le_res); +static int __init broadcom_postcore_init(void) +{ + u8 bus = 0, slot = 0; + u32 id; + u16 vendor, device; + +#ifdef CONFIG_ACPI + /* + * We should get host bridge information from ACPI unless the BIOS + * doesn't support it. + */ + if (acpi_os_get_root_pointer()) + return 0; +#endif + + id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); + vendor = id & 0xffff; + device = (id >> 16) & 0xffff; + + if (vendor == PCI_VENDOR_ID_SERVERWORKS && + device == PCI_DEVICE_ID_SERVERWORKS_LE) { + cnb20le_res(bus, slot, 0); + cnb20le_res(bus, slot, 1); + } + return 0; +} +postcore_initcall(broadcom_postcore_init); -- cgit v1.1 From 46fbade05ca0784ca3c959bd7bf2aae7d81306c2 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 28 Oct 2011 16:28:08 -0600 Subject: x86/PCI: use pci_scan_bus() instead of pci_scan_bus_parented() This doesn't change any functionality, but it makes a subsequent patch slightly simpler. pci_scan_bus(NULL, ...) and pci_scan_bus_parented() are identical except that pci_scan_bus() also calls pci_bus_add_devices(): pci_scan_bus_parented pci_create_bus pci_scan_child_bus pci_scan_bus pci_create_bus pci_scan_child_bus pci_bus_add_devices All callers of pcibios_scan_root() call pci_bus_add_devices() explicitly, and we don't pass a parent device, so we might as well use pci_scan_bus(). Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/common.c | 2 +- arch/x86/pci/legacy.c | 3 --- arch/x86/pci/numaq_32.c | 2 -- 3 files changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 7962ccb..07c55ce 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -456,7 +456,7 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) sd->node = get_mp_bus_to_node(busnum); printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum); - bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); + bus = pci_scan_bus(busnum, &pci_root_ops, sd); if (!bus) kfree(sd); diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 2c2aeab..a1df191 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -31,9 +31,6 @@ int __init pci_legacy_init(void) printk("PCI: Probing PCI hardware\n"); pci_root_bus = pcibios_scan_root(0); - if (pci_root_bus) - pci_bus_add_devices(pci_root_bus); - return 0; } diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 51abf02..83e125b 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -153,8 +153,6 @@ int __init pci_numaq_init(void) raw_pci_ops = &pci_direct_conf1_mq; pci_root_bus = pcibios_scan_root(0); - if (pci_root_bus) - pci_bus_add_devices(pci_root_bus); if (num_online_nodes() > 1) for_each_online_node(quad) { if (quad == 0) -- cgit v1.1 From 2cd6975a4ff92a75e46240109d01c1daf4682e5d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 28 Oct 2011 16:28:14 -0600 Subject: x86/PCI: convert to pci_create_root_bus() and pci_scan_root_bus() x86 has two kinds of PCI root bus scanning: (1) ACPI-based, using _CRS resources. This used pci_create_bus(), not pci_scan_bus(), because ACPI hotplug needed to split the pci_bus_add_devices() into a separate host bridge .start() method. This patch parses the _CRS resources earlier, so we can build a list of resources and pass it to pci_create_root_bus(). Note that as before, we parse the _CRS even if we aren't going to use it so we can print it for debugging purposes. (2) All other, which used either default resources (ioport_resource and iomem_resource) or information read from the hardware via amd_bus.c or similar. This used pci_scan_bus(). This patch converts x86_pci_root_bus_res_quirks() (previously called from pcibios_fixup_bus()) to x86_pci_root_bus_resources(), which builds a list of resources before we call pci_scan_root_bus(). We also use x86_pci_root_bus_resources() if we have ACPI but are ignoring _CRS. CC: Yinghai Lu Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/include/asm/topology.h | 2 +- arch/x86/pci/acpi.c | 28 ++++++++++++++-------------- arch/x86/pci/bus_numa.c | 31 ++++++++++++++++++------------- arch/x86/pci/common.c | 19 ++++++++++++------- 4 files changed, 45 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index c006924..5f83b13 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -174,7 +174,7 @@ static inline void arch_fix_phys_package_id(int num, u32 slot) } struct pci_bus; -void x86_pci_root_bus_res_quirks(struct pci_bus *b); +void x86_pci_root_bus_resources(int bus, struct list_head *resources); #ifdef CONFIG_SMP #define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \ diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 425500b..a312e76 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -12,7 +12,7 @@ struct pci_root_info { char *name; unsigned int res_num; struct resource *res; - struct pci_bus *bus; + struct list_head *resources; int busnum; }; @@ -304,23 +304,20 @@ static void add_resources(struct pci_root_info *info) "ignoring host bridge window %pR (conflicts with %s %pR)\n", res, conflict->name, conflict); else - pci_bus_add_resource(info->bus, res, 0); + pci_add_resource(info->resources, res); } } static void get_current_resources(struct acpi_device *device, int busnum, - int domain, struct pci_bus *bus) + int domain, struct list_head *resources) { struct pci_root_info info; size_t size; - if (pci_use_crs) - pci_bus_remove_resources(bus); - info.bridge = device; - info.bus = bus; info.res_num = 0; + info.resources = resources; acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, &info); if (!info.res_num) @@ -329,7 +326,7 @@ get_current_resources(struct acpi_device *device, int busnum, size = sizeof(*info.res) * info.res_num; info.res = kmalloc(size, GFP_KERNEL); if (!info.res) - goto res_alloc_fail; + return; info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum); if (!info.name) @@ -344,8 +341,6 @@ get_current_resources(struct acpi_device *device, int busnum, name_alloc_fail: kfree(info.res); -res_alloc_fail: - return; } struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) @@ -353,6 +348,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) struct acpi_device *device = root->device; int domain = root->segment; int busnum = root->secondary.start; + LIST_HEAD(resources); struct pci_bus *bus; struct pci_sysdata *sd; int node; @@ -407,11 +403,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) memcpy(bus->sysdata, sd, sizeof(*sd)); kfree(sd); } else { - bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); - if (bus) { - get_current_resources(device, busnum, domain, bus); + get_current_resources(device, busnum, domain, &resources); + if (list_empty(&resources)) + x86_pci_root_bus_resources(busnum, &resources); + bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, + &resources); + if (bus) bus->subordinate = pci_scan_child_bus(bus); - } + else + pci_free_resource_list(&resources); } /* After the PCI-E bus has been walked and all devices discovered, diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c index 64a1228..fd3f655 100644 --- a/arch/x86/pci/bus_numa.c +++ b/arch/x86/pci/bus_numa.c @@ -7,45 +7,50 @@ int pci_root_num; struct pci_root_info pci_root_info[PCI_ROOT_NR]; -void x86_pci_root_bus_res_quirks(struct pci_bus *b) +void x86_pci_root_bus_resources(int bus, struct list_head *resources) { int i; int j; struct pci_root_info *info; - /* don't go for it if _CRS is used already */ - if (b->resource[0] != &ioport_resource || - b->resource[1] != &iomem_resource) - return; - if (!pci_root_num) - return; + goto default_resources; for (i = 0; i < pci_root_num; i++) { - if (pci_root_info[i].bus_min == b->number) + if (pci_root_info[i].bus_min == bus) break; } if (i == pci_root_num) - return; + goto default_resources; - printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", - b->number); + printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n", + bus); - pci_bus_remove_resources(b); info = &pci_root_info[i]; for (j = 0; j < info->res_num; j++) { struct resource *res; struct resource *root; res = &info->res[j]; - pci_bus_add_resource(b, res, 0); + pci_add_resource(resources, res); if (res->flags & IORESOURCE_IO) root = &ioport_resource; else root = &iomem_resource; insert_resource(root, res); } + return; + +default_resources: + /* + * We don't have any host bridge aperture information from the + * "native host bridge drivers," e.g., amd_bus or broadcom_bus, + * so fall back to the defaults historically used by pci_create_bus(). + */ + printk(KERN_DEBUG "PCI: root bus %02x: using default resources\n", bus); + pci_add_resource(resources, &ioport_resource); + pci_add_resource(resources, &iomem_resource); } void __devinit update_res(struct pci_root_info *info, resource_size_t start, diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 07c55ce..323481e 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -164,9 +164,6 @@ void __devinit pcibios_fixup_bus(struct pci_bus *b) { struct pci_dev *dev; - /* root bus? */ - if (!b->parent) - x86_pci_root_bus_res_quirks(b); pci_read_bridge_bases(b); list_for_each_entry(dev, &b->devices, bus_list) pcibios_fixup_device_resources(dev); @@ -433,6 +430,7 @@ void __init dmi_check_pciprobe(void) struct pci_bus * __devinit pcibios_scan_root(int busnum) { + LIST_HEAD(resources); struct pci_bus *bus = NULL; struct pci_sysdata *sd; @@ -456,9 +454,12 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) sd->node = get_mp_bus_to_node(busnum); printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum); - bus = pci_scan_bus(busnum, &pci_root_ops, sd); - if (!bus) + x86_pci_root_bus_resources(busnum, &resources); + bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources); + if (!bus) { + pci_free_resource_list(&resources); kfree(sd); + } return bus; } @@ -639,6 +640,7 @@ int pci_ext_cfg_avail(struct pci_dev *dev) struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node) { + LIST_HEAD(resources); struct pci_bus *bus = NULL; struct pci_sysdata *sd; @@ -653,9 +655,12 @@ struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, return NULL; } sd->node = node; - bus = pci_scan_bus(busno, ops, sd); - if (!bus) + x86_pci_root_bus_resources(busno, &resources); + bus = pci_scan_root_bus(NULL, busno, ops, sd, &resources); + if (!bus) { + pci_free_resource_list(&resources); kfree(sd); + } return bus; } -- cgit v1.1 From 24d25dbfa63c376323096660bfa9ad45a08870ce Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 5 Jan 2012 14:27:19 -0700 Subject: x86/PCI: amd: factor out MMCONFIG discovery This factors out the AMD native MMCONFIG discovery so we can use it outside amd_bus.c. amd_bus.c reads AMD MSRs so it can remove the MMCONFIG area from the PCI resources. We may also need the MMCONFIG information to work around BIOS defects in the ACPI MCFG table. Cc: Borislav Petkov Cc: Yinghai Lu Cc: stable@kernel.org # 2.6.34+ Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/include/asm/amd_nb.h | 2 ++ arch/x86/kernel/amd_nb.c | 31 +++++++++++++++++++++++++++++++ arch/x86/pci/amd_bus.c | 42 +++++++++++------------------------------- 3 files changed, 44 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 8e41071..49ad773 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_AMD_NB_H #define _ASM_X86_AMD_NB_H +#include #include struct amd_nb_bus_dev_range { @@ -13,6 +14,7 @@ extern const struct pci_device_id amd_nb_misc_ids[]; extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; extern bool early_is_amd_nb(u32 value); +extern struct resource *amd_get_mmconfig_range(struct resource *res); extern int amd_cache_northbridges(void); extern void amd_flush_garts(void); extern int amd_numa_init(void); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4c39baa..bae1efe 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -119,6 +119,37 @@ bool __init early_is_amd_nb(u32 device) return false; } +struct resource *amd_get_mmconfig_range(struct resource *res) +{ + u32 address; + u64 base, msr; + unsigned segn_busn_bits; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return NULL; + + /* assume all cpus from fam10h have mmconfig */ + if (boot_cpu_data.x86 < 0x10) + return NULL; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, msr); + + /* mmconfig is not enabled */ + if (!(msr & FAM10H_MMIO_CONF_ENABLE)) + return NULL; + + base = msr & (FAM10H_MMIO_CONF_BASE_MASK<> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + res->flags = IORESOURCE_MEM; + res->start = base; + res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1; + return res; +} + int amd_get_subcaches(int cpu) { struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 7b7a897..0567df3 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -30,34 +30,6 @@ static struct pci_hostbridge_probe pci_probes[] __initdata = { { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, }; -static u64 __initdata fam10h_mmconf_start; -static u64 __initdata fam10h_mmconf_end; -static void __init get_pci_mmcfg_amd_fam10h_range(void) -{ - u32 address; - u64 base, msr; - unsigned segn_busn_bits; - - /* assume all cpus from fam10h have mmconf */ - if (boot_cpu_data.x86 < 0x10) - return; - - address = MSR_FAM10H_MMIO_CONF_BASE; - rdmsrl(address, msr); - - /* mmconfig is not enable */ - if (!(msr & FAM10H_MMIO_CONF_ENABLE)) - return; - - base = msr & (FAM10H_MMIO_CONF_BASE_MASK<> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & - FAM10H_MMIO_CONF_BUSRANGE_MASK; - - fam10h_mmconf_start = base; - fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1; -} - #define RANGE_NUM 16 /** @@ -85,6 +57,9 @@ static int __init early_fill_mp_bus_info(void) u64 val; u32 address; bool found; + struct resource fam10h_mmconf_res, *fam10h_mmconf; + u64 fam10h_mmconf_start; + u64 fam10h_mmconf_end; if (!early_pci_allowed()) return -1; @@ -211,12 +186,17 @@ static int __init early_fill_mp_bus_info(void) subtract_range(range, RANGE_NUM, 0, end); /* get mmconfig */ - get_pci_mmcfg_amd_fam10h_range(); + fam10h_mmconf = amd_get_mmconfig_range(&fam10h_mmconf_res); /* need to take out mmconf range */ - if (fam10h_mmconf_end) { - printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); + if (fam10h_mmconf) { + printk(KERN_DEBUG "Fam 10h mmconf %pR\n", fam10h_mmconf); + fam10h_mmconf_start = fam10h_mmconf->start; + fam10h_mmconf_end = fam10h_mmconf->end; subtract_range(range, RANGE_NUM, fam10h_mmconf_start, fam10h_mmconf_end + 1); + } else { + fam10h_mmconf_start = 0; + fam10h_mmconf_end = 0; } /* mmio resource */ -- cgit v1.1 From 76ccc297018d25d55b789bbd508861ef1e2cdb0c Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 16 Dec 2011 17:38:18 -0500 Subject: x86/PCI: Expand the x86_msi_ops to have a restore MSIs. The MSI restore function will become a function pointer in an x86_msi_ops struct. It defaults to the implementation in the io_apic.c and msi.c. We piggyback on the indirection mechanism introduced by "x86: Introduce x86_msi_ops". Cc: x86@kernel.org Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: linux-pci@vger.kernel.org Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Jesse Barnes --- arch/x86/include/asm/pci.h | 9 +++++++++ arch/x86/include/asm/x86_init.h | 1 + arch/x86/kernel/x86_init.c | 1 + 3 files changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index d498943..df75d07 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -112,19 +112,28 @@ static inline void x86_teardown_msi_irq(unsigned int irq) { x86_msi.teardown_msi_irq(irq); } +static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq) +{ + x86_msi.restore_msi_irqs(dev, irq); +} #define arch_setup_msi_irqs x86_setup_msi_irqs #define arch_teardown_msi_irqs x86_teardown_msi_irqs #define arch_teardown_msi_irq x86_teardown_msi_irq +#define arch_restore_msi_irqs x86_restore_msi_irqs /* implemented in arch/x86/kernel/apic/io_apic. */ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void native_teardown_msi_irq(unsigned int irq); +void native_restore_msi_irqs(struct pci_dev *dev, int irq); /* default to the implementation in drivers/lib/msi.c */ #define HAVE_DEFAULT_MSI_TEARDOWN_IRQS +#define HAVE_DEFAULT_MSI_RESTORE_IRQS void default_teardown_msi_irqs(struct pci_dev *dev); +void default_restore_msi_irqs(struct pci_dev *dev, int irq); #else #define native_setup_msi_irqs NULL #define native_teardown_msi_irq NULL #define default_teardown_msi_irqs NULL +#define default_restore_msi_irqs NULL #endif #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 1971e65..cd52084 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -177,6 +177,7 @@ struct x86_msi_ops { int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); void (*teardown_msi_irq)(unsigned int irq); void (*teardown_msi_irqs)(struct pci_dev *dev); + void (*restore_msi_irqs)(struct pci_dev *dev, int irq); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c1d6cd5..83b05ad 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -114,4 +114,5 @@ struct x86_msi_ops x86_msi = { .setup_msi_irqs = native_setup_msi_irqs, .teardown_msi_irq = native_teardown_msi_irq, .teardown_msi_irqs = default_teardown_msi_irqs, + .restore_msi_irqs = default_restore_msi_irqs, }; -- cgit v1.1 From e58d429209105e698e9d0357481d62b37fe9a7dd Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 6 Jan 2012 11:17:51 -0500 Subject: x86, reboot: Fix typo in nmi reboot path It was brought to my attention that my x86 change to use NMI in the reboot path broke Intel Nehalem and Westmere boxes when using kexec. I realized I had mistyped the if statement in commit 3603a2512f9e69dc87914ba922eb4a0812b21cd6 and stuck the ')' in the wrong spot. Putting it in the right spot fixes kexec again. Doh. Reported-by: Yinghai Lu Cc: Linus Torvalds Signed-off-by: Don Zickus Link: http://lkml.kernel.org/r/1325866671-9797-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 113acda..66c74f4 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -176,7 +176,7 @@ static void native_nmi_stop_other_cpus(int wait) */ if (num_online_cpus() > 1) { /* did someone beat us here? */ - if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id() != -1)) + if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) return; if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, -- cgit v1.1 From 72142fd4109105c6bd21658966ca5e93c1684081 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 7 Jan 2012 14:10:18 -0800 Subject: x86: Move from trace_syscalls.c to asm/syscall.h This reverts commit d5e553d6e0a4bdea43adae7373e3fa144b9a1aaa, which caused large numbers of build warnings on PowerPC. This moves the #include to , which makes some kind of sense since NR_syscalls is syscalls related. Reported-by: Stephen Rothwell Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/20111214181545.6e13bc954cb7ddce9086e861@canb.auug.org.au --- arch/x86/include/asm/syscall.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index c4a348f..d962e56 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -15,6 +15,7 @@ #include #include +#include /* For NR_syscalls */ extern const unsigned long sys_call_table[]; -- cgit v1.1 From dc6821e0cfe74802aefd2067b40fcdc03fc4599e Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Sat, 7 Jan 2012 21:27:38 -0500 Subject: xen/mmu: Fix compile errors introduced by x86/memblock mismerge. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The git commit d4bbf7e7759afc172e2bfbc5c416324590049cdd "Merge branch 'master' into x86/memblock" mismerged the 32-bit section causing: arch/x86/xen/mmu.c: In function ‘xen_setup_kernel_pagetable’: arch/x86/xen/mmu.c:1855: error: expected ‘;’ before ‘)’ token arch/x86/xen/mmu.c:1855: error: expected statement before ‘)’ token Acked-by: Tejun Heo Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f4bf8aa..58a0e46 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1852,7 +1852,7 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, xen_write_cr3(__pa(initial_page_table)); memblock_reserve(__pa(xen_start_info->pt_base), - xen_start_info->nr_pt_frames * PAGE_SIZE)); + xen_start_info->nr_pt_frames * PAGE_SIZE); return initial_page_table; } -- cgit v1.1 From e39f560239984c3098237ad94c9449b1494163f8 Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 10 Jan 2012 15:10:21 -0800 Subject: fs: binfmt_elf: create Kconfig variable for PIE randomization Randomization of PIE load address is hard coded in binfmt_elf.c for X86 and ARM. Create a new Kconfig variable (CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE) for this and use it instead. Thus architecture specific policy is pushed out of the generic binfmt_elf.c and into the architecture Kconfig files. X86 and ARM Kconfigs are modified to select the new variable so there is no change in behavior. A follow on patch will select it for MIPS too. Signed-off-by: David Daney Cc: Russell King Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Alexander Viro Acked-by: H. Peter Anvin Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d2a69d..d6ddc0b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -62,6 +62,7 @@ config X86 select ANON_INODES select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER + select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_ARCH_JUMP_LABEL select HAVE_TEXT_POKE_SMP select HAVE_GENERIC_HARDIRQS -- cgit v1.1 From 5e6292c0f28f03dfdb8ea3d685f0b838a23bfba4 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 10 Jan 2012 15:11:17 -0800 Subject: signal: add block_sigmask() for adding sigmask to current->blocked Abstract the code sequence for adding a signal handler's sa_mask to current->blocked because the sequence is identical for all architectures. Furthermore, in the past some architectures actually got this code wrong, so introduce a wrapper that all architectures can use. Signed-off-by: Matt Fleming Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Tejun Heo Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/signal.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 54ddaeb2..46a01bdc 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -682,7 +682,6 @@ static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs) { - sigset_t blocked; int ret; /* Are we from a system call? */ @@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, */ regs->flags &= ~X86_EFLAGS_TF; - sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, sig); - set_current_blocked(&blocked); + block_sigmask(ka, sig); tracehook_signal_handler(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); -- cgit v1.1 From b6c96c0214138186f495e3ee73737c6fc5e4efa2 Mon Sep 17 00:00:00 2001 From: Stratos Psomadakis Date: Thu, 12 Jan 2012 15:44:47 +1030 Subject: lguest: Make sure interrupt is allocated ok by lguest_setup_irq Make sure the interrupt is allocated correctly by lguest_setup_irq (check the return value of irq_alloc_desc_at for -ENOMEM) Signed-off-by: Stratos Psomadakis Signed-off-by: Rusty Russell (cleanups and commentry) --- arch/x86/lguest/boot.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index cf4603b..642d880 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -856,18 +856,23 @@ static void __init lguest_init_IRQ(void) } /* - * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so - * rather than set them in lguest_init_IRQ we are called here every time an - * lguest device needs an interrupt. - * - * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should - * pass that up! + * Interrupt descriptors are allocated as-needed, but low-numbered ones are + * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it + * tells us the irq is already used: other errors (ie. ENOMEM) we take + * seriously. */ -void lguest_setup_irq(unsigned int irq) +int lguest_setup_irq(unsigned int irq) { - irq_alloc_desc_at(irq, 0); + int err; + + /* Returns -ve error or vector number. */ + err = irq_alloc_desc_at(irq, 0); + if (err < 0 && err != -EEXIST) + return err; + irq_set_chip_and_handler_name(irq, &lguest_irq_controller, handle_level_irq, "level"); + return 0; } /* -- cgit v1.1 From 5cf9a4e69c1ff0ccdd1d2b7404f95c0531355274 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 12 Jan 2012 08:01:40 -0700 Subject: x86/PCI: build amd_bus.o only when CONFIG_AMD_NB=y We only need amd_bus.o for AMD systems with PCI. arch/x86/pci/Makefile already depends on CONFIG_PCI=y, so this patch just adds the dependency on CONFIG_AMD_NB. Cc: Yinghai Lu Cc: stable@kernel.org # 2.6.34+ (needs adjustment for k8 -> amd rename) Signed-off-by: Bjorn Helgaas Signed-off-by: Linus Torvalds --- arch/x86/pci/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 75b06f3..e76e18c 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -18,8 +18,9 @@ obj-$(CONFIG_X86_NUMAQ) += numaq_32.o obj-$(CONFIG_X86_INTEL_MID) += mrst.o obj-y += common.o early.o -obj-y += amd_bus.o bus_numa.o +obj-y += bus_numa.o +obj-$(CONFIG_AMD_NB) += amd_bus.o obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o ifeq ($(CONFIG_PCI_DEBUG),y) -- cgit v1.1 From bccd17294a26b67a8a19aaa120e3eeaa7da49281 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 11 Jan 2012 05:11:46 +0400 Subject: x86: Get rid of 'dubious one-bit signed bitfield' sprase warning This very noisy sparse warning appears on almost every file in the kernel: CHECK init/main.c arch/x86/include/asm/thread_info.h:43:55: error: dubious one-bit signed bitfield arch/x86/include/asm/thread_info.h:44:46: error: dubious one-bit signed bitfield This patch changes sig_on_uaccess_error and uaccess_err flags to unsigned type and thus fixes the warning. Signed-off-by: Anton Vorontsov Acked-by: Andy Lutomirski Signed-off-by: Linus Torvalds --- arch/x86/include/asm/thread_info.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 7404715..bc817cd 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -40,8 +40,8 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif - int sig_on_uaccess_error:1; - int uaccess_err:1; /* uaccess failed */ + unsigned int sig_on_uaccess_error:1; + unsigned int uaccess_err:1; /* uaccess failed */ }; #define INIT_THREAD_INFO(tsk) \ -- cgit v1.1 From 476bc0015bf09dad39d36a8b19f76f0c181d1ec9 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 13 Jan 2012 09:32:18 +1030 Subject: module_param: make bool parameters really bool (arch) module_param(bool) used to counter-intuitively take an int. In fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy trick. It's time to remove the int/unsigned int option. For this version it'll simply give a warning, but it'll break next kernel version. Signed-off-by: Rusty Russell --- arch/x86/kernel/apm_32.c | 16 ++++++++-------- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/vmx.c | 18 +++++++++--------- arch/x86/kvm/x86.c | 4 ++-- arch/x86/mm/mmio-mod.c | 4 ++-- arch/x86/platform/geode/alix.c | 2 +- arch/x86/platform/iris/iris.c | 2 +- 7 files changed, 24 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index a46bd38..f76623c 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -383,21 +383,21 @@ static int ignore_sys_suspend; static int ignore_normal_resume; static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; -static int debug __read_mostly; -static int smp __read_mostly; +static bool debug __read_mostly; +static bool smp __read_mostly; static int apm_disabled = -1; #ifdef CONFIG_SMP -static int power_off; +static bool power_off; #else -static int power_off = 1; +static bool power_off = 1; #endif -static int realmode_power_off; +static bool realmode_power_off; #ifdef CONFIG_APM_ALLOW_INTS -static int allow_ints = 1; +static bool allow_ints = 1; #else -static int allow_ints; +static bool allow_ints; #endif -static int broken_psr; +static bool broken_psr; static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2a2a9b4..224b02c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -74,7 +74,7 @@ enum { #endif #ifdef MMU_DEBUG -static int dbg = 0; +static bool dbg = 0; module_param(dbg, bool, 0644); #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 906a7e8..d29216c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -51,29 +51,29 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); -static int __read_mostly enable_vpid = 1; +static bool __read_mostly enable_vpid = 1; module_param_named(vpid, enable_vpid, bool, 0444); -static int __read_mostly flexpriority_enabled = 1; +static bool __read_mostly flexpriority_enabled = 1; module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); -static int __read_mostly enable_ept = 1; +static bool __read_mostly enable_ept = 1; module_param_named(ept, enable_ept, bool, S_IRUGO); -static int __read_mostly enable_unrestricted_guest = 1; +static bool __read_mostly enable_unrestricted_guest = 1; module_param_named(unrestricted_guest, enable_unrestricted_guest, bool, S_IRUGO); -static int __read_mostly emulate_invalid_guest_state = 0; +static bool __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); -static int __read_mostly vmm_exclusive = 1; +static bool __read_mostly vmm_exclusive = 1; module_param(vmm_exclusive, bool, S_IRUGO); -static int __read_mostly yield_on_hlt = 1; +static bool __read_mostly yield_on_hlt = 1; module_param(yield_on_hlt, bool, S_IRUGO); -static int __read_mostly fasteoi = 1; +static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); /* @@ -81,7 +81,7 @@ module_param(fasteoi, bool, S_IRUGO); * VMX and be a hypervisor for its own guests. If nested=0, guests may not * use VMX instructions. */ -static int __read_mostly nested = 0; +static bool __read_mostly nested = 0; module_param(nested, bool, S_IRUGO); #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1171def..14d6cad 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -88,8 +88,8 @@ static void process_nmi(struct kvm_vcpu *vcpu); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); -int ignore_msrs = 0; -module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); +static bool ignore_msrs = 0; +module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); bool kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index de54b9b..dc0b727 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -75,8 +75,8 @@ static LIST_HEAD(trace_list); /* struct remap_trace */ /* module parameters */ static unsigned long filter_offset; -static int nommiotrace; -static int trace_pc; +static bool nommiotrace; +static bool trace_pc; module_param(filter_offset, ulong, 0); module_param(nommiotrace, bool, 0); diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c index ca19736..dc5f1d3 100644 --- a/arch/x86/platform/geode/alix.c +++ b/arch/x86/platform/geode/alix.c @@ -27,7 +27,7 @@ #include -static int force = 0; +static bool force = 0; module_param(force, bool, 0444); /* FIXME: Award bios is not automatically detected as Alix platform */ MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform"); diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c index 1ba7f5e..5917eb5 100644 --- a/arch/x86/platform/iris/iris.c +++ b/arch/x86/platform/iris/iris.c @@ -42,7 +42,7 @@ MODULE_AUTHOR("Sébastien Hinderer "); MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille"); MODULE_SUPPORTED_DEVICE("Eurobraille/Iris"); -static int force; +static bool force; module_param(force, bool, 0); MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation."); -- cgit v1.1 From 43570fd2f47ba518145e9289f54cde3dba4c8b25 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 12 Jan 2012 17:17:27 -0800 Subject: mm,slub,x86: decouple size of struct page from CONFIG_CMPXCHG_LOCAL While implementing cmpxchg_double() on s390 I realized that we don't set CONFIG_CMPXCHG_LOCAL despite the fact that we have support for it. However setting that option will increase the size of struct page by eight bytes on 64 bit, which we certainly do not want. Also, it doesn't make sense that a present cpu feature should increase the size of struct page. Besides that it looks like the dependency to CMPXCHG_LOCAL is wrong and that it should depend on CMPXCHG_DOUBLE instead. This patch: If an architecture supports CMPXCHG_LOCAL this shouldn't result automatically in larger struct pages if the SLUB allocator is used. Instead introduce a new config option "HAVE_ALIGNED_STRUCT_PAGE" which can be selected if a double word aligned struct page is required. Also update x86 Kconfig so that it should work as before. Signed-off-by: Heiko Carstens Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a150f4c..5201a2c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,6 +60,7 @@ config X86 select PERF_EVENTS select HAVE_PERF_EVENTS_NMI select ANON_INODES + select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE -- cgit v1.1 From 4156153c4daddf12dd386016f96a947a01e93bf4 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 12 Jan 2012 17:17:30 -0800 Subject: mm,x86,um: move CMPXCHG_LOCAL config option Move CMPXCHG_LOCAL and rename it to HAVE_CMPXCHG_LOCAL so architectures can simply select the option if it is supported. Signed-off-by: Heiko Carstens Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/Kconfig.cpu | 3 --- arch/x86/um/Kconfig | 4 ---- 3 files changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5201a2c..59717fd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -61,6 +61,7 @@ config X86 select HAVE_PERF_EVENTS_NMI select ANON_INODES select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 + select HAVE_CMPXCHG_LOCAL if !M386 select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index e3ca7e0..99d2ab8 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -309,9 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT config X86_CMPXCHG def_bool X86_64 || (X86_32 && !M386) -config CMPXCHG_LOCAL - def_bool X86_64 || (X86_32 && !M386) - config CMPXCHG_DOUBLE def_bool y diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 1d97bd8..a62bfc6 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -6,10 +6,6 @@ menu "UML-specific options" menu "Host processor type and features" -config CMPXCHG_LOCAL - bool - default n - config CMPXCHG_DOUBLE bool default n -- cgit v1.1 From 2565409fc0303f3ab8d66b8326702a687962a29b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 12 Jan 2012 17:17:33 -0800 Subject: mm,x86,um: move CMPXCHG_DOUBLE config option Move CMPXCHG_DOUBLE and rename it to HAVE_CMPXCHG_DOUBLE so architectures can simply select the option if it is supported. Signed-off-by: Heiko Carstens Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/Kconfig.cpu | 3 --- arch/x86/um/Kconfig | 4 ---- 3 files changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 59717fd..6c14ecd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -62,6 +62,7 @@ config X86 select ANON_INODES select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 select HAVE_CMPXCHG_LOCAL if !M386 + select HAVE_CMPXCHG_DOUBLE select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 99d2ab8..3c57033 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -309,9 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT config X86_CMPXCHG def_bool X86_64 || (X86_32 && !M386) -config CMPXCHG_DOUBLE - def_bool y - config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index a62bfc6..b2b54d2 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -6,10 +6,6 @@ menu "UML-specific options" menu "Host processor type and features" -config CMPXCHG_DOUBLE - bool - default n - source "arch/x86/Kconfig.cpu" endmenu -- cgit v1.1 From 9512938b885304f72c847379611d6018064af840 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Thu, 12 Jan 2012 17:20:09 -0800 Subject: cpumask: update setup_node_to_cpumask_map() comments node_to_cpumask() has been replaced by cpumask_of_node(), and wholly removed since commit 29c337a0 ("cpumask: remove obsolete node_to_cpumask now everyone uses cpumask_of_node"). So update the comments for setup_node_to_cpumask_map(). Signed-off-by: Wanlong Gao Acked-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 020cd2e..19d3fa0 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -110,7 +110,7 @@ void __cpuinit numa_clear_node(int cpu) * Allocate node_to_cpumask_map based on number of available nodes * Requires node_possible_map to be valid. * - * Note: node_to_cpumask() is not valid until after this is done. + * Note: cpumask_of_node() is not valid until after this is done. * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.) */ void __init setup_node_to_cpumask_map(void) -- cgit v1.1 From a3301b751b19f0efbafddc4034f8e7ce6bf3007b Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 14 Jan 2012 08:11:31 +0530 Subject: x86/mce: Fix CPU hotplug and suspend regression related to MCE Commit 8a25a2fd126c ("cpu: convert 'cpu' and 'machinecheck' sysdev_class to a regular subsystem") changed how things are dealt with in the MCE subsystem. Some of the things that got broken due to this are CPU hotplug and suspend/hibernate. MCE uses per_cpu allocations of struct device. So, when a CPU goes offline and comes back online, in order to ensure that we start from a clean slate with respect to the MCE subsystem, zero out the entire per_cpu device structure to 0 before using it. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index f22a9f7..29ba329 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2011,7 +2011,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&dev->kobj, 0, sizeof(struct kobject)); + memset(dev, 0, sizeof(struct device)); dev->id = cpu; dev->bus = &mce_subsys; -- cgit v1.1 From e032d80774315869aa2285b217fdbbfed86c0b49 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 16 Jan 2012 14:40:28 -0800 Subject: mce: fix warning messages about static struct mce_device When suspending, there was a large list of warnings going something like: Device 'machinecheck1' does not have a release() function, it is broken and must be fixed This patch turns the static mce_devices into dynamically allocated, and properly frees them when they are removed from the system. It solves the warning messages on my laptop here. Reported-by: "Srivatsa S. Bhat" Reported-by: Linus Torvalds Tested-by: Djalal Harouni Cc: Kay Sievers Cc: Tony Luck Cc: Borislav Petkov Signed-off-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 18 ++++++++++++++---- arch/x86/kernel/cpu/mcheck/mce_amd.c | 18 +++++++++++------- 3 files changed, 26 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index f35ce43..6aefb14 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -151,7 +151,7 @@ static inline void enable_p5_mce(void) {} void mce_setup(struct mce *m); void mce_log(struct mce *m); -DECLARE_PER_CPU(struct device, mce_device); +extern struct device *mce_device[CONFIG_NR_CPUS]; /* * Maximum banks number. diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 29ba329..5a11ae2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1859,7 +1859,7 @@ static struct bus_type mce_subsys = { .dev_name = "machinecheck", }; -DEFINE_PER_CPU(struct device, mce_device); +struct device *mce_device[CONFIG_NR_CPUS]; __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); @@ -2001,19 +2001,27 @@ static struct device_attribute *mce_device_attrs[] = { static cpumask_var_t mce_device_initialized; +static void mce_device_release(struct device *dev) +{ + kfree(dev); +} + /* Per cpu device init. All of the cpus still share the same ctrl bank: */ static __cpuinit int mce_device_create(unsigned int cpu) { - struct device *dev = &per_cpu(mce_device, cpu); + struct device *dev; int err; int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; - memset(dev, 0, sizeof(struct device)); + dev = kzalloc(sizeof *dev, GFP_KERNEL); + if (!dev) + return -ENOMEM; dev->id = cpu; dev->bus = &mce_subsys; + dev->release = &mce_device_release; err = device_register(dev); if (err) @@ -2030,6 +2038,7 @@ static __cpuinit int mce_device_create(unsigned int cpu) goto error2; } cpumask_set_cpu(cpu, mce_device_initialized); + mce_device[cpu] = dev; return 0; error2: @@ -2046,7 +2055,7 @@ error: static __cpuinit void mce_device_remove(unsigned int cpu) { - struct device *dev = &per_cpu(mce_device, cpu); + struct device *dev = mce_device[cpu]; int i; if (!cpumask_test_cpu(cpu, mce_device_initialized)) @@ -2060,6 +2069,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu) device_unregister(dev); cpumask_clear_cpu(cpu, mce_device_initialized); + mce_device[cpu] = NULL; } /* Make sure there are no machine checks on offlined CPUs. */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index ba0b94a..786e76a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -523,6 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; + struct device *dev = mce_device[cpu]; char name[32]; sprintf(name, "threshold_bank%i", bank); @@ -543,8 +544,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (!b) goto out; - err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj, - b->kobj, name); + err = sysfs_create_link(&dev->kobj, b->kobj, name); if (err) goto out; @@ -565,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj); + b->kobj = kobject_create_and_add(name, &dev->kobj); if (!b->kobj) goto out_free; @@ -585,8 +585,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - err = sysfs_create_link(&per_cpu(mce_device, i).kobj, - b->kobj, name); + dev = mce_device[i]; + if (dev) + err = sysfs_create_link(&dev->kobj,b->kobj, name); if (err) goto out; @@ -649,6 +650,7 @@ static void deallocate_threshold_block(unsigned int cpu, static void threshold_remove_bank(unsigned int cpu, int bank) { struct threshold_bank *b; + struct device *dev; char name[32]; int i = 0; @@ -663,7 +665,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name); + sysfs_remove_link(&mce_device[cpu]->kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -675,7 +677,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - sysfs_remove_link(&per_cpu(mce_device, i).kobj, name); + dev = mce_device[i]; + if (dev) + sysfs_remove_link(&dev->kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; } -- cgit v1.1 From b54ac6d2a25084667da781c7ca2cebef52a2bcdd Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 8 Dec 2011 11:25:49 +0800 Subject: ACPI, Record ACPI NVS regions Some firmware will access memory in ACPI NVS region via APEI. That is, instructions in APEI ERST/EINJ table will read/write ACPI NVS region. The original resource conflict checking in APEI code will check memory/ioport accessed by APEI via general resource management mechanism. But ACPI NVS region is marked as busy already, so that the false resource conflict will prevent APEI ERST/EINJ to work. To fix this, this patch record ACPI NVS regions, so that we can avoid request resources for memory region inside it. Signed-off-by: Huang Ying Signed-off-by: Len Brown --- arch/x86/kernel/e820.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 303a0e4..51c3b18 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -714,7 +714,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) } #endif -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_ACPI /** * Mark ACPI NVS memory region, so that we can save/restore it during * hibernation and the subsequent resume. @@ -727,7 +727,7 @@ static int __init e820_mark_nvs_memory(void) struct e820entry *ei = &e820.map[i]; if (ei->type == E820_NVS) - suspend_nvs_register(ei->addr, ei->size); + acpi_nvs_register(ei->addr, ei->size); } return 0; -- cgit v1.1 From cd298f60a2451a16e0f077404bf69b62ec868733 Mon Sep 17 00:00:00 2001 From: Kurt Garloff Date: Tue, 17 Jan 2012 04:20:31 -0500 Subject: ACPI, x86: Use SRAT table rev to use 8bit or 32bit PXM fields (x86/x86-64) In SRAT v1, we had 8bit proximity domain (PXM) fields; SRAT v2 provides 32bits for these. The new fields were reserved before. According to the ACPI spec, the OS must disregrard reserved fields. x86/x86-64 was rather inconsistent prior to this patch; it used 8 bits for the pxm field in cpu_affinity, but 32 bits in mem_affinity. This patch makes it consistent: Either use 8 bits consistently (SRAT rev 1 or lower) or 32 bits (SRAT rev 2 or higher). cc: x86@kernel.org Signed-off-by: Kurt Garloff Signed-off-by: Len Brown --- arch/x86/mm/srat.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 81dbfde..7efd0c6 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -104,6 +104,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) return; pxm = pa->proximity_domain_lo; + if (acpi_srat_revision >= 2) + pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8; node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); @@ -155,6 +157,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) start = ma->base_address; end = start + ma->length; pxm = ma->proximity_domain; + if (acpi_srat_revision <= 1) + pxm &= 0xff; node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains.\n"); -- cgit v1.1 From d7e7528bcd456f5c36ad4a202ccfb43c5aa98bc4 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: Audit: push audit success and retcode into arch ptrace.h The audit system previously expected arches calling to audit_syscall_exit to supply as arguments if the syscall was a success and what the return code was. Audit also provides a helper AUDITSC_RESULT which was supposed to simplify things by converting from negative retcodes to an audit internal magic value stating success or failure. This helper was wrong and could indicate that a valid pointer returned to userspace was a failed syscall. The fix is to fix the layering foolishness. We now pass audit_syscall_exit a struct pt_reg and it in turns calls back into arch code to collect the return value and to determine if the syscall was a success or failure. We also define a generic is_syscall_success() macro which determines success/failure based on if the value is < -MAX_ERRNO. This works for arches like x86 which do not use a separate mechanism to indicate syscall failure. We make both the is_syscall_success() and regs_return_value() static inlines instead of macros. The reason is because the audit function must take a void* for the regs. (uml calls theirs struct uml_pt_regs instead of just struct pt_regs so audit_syscall_exit can't take a struct pt_regs). Since the audit function takes a void* we need to use static inlines to cast it back to the arch correct structure to dereference it. The other major change is that on some arches, like ia64, MIPS and ppc, we change regs_return_value() to give us the negative value on syscall failure. THE only other user of this macro, kretprobe_example.c, won't notice and it makes the value signed consistently for the audit functions across all archs. In arch/sh/kernel/ptrace_64.c I see that we were using regs[9] in the old audit code as the return value. But the ptrace_64.h code defined the macro regs_return_value() as regs[3]. I have no idea which one is correct, but this patch now uses the regs_return_value() function, so it now uses regs[3]. For powerpc we previously used regs->result but now use the regs_return_value() function which uses regs->gprs[3]. regs->gprs[3] is always positive so the regs_return_value(), much like ia64 makes it negative before calling the audit code when appropriate. Signed-off-by: Eric Paris Acked-by: H. Peter Anvin [for x86 portion] Acked-by: Tony Luck [for ia64] Acked-by: Richard Weinberger [for uml] Acked-by: David S. Miller [for sparc] Acked-by: Ralf Baechle [for mips] Acked-by: Benjamin Herrenschmidt [for ppc] --- arch/x86/ia32/ia32entry.S | 10 +++++----- arch/x86/kernel/entry_32.S | 8 ++++---- arch/x86/kernel/entry_64.S | 10 +++++----- arch/x86/kernel/ptrace.c | 3 +-- arch/x86/kernel/vm86_32.c | 4 ++-- arch/x86/um/shared/sysdep/ptrace.h | 5 +++++ 6 files changed, 22 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 3e27456..64ced0b 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -14,6 +14,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -208,12 +209,11 @@ sysexit_from_sys_call: TRACE_IRQS_ON sti movl %eax,%esi /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit - movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ + call __audit_syscall_exit + movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi cli TRACE_IRQS_OFF diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 22d0e21..a22facf 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -42,6 +42,7 @@ */ #include +#include #include #include #include @@ -466,11 +467,10 @@ sysexit_audit: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) movl %eax,%edx /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%eax /* zero-extend that */ - inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a20e1cb..e51393d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -563,17 +564,16 @@ auditsys: jmp system_call_fastpath /* - * Return fast path for syscall audit. Call audit_syscall_exit() + * Return fast path for syscall audit. Call __audit_syscall_exit() * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT * masked off. */ sysret_audit: movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ - cmpq $0,%rsi /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi jmp sysret_check #endif /* CONFIG_AUDITSYSCALL */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 89a04c7..8b02187 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1414,8 +1414,7 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->ax); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 863f875..af17e1c 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -335,9 +335,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - /*call audit_syscall_exit since we do not exit via the normal paths */ + /*call __audit_syscall_exit since we do not exit via the normal paths */ if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(0), 0); + __audit_syscall_exit(1, 0); __asm__ __volatile__( "movl %0,%%esp\n\t" diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index 711b162..5ef9344 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -3,3 +3,8 @@ #else #include "ptrace_64.h" #endif + +static inline long regs_return_value(struct uml_pt_regs *regs) +{ + return UPT_SYSCALL_RET(regs); +} -- cgit v1.1 From f031cd25568a390dc2c9c3a4015054183753449a Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: audit: ia32entry.S sign extend error codes when calling 64 bit code In the ia32entry syscall exit audit fastpath we have assembly code which calls __audit_syscall_exit directly. This code was, however, zeroes the upper 32 bits of the return code. It then proceeded to call code which expects longs to be 64bits long. In order to handle code which expects longs to be 64bit we sign extend the return code if that code is an error. Thus the __audit_syscall_exit function can correctly handle using the values in snprintf("%ld"). This fixes the regression introduced in 5cbf1565f29eb57a86a. Old record: type=SYSCALL msg=audit(1306197182.256:281): arch=40000003 syscall=192 success=no exit=4294967283 New record: type=SYSCALL msg=audit(1306197182.256:281): arch=40000003 syscall=192 success=no exit=-13 Signed-off-by: Eric Paris Acked-by: H. Peter Anvin --- arch/x86/ia32/ia32entry.S | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 64ced0b..025f0f0 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -210,7 +210,9 @@ sysexit_from_sys_call: sti movl %eax,%esi /* second arg, syscall return value */ cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - setbe %al /* 1 if so, 0 if not */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ call __audit_syscall_exit movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ -- cgit v1.1 From b05d8447e7821695bc2fa3359431f7a664232743 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 3 Jan 2012 14:23:06 -0500 Subject: audit: inline audit_syscall_entry to reduce burden on archs Every arch calls: if (unlikely(current->audit_context)) audit_syscall_entry() which requires knowledge about audit (the existance of audit_context) in the arch code. Just do it all in static inline in audit.h so that arch's can remain blissfully ignorant. Signed-off-by: Eric Paris --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/entry_64.S | 4 ++-- arch/x86/kernel/ptrace.c | 22 ++++++++++------------ 4 files changed, 14 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 025f0f0..cecfd9a 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -192,7 +192,7 @@ sysexit_from_sys_call: movl %ebx,%edx /* 3rd arg: 1st syscall arg */ movl %eax,%esi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index a22facf..1ccd742 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -456,7 +456,7 @@ sysenter_audit: movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ movl %eax,%edx /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e51393d..1ca66b6 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -549,7 +549,7 @@ badsys: #ifdef CONFIG_AUDITSYSCALL /* * Fast path for syscall audit without full syscall trace. - * We just call audit_syscall_entry() directly, and then + * We just call __audit_syscall_entry() directly, and then * jump back to the normal fast path. */ auditsys: @@ -559,7 +559,7 @@ auditsys: movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ movq %rax,%rsi /* 2nd arg: syscall number */ movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ - call audit_syscall_entry + call __audit_syscall_entry LOAD_ARGS 0 /* reload call-clobbered registers */ jmp system_call_fastpath diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8b02187..5026738 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->orig_ax); - if (unlikely(current->audit_context)) { - if (IS_IA32) - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_ax, - regs->bx, regs->cx, - regs->dx, regs->si); + if (IS_IA32) + audit_syscall_entry(AUDIT_ARCH_I386, + regs->orig_ax, + regs->bx, regs->cx, + regs->dx, regs->si); #ifdef CONFIG_X86_64 - else - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_ax, - regs->di, regs->si, - regs->dx, regs->r10); + else + audit_syscall_entry(AUDIT_ARCH_X86_64, + regs->orig_ax, + regs->di, regs->si, + regs->dx, regs->r10); #endif - } return ret ?: regs->orig_ax; } -- cgit v1.1 From 6015ff103133c7e50a753c198c69bcabc3a5e3b0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 18 Jan 2012 01:51:22 +0000 Subject: x86-32: Fix build failure with AUDIT=y, AUDITSYSCALL=n JONGMAN HEO reports: With current linus git (commit a25a2b84), I got following build error, arch/x86/kernel/vm86_32.c: In function 'do_sys_vm86': arch/x86/kernel/vm86_32.c:340: error: implicit declaration of function '__audit_syscall_exit' make[3]: *** [arch/x86/kernel/vm86_32.o] Error 1 OK, I can reproduce it (32bit allmodconfig with AUDIT=y, AUDITSYSCALL=n) It's due to commit d7e7528bcd45: "Audit: push audit success and retcode into arch ptrace.h". Reported-by: JONGMAN HEO Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- arch/x86/kernel/vm86_32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index af17e1c..b466cab 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -336,8 +336,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk mark_screen_rdonly(tsk->mm); /*call __audit_syscall_exit since we do not exit via the normal paths */ +#ifdef CONFIG_AUDITSYSCALL if (unlikely(current->audit_context)) __audit_syscall_exit(1, 0); +#endif __asm__ __volatile__( "movl %0,%%esp\n\t" -- cgit v1.1 From 90a4c0f51e8e44111a926be6f4c87af3938a79c3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 18 Jan 2012 19:26:11 -0800 Subject: uml: fix compile for x86-64 Randy Dunlap reports that we get arch/x86/um/shared/sysdep/ptrace.h:7:20: error: redefinition of 'regs_return_value' arch/x86/um/shared/sysdep/ptrace.h:7:20: note: previous definition of 'regs_return_value' was here when compiling UML for x86-64. Stephen Rothwell root-caused it and says: "Caused by commit d7e7528bcd45 ("Audit: push audit success and retcode into arch ptrace.h") (another patch that was never in linux-next :-(). This file now needs protection against double inclusion." so let's do as the man says. Reported-by: Randy Dunlap Analyzed-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- arch/x86/um/shared/sysdep/ptrace.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h index 5ef9344..2bbe1ec2 100644 --- a/arch/x86/um/shared/sysdep/ptrace.h +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -1,3 +1,6 @@ +#ifndef __SYSDEP_X86_PTRACE_H +#define __SYSDEP_X86_PTRACE_H + #ifdef __i386__ #include "ptrace_32.h" #else @@ -8,3 +11,5 @@ static inline long regs_return_value(struct uml_pt_regs *regs) { return UPT_SYSCALL_RET(regs); } + +#endif /* __SYSDEP_X86_PTRACE_H */ -- cgit v1.1