| /* |
| * Copyright IBM Corp. 2007,2009 |
| * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> |
| */ |
| |
| #include <linux/sched.h> |
| #include <linux/kernel.h> |
| #include <linux/errno.h> |
| #include <linux/gfp.h> |
| #include <linux/mm.h> |
| #include <linux/swap.h> |
| #include <linux/smp.h> |
| #include <linux/highmem.h> |
| #include <linux/pagemap.h> |
| #include <linux/spinlock.h> |
| #include <linux/module.h> |
| #include <linux/quicklist.h> |
| #include <linux/rcupdate.h> |
| |
| #include <asm/system.h> |
| #include <asm/pgtable.h> |
| #include <asm/pgalloc.h> |
| #include <asm/tlb.h> |
| #include <asm/tlbflush.h> |
| #include <asm/mmu_context.h> |
| |
| struct rcu_table_freelist { |
| struct rcu_head rcu; |
| struct mm_struct *mm; |
| unsigned int pgt_index; |
| unsigned int crst_index; |
| unsigned long *table[0]; |
| }; |
| |
| #define RCU_FREELIST_SIZE \ |
| ((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \ |
| / sizeof(unsigned long)) |
| |
| DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); |
| static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist); |
| |
| static void __page_table_free(struct mm_struct *mm, unsigned long *table); |
| static void __crst_table_free(struct mm_struct *mm, unsigned long *table); |
| |
| static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm) |
| { |
| struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist); |
| struct rcu_table_freelist *batch = *batchp; |
| |
| if (batch) |
| return batch; |
| batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC); |
| if (batch) { |
| batch->mm = mm; |
| batch->pgt_index = 0; |
| batch->crst_index = RCU_FREELIST_SIZE; |
| *batchp = batch; |
| } |
| return batch; |
| } |
| |
| static void rcu_table_freelist_callback(struct rcu_head *head) |
| { |
| struct rcu_table_freelist *batch = |
| container_of(head, struct rcu_table_freelist, rcu); |
| |
| while (batch->pgt_index > 0) |
| __page_table_free(batch->mm, batch->table[--batch->pgt_index]); |
| while (batch->crst_index < RCU_FREELIST_SIZE) |
| __crst_table_free(batch->mm, batch->table[batch->crst_index++]); |
| free_page((unsigned long) batch); |
| } |
| |
| void rcu_table_freelist_finish(void) |
| { |
| struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist); |
| |
| if (!batch) |
| return; |
| call_rcu(&batch->rcu, rcu_table_freelist_callback); |
| __get_cpu_var(rcu_table_freelist) = NULL; |
| } |
| |
| static void smp_sync(void *arg) |
| { |
| } |
| |
| #ifndef CONFIG_64BIT |
| #define ALLOC_ORDER 1 |
| #define TABLES_PER_PAGE 4 |
| #define FRAG_MASK 15UL |
| #define SECOND_HALVES 10UL |
| |
| void clear_table_pgstes(unsigned long *table) |
| { |
| clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); |
| memset(table + 256, 0, PAGE_SIZE/4); |
| clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4); |
| memset(table + 768, 0, PAGE_SIZE/4); |
| } |
| |
| #else |
| #define ALLOC_ORDER 2 |
| #define TABLES_PER_PAGE 2 |
| #define FRAG_MASK 3UL |
| #define SECOND_HALVES 2UL |
| |
| void clear_table_pgstes(unsigned long *table) |
| { |
| clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); |
| memset(table + 256, 0, PAGE_SIZE/2); |
| } |
| |
| #endif |
| |
| unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE; |
| EXPORT_SYMBOL(VMALLOC_START); |
| |
| static int __init parse_vmalloc(char *arg) |
| { |
| if (!arg) |
| return -EINVAL; |
| VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK; |
| return 0; |
| } |
| early_param("vmalloc", parse_vmalloc); |
| |
| unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) |
| { |
| struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); |
| |
| if (!page) |
| return NULL; |
| page->index = 0; |
| if (noexec) { |
| struct page *shadow = alloc_pages(GFP_KERNEL, ALLOC_ORDER); |
| if (!shadow) { |
| __free_pages(page, ALLOC_ORDER); |
| return NULL; |
| } |
| page->index = page_to_phys(shadow); |
| } |
| spin_lock_bh(&mm->context.list_lock); |
| list_add(&page->lru, &mm->context.crst_list); |
| spin_unlock_bh(&mm->context.list_lock); |
| return (unsigned long *) page_to_phys(page); |
| } |
| |
| static void __crst_table_free(struct mm_struct *mm, unsigned long *table) |
| { |
| unsigned long *shadow = get_shadow_table(table); |
| |
| if (shadow) |
| free_pages((unsigned long) shadow, ALLOC_ORDER); |
| free_pages((unsigned long) table, ALLOC_ORDER); |
| } |
| |
| void crst_table_free(struct mm_struct *mm, unsigned long *table) |
| { |
| struct page *page = virt_to_page(table); |
| |
| spin_lock_bh(&mm->context.list_lock); |
| list_del(&page->lru); |
| spin_unlock_bh(&mm->context.list_lock); |
| __crst_table_free(mm, table); |
| } |
| |
| void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table) |
| { |
| struct rcu_table_freelist *batch; |
| struct page *page = virt_to_page(table); |
| |
| spin_lock_bh(&mm->context.list_lock); |
| list_del(&page->lru); |
| spin_unlock_bh(&mm->context.list_lock); |
| if (atomic_read(&mm->mm_users) < 2 && |
| cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { |
| __crst_table_free(mm, table); |
| return; |
| } |
| batch = rcu_table_freelist_get(mm); |
| if (!batch) { |
| smp_call_function(smp_sync, NULL, 1); |
| __crst_table_free(mm, table); |
| return; |
| } |
| batch->table[--batch->crst_index] = table; |
| if (batch->pgt_index >= batch->crst_index) |
| rcu_table_freelist_finish(); |
| } |
| |
| #ifdef CONFIG_64BIT |
| int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) |
| { |
| unsigned long *table, *pgd; |
| unsigned long entry; |
| |
| BUG_ON(limit > (1UL << 53)); |
| repeat: |
| table = crst_table_alloc(mm, mm->context.noexec); |
| if (!table) |
| return -ENOMEM; |
| spin_lock_bh(&mm->page_table_lock); |
| if (mm->context.asce_limit < limit) { |
| pgd = (unsigned long *) mm->pgd; |
| if (mm->context.asce_limit <= (1UL << 31)) { |
| entry = _REGION3_ENTRY_EMPTY; |
| mm->context.asce_limit = 1UL << 42; |
| mm->context.asce_bits = _ASCE_TABLE_LENGTH | |
| _ASCE_USER_BITS | |
| _ASCE_TYPE_REGION3; |
| } else { |
| entry = _REGION2_ENTRY_EMPTY; |
| mm->context.asce_limit = 1UL << 53; |
| mm->context.asce_bits = _ASCE_TABLE_LENGTH | |
| _ASCE_USER_BITS | |
| _ASCE_TYPE_REGION2; |
| } |
| crst_table_init(table, entry); |
| pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); |
| mm->pgd = (pgd_t *) table; |
| mm->task_size = mm->context.asce_limit; |
| table = NULL; |
| } |
| spin_unlock_bh(&mm->page_table_lock); |
| if (table) |
| crst_table_free(mm, table); |
| if (mm->context.asce_limit < limit) |
| goto repeat; |
| update_mm(mm, current); |
| return 0; |
| } |
| |
| void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) |
| { |
| pgd_t *pgd; |
| |
| if (mm->context.asce_limit <= limit) |
| return; |
| __tlb_flush_mm(mm); |
| while (mm->context.asce_limit > limit) { |
| pgd = mm->pgd; |
| switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { |
| case _REGION_ENTRY_TYPE_R2: |
| mm->context.asce_limit = 1UL << 42; |
| mm->context.asce_bits = _ASCE_TABLE_LENGTH | |
| _ASCE_USER_BITS | |
| _ASCE_TYPE_REGION3; |
| break; |
| case _REGION_ENTRY_TYPE_R3: |
| mm->context.asce_limit = 1UL << 31; |
| mm->context.asce_bits = _ASCE_TABLE_LENGTH | |
| _ASCE_USER_BITS | |
| _ASCE_TYPE_SEGMENT; |
| break; |
| default: |
| BUG(); |
| } |
| mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); |
| mm->task_size = mm->context.asce_limit; |
| crst_table_free(mm, (unsigned long *) pgd); |
| } |
| update_mm(mm, current); |
| } |
| #endif |
| |
| /* |
| * page table entry allocation/free routines. |
| */ |
| unsigned long *page_table_alloc(struct mm_struct *mm) |
| { |
| struct page *page; |
| unsigned long *table; |
| unsigned long bits; |
| |
| bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; |
| spin_lock_bh(&mm->context.list_lock); |
| page = NULL; |
| if (!list_empty(&mm->context.pgtable_list)) { |
| page = list_first_entry(&mm->context.pgtable_list, |
| struct page, lru); |
| if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) |
| page = NULL; |
| } |
| if (!page) { |
| spin_unlock_bh(&mm->context.list_lock); |
| page = alloc_page(GFP_KERNEL|__GFP_REPEAT); |
| if (!page) |
| return NULL; |
| pgtable_page_ctor(page); |
| page->flags &= ~FRAG_MASK; |
| table = (unsigned long *) page_to_phys(page); |
| if (mm->context.has_pgste) |
| clear_table_pgstes(table); |
| else |
| clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); |
| spin_lock_bh(&mm->context.list_lock); |
| list_add(&page->lru, &mm->context.pgtable_list); |
| } |
| table = (unsigned long *) page_to_phys(page); |
| while (page->flags & bits) { |
| table += 256; |
| bits <<= 1; |
| } |
| page->flags |= bits; |
| if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1)) |
| list_move_tail(&page->lru, &mm->context.pgtable_list); |
| spin_unlock_bh(&mm->context.list_lock); |
| return table; |
| } |
| |
| static void __page_table_free(struct mm_struct *mm, unsigned long *table) |
| { |
| struct page *page; |
| unsigned long bits; |
| |
| bits = ((unsigned long) table) & 15; |
| table = (unsigned long *)(((unsigned long) table) ^ bits); |
| page = pfn_to_page(__pa(table) >> PAGE_SHIFT); |
| page->flags ^= bits; |
| if (!(page->flags & FRAG_MASK)) { |
| pgtable_page_dtor(page); |
| __free_page(page); |
| } |
| } |
| |
| void page_table_free(struct mm_struct *mm, unsigned long *table) |
| { |
| struct page *page; |
| unsigned long bits; |
| |
| bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; |
| bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); |
| page = pfn_to_page(__pa(table) >> PAGE_SHIFT); |
| spin_lock_bh(&mm->context.list_lock); |
| page->flags ^= bits; |
| if (page->flags & FRAG_MASK) { |
| /* Page now has some free pgtable fragments. */ |
| if (!list_empty(&page->lru)) |
| list_move(&page->lru, &mm->context.pgtable_list); |
| page = NULL; |
| } else |
| /* All fragments of the 4K page have been freed. */ |
| list_del(&page->lru); |
| spin_unlock_bh(&mm->context.list_lock); |
| if (page) { |
| pgtable_page_dtor(page); |
| __free_page(page); |
| } |
| } |
| |
| void page_table_free_rcu(struct mm_struct *mm, unsigned long *table) |
| { |
| struct rcu_table_freelist *batch; |
| struct page *page; |
| unsigned long bits; |
| |
| if (atomic_read(&mm->mm_users) < 2 && |
| cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { |
| page_table_free(mm, table); |
| return; |
| } |
| batch = rcu_table_freelist_get(mm); |
| if (!batch) { |
| smp_call_function(smp_sync, NULL, 1); |
| page_table_free(mm, table); |
| return; |
| } |
| bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL; |
| bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); |
| page = pfn_to_page(__pa(table) >> PAGE_SHIFT); |
| spin_lock_bh(&mm->context.list_lock); |
| /* Delayed freeing with rcu prevents reuse of pgtable fragments */ |
| list_del_init(&page->lru); |
| spin_unlock_bh(&mm->context.list_lock); |
| table = (unsigned long *)(((unsigned long) table) | bits); |
| batch->table[batch->pgt_index++] = table; |
| if (batch->pgt_index >= batch->crst_index) |
| rcu_table_freelist_finish(); |
| } |
| |
| void disable_noexec(struct mm_struct *mm, struct task_struct *tsk) |
| { |
| struct page *page; |
| |
| spin_lock_bh(&mm->context.list_lock); |
| /* Free shadow region and segment tables. */ |
| list_for_each_entry(page, &mm->context.crst_list, lru) |
| if (page->index) { |
| free_pages((unsigned long) page->index, ALLOC_ORDER); |
| page->index = 0; |
| } |
| /* "Free" second halves of page tables. */ |
| list_for_each_entry(page, &mm->context.pgtable_list, lru) |
| page->flags &= ~SECOND_HALVES; |
| spin_unlock_bh(&mm->context.list_lock); |
| mm->context.noexec = 0; |
| update_mm(mm, tsk); |
| } |
| |
| /* |
| * switch on pgstes for its userspace process (for kvm) |
| */ |
| int s390_enable_sie(void) |
| { |
| struct task_struct *tsk = current; |
| struct mm_struct *mm, *old_mm; |
| |
| /* Do we have switched amode? If no, we cannot do sie */ |
| if (user_mode == HOME_SPACE_MODE) |
| return -EINVAL; |
| |
| /* Do we have pgstes? if yes, we are done */ |
| if (tsk->mm->context.has_pgste) |
| return 0; |
| |
| /* lets check if we are allowed to replace the mm */ |
| task_lock(tsk); |
| if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || |
| #ifdef CONFIG_AIO |
| !hlist_empty(&tsk->mm->ioctx_list) || |
| #endif |
| tsk->mm != tsk->active_mm) { |
| task_unlock(tsk); |
| return -EINVAL; |
| } |
| task_unlock(tsk); |
| |
| /* we copy the mm and let dup_mm create the page tables with_pgstes */ |
| tsk->mm->context.alloc_pgste = 1; |
| mm = dup_mm(tsk); |
| tsk->mm->context.alloc_pgste = 0; |
| if (!mm) |
| return -ENOMEM; |
| |
| /* Now lets check again if something happened */ |
| task_lock(tsk); |
| if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || |
| #ifdef CONFIG_AIO |
| !hlist_empty(&tsk->mm->ioctx_list) || |
| #endif |
| tsk->mm != tsk->active_mm) { |
| mmput(mm); |
| task_unlock(tsk); |
| return -EINVAL; |
| } |
| |
| /* ok, we are alone. No ptrace, no threads, etc. */ |
| old_mm = tsk->mm; |
| tsk->mm = tsk->active_mm = mm; |
| preempt_disable(); |
| update_mm(mm, tsk); |
| atomic_inc(&mm->context.attach_count); |
| atomic_dec(&old_mm->context.attach_count); |
| cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); |
| preempt_enable(); |
| task_unlock(tsk); |
| mmput(old_mm); |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(s390_enable_sie); |
| |
| #if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION) |
| bool kernel_page_present(struct page *page) |
| { |
| unsigned long addr; |
| int cc; |
| |
| addr = page_to_phys(page); |
| asm volatile( |
| " lra %1,0(%1)\n" |
| " ipm %0\n" |
| " srl %0,28" |
| : "=d" (cc), "+a" (addr) : : "cc"); |
| return cc == 0; |
| } |
| #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */ |