Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 1 | /* |
Dave Jones | 835c34a | 2007-10-12 21:10:53 -0400 | [diff] [blame] | 2 | * handle transition of Linux booting another kernel |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> |
| 4 | * |
| 5 | * This source code is licensed under the GNU General Public License, |
| 6 | * Version 2. See the file COPYING for more details. |
| 7 | */ |
| 8 | |
| 9 | #include <linux/mm.h> |
| 10 | #include <linux/kexec.h> |
| 11 | #include <linux/delay.h> |
Rusty Russell | 1a3f239 | 2006-09-26 10:52:32 +0200 | [diff] [blame] | 12 | #include <linux/init.h> |
Ken'ichi Ohmichi | fd59d23 | 2007-10-16 23:27:27 -0700 | [diff] [blame] | 13 | #include <linux/numa.h> |
Ingo Molnar | f43fdad | 2008-05-12 21:20:43 +0200 | [diff] [blame] | 14 | #include <linux/ftrace.h> |
Huang Ying | 3122c33 | 2008-08-15 00:40:26 -0700 | [diff] [blame] | 15 | #include <linux/suspend.h> |
Huang Ying | 92be3d6 | 2008-10-31 09:48:08 +0800 | [diff] [blame] | 16 | #include <linux/gfp.h> |
Ingo Molnar | f43fdad | 2008-05-12 21:20:43 +0200 | [diff] [blame] | 17 | |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 18 | #include <asm/pgtable.h> |
| 19 | #include <asm/pgalloc.h> |
| 20 | #include <asm/tlbflush.h> |
| 21 | #include <asm/mmu_context.h> |
| 22 | #include <asm/io.h> |
| 23 | #include <asm/apic.h> |
| 24 | #include <asm/cpufeature.h> |
Eric W. Biederman | e7b47cc | 2005-07-29 13:01:18 -0600 | [diff] [blame] | 25 | #include <asm/desc.h> |
Zachary Amsden | 4bb0d3e | 2005-09-03 15:56:36 -0700 | [diff] [blame] | 26 | #include <asm/system.h> |
Huang Ying | 3ab8352 | 2008-07-25 19:45:07 -0700 | [diff] [blame] | 27 | #include <asm/cacheflush.h> |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 28 | |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 29 | static void set_idt(void *newidt, __u16 limit) |
| 30 | { |
Glauber de Oliveira Costa | 6b68f01 | 2008-01-30 13:31:12 +0100 | [diff] [blame] | 31 | struct desc_ptr curidt; |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 32 | |
| 33 | /* ia32 supports unaliged loads & stores */ |
Eric W. Biederman | e7b47cc | 2005-07-29 13:01:18 -0600 | [diff] [blame] | 34 | curidt.size = limit; |
| 35 | curidt.address = (unsigned long)newidt; |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 36 | |
Zachary Amsden | f2ab446 | 2005-09-03 15:56:42 -0700 | [diff] [blame] | 37 | load_idt(&curidt); |
WANG Cong | 378fc6e | 2008-06-24 16:21:18 +0100 | [diff] [blame] | 38 | } |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 39 | |
| 40 | |
| 41 | static void set_gdt(void *newgdt, __u16 limit) |
| 42 | { |
Glauber de Oliveira Costa | 6b68f01 | 2008-01-30 13:31:12 +0100 | [diff] [blame] | 43 | struct desc_ptr curgdt; |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 44 | |
| 45 | /* ia32 supports unaligned loads & stores */ |
Eric W. Biederman | e7b47cc | 2005-07-29 13:01:18 -0600 | [diff] [blame] | 46 | curgdt.size = limit; |
| 47 | curgdt.address = (unsigned long)newgdt; |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 48 | |
Zachary Amsden | f2ab446 | 2005-09-03 15:56:42 -0700 | [diff] [blame] | 49 | load_gdt(&curgdt); |
WANG Cong | 378fc6e | 2008-06-24 16:21:18 +0100 | [diff] [blame] | 50 | } |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 51 | |
| 52 | static void load_segments(void) |
| 53 | { |
| 54 | #define __STR(X) #X |
| 55 | #define STR(X) __STR(X) |
| 56 | |
| 57 | __asm__ __volatile__ ( |
| 58 | "\tljmp $"STR(__KERNEL_CS)",$1f\n" |
| 59 | "\t1:\n" |
Michael Matz | 2ec5e3a | 2006-03-07 21:55:48 -0800 | [diff] [blame] | 60 | "\tmovl $"STR(__KERNEL_DS)",%%eax\n" |
| 61 | "\tmovl %%eax,%%ds\n" |
| 62 | "\tmovl %%eax,%%es\n" |
| 63 | "\tmovl %%eax,%%fs\n" |
| 64 | "\tmovl %%eax,%%gs\n" |
| 65 | "\tmovl %%eax,%%ss\n" |
| 66 | ::: "eax", "memory"); |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 67 | #undef STR |
| 68 | #undef __STR |
| 69 | } |
| 70 | |
Huang Ying | 92be3d6 | 2008-10-31 09:48:08 +0800 | [diff] [blame] | 71 | static void machine_kexec_free_page_tables(struct kimage *image) |
| 72 | { |
| 73 | free_page((unsigned long)image->arch.pgd); |
| 74 | #ifdef CONFIG_X86_PAE |
| 75 | free_page((unsigned long)image->arch.pmd0); |
| 76 | free_page((unsigned long)image->arch.pmd1); |
| 77 | #endif |
| 78 | free_page((unsigned long)image->arch.pte0); |
| 79 | free_page((unsigned long)image->arch.pte1); |
| 80 | } |
| 81 | |
| 82 | static int machine_kexec_alloc_page_tables(struct kimage *image) |
| 83 | { |
| 84 | image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
| 85 | #ifdef CONFIG_X86_PAE |
| 86 | image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); |
| 87 | image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); |
| 88 | #endif |
| 89 | image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL); |
| 90 | image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL); |
| 91 | if (!image->arch.pgd || |
| 92 | #ifdef CONFIG_X86_PAE |
| 93 | !image->arch.pmd0 || !image->arch.pmd1 || |
| 94 | #endif |
| 95 | !image->arch.pte0 || !image->arch.pte1) { |
| 96 | machine_kexec_free_page_tables(image); |
| 97 | return -ENOMEM; |
| 98 | } |
| 99 | return 0; |
| 100 | } |
| 101 | |
Huang Ying | 9868ee6 | 2008-10-31 09:48:15 +0800 | [diff] [blame] | 102 | static void machine_kexec_page_table_set_one( |
| 103 | pgd_t *pgd, pmd_t *pmd, pte_t *pte, |
| 104 | unsigned long vaddr, unsigned long paddr) |
| 105 | { |
| 106 | pud_t *pud; |
| 107 | |
| 108 | pgd += pgd_index(vaddr); |
| 109 | #ifdef CONFIG_X86_PAE |
| 110 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) |
| 111 | set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); |
| 112 | #endif |
| 113 | pud = pud_offset(pgd, vaddr); |
| 114 | pmd = pmd_offset(pud, vaddr); |
| 115 | if (!(pmd_val(*pmd) & _PAGE_PRESENT)) |
| 116 | set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); |
| 117 | pte = pte_offset_kernel(pmd, vaddr); |
| 118 | set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); |
| 119 | } |
| 120 | |
| 121 | static void machine_kexec_prepare_page_tables(struct kimage *image) |
| 122 | { |
| 123 | void *control_page; |
| 124 | pmd_t *pmd = 0; |
| 125 | |
| 126 | control_page = page_address(image->control_code_page); |
| 127 | #ifdef CONFIG_X86_PAE |
| 128 | pmd = image->arch.pmd0; |
| 129 | #endif |
| 130 | machine_kexec_page_table_set_one( |
| 131 | image->arch.pgd, pmd, image->arch.pte0, |
| 132 | (unsigned long)control_page, __pa(control_page)); |
| 133 | #ifdef CONFIG_X86_PAE |
| 134 | pmd = image->arch.pmd1; |
| 135 | #endif |
| 136 | machine_kexec_page_table_set_one( |
| 137 | image->arch.pgd, pmd, image->arch.pte1, |
| 138 | __pa(control_page), __pa(control_page)); |
| 139 | } |
| 140 | |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 141 | /* |
| 142 | * A architecture hook called to validate the |
| 143 | * proposed image and prepare the control pages |
Huang Ying | 163f687 | 2008-08-15 00:40:22 -0700 | [diff] [blame] | 144 | * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 145 | * have been allocated, but the segments have yet |
| 146 | * been copied into the kernel. |
| 147 | * |
| 148 | * Do what every setup is needed on image and the |
| 149 | * reboot code buffer to allow us to avoid allocations |
| 150 | * later. |
| 151 | * |
Huang Ying | 92be3d6 | 2008-10-31 09:48:08 +0800 | [diff] [blame] | 152 | * - Make control page executable. |
| 153 | * - Allocate page tables |
Huang Ying | 9868ee6 | 2008-10-31 09:48:15 +0800 | [diff] [blame] | 154 | * - Setup page tables |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 155 | */ |
| 156 | int machine_kexec_prepare(struct kimage *image) |
| 157 | { |
Huang Ying | 9868ee6 | 2008-10-31 09:48:15 +0800 | [diff] [blame] | 158 | int error; |
| 159 | |
Huang Ying | 3ab8352 | 2008-07-25 19:45:07 -0700 | [diff] [blame] | 160 | if (nx_enabled) |
| 161 | set_pages_x(image->control_code_page, 1); |
Huang Ying | 9868ee6 | 2008-10-31 09:48:15 +0800 | [diff] [blame] | 162 | error = machine_kexec_alloc_page_tables(image); |
| 163 | if (error) |
| 164 | return error; |
| 165 | machine_kexec_prepare_page_tables(image); |
| 166 | return 0; |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 167 | } |
| 168 | |
| 169 | /* |
| 170 | * Undo anything leftover by machine_kexec_prepare |
| 171 | * when an image is freed. |
| 172 | */ |
| 173 | void machine_kexec_cleanup(struct kimage *image) |
| 174 | { |
Huang Ying | 3ab8352 | 2008-07-25 19:45:07 -0700 | [diff] [blame] | 175 | if (nx_enabled) |
| 176 | set_pages_nx(image->control_code_page, 1); |
Huang Ying | 92be3d6 | 2008-10-31 09:48:08 +0800 | [diff] [blame] | 177 | machine_kexec_free_page_tables(image); |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 178 | } |
| 179 | |
| 180 | /* |
| 181 | * Do not allocate memory (or fail in any way) in machine_kexec(). |
| 182 | * We are past the point of no return, committed to rebooting now. |
| 183 | */ |
Huang Ying | 3ab8352 | 2008-07-25 19:45:07 -0700 | [diff] [blame] | 184 | void machine_kexec(struct kimage *image) |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 185 | { |
Magnus Damm | 3566561 | 2006-09-26 10:52:38 +0200 | [diff] [blame] | 186 | unsigned long page_list[PAGES_NR]; |
| 187 | void *control_page; |
Huang Ying | 3122c33 | 2008-08-15 00:40:26 -0700 | [diff] [blame] | 188 | int save_ftrace_enabled; |
Huang Ying | 3ab8352 | 2008-07-25 19:45:07 -0700 | [diff] [blame] | 189 | asmlinkage unsigned long |
| 190 | (*relocate_kernel_ptr)(unsigned long indirection_page, |
| 191 | unsigned long control_page, |
| 192 | unsigned long start_address, |
| 193 | unsigned int has_pae, |
| 194 | unsigned int preserve_context); |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 195 | |
Huang Ying | 3122c33 | 2008-08-15 00:40:26 -0700 | [diff] [blame] | 196 | #ifdef CONFIG_KEXEC_JUMP |
| 197 | if (kexec_image->preserve_context) |
| 198 | save_processor_state(); |
| 199 | #endif |
| 200 | |
| 201 | save_ftrace_enabled = __ftrace_enabled_save(); |
Ingo Molnar | f43fdad | 2008-05-12 21:20:43 +0200 | [diff] [blame] | 202 | |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 203 | /* Interrupts aren't acceptable while we reboot */ |
| 204 | local_irq_disable(); |
| 205 | |
Huang Ying | 89081d1 | 2008-07-25 19:45:10 -0700 | [diff] [blame] | 206 | if (image->preserve_context) { |
| 207 | #ifdef CONFIG_X86_IO_APIC |
| 208 | /* We need to put APICs in legacy mode so that we can |
| 209 | * get timer interrupts in second kernel. kexec/kdump |
| 210 | * paths already have calls to disable_IO_APIC() in |
| 211 | * one form or other. kexec jump path also need |
| 212 | * one. |
| 213 | */ |
| 214 | disable_IO_APIC(); |
| 215 | #endif |
| 216 | } |
| 217 | |
Magnus Damm | 3566561 | 2006-09-26 10:52:38 +0200 | [diff] [blame] | 218 | control_page = page_address(image->control_code_page); |
Huang Ying | fb45daa | 2008-08-15 00:40:23 -0700 | [diff] [blame] | 219 | memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 220 | |
Huang Ying | 3ab8352 | 2008-07-25 19:45:07 -0700 | [diff] [blame] | 221 | relocate_kernel_ptr = control_page; |
Magnus Damm | 3566561 | 2006-09-26 10:52:38 +0200 | [diff] [blame] | 222 | page_list[PA_CONTROL_PAGE] = __pa(control_page); |
Huang Ying | 3ab8352 | 2008-07-25 19:45:07 -0700 | [diff] [blame] | 223 | page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; |
Huang Ying | 92be3d6 | 2008-10-31 09:48:08 +0800 | [diff] [blame] | 224 | page_list[PA_PGD] = __pa(image->arch.pgd); |
Ken'ichi Ohmichi | e7706fc | 2008-10-20 13:51:52 +0900 | [diff] [blame] | 225 | |
| 226 | if (image->type == KEXEC_TYPE_DEFAULT) |
| 227 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) |
| 228 | << PAGE_SHIFT); |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 229 | |
Eric W. Biederman | 2a8a3d5 | 2006-07-30 03:03:20 -0700 | [diff] [blame] | 230 | /* The segment registers are funny things, they have both a |
| 231 | * visible and an invisible part. Whenever the visible part is |
| 232 | * set to a specific selector, the invisible part is loaded |
| 233 | * with from a table in memory. At no other time is the |
| 234 | * descriptor table in memory accessed. |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 235 | * |
| 236 | * I take advantage of this here by force loading the |
| 237 | * segments, before I zap the gdt with an invalid value. |
| 238 | */ |
| 239 | load_segments(); |
| 240 | /* The gdt & idt are now invalid. |
| 241 | * If you want to load them you must set up your own idt & gdt. |
| 242 | */ |
| 243 | set_gdt(phys_to_virt(0),0); |
| 244 | set_idt(phys_to_virt(0),0); |
| 245 | |
| 246 | /* now call it */ |
Huang Ying | 3ab8352 | 2008-07-25 19:45:07 -0700 | [diff] [blame] | 247 | image->start = relocate_kernel_ptr((unsigned long)image->head, |
| 248 | (unsigned long)page_list, |
| 249 | image->start, cpu_has_pae, |
| 250 | image->preserve_context); |
Huang Ying | 3122c33 | 2008-08-15 00:40:26 -0700 | [diff] [blame] | 251 | |
| 252 | #ifdef CONFIG_KEXEC_JUMP |
| 253 | if (kexec_image->preserve_context) |
| 254 | restore_processor_state(); |
| 255 | #endif |
| 256 | |
| 257 | __ftrace_enabled_restore(save_ftrace_enabled); |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 258 | } |
Rusty Russell | 1a3f239 | 2006-09-26 10:52:32 +0200 | [diff] [blame] | 259 | |
Ken'ichi Ohmichi | fd59d23 | 2007-10-16 23:27:27 -0700 | [diff] [blame] | 260 | void arch_crash_save_vmcoreinfo(void) |
| 261 | { |
Ken'ichi Ohmichi | 92df5c3 | 2008-02-07 00:15:23 -0800 | [diff] [blame] | 262 | #ifdef CONFIG_NUMA |
Ken'ichi Ohmichi | bcbba6c | 2007-10-16 23:27:30 -0700 | [diff] [blame] | 263 | VMCOREINFO_SYMBOL(node_data); |
| 264 | VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); |
Ken'ichi Ohmichi | fd59d23 | 2007-10-16 23:27:27 -0700 | [diff] [blame] | 265 | #endif |
| 266 | #ifdef CONFIG_X86_PAE |
Ken'ichi Ohmichi | bcbba6c | 2007-10-16 23:27:30 -0700 | [diff] [blame] | 267 | VMCOREINFO_CONFIG(X86_PAE); |
Ken'ichi Ohmichi | fd59d23 | 2007-10-16 23:27:27 -0700 | [diff] [blame] | 268 | #endif |
| 269 | } |
| 270 | |