Mahesh Salgaonkar | 36df96f | 2013-10-30 20:05:40 +0530 | [diff] [blame] | 1 | /* |
| 2 | * Machine check exception handling. |
| 3 | * |
| 4 | * This program is free software; you can redistribute it and/or modify |
| 5 | * it under the terms of the GNU General Public License as published by |
| 6 | * the Free Software Foundation; either version 2 of the License, or |
| 7 | * (at your option) any later version. |
| 8 | * |
| 9 | * This program is distributed in the hope that it will be useful, |
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 12 | * GNU General Public License for more details. |
| 13 | * |
| 14 | * You should have received a copy of the GNU General Public License |
| 15 | * along with this program; if not, write to the Free Software |
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| 17 | * |
| 18 | * Copyright 2013 IBM Corporation |
| 19 | * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> |
| 20 | */ |
| 21 | |
| 22 | #undef DEBUG |
| 23 | #define pr_fmt(fmt) "mce: " fmt |
| 24 | |
| 25 | #include <linux/types.h> |
| 26 | #include <linux/ptrace.h> |
| 27 | #include <linux/percpu.h> |
| 28 | #include <linux/export.h> |
Mahesh Salgaonkar | 30c8263 | 2014-01-14 15:45:09 +0530 | [diff] [blame] | 29 | #include <linux/irq_work.h> |
Mahesh Salgaonkar | 36df96f | 2013-10-30 20:05:40 +0530 | [diff] [blame] | 30 | #include <asm/mce.h> |
| 31 | |
| 32 | static DEFINE_PER_CPU(int, mce_nest_count); |
| 33 | static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); |
| 34 | |
Mahesh Salgaonkar | b5ff421 | 2013-10-30 20:05:49 +0530 | [diff] [blame] | 35 | /* Queue for delayed MCE events. */ |
| 36 | static DEFINE_PER_CPU(int, mce_queue_count); |
| 37 | static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); |
| 38 | |
Mahesh Salgaonkar | 30c8263 | 2014-01-14 15:45:09 +0530 | [diff] [blame] | 39 | static void machine_check_process_queued_event(struct irq_work *work); |
| 40 | struct irq_work mce_event_process_work = { |
| 41 | .func = machine_check_process_queued_event, |
| 42 | }; |
| 43 | |
Mahesh Salgaonkar | 36df96f | 2013-10-30 20:05:40 +0530 | [diff] [blame] | 44 | static void mce_set_error_info(struct machine_check_event *mce, |
| 45 | struct mce_error_info *mce_err) |
| 46 | { |
| 47 | mce->error_type = mce_err->error_type; |
| 48 | switch (mce_err->error_type) { |
| 49 | case MCE_ERROR_TYPE_UE: |
| 50 | mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type; |
| 51 | break; |
| 52 | case MCE_ERROR_TYPE_SLB: |
| 53 | mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type; |
| 54 | break; |
| 55 | case MCE_ERROR_TYPE_ERAT: |
| 56 | mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type; |
| 57 | break; |
| 58 | case MCE_ERROR_TYPE_TLB: |
| 59 | mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type; |
| 60 | break; |
| 61 | case MCE_ERROR_TYPE_UNKNOWN: |
| 62 | default: |
| 63 | break; |
| 64 | } |
| 65 | } |
| 66 | |
| 67 | /* |
| 68 | * Decode and save high level MCE information into per cpu buffer which |
| 69 | * is an array of machine_check_event structure. |
| 70 | */ |
| 71 | void save_mce_event(struct pt_regs *regs, long handled, |
| 72 | struct mce_error_info *mce_err, |
Mahesh Salgaonkar | 55672ec | 2013-12-16 10:46:24 +0530 | [diff] [blame] | 73 | uint64_t nip, uint64_t addr) |
Mahesh Salgaonkar | 36df96f | 2013-10-30 20:05:40 +0530 | [diff] [blame] | 74 | { |
| 75 | uint64_t srr1; |
| 76 | int index = __get_cpu_var(mce_nest_count)++; |
| 77 | struct machine_check_event *mce = &__get_cpu_var(mce_event[index]); |
| 78 | |
| 79 | /* |
| 80 | * Return if we don't have enough space to log mce event. |
| 81 | * mce_nest_count may go beyond MAX_MC_EVT but that's ok, |
| 82 | * the check below will stop buffer overrun. |
| 83 | */ |
| 84 | if (index >= MAX_MC_EVT) |
| 85 | return; |
| 86 | |
| 87 | /* Populate generic machine check info */ |
| 88 | mce->version = MCE_V1; |
Mahesh Salgaonkar | 55672ec | 2013-12-16 10:46:24 +0530 | [diff] [blame] | 89 | mce->srr0 = nip; |
Mahesh Salgaonkar | 36df96f | 2013-10-30 20:05:40 +0530 | [diff] [blame] | 90 | mce->srr1 = regs->msr; |
| 91 | mce->gpr3 = regs->gpr[3]; |
| 92 | mce->in_use = 1; |
| 93 | |
| 94 | mce->initiator = MCE_INITIATOR_CPU; |
| 95 | if (handled) |
| 96 | mce->disposition = MCE_DISPOSITION_RECOVERED; |
| 97 | else |
| 98 | mce->disposition = MCE_DISPOSITION_NOT_RECOVERED; |
| 99 | mce->severity = MCE_SEV_ERROR_SYNC; |
| 100 | |
| 101 | srr1 = regs->msr; |
| 102 | |
| 103 | /* |
| 104 | * Populate the mce error_type and type-specific error_type. |
| 105 | */ |
| 106 | mce_set_error_info(mce, mce_err); |
| 107 | |
| 108 | if (!addr) |
| 109 | return; |
| 110 | |
| 111 | if (mce->error_type == MCE_ERROR_TYPE_TLB) { |
| 112 | mce->u.tlb_error.effective_address_provided = true; |
| 113 | mce->u.tlb_error.effective_address = addr; |
| 114 | } else if (mce->error_type == MCE_ERROR_TYPE_SLB) { |
| 115 | mce->u.slb_error.effective_address_provided = true; |
| 116 | mce->u.slb_error.effective_address = addr; |
| 117 | } else if (mce->error_type == MCE_ERROR_TYPE_ERAT) { |
| 118 | mce->u.erat_error.effective_address_provided = true; |
| 119 | mce->u.erat_error.effective_address = addr; |
| 120 | } else if (mce->error_type == MCE_ERROR_TYPE_UE) { |
| 121 | mce->u.ue_error.effective_address_provided = true; |
| 122 | mce->u.ue_error.effective_address = addr; |
| 123 | } |
| 124 | return; |
| 125 | } |
| 126 | |
| 127 | /* |
| 128 | * get_mce_event: |
| 129 | * mce Pointer to machine_check_event structure to be filled. |
| 130 | * release Flag to indicate whether to free the event slot or not. |
| 131 | * 0 <= do not release the mce event. Caller will invoke |
| 132 | * release_mce_event() once event has been consumed. |
| 133 | * 1 <= release the slot. |
| 134 | * |
| 135 | * return 1 = success |
| 136 | * 0 = failure |
| 137 | * |
| 138 | * get_mce_event() will be called by platform specific machine check |
| 139 | * handle routine and in KVM. |
| 140 | * When we call get_mce_event(), we are still in interrupt context and |
| 141 | * preemption will not be scheduled until ret_from_expect() routine |
| 142 | * is called. |
| 143 | */ |
| 144 | int get_mce_event(struct machine_check_event *mce, bool release) |
| 145 | { |
| 146 | int index = __get_cpu_var(mce_nest_count) - 1; |
| 147 | struct machine_check_event *mc_evt; |
| 148 | int ret = 0; |
| 149 | |
| 150 | /* Sanity check */ |
| 151 | if (index < 0) |
| 152 | return ret; |
| 153 | |
| 154 | /* Check if we have MCE info to process. */ |
| 155 | if (index < MAX_MC_EVT) { |
| 156 | mc_evt = &__get_cpu_var(mce_event[index]); |
| 157 | /* Copy the event structure and release the original */ |
| 158 | if (mce) |
| 159 | *mce = *mc_evt; |
| 160 | if (release) |
| 161 | mc_evt->in_use = 0; |
| 162 | ret = 1; |
| 163 | } |
| 164 | /* Decrement the count to free the slot. */ |
| 165 | if (release) |
| 166 | __get_cpu_var(mce_nest_count)--; |
| 167 | |
| 168 | return ret; |
| 169 | } |
| 170 | |
| 171 | void release_mce_event(void) |
| 172 | { |
| 173 | get_mce_event(NULL, true); |
| 174 | } |
Mahesh Salgaonkar | b5ff421 | 2013-10-30 20:05:49 +0530 | [diff] [blame] | 175 | |
| 176 | /* |
| 177 | * Queue up the MCE event which then can be handled later. |
| 178 | */ |
| 179 | void machine_check_queue_event(void) |
| 180 | { |
| 181 | int index; |
| 182 | struct machine_check_event evt; |
| 183 | |
| 184 | if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) |
| 185 | return; |
| 186 | |
| 187 | index = __get_cpu_var(mce_queue_count)++; |
| 188 | /* If queue is full, just return for now. */ |
| 189 | if (index >= MAX_MC_EVT) { |
| 190 | __get_cpu_var(mce_queue_count)--; |
| 191 | return; |
| 192 | } |
| 193 | __get_cpu_var(mce_event_queue[index]) = evt; |
Mahesh Salgaonkar | 30c8263 | 2014-01-14 15:45:09 +0530 | [diff] [blame] | 194 | |
| 195 | /* Queue irq work to process this event later. */ |
| 196 | irq_work_queue(&mce_event_process_work); |
Mahesh Salgaonkar | b5ff421 | 2013-10-30 20:05:49 +0530 | [diff] [blame] | 197 | } |
| 198 | |
| 199 | /* |
| 200 | * process pending MCE event from the mce event queue. This function will be |
| 201 | * called during syscall exit. |
| 202 | */ |
Mahesh Salgaonkar | 30c8263 | 2014-01-14 15:45:09 +0530 | [diff] [blame] | 203 | static void machine_check_process_queued_event(struct irq_work *work) |
Mahesh Salgaonkar | b5ff421 | 2013-10-30 20:05:49 +0530 | [diff] [blame] | 204 | { |
| 205 | int index; |
| 206 | |
Mahesh Salgaonkar | b5ff421 | 2013-10-30 20:05:49 +0530 | [diff] [blame] | 207 | /* |
| 208 | * For now just print it to console. |
| 209 | * TODO: log this error event to FSP or nvram. |
| 210 | */ |
| 211 | while (__get_cpu_var(mce_queue_count) > 0) { |
| 212 | index = __get_cpu_var(mce_queue_count) - 1; |
| 213 | machine_check_print_event_info( |
| 214 | &__get_cpu_var(mce_event_queue[index])); |
| 215 | __get_cpu_var(mce_queue_count)--; |
| 216 | } |
Mahesh Salgaonkar | b5ff421 | 2013-10-30 20:05:49 +0530 | [diff] [blame] | 217 | } |
| 218 | |
| 219 | void machine_check_print_event_info(struct machine_check_event *evt) |
| 220 | { |
| 221 | const char *level, *sevstr, *subtype; |
| 222 | static const char *mc_ue_types[] = { |
| 223 | "Indeterminate", |
| 224 | "Instruction fetch", |
| 225 | "Page table walk ifetch", |
| 226 | "Load/Store", |
| 227 | "Page table walk Load/Store", |
| 228 | }; |
| 229 | static const char *mc_slb_types[] = { |
| 230 | "Indeterminate", |
| 231 | "Parity", |
| 232 | "Multihit", |
| 233 | }; |
| 234 | static const char *mc_erat_types[] = { |
| 235 | "Indeterminate", |
| 236 | "Parity", |
| 237 | "Multihit", |
| 238 | }; |
| 239 | static const char *mc_tlb_types[] = { |
| 240 | "Indeterminate", |
| 241 | "Parity", |
| 242 | "Multihit", |
| 243 | }; |
| 244 | |
| 245 | /* Print things out */ |
| 246 | if (evt->version != MCE_V1) { |
| 247 | pr_err("Machine Check Exception, Unknown event version %d !\n", |
| 248 | evt->version); |
| 249 | return; |
| 250 | } |
| 251 | switch (evt->severity) { |
| 252 | case MCE_SEV_NO_ERROR: |
| 253 | level = KERN_INFO; |
| 254 | sevstr = "Harmless"; |
| 255 | break; |
| 256 | case MCE_SEV_WARNING: |
| 257 | level = KERN_WARNING; |
| 258 | sevstr = ""; |
| 259 | break; |
| 260 | case MCE_SEV_ERROR_SYNC: |
| 261 | level = KERN_ERR; |
| 262 | sevstr = "Severe"; |
| 263 | break; |
| 264 | case MCE_SEV_FATAL: |
| 265 | default: |
| 266 | level = KERN_ERR; |
| 267 | sevstr = "Fatal"; |
| 268 | break; |
| 269 | } |
| 270 | |
| 271 | printk("%s%s Machine check interrupt [%s]\n", level, sevstr, |
| 272 | evt->disposition == MCE_DISPOSITION_RECOVERED ? |
| 273 | "Recovered" : "[Not recovered"); |
| 274 | printk("%s Initiator: %s\n", level, |
| 275 | evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown"); |
| 276 | switch (evt->error_type) { |
| 277 | case MCE_ERROR_TYPE_UE: |
| 278 | subtype = evt->u.ue_error.ue_error_type < |
| 279 | ARRAY_SIZE(mc_ue_types) ? |
| 280 | mc_ue_types[evt->u.ue_error.ue_error_type] |
| 281 | : "Unknown"; |
| 282 | printk("%s Error type: UE [%s]\n", level, subtype); |
| 283 | if (evt->u.ue_error.effective_address_provided) |
| 284 | printk("%s Effective address: %016llx\n", |
| 285 | level, evt->u.ue_error.effective_address); |
| 286 | if (evt->u.ue_error.physical_address_provided) |
| 287 | printk("%s Physial address: %016llx\n", |
| 288 | level, evt->u.ue_error.physical_address); |
| 289 | break; |
| 290 | case MCE_ERROR_TYPE_SLB: |
| 291 | subtype = evt->u.slb_error.slb_error_type < |
| 292 | ARRAY_SIZE(mc_slb_types) ? |
| 293 | mc_slb_types[evt->u.slb_error.slb_error_type] |
| 294 | : "Unknown"; |
| 295 | printk("%s Error type: SLB [%s]\n", level, subtype); |
| 296 | if (evt->u.slb_error.effective_address_provided) |
| 297 | printk("%s Effective address: %016llx\n", |
| 298 | level, evt->u.slb_error.effective_address); |
| 299 | break; |
| 300 | case MCE_ERROR_TYPE_ERAT: |
| 301 | subtype = evt->u.erat_error.erat_error_type < |
| 302 | ARRAY_SIZE(mc_erat_types) ? |
| 303 | mc_erat_types[evt->u.erat_error.erat_error_type] |
| 304 | : "Unknown"; |
| 305 | printk("%s Error type: ERAT [%s]\n", level, subtype); |
| 306 | if (evt->u.erat_error.effective_address_provided) |
| 307 | printk("%s Effective address: %016llx\n", |
| 308 | level, evt->u.erat_error.effective_address); |
| 309 | break; |
| 310 | case MCE_ERROR_TYPE_TLB: |
| 311 | subtype = evt->u.tlb_error.tlb_error_type < |
| 312 | ARRAY_SIZE(mc_tlb_types) ? |
| 313 | mc_tlb_types[evt->u.tlb_error.tlb_error_type] |
| 314 | : "Unknown"; |
| 315 | printk("%s Error type: TLB [%s]\n", level, subtype); |
| 316 | if (evt->u.tlb_error.effective_address_provided) |
| 317 | printk("%s Effective address: %016llx\n", |
| 318 | level, evt->u.tlb_error.effective_address); |
| 319 | break; |
| 320 | default: |
| 321 | case MCE_ERROR_TYPE_UNKNOWN: |
| 322 | printk("%s Error type: Unknown\n", level); |
| 323 | break; |
| 324 | } |
| 325 | } |
Mahesh Salgaonkar | b63a0ff | 2013-10-30 20:06:13 +0530 | [diff] [blame] | 326 | |
| 327 | uint64_t get_mce_fault_addr(struct machine_check_event *evt) |
| 328 | { |
| 329 | switch (evt->error_type) { |
| 330 | case MCE_ERROR_TYPE_UE: |
| 331 | if (evt->u.ue_error.effective_address_provided) |
| 332 | return evt->u.ue_error.effective_address; |
| 333 | break; |
| 334 | case MCE_ERROR_TYPE_SLB: |
| 335 | if (evt->u.slb_error.effective_address_provided) |
| 336 | return evt->u.slb_error.effective_address; |
| 337 | break; |
| 338 | case MCE_ERROR_TYPE_ERAT: |
| 339 | if (evt->u.erat_error.effective_address_provided) |
| 340 | return evt->u.erat_error.effective_address; |
| 341 | break; |
| 342 | case MCE_ERROR_TYPE_TLB: |
| 343 | if (evt->u.tlb_error.effective_address_provided) |
| 344 | return evt->u.tlb_error.effective_address; |
| 345 | break; |
| 346 | default: |
| 347 | case MCE_ERROR_TYPE_UNKNOWN: |
| 348 | break; |
| 349 | } |
| 350 | return 0; |
| 351 | } |
| 352 | EXPORT_SYMBOL(get_mce_fault_addr); |