diff --git a/Documentation/devicetree/bindings/edac/gic600-edac.txt b/Documentation/devicetree/bindings/edac/gic600-edac.txt new file mode 100644 index 000000000000..53490a536593 --- /dev/null +++ b/Documentation/devicetree/bindings/edac/gic600-edac.txt @@ -0,0 +1,40 @@ +* GIC EDAC node: + +GIC EDAC node is defined to describe on-chip error or fault detection and correction +for GIC RAM. GICT(GIC trace and debug) register map describes the syndrome information +and GICT support is available from GIC-600 onwards. + +GIC EDAC reports all Single Bit Error and Double Bit Error found in GIC RAM. +Current GIC ECC implementation is interrupt based. GIC scrub feature helps +to avoid potential errors accumulation of particular error location by a write-back +of all valid RAM entries periodically. + +GICT identifies software programming errors, correctable and uncorrectable +errors of Shared Peripheral Interrupt(SPI) RAM, Software Generated Interrupt(SGI) RAM, +Private Peripheral Interrupt(PPI) RAM, Locality-Specific Peripheral Interrupt(LPI) RAM +and Interrupt Translation Service (ITS) RAM. +For every error record following information available - + ERRXSTATUS - Error Record Status Register + ERRXMISC0 - Error Record Miscellaneous Register 0 + ERRXMISC1 - Error Record Miscellaneous Register 1 + ERRXADDR - Error Record Address Resister + +The following section describes the DT node binding for gic-erp. + +Required properties: +- compatible : "arm,gic-600-erp". +- reg-names : GIC Trace, GIC Distributor, GIC Redistributor bases +- interrupt-config : SPI interrupt numbers of Fault and Error interrupts + to configure GICT_ERRORQCR0, GICT_ERRORQCR1 registers respectively. +- interrupts : Interrupts for Fault and Error IRQs + +gict: gict@17a20000 { + compatible = "arm,gic-600-erp"; + reg = <0x17a20000 0x10000>; /* GICT */ + reg-names = "gict-base"; + interrupt-config = <46, 17>; + interrupt-names = "gict-fault", "gict-err"; + interrupts = , + ; +}; + diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index c4e5a0899a4e..694f2b7bb862 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -492,6 +492,39 @@ config EDAC_KRYO_ARM64_PANIC_ON_UE For production builds, you should probably say 'N' here. +config EDAC_GIC + depends on ARM_GIC_V3 + tristate "GIC ECC" + help + Supports error detection and correction on the + GIC RAMs. Reports correctable and uncorrectable errors caught by + GIC ECC mechanism. GICT(GIC Trace and debug) register map page + describes the syndrome registers GIC and GICT support is available + from GIC600 onwards. + For debugging issues having to do with stability and overall system + health, you should probably say 'Y' here. + +config EDAC_GIC_PANIC_ON_CE + depends on EDAC_GIC + bool "Panic on correctable errors - GIC" + help + Forcibly cause a kernel panic if an correctable error (CE) is + detected, even though the error is (by definition) correctable and + would otherwise result in no adverse system effects. This can reduce + debugging times on hardware which may be operating at voltages or + frequencies outside normal specification. + For production builds, you should definitely say 'N' here. + +config EDAC_GIC_PANIC_ON_UE + depends on EDAC_GIC + bool "Panic on un-correctable errors - GIC" + help + Forcibly cause a kernel panic if an uncorrectable error (UE) is + detected. This can reduce debugging times on hardware which may + be operating at voltages or frequencies outside + normal specification. + For production builds, you should definitely say 'N' here. + config EDAC_XGENE tristate "APM X-Gene SoC" depends on (ARM64 || COMPILE_TEST) diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 57bf313997ac..10a5ca3dc91a 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_EDAC_X38) += x38_edac.o obj-$(CONFIG_EDAC_I82860) += i82860_edac.o obj-$(CONFIG_EDAC_R82600) += r82600_edac.o obj-$(CONFIG_EDAC_KRYO_ARM64) += kryo_arm64_edac.o +obj-$(CONFIG_EDAC_GIC) += gic600_edac.o amd64_edac_mod-y := amd64_edac.o amd64_edac_mod-$(CONFIG_EDAC_DEBUG) += amd64_edac_dbg.o diff --git a/drivers/edac/gic600_edac.c b/drivers/edac/gic600_edac.c new file mode 100644 index 000000000000..ee86771ae00e --- /dev/null +++ b/drivers/edac/gic600_edac.c @@ -0,0 +1,448 @@ +/* Copyright (c) 2018, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define pr_fmt(fmt) "GICT: %s(): " fmt, __func__ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "edac_mc.h" +#include "edac_device.h" + +#ifdef CONFIG_EDAC_GIC_PANIC_ON_CE +#define GICT_PANIC_ON_CE 1 +#else +#define GICT_PANIC_ON_CE 0 +#endif + +#ifdef CONFIG_EDAC_GIC_PANIC_ON_UE +#define GICT_PANIC_ON_UE 1 +#else +#define GICT_PANIC_ON_UE 0 +#endif + +#define EDAC_NODE "GICT" +#define RECORD_WIDTH(n) (64 * n) +#define GICT_ERR_FR(n) (RECORD_WIDTH(n) + 0x00) +#define GICT_ERR_CTRL(n) (RECORD_WIDTH(n) + 0x08) +#define GICT_ERR_STATUS(n) (RECORD_WIDTH(n) + 0x10) +#define GICT_ERR_ADDR(n) (RECORD_WIDTH(n) + 0x18) +#define GICT_ERR_MISC0(n) (RECORD_WIDTH(n) + 0x20) +#define GICT_ERR_MISC1(n) (RECORD_WIDTH(n) + 0x28) +#define GICT_ERRGSR 0xE000 +#define GICT_ERRDEVARCH 0xFFBC +#define GICT_ERRIDR 0xFFC8 +#define GICT_ERRIRQCR0 0xE800 +#define GICT_ERRIRQCR1 (GICT_ERRIRQCR0+0x8) + +#define GICD_FCTLR 0x0020 +#define GICR_FCTLR 0x0020 +#define GICR_OFFSET 0x20000 + + +#define RECORD0 "Software error in GICD programming(UEO)" +#define RECORD1 "Correctable SPI RAM errors(CE)" +#define RECORD2 "Uncorrectable SPI RAM errors(UER)" +#define RECORD3 "Correctable SGI RAM errors(CE)" +#define RECORD4 "Uncorrectable SGI RAM errors(UER)" +#define RECORD5 "Correctable TGT cache errors(CE)" +#define RECORD6 "Correctable TGT cache errors(UER)" +#define RECORD7 "Correctable PPI RAM errors(CE)" +#define RECORD8 "Uncorrectable PPI RAM errors(UER)" +#define RECORD9 "Correctable LPI RAM errors(CE)" +#define RECORD10 "Uncorrectable LPI RAM errors(UER)" +#define RECORD11 "Correctable ITS RAM errors(CE)" +#define RECORD12 "Uncorrectable ITS RAM errors(UEO)" +#define RECORD13 "Uncorrectable 2xITS RAM errors(UER)" +#define RECORD14 "Uncorrectable 2xITS RAM errors(UER)" +#define RECORD(N) RECORD##N + +#define ERR_FR_UI_MASK GENMASK(5, 4) +#define ERR_FR_FI_MASK GENMASK(7, 6) +#define ERR_FR_UE_MASK GENMASK(9, 8) +#define ERR_FR_CFI_MASK GENMASK(11, 10) + +#define ERR_CTRL_UI_MASK BIT(2) +#define ERR_CTRL_FI_MASK BIT(3) +#define ERR_CTRL_UE_MASK BIT(4) +#define ERR_CTRL_CFI_MASK BIT(8) + +#define ERR_STATUS_SERR_MASK GENMASK(7, 0) +#define ERR_STATUS_IERR_MASK GENMASK(15, 8) +#define ERR_STATUS_UET_MASK GENMASK(21, 20) +#define ERR_STATUS_CE_MASK GENMASK(25, 24) +#define ERR_STATUS_MV_MASK BIT(26) +#define ERR_STATUS_OF_MASK BIT(27) +#define ERR_STATUS_ER_MASK BIT(28) +#define ERR_STATUS_UE_MASK BIT(29) +#define ERR_STATUS_V_MASK BIT(30) +#define ERR_STATUS_AV_MASK BIT(31) + +#define ERR_MISC0_COUNT_SHIFT 32 +#define ERR_MISC0_COUNT_MASK GENMASK(39, 32) +#define FAULT_THRESHOLD 0xFF + +#define ERR_IRQCR_SPIID_MASK GENMASK(9, 0) + +#define ERR_ADDR_PADDR_MASK GENMASK(47, 0) +#define ERR_ADDR_NS_MASK BIT(63) + +struct errors_edac { + const char *const msg; + void (*func)(struct edac_device_ctl_info *edac_dev, + int inst_nr, int block_nr, const char *msg); +}; +#define GICT_IRQS 2 +#define GICT_FAULT_IRQ_IDX 0 +#define GICT_ERROR_IRQ_IDX 1 +#define GICT_BUFFER_SIZE 32 +static u32 gict_buffer[GICT_BUFFER_SIZE] __aligned(PAGE_SIZE); + +static void gict_spi_recovery(struct edac_device_ctl_info *edac_dev, + int inst_nr, int block_nr, const char *msg); + +static const struct errors_edac errors[] = { + {RECORD(0), edac_device_handle_ue }, + {RECORD(1), edac_device_handle_ce }, + {RECORD(2), gict_spi_recovery }, + {RECORD(3), edac_device_handle_ce }, + {RECORD(4), edac_device_handle_ue }, + {RECORD(5), edac_device_handle_ce }, + {RECORD(6), edac_device_handle_ce }, + {RECORD(7), edac_device_handle_ce }, + {RECORD(8), edac_device_handle_ue }, + {RECORD(9), edac_device_handle_ce }, + {RECORD(10), edac_device_handle_ue }, + {RECORD(11), edac_device_handle_ce }, + {RECORD(12), edac_device_handle_ue }, + {RECORD(13), edac_device_handle_ue }, + {RECORD(14), edac_device_handle_ue }, +}; + +struct erp_drvdata { + struct edac_device_ctl_info *edev_ctl; + void __iomem *base; + u32 interrupt_config[GICT_IRQS]; + u32 max_records; + struct mutex mutex; +}; +static struct erp_drvdata *gict_edac; + +static inline void gict_write(struct erp_drvdata *drv, u32 val, u32 off) +{ + writel_relaxed(val, drv->base + off); +} + +static inline u32 gict_read(struct erp_drvdata *drv, u32 off) +{ + return readl_relaxed(drv->base + off); +} + +static inline u64 gict_readq(struct erp_drvdata *drv, u32 off) +{ + return readq_relaxed(drv->base + off); +} + +static inline void gict_writeq(struct erp_drvdata *drv, u64 val, u32 off) +{ + writeq_relaxed(val, drv->base + off); +} + +#define for_each_bit(i, buf, max) \ + for (i = find_first_bit((unsigned long *)buf, max); \ + i < max; \ + i = find_next_bit((unsigned long *)buf, max, i + 1)) + +#define SCM_SVC_GIC 0x1D +#define GIC_ERROR_RECOVERY_SMC_ID 0x01 +#define MAX_IRQS 1023 +/* RECORD(2) - Recovery of GIC SPI interrupts */ +static void gict_spi_recovery(struct edac_device_ctl_info *edac_dev, + int inst_nr, int block_nr, const char *msg) +{ + struct scm_desc desc; + u64 i; + int ret; + + /* + * SPIs that are in the error state can be determined + * by reading the GICD_IERRRn register. + * Secure side reads GICD_IERRRn register into gict_buffer. + */ + memset(&gict_buffer, 0, sizeof(gict_buffer)); + desc.args[0] = virt_to_phys(gict_buffer); + desc.args[1] = sizeof(gict_buffer); + desc.arginfo = SCM_ARGS(2, SCM_RW, SCM_VAL); + + ret = scm_call2(SCM_SIP_FNID(SCM_SVC_GIC, + GIC_ERROR_RECOVERY_SMC_ID), &desc); + + if (ret) { + pr_warn("Recovery SCM call failed\n"); + return; + } + + for_each_bit(i, gict_buffer, MAX_IRQS) + pr_warn("GICT: Corrupted SPI:%d", i); +} + +static inline void gict_dump_err_record(int record, u64 errxstatus, + u64 errxmisc0, u64 errxmisc1, u64 errxaddr) +{ + edac_printk(KERN_INFO, EDAC_NODE, "RECORD: %d", record); + edac_printk(KERN_INFO, EDAC_NODE, "%s\n", errors[record].msg); + edac_printk(KERN_INFO, EDAC_NODE, "ERRXSTATUS: %llx\n", errxstatus); + edac_printk(KERN_INFO, EDAC_NODE, "ERRXMISC0: %llx\n", errxmisc0); + edac_printk(KERN_INFO, EDAC_NODE, "ERRXMISC1: %llx\n", errxmisc1); + edac_printk(KERN_INFO, EDAC_NODE, "ERRXADDR: %llx\n", errxaddr); +} + +static void handle_record(int record, int level, + struct edac_device_ctl_info *edev_ctl) +{ + errors[record].func(edev_ctl, raw_smp_processor_id(), + level, errors[record].msg); +} + +static void gict_check_error_records(struct erp_drvdata *drv) +{ + u64 errxstatus, errxaddr, errxmisc0, errxmisc1, errgsr, i; + + errgsr = gict_readq(drv, GICT_ERRGSR); + /* + * Lets first dump all error records + * details which are having a valid status. + */ + for_each_bit(i, &errgsr, drv->max_records) { + errxstatus = gict_readq(drv, GICT_ERR_STATUS(i)); + if (errxstatus & ERR_STATUS_V_MASK) { + errxaddr = gict_readq(drv, GICT_ERR_ADDR(i)); + errxmisc0 = gict_readq(drv, GICT_ERR_MISC0(i)); + errxmisc1 = gict_readq(drv, GICT_ERR_MISC1(i)); + gict_dump_err_record(i, errxstatus, + errxmisc0, errxmisc1, errxaddr); + } + } +} + +static void gict_handle_records(struct erp_drvdata *drv) +{ + u64 errxstatus, errgsr, errxstatus_ack, i; + + errgsr = gict_readq(drv, GICT_ERRGSR); + + for_each_bit(i, &errgsr, drv->max_records) { + errxstatus = gict_readq(drv, GICT_ERR_STATUS(i)); + errxstatus_ack = 0; + if (errxstatus & ERR_STATUS_V_MASK) { + handle_record(i, 0, drv->edev_ctl); + errxstatus_ack = ERR_STATUS_CE_MASK | + ERR_STATUS_UE_MASK | ERR_STATUS_UET_MASK | + ERR_STATUS_OF_MASK | ERR_STATUS_MV_MASK | + ERR_STATUS_AV_MASK | ERR_STATUS_V_MASK; + gict_write(drv, errxstatus_ack, GICT_ERR_STATUS(i)); + } + + } +} + +static void configure_thresholds(struct erp_drvdata *drv) +{ + u64 errxmisc0, i; + + /* + * Configure ERRXMISC0 count to 0xFF so that fault + * interrupt raised on every error observed. + */ + for (i = 0; i <= drv->max_records; i++) { + errxmisc0 = gict_readq(drv, GICT_ERR_MISC0(i)); + errxmisc0 |= (u64)FAULT_THRESHOLD << ERR_MISC0_COUNT_SHIFT; + gict_writeq(drv, errxmisc0, GICT_ERR_MISC0(i)); + } + +} + +static irqreturn_t gict_fault_handler(int irq, void *drvdata) +{ + struct erp_drvdata *drv = drvdata; + + mutex_lock(&drv->mutex); + gict_check_error_records(drv); + gict_handle_records(drv); + configure_thresholds(drv); + mutex_unlock(&drv->mutex); + return IRQ_HANDLED; +} + +static void configure_ctrl_registers(struct erp_drvdata *drv) +{ + u64 errxfr, errxctrl, i; + + configure_thresholds(drv); + + /* Configure Control Register for CFI, UE, FI and UI faults/errors. */ + for (i = 0; i <= drv->max_records; i++) { + errxfr = gict_readq(drv, GICT_ERR_FR(i)); + errxctrl = gict_readq(drv, GICT_ERR_CTRL(i)); + /* Configure to enable Correctable Fault Interrupt(CFI) */ + if (errxfr & ERR_FR_CFI_MASK) + errxctrl = errxctrl | ERR_CTRL_CFI_MASK; + /* Configure to enable Uncorrectable Error(UE) */ + if (errxfr & ERR_FR_UE_MASK) + errxctrl = errxctrl | ERR_CTRL_UE_MASK; + /* Configure to enable Uncorrectable Fault Interrupt(FI) */ + if (errxfr & ERR_FR_FI_MASK) + errxctrl = errxctrl | ERR_CTRL_FI_MASK; + /* Configure to enable Uncorrectable Interrupt(UI) */ + if (errxfr & ERR_FR_UI_MASK) + errxctrl = errxctrl | ERR_CTRL_UI_MASK; + + gict_writeq(drv, errxctrl, GICT_ERR_CTRL(i)); + } +} + +static int gic_erp_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct erp_drvdata *drv; + struct resource *res; + int ret, errirq, faultirq; + u64 errirqcrx; + const char *err_irqname = "gict-err"; + const char *fault_irqname = "gict-fault"; + + drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL); + if (!drv) + return -ENOMEM; + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "gict-base"); + if (!res) + return -ENXIO; + + drv->base = devm_ioremap_resource(dev, res); + if (IS_ERR_OR_NULL(drv->base)) + return -EBUSY; + + ret = of_property_read_u32_array(dev->of_node, + "interrupt-config", drv->interrupt_config, GICT_IRQS); + if (ret) + return -ENXIO; + + errirq = platform_get_irq_byname(pdev, err_irqname); + if (errirq < 0) + return errirq; + + faultirq = platform_get_irq_byname(pdev, fault_irqname); + if (faultirq < 0) + return faultirq; + + drv->edev_ctl = edac_device_alloc_ctl_info(0, "gic", + 1, "T", 1, 1, NULL, 0, + edac_device_alloc_index()); + + if (!drv->edev_ctl) + return -ENOMEM; + + drv->edev_ctl->dev = dev; + drv->edev_ctl->mod_name = dev_name(dev); + drv->edev_ctl->dev_name = dev_name(dev); + drv->edev_ctl->ctl_name = "GICT"; + drv->edev_ctl->panic_on_ce = GICT_PANIC_ON_CE; + drv->edev_ctl->panic_on_ue = GICT_PANIC_ON_UE; + platform_set_drvdata(pdev, drv); + gict_edac = drv; + + mutex_init(&drv->mutex); + ret = edac_device_add_device(drv->edev_ctl); + if (ret) + goto out_mem; + + /* + * Find no of error records supported by GICT from ERRIDR register + * and configure control registers for all records supported. + */ + drv->max_records = gict_read(drv, GICT_ERRIDR); + configure_ctrl_registers(drv); + + /* Configure GICT_ERRIRQCR0 register with fault_interrupt number */ + errirqcrx = gict_readq(drv, GICT_ERRIRQCR0); + errirqcrx |= (drv->interrupt_config[GICT_FAULT_IRQ_IDX] + & ERR_IRQCR_SPIID_MASK); + gict_writeq(drv, errirqcrx, GICT_ERRIRQCR0); + + /* Configure GICT_ERRIRQCR1 register with err_interrupt number */ + errirqcrx = gict_readq(drv, GICT_ERRIRQCR1); + errirqcrx |= (drv->interrupt_config[GICT_ERROR_IRQ_IDX] + & ERR_IRQCR_SPIID_MASK); + gict_writeq(drv, errirqcrx, GICT_ERRIRQCR1); + + ret = devm_request_threaded_irq(&pdev->dev, faultirq, + NULL, gict_fault_handler, + IRQF_ONESHOT | IRQF_TRIGGER_HIGH, fault_irqname, drv); + if (ret) { + dev_err(dev, "Failed to request %s IRQ %d: %d\n", + fault_irqname, res->start, ret); + goto out_dev; + } + + ret = devm_request_threaded_irq(&pdev->dev, errirq, + NULL, gict_fault_handler, + IRQF_ONESHOT | IRQF_TRIGGER_HIGH, err_irqname, drv); + if (ret) { + dev_err(dev, "Failed to request %s IRQ %d: %d\n", + err_irqname, res->start, ret); + goto out_dev; + } + return ret; + +out_dev: + edac_device_del_device(dev); +out_mem: + edac_device_free_ctl_info(drv->edev_ctl); + return ret; +} + +static int gic_erp_remove(struct platform_device *pdev) +{ + struct erp_drvdata *drv = dev_get_drvdata(&pdev->dev); + struct edac_device_ctl_info *edac_ctl = drv->edev_ctl; + + edac_device_del_device(edac_ctl->dev); + edac_device_free_ctl_info(edac_ctl); + + return 0; +} + +static const struct of_device_id gic_edac_match[] = { + { .compatible = "arm,gic-600-erp", }, + { } +}; +MODULE_DEVICE_TABLE(of, gic_edac_match); + +static struct platform_driver gic_edac_driver = { + .probe = gic_erp_probe, + .remove = gic_erp_remove, + .driver = { + .name = "gic_erp", + .of_match_table = of_match_ptr(gic_edac_match), + }, +}; +module_platform_driver(gic_edac_driver); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("GICT driver");