diff --git a/drivers/vservices/Makefile b/drivers/vservices/Makefile index 4ce9e48c72da..685ba0a961af 100644 --- a/drivers/vservices/Makefile +++ b/drivers/vservices/Makefile @@ -11,4 +11,6 @@ obj-$(CONFIG_VSERVICES_SERVER) += core_server.o obj-$(CONFIG_VSERVICES_SKELETON_DRIVER) += vservices_skeleton_driver.o vservices_skeleton_driver-objs = skeleton_driver.o +obj-$(CONFIG_VSERVICES_SUPPORT) += transport/ + obj-$(CONFIG_VSERVICES_SUPPORT) += protocol/ diff --git a/drivers/vservices/transport/Kconfig b/drivers/vservices/transport/Kconfig index cd1c97cddab1..37e84c4d4f1e 100644 --- a/drivers/vservices/transport/Kconfig +++ b/drivers/vservices/transport/Kconfig @@ -4,4 +4,17 @@ menu "Transport drivers" +config VSERVICES_OKL4_AXON + tristate "OKL4 Microvisor Axon driver" + depends on VSERVICES_SUPPORT && OKL4_GUEST + default y + help + This option adds support for Virtual Services sessions using an OKL4 + Microvisor Axon object as a transport. + + If this driver is to be used in a Cell that has multiple + discontiguous regions in its physical memory pool, the + CONFIG_DMA_CMA option must also be selected (or CONFIG_CMA + in older kernels that do not have CONFIG_DMA_CMA). + endmenu diff --git a/drivers/vservices/transport/Makefile b/drivers/vservices/transport/Makefile index ae1c943993ac..222fb512f946 100644 --- a/drivers/vservices/transport/Makefile +++ b/drivers/vservices/transport/Makefile @@ -1,2 +1,5 @@ ccflags-y += -Werror ccflags-$(CONFIG_VSERVICES_DEBUG) += -DDEBUG + +obj-$(CONFIG_VSERVICES_OKL4_AXON) += vtransport_axon.o +vtransport_axon-objs = axon.o diff --git a/drivers/vservices/transport/axon.c b/drivers/vservices/transport/axon.c new file mode 100644 index 000000000000..a140b4aac3e0 --- /dev/null +++ b/drivers/vservices/transport/axon.c @@ -0,0 +1,3573 @@ +/* + * drivers/vservices/transport/axon.c + * + * Copyright (c) 2015-2018 General Dynamics + * Copyright (c) 2015 Open Kernel Labs, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This is the OKL4 Virtual Services transport driver for OKL4 Microvisor + * Axons (virtual inter-Cell DMA engines). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 12, 0) +#include +#endif +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "../transport.h" +#include "../session.h" +#include "../debug.h" + +#define DRIVER_AUTHOR "Cog Systems Pty Ltd" +#define DRIVER_DESC "OKL4 vServices Axon Transport Driver" +#define DRIVER_NAME "vtransport_axon" + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 18, 0) || \ + defined(CONFIG_NO_DEPRECATED_MEMORY_BARRIERS) +#define smp_mb__before_atomic_dec smp_mb__before_atomic +#define smp_mb__before_atomic_inc smp_mb__before_atomic +#define smp_mb__after_atomic_dec smp_mb__after_atomic +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,8,0) +#define DMA_ATTRS unsigned long +#else +#define DMA_ATTRS struct dma_attrs * +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 12, 0) && \ + !defined(CONFIG_CMA) +static inline struct cma *dev_get_cma_area(struct device *dev) +{ + return NULL; +} +#endif + +static struct kmem_cache *mbuf_cache; + +struct child_device { + struct device *dev; + struct list_head list; +}; + +/* Number of services in the transport array to allocate at a time */ +#define SERVICES_ALLOC_CHUNK 16 +#define MSG_SEND_FREE_BUFS VS_SERVICE_ID_RESERVED_1 + +/* The maximum value we allow for the free_bufs_balance counter */ +#define MAX_BALANCE 1 + +/* + * The free bufs quota must be enough to take free_bufs_balance from its + * minimum to its maximum. + */ +#define FREE_BUFS_QUOTA (MAX_BALANCE * 2) + +/* + * The free bufs retry delay is the period in jiffies that we delay retrying + * after an out-of-memory condition when trying to send a free bufs message. + */ +#define FREE_BUFS_RETRY_DELAY 2 + +/* The minimum values we permit for queue and message size. */ +#define MIN_QUEUE_SIZE ((size_t)4) +#define MIN_MSG_SIZE (32 - sizeof(vs_service_id_t)) + +/* + * The maximum size for a batched receive. This should be larger than the + * maximum message size, and large enough to avoid excessive context switching + * overheads, yet small enough to avoid blocking the tasklet queue for too + * long. + */ +#define MAX_TRANSFER_CHUNK 65536 + +#define INC_MOD(x, m) { \ + x++; \ + if (x == m) x = 0; \ +} + +/* Local Axon cleanup workqueue */ +struct workqueue_struct *work_queue; + +/* + * True if there is only one physical segment being used for kernel memory + * allocations. If this is false, the device must have a usable CMA region. + */ +static bool okl4_single_physical_segment; + +/* OKL4 MMU capability. */ +static okl4_kcap_t okl4_mmu_cap; + +/* + * Per-service TX buffer allocation pool. + * + * We cannot use a normal DMA pool for TX buffers, because alloc_mbuf can be + * called with GFP_ATOMIC, and a normal DMA pool alloc will take pages from + * a global emergency pool if GFP_WAIT is not set. The emergency pool is not + * guaranteed to be in the same physical segment as this device's DMA region, + * so it might not be usable by the axon. + * + * Using a very simple allocator with preallocated memory also speeds up the + * TX path. + * + * RX buffers use a standard Linux DMA pool, shared between all services, + * rather than this struct. They are preallocated by definition, so the speed + * of the allocator doesn't matter much for them. Also, they're always + * allocated with GFP_KERNEL (which includes GFP_WAIT) so the normal DMA pool + * will use memory from the axon's contiguous region. + */ +struct vs_axon_tx_pool { + struct vs_transport_axon *transport; + struct kref kref; + + void *base_vaddr; + dma_addr_t base_laddr; + + unsigned alloc_order; + unsigned count; + + struct work_struct free_work; + unsigned long alloc_bitmap[]; +}; + +struct vs_axon_rx_freelist_entry { + struct list_head list; + dma_addr_t laddr; +}; + +/* Service info */ +struct vs_mv_service_info { + struct vs_service_device *service; + + /* True if the session has started the service */ + bool ready; + + /* Number of send buffers we have allocated, in total. */ + atomic_t send_inflight; + + /* + * Number of send buffers we have allocated but not yet sent. + * This should always be zero if ready is false. + */ + atomic_t send_alloc; + + /* + * Number of receive buffers we have received and not yet freed. + * This should always be zero if ready is false. + */ + atomic_t recv_inflight; + + /* + * Number of receive buffers we have freed, but not told the other end + * about yet. + * + * The watermark is the maximum number of freed buffers we can + * accumulate before we send a dummy message to the remote end to ack + * them. This is used in situations where the protocol allows the remote + * end to reach its send quota without guaranteeing a reply; the dummy + * message lets it make progress even if our service driver doesn't send + * an answer that we can piggy-back the acks on. + */ + atomic_t recv_freed; + unsigned int recv_freed_watermark; + + /* + * Number of buffers that have been left allocated after a reset. If + * this count is nonzero, then the service has been disabled by the + * session layer, and needs to be re-enabled when it reaches zero. + */ + atomic_t outstanding_frees; + + /* TX allocation pool */ + struct vs_axon_tx_pool *tx_pool; + + /* RX allocation count */ + unsigned rx_allocated; + + /* Reference count for this info struct. */ + struct kref kref; + + /* RCU head for cleanup */ + struct rcu_head rcu_head; +}; + +/* + * Transport readiness state machine + * + * This is similar to the service readiness state machine, but simpler, + * because there are fewer transition triggers. + * + * The states are: + * INIT: Initial state. This occurs transiently during probe. + * LOCAL_RESET: We have initiated a reset at this end, but the remote end has + * not yet acknowledged it. We will enter the RESET state on receiving + * acknowledgement. + * RESET: The transport is inactive at both ends, and the session layer has + * not yet told us to start activating. + * LOCAL_READY: The session layer has told us to start activating, and we + * have notified the remote end that we're ready. + * REMOTE_READY: The remote end has notified us that it is ready, but the + * local session layer hasn't decided to become ready yet. + * ACTIVE: Both ends are ready to communicate. + * SHUTDOWN: The transport is shutting down and should not become ready. + */ +enum vs_transport_readiness { + VS_TRANSPORT_INIT = 0, + VS_TRANSPORT_LOCAL_RESET, + VS_TRANSPORT_RESET, + VS_TRANSPORT_LOCAL_READY, + VS_TRANSPORT_REMOTE_READY, + VS_TRANSPORT_ACTIVE, + VS_TRANSPORT_SHUTDOWN, +}; + +/* + * Transport reset / ready VIRQ payload bits + */ +enum vs_transport_reset_virq { + VS_TRANSPORT_VIRQ_RESET_REQ = (1 << 0), + VS_TRANSPORT_VIRQ_RESET_ACK = (1 << 1), + VS_TRANSPORT_VIRQ_READY = (1 << 2), +}; + +/* + * Internal definitions of the transport and message buffer structures. + */ +#define MAX_NOTIFICATION_LINES 16 /* Enough for 512 notifications each way */ + +struct vs_transport_axon { + struct device *axon_dev; + + struct okl4_axon_tx *tx; + struct okl4_axon_queue_entry *tx_descs; + struct vs_axon_tx_pool **tx_pools; + struct okl4_axon_rx *rx; + struct okl4_axon_queue_entry *rx_descs; + void **rx_ptrs; + + dma_addr_t tx_phys, rx_phys; + size_t tx_size, rx_size; + + okl4_kcap_t segment; + okl4_laddr_t segment_base; + + okl4_kcap_t tx_cap, rx_cap, reset_cap; + unsigned int tx_irq, rx_irq, reset_irq; + okl4_interrupt_number_t reset_okl4_irq; + + unsigned int notify_tx_nirqs; + okl4_kcap_t notify_cap[MAX_NOTIFICATION_LINES]; + unsigned int notify_rx_nirqs; + unsigned int notify_irq[MAX_NOTIFICATION_LINES]; + + bool is_server; + size_t msg_size, queue_size; + + /* + * The handle to the device tree node for the virtual-session node + * associated with the axon. + */ + struct device_node *of_node; + + struct list_head child_dev_list; + + /* + * Hold queue and tx tasklet used to buffer and resend mbufs blocked + * by a full outgoing axon queue, due to a slow receiver or a halted + * axon. + */ + struct list_head tx_queue; + struct tasklet_struct tx_tasklet; + u32 tx_uptr_freed; + + /* + * The readiness state of the transport, and a spinlock protecting it. + * Note that this is different to the session's readiness state + * machine, though it has the same basic purpose. + */ + enum vs_transport_readiness readiness; + spinlock_t readiness_lock; + + struct tasklet_struct rx_tasklet; + struct timer_list rx_retry_timer; + struct list_head rx_freelist; + u32 rx_alloc_extra; + struct dma_pool *rx_pool; + spinlock_t rx_alloc_lock; + u32 rx_uptr_allocated; + + struct vs_session_device *session_dev; + struct vs_transport transport; + + DECLARE_BITMAP(service_bitmap, VS_SERVICE_ID_BITMAP_BITS); + + struct delayed_work free_bufs_work; + + /* + * Freed buffers messages balance counter. This counter is incremented + * when we send a freed buffers message and decremented when we receive + * one. If the balance is negative then we need to send a message + * as an acknowledgement to the other end, even if there are no + * freed buffers to acknowledge. + */ + atomic_t free_bufs_balance; + + /* + * Flag set when a service exceeds its freed buffers watermark, + * telling free_bufs_work to send a message when the balance + * counter is non-negative. This is ignored, and a message is + * sent in any case, if the balance is negative. + */ + bool free_bufs_pending; + + /* Pool for allocating outgoing free bufs messages */ + struct vs_axon_tx_pool *free_bufs_pool; +}; + +#define to_vs_transport_axon(t) \ + container_of(t, struct vs_transport_axon, transport) + +struct vs_mbuf_axon { + struct vs_mbuf base; + struct vs_transport_axon *owner; + dma_addr_t laddr; + struct vs_axon_tx_pool *pool; +}; + +#define to_vs_mbuf_axon(b) container_of(b, struct vs_mbuf_axon, base) + +/* + * Buffer allocation + * + * Buffers used by axons must be allocated within a single contiguous memory + * region, backed by a single OKL4 physical segment. This is similar to how + * the DMA allocator normally works, but we can't use the normal DMA allocator + * because the platform code will remap the allocated memory with caching + * disabled. + * + * We borrow the useful parts of the DMA allocator by providing our own DMA + * mapping ops which don't actually remap the memory. + */ +static void *axon_dma_alloc(struct device *dev, size_t size, + dma_addr_t *handle, gfp_t gfp, DMA_ATTRS attrs) +{ + unsigned long order; + size_t count; + struct page *page; + void *ptr; + + *handle = DMA_ERROR_CODE; + size = PAGE_ALIGN(size); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) + if (!(gfp & __GFP_WAIT)) +#else + if (!(gfp & __GFP_RECLAIM)) +#endif + return NULL; + + order = get_order(size); + count = size >> PAGE_SHIFT; + + if (dev_get_cma_area(dev)) { + page = dma_alloc_from_contiguous(dev, count, order); + + if (!page) + return NULL; + } else { + struct page *p, *e; + page = alloc_pages(gfp, order); + + if (!page) + return NULL; + + /* Split huge page and free any excess pages */ + split_page(page, order); + for (p = page + count, e = page + (1 << order); p < e; p++) + __free_page(p); + } + + if (PageHighMem(page)) { + struct vm_struct *area = get_vm_area(size, VM_USERMAP); + if (!area) + goto free_pages; + ptr = area->addr; + area->phys_addr = __pfn_to_phys(page_to_pfn(page)); + + if (ioremap_page_range((unsigned long)ptr, + (unsigned long)ptr + size, + area->phys_addr, PAGE_KERNEL)) { + vunmap(ptr); + goto free_pages; + } + } else { + ptr = page_address(page); + } + + *handle = (dma_addr_t)page_to_pfn(page) << PAGE_SHIFT; + + dev_dbg(dev, "dma_alloc: %#tx bytes at %pK (%#llx), %s cma, %s high\n", + size, ptr, (long long)*handle, + dev_get_cma_area(dev) ? "is" : "not", + PageHighMem(page) ? "is" : "not"); + + return ptr; + +free_pages: + if (dev_get_cma_area(dev)) { + dma_release_from_contiguous(dev, page, count); + } else { + struct page *e = page + count; + + while (page < e) { + __free_page(page); + page++; + } + } + + return NULL; +} + +static void axon_dma_free(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t handle, DMA_ATTRS attrs) +{ + struct page *page = pfn_to_page(handle >> PAGE_SHIFT); + + size = PAGE_ALIGN(size); + + if (PageHighMem(page)) { + unmap_kernel_range((unsigned long)cpu_addr, size); + vunmap(cpu_addr); + } + + if (dev_get_cma_area(dev)) { + dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT); + } else { + struct page *e = page + (size >> PAGE_SHIFT); + + while (page < e) { + __free_page(page); + page++; + } + } +} + +struct dma_map_ops axon_dma_ops = { + .alloc = axon_dma_alloc, + .free = axon_dma_free, +}; + +/* + * Quotas + * ------ + * + * Each service has two quotas, one for send and one for receive. The + * send quota is incremented when we allocate an mbuf. The send quota + * is decremented by receiving an freed buffer ack from the remove + * end, either in the reserved bits of the service id or in a special + * free bufs message. + * + * The receive quota is incremented whenever we receive a message and + * decremented when we free the mbuf. Exceeding the receive quota + * indicates that something bad has happened since the other end's + * send quota should have prevented it from sending the + * message. Exceeding the receive quota indicates a driver bug since + * the two ends are disagreeing about the quotas. If this happens then + * a warning is printed and the offending service is reset. + */ + +/* + * The base of the mbuf has the destination service id, but we pass the + * data pointer starting after the service id. The following helper + * functions are used to avoid ugly pointer arithmetic when handling + * mbufs. + */ +static size_t mbuf_real_size(struct vs_mbuf_axon *mbuf) +{ + return mbuf->base.size + sizeof(vs_service_id_t); +} + +static void *mbuf_real_base(struct vs_mbuf_axon *mbuf) +{ + return mbuf->base.data - sizeof(vs_service_id_t); +} +/* + * Get the service_id and reserved bits from a message buffer and the + * clear the reserved bits so the upper layers don't see them. + */ +vs_service_id_t +transport_get_mbuf_service_id(struct vs_transport_axon *transport, + void *data, unsigned int *freed_acks) +{ + unsigned int reserved_bits; + vs_service_id_t id; + + /* Get the real service id and reserved bits */ + id = *(vs_service_id_t *)data; + reserved_bits = vs_get_service_id_reserved_bits(id); + id = vs_get_real_service_id(id); + + /* Clear the reserved bits in the service id */ + vs_set_service_id_reserved_bits(&id, 0); + if (freed_acks) { + *(vs_service_id_t *)data = id; + *freed_acks = reserved_bits; + } + return id; +} + +static void +__transport_get_service_info(struct vs_mv_service_info *service_info) +{ + kref_get(&service_info->kref); +} + +static struct vs_mv_service_info * +transport_get_service_info(struct vs_service_device *service) +{ + struct vs_mv_service_info *service_info; + + rcu_read_lock(); + service_info = rcu_dereference(service->transport_priv); + if (service_info) + __transport_get_service_info(service_info); + rcu_read_unlock(); + + return service_info; +} + +static struct vs_mv_service_info * +transport_get_service_id_info(struct vs_transport_axon *transport, + vs_service_id_t service_id) +{ + struct vs_service_device *service; + struct vs_mv_service_info *service_info; + + service = vs_session_get_service(transport->session_dev, service_id); + if (!service) + return NULL; + + service_info = transport_get_service_info(service); + + vs_put_service(service); + return service_info; +} + +static void transport_info_free(struct rcu_head *rcu_head) +{ + struct vs_mv_service_info *service_info = + container_of(rcu_head, struct vs_mv_service_info, rcu_head); + + vs_put_service(service_info->service); + kfree(service_info); +} + +static void transport_info_release(struct kref *kref) +{ + struct vs_mv_service_info *service_info = + container_of(kref, struct vs_mv_service_info, kref); + + call_rcu(&service_info->rcu_head, transport_info_free); +} + +static void transport_put_service_info(struct vs_mv_service_info *service_info) +{ + kref_put(&service_info->kref, transport_info_release); +} + +static bool transport_axon_reset(struct vs_transport_axon *transport); + +static void transport_fatal_error(struct vs_transport_axon *transport, + const char *msg) +{ + dev_err(transport->axon_dev, "Fatal transport error (%s); resetting\n", + msg); +#ifdef DEBUG + dump_stack(); +#endif + transport_axon_reset(transport); +} + +static unsigned int reduce_send_quota(struct vs_transport_axon *transport, + struct vs_mv_service_info *service_info, unsigned int count, + bool allow_tx_ready) +{ + int new_inflight, send_alloc; + bool was_over_quota, is_over_quota; + + /* FIXME: Redmine issue #1303 - philip. */ + spin_lock_irq(&transport->readiness_lock); + /* + * We read the current send_alloc for error checking *before* + * decrementing send_inflight. This avoids any false positives + * due to send_alloc being incremented by a concurrent alloc_mbuf. + * + * Note that there is an implicit smp_mb() before atomic_sub_return(), + * matching the explicit one in alloc_mbuf. + */ + send_alloc = atomic_read(&service_info->send_alloc); + new_inflight = atomic_sub_return(count, &service_info->send_inflight); + + spin_unlock_irq(&transport->readiness_lock); + if (WARN_ON(new_inflight < send_alloc)) { + dev_err(transport->axon_dev, + "inflight sent messages for service %d is less than the number of allocated messages (%d < %d, was reduced by %d)\n", + service_info->service->id, new_inflight, + send_alloc, count); + transport_fatal_error(transport, "sent msg count underrun"); + return 0; + } + + was_over_quota = (new_inflight + count >= + service_info->service->send_quota); + is_over_quota = (new_inflight > service_info->service->send_quota); + + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, + "Service %d quota %d -> %d (over_quota: %d -> %d)\n", + service_info->service->id, new_inflight + count, + new_inflight, was_over_quota, is_over_quota); + + /* + * Notify the service that a buffer has been freed. We call tx_ready + * if this is a notification from the remote end (i.e. not an unsent + * buffer) and the quota has just dropped below the maximum. + */ + vs_session_quota_available(transport->session_dev, + service_info->service->id, count, + !is_over_quota && was_over_quota && allow_tx_ready); + + return count; +} + +static void __transport_tx_pool_free(struct vs_axon_tx_pool *pool, + dma_addr_t laddr); + +static void +__transport_tx_cleanup(struct vs_transport_axon *transport) +{ + u32 uptr; + struct okl4_axon_queue_entry *desc; + + lockdep_assert_held(&transport->readiness_lock); + + uptr = transport->tx_uptr_freed; + desc = &transport->tx_descs[uptr]; + + while (!okl4_axon_data_info_getpending(&desc->info)) { + if (!transport->tx_pools[uptr]) + break; + + __transport_tx_pool_free(transport->tx_pools[uptr], + okl4_axon_data_info_getladdr(&desc->info)); + transport->tx_pools[uptr] = NULL; + + INC_MOD(uptr, transport->tx->queues[0].entries); + desc = &transport->tx_descs[uptr]; + transport->tx_uptr_freed = uptr; + } +} + +static void +transport_axon_free_tx_pool(struct work_struct *work) +{ + struct vs_axon_tx_pool *pool = container_of(work, + struct vs_axon_tx_pool, free_work); + struct vs_transport_axon *transport = pool->transport; + + dmam_free_coherent(transport->axon_dev, + pool->count << pool->alloc_order, + pool->base_vaddr, pool->base_laddr); + devm_kfree(transport->axon_dev, pool); +} + +static void +transport_axon_queue_free_tx_pool(struct kref *kref) +{ + struct vs_axon_tx_pool *pool = container_of(kref, + struct vs_axon_tx_pool, kref); + + /* + * Put the task on the axon local work queue for running in + * a context where IRQ is enabled. + */ + INIT_WORK(&pool->free_work, transport_axon_free_tx_pool); + queue_work(work_queue, &pool->free_work); +} + +static void +transport_axon_put_tx_pool(struct vs_axon_tx_pool *pool) +{ + kref_put(&pool->kref, transport_axon_queue_free_tx_pool); +} + +/* Low-level tx buffer allocation, without quota tracking. */ +static struct vs_mbuf_axon * +__transport_alloc_mbuf(struct vs_transport_axon *transport, + vs_service_id_t service_id, struct vs_axon_tx_pool *pool, + size_t size, gfp_t gfp_flags) +{ + size_t real_size = size + sizeof(vs_service_id_t); + struct vs_mbuf_axon *mbuf; + unsigned index; + + if (WARN_ON(real_size > (1 << pool->alloc_order))) { + dev_err(transport->axon_dev, "Message too big (%zu > %zu)\n", + real_size, (size_t)1 << pool->alloc_order); + goto fail_message_size; + } + + kref_get(&pool->kref); + + do { + index = find_first_zero_bit(pool->alloc_bitmap, pool->count); + if (unlikely(index >= pool->count)) { + /* + * No buffers left. This can't be an out-of-quota + * situation, because we've already checked the quota; + * it must be because there's a buffer left over in + * the tx queue. Clean out the tx queue and retry. + */ + spin_lock_irq(&transport->readiness_lock); + __transport_tx_cleanup(transport); + spin_unlock_irq(&transport->readiness_lock); + + index = find_first_zero_bit(pool->alloc_bitmap, + pool->count); + } + if (unlikely(index >= pool->count)) + goto fail_buffer_alloc; + } while (unlikely(test_and_set_bit_lock(index, pool->alloc_bitmap))); + + mbuf = kmem_cache_alloc(mbuf_cache, gfp_flags & ~GFP_ZONEMASK); + if (!mbuf) + goto fail_mbuf_alloc; + + mbuf->base.is_recv = false; + mbuf->base.data = pool->base_vaddr + (index << pool->alloc_order); + mbuf->base.size = size; + mbuf->owner = transport; + mbuf->laddr = pool->base_laddr + (index << pool->alloc_order); + mbuf->pool = pool; + + /* + * We put the destination service id in the mbuf, but increment the + * data pointer past it so the receiver doesn't always need to skip + * the service id. + */ + *(vs_service_id_t *)mbuf->base.data = service_id; + mbuf->base.data += sizeof(vs_service_id_t); + + return mbuf; + +fail_mbuf_alloc: + clear_bit_unlock(index, pool->alloc_bitmap); +fail_buffer_alloc: + transport_axon_put_tx_pool(pool); +fail_message_size: + return NULL; +} + +/* Allocate a tx buffer for a specified service. */ +static struct vs_mbuf *transport_alloc_mbuf(struct vs_transport *_transport, + struct vs_service_device *service, size_t size, gfp_t gfp_flags) +{ + struct vs_transport_axon *transport = to_vs_transport_axon(_transport); + size_t real_size = size + sizeof(vs_service_id_t); + struct vs_mv_service_info *service_info = NULL; + struct vs_mbuf_axon *mbuf; + vs_service_id_t service_id = service->id; + + if (real_size > transport->msg_size) { + dev_err(transport->axon_dev, "Message too big (%zu > %zu)\n", + real_size, transport->msg_size); + return ERR_PTR(-EINVAL); + } + + if (WARN_ON(service_id == MSG_SEND_FREE_BUFS)) + return ERR_PTR(-ENXIO); + + service_info = transport_get_service_info(service); + if (WARN_ON(!service_info)) + return ERR_PTR(-EINVAL); + + if (!service_info->tx_pool) { + transport_put_service_info(service_info); + return ERR_PTR(-ECONNRESET); + } + + if (!atomic_add_unless(&service_info->send_inflight, 1, + service_info->service->send_quota)) { + /* Service has reached its quota */ + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, + "Service %d is at max send quota %d\n", + service_id, service_info->service->send_quota); + transport_put_service_info(service_info); + return ERR_PTR(-ENOBUFS); + } + + /* + * Increment the count of allocated but unsent mbufs. This is done + * *after* the send_inflight increment (with a barrier to enforce + * ordering) to ensure that send_inflight is never less than + * send_alloc - see reduce_send_quota(). + */ + smp_mb__before_atomic_inc(); + atomic_inc(&service_info->send_alloc); + + mbuf = __transport_alloc_mbuf(transport, service_id, + service_info->tx_pool, size, gfp_flags); + if (!mbuf) { + /* + * Failed to allocate a buffer - decrement our quota back to + * where it was. + */ + atomic_dec(&service_info->send_alloc); + smp_mb__after_atomic_dec(); + atomic_dec(&service_info->send_inflight); + + transport_put_service_info(service_info); + + return ERR_PTR(-ENOMEM); + } + + transport_put_service_info(service_info); + + return &mbuf->base; +} + +static void transport_free_sent_mbuf(struct vs_transport_axon *transport, + struct vs_mbuf_axon *mbuf) +{ + kmem_cache_free(mbuf_cache, mbuf); +} + +static void __transport_tx_pool_free(struct vs_axon_tx_pool *pool, + dma_addr_t laddr) +{ + unsigned index = (laddr - pool->base_laddr) >> pool->alloc_order; + + if (WARN_ON(index >= pool->count)) { + printk(KERN_DEBUG "free %#llx base %#llx order %d count %d\n", + (long long)laddr, (long long)pool->base_laddr, + pool->alloc_order, pool->count); + return; + } + + clear_bit_unlock(index, pool->alloc_bitmap); + transport_axon_put_tx_pool(pool); +} + +static int transport_rx_queue_buffer(struct vs_transport_axon *transport, + void *ptr, dma_addr_t laddr); + +static void transport_rx_recycle(struct vs_transport_axon *transport, + struct vs_mbuf_axon *mbuf) +{ + void *data = mbuf_real_base(mbuf); + dma_addr_t laddr = mbuf->laddr; + unsigned long flags; + + spin_lock_irqsave(&transport->rx_alloc_lock, flags); + + if (transport->rx_alloc_extra) { + transport->rx_alloc_extra--; + dma_pool_free(transport->rx_pool, data, laddr); + } else if (transport_rx_queue_buffer(transport, data, laddr) < 0) { + struct vs_axon_rx_freelist_entry *buf = data; + buf->laddr = laddr; + list_add_tail(&buf->list, &transport->rx_freelist); + tasklet_schedule(&transport->rx_tasklet); + } else { + tasklet_schedule(&transport->rx_tasklet); + } + + spin_unlock_irqrestore(&transport->rx_alloc_lock, flags); +} + +static void transport_free_mbuf_pools(struct vs_transport_axon *transport, + struct vs_service_device *service, + struct vs_mv_service_info *service_info) +{ + /* + * Free the TX allocation pool. This will also free any buffer + * memory allocated from the pool, so it is essential that + * this happens only after we have successfully freed all + * mbufs. + * + * Note that the pool will not exist if the core client is reset + * before it receives a startup message. + */ + if (!IS_ERR_OR_NULL(service_info->tx_pool)) + transport_axon_put_tx_pool(service_info->tx_pool); + service_info->tx_pool = NULL; + + /* Mark the service's preallocated RX buffers as extra. */ + spin_lock_irq(&transport->rx_alloc_lock); + transport->rx_alloc_extra += service_info->rx_allocated; + service_info->rx_allocated = 0; + spin_unlock_irq(&transport->rx_alloc_lock); +} + +/* Low-level tx or rx buffer free, with no quota tracking */ +static void __transport_free_mbuf(struct vs_transport_axon *transport, + struct vs_mbuf_axon *mbuf, bool is_rx) +{ + if (is_rx) { + transport_rx_recycle(transport, mbuf); + } else { + __transport_tx_pool_free(mbuf->pool, mbuf->laddr); + } + + kmem_cache_free(mbuf_cache, mbuf); +} + +static void transport_free_mbuf(struct vs_transport *_transport, + struct vs_service_device *service, struct vs_mbuf *_mbuf) +{ + struct vs_transport_axon *transport = to_vs_transport_axon(_transport); + struct vs_mbuf_axon *mbuf = to_vs_mbuf_axon(_mbuf); + struct vs_mv_service_info *service_info = NULL; + void *data = mbuf_real_base(mbuf); + vs_service_id_t service_id __maybe_unused = + transport_get_mbuf_service_id(transport, data, NULL); + bool is_recv = mbuf->base.is_recv; + + WARN_ON(!service); + service_info = transport_get_service_info(service); + + __transport_free_mbuf(transport, mbuf, is_recv); + + /* + * If this message was left over from a service that has already been + * deleted, we don't need to do any quota accounting. + */ + if (!service_info) + return; + + if (unlikely(atomic_read(&service_info->outstanding_frees))) { + if (atomic_dec_and_test(&service_info->outstanding_frees)) { + dev_dbg(transport->axon_dev, + "service %d all outstanding frees done\n", + service->id); + transport_free_mbuf_pools(transport, service, + service_info); + vs_service_enable(service); + } else { + dev_dbg(transport->axon_dev, + "service %d outstanding frees -> %d\n", + service->id, atomic_read( + &service_info->outstanding_frees)); + } + } else if (is_recv) { + smp_mb__before_atomic_dec(); + atomic_dec(&service_info->recv_inflight); + if (atomic_inc_return(&service_info->recv_freed) >= + service_info->recv_freed_watermark) { + transport->free_bufs_pending = true; + schedule_delayed_work(&transport->free_bufs_work, 0); + } + + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, + "Freed recv buffer for service %d rq=%d/%d, freed=%d (watermark = %d)\n", + service_id, + atomic_read(&service_info->recv_inflight), + service_info->service->recv_quota, + atomic_read(&service_info->recv_freed), + service_info->recv_freed_watermark); + } else { + /* + * We are freeing a message buffer that we allocated. This + * usually happens on error paths in application drivers if + * we allocated a buffer but failed to send it. In this case + * we need to decrement our own send quota since we didn't + * send anything. + */ + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, + "Freeing send buffer for service %d, send quota = %d\n", + service_id, atomic_read(&service_info->send_inflight)); + + smp_mb__before_atomic_dec(); + atomic_dec(&service_info->send_alloc); + + /* + * We don't allow the tx_ready handler to run when we are + * freeing an mbuf that we allocated. + */ + reduce_send_quota(transport, service_info, 1, false); + } + + transport_put_service_info(service_info); +} + +static size_t transport_mbuf_size(struct vs_mbuf *_mbuf) +{ + struct vs_mbuf_axon *mbuf = to_vs_mbuf_axon(_mbuf); + + return mbuf_real_size(mbuf); +} + +static size_t transport_max_mbuf_size(struct vs_transport *_transport) +{ + struct vs_transport_axon *transport = to_vs_transport_axon(_transport); + + return transport->msg_size - sizeof(vs_service_id_t); +} + +static int okl4_error_to_errno(okl4_error_t err) { + switch (err) { + case OKL4_OK: + return 0; + case OKL4_ERROR_AXON_QUEUE_NOT_MAPPED: + /* Axon has been reset locally */ + return -ECONNRESET; + case OKL4_ERROR_AXON_QUEUE_NOT_READY: + /* No message buffers in the queue. */ + return -ENOBUFS; + case OKL4_ERROR_AXON_INVALID_OFFSET: + case OKL4_ERROR_AXON_AREA_TOO_BIG: + /* Buffer address is bad */ + return -EFAULT; + case OKL4_ERROR_AXON_BAD_MESSAGE_SIZE: + case OKL4_ERROR_AXON_TRANSFER_LIMIT_EXCEEDED: + /* One of the Axon's message size limits has been exceeded */ + return -EMSGSIZE; + default: + /* Miscellaneous failure, probably a bad cap */ + return -EIO; + } +} + +static void queue_tx_mbuf(struct vs_mbuf_axon *mbuf, struct vs_transport_axon *priv, + vs_service_id_t service_id) +{ + list_add_tail(&mbuf->base.queue, &priv->tx_queue); +} + +static void free_tx_mbufs(struct vs_transport_axon *priv) +{ + struct vs_mbuf_axon *child, *tmp; + + list_for_each_entry_safe(child, tmp, &priv->tx_queue, base.queue) { + list_del(&child->base.queue); + __transport_free_mbuf(priv, child, false); + } +} + +static int __transport_flush(struct vs_transport_axon *transport) +{ + _okl4_sys_axon_trigger_send(transport->tx_cap); + return 0; +} + +static int transport_flush(struct vs_transport *_transport, + struct vs_service_device *service) +{ + struct vs_transport_axon *transport = to_vs_transport_axon(_transport); + + return __transport_flush(transport); +} + +/* + * Low-level transport message send function. + * + * The caller must hold the transport->readiness_lock, and is responsible for + * freeing the mbuf on successful send (use transport_free_sent_mbuf). The + * mbuf should _not_ be freed if this function fails. The Virtual Service + * driver is responsible for freeing the mbuf in the failure case. + */ +static int __transport_send(struct vs_transport_axon *transport, + struct vs_mbuf_axon *mbuf, vs_service_id_t service_id, + unsigned long flags) +{ + u32 uptr; + struct okl4_axon_queue_entry *desc; + struct vs_axon_tx_pool *old_pool; + dma_addr_t old_laddr; + + lockdep_assert_held(&transport->readiness_lock); + + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, + "send %zu bytes to service %d\n", + mbuf->base.size, service_id); + vs_debug_dump_mbuf(transport->session_dev, &mbuf->base); + + uptr = ACCESS_ONCE(transport->tx->queues[0].uptr); + desc = &transport->tx_descs[uptr]; + + /* Is the descriptor ready to use? */ + if (okl4_axon_data_info_getpending(&desc->info)) + return -ENOSPC; + mb(); + + /* The descriptor is ours; save its old state and increment the uptr */ + old_pool = transport->tx_pools[uptr]; + if (old_pool != NULL) + old_laddr = okl4_axon_data_info_getladdr(&desc->info); + transport->tx_pools[uptr] = mbuf->pool; + + INC_MOD(uptr, transport->tx->queues[0].entries); + ACCESS_ONCE(transport->tx->queues[0].uptr) = uptr; + + /* Set up the descriptor */ + desc->data_size = mbuf_real_size(mbuf); + okl4_axon_data_info_setladdr(&desc->info, mbuf->laddr); + + /* Message is ready to go */ + wmb(); + okl4_axon_data_info_setpending(&desc->info, true); + + if (flags & VS_TRANSPORT_SEND_FLAGS_MORE) { + /* + * This is a batched message, so we normally don't flush, + * unless we've filled the queue completely. + * + * Races on the queue descriptor don't matter here, because + * this is only an optimisation; the service should do an + * explicit flush when it finishes the batch anyway. + */ + desc = &transport->tx_descs[uptr]; + if (okl4_axon_data_info_getpending(&desc->info)) + __transport_flush(transport); + } else { + __transport_flush(transport); + } + + /* Free any buffer previously in the descriptor */ + if (old_pool != NULL) { + u32 uptr_freed = transport->tx_uptr_freed; + INC_MOD(uptr_freed, transport->tx->queues[0].entries); + WARN_ON(uptr_freed != uptr); + __transport_tx_pool_free(old_pool, old_laddr); + transport->tx_uptr_freed = uptr_freed; + } + + return 0; +} + +static int transport_send_might_queue(struct vs_transport_axon *transport, + struct vs_mbuf_axon *mbuf, vs_service_id_t service_id, + unsigned long flags, bool *queued) +{ + int ret = 0; + + lockdep_assert_held(&transport->readiness_lock); + *queued = false; + + if (transport->readiness != VS_TRANSPORT_ACTIVE) + return -ECONNRESET; + + if (!list_empty(&transport->tx_queue)) { + *queued = true; + } else { + ret = __transport_send(transport, mbuf, service_id, flags); + if (ret == -ENOSPC) { + *queued = true; + ret = 0; + } + } + + if (*queued) + queue_tx_mbuf(mbuf, transport, service_id); + + return ret; +} + +static int transport_send(struct vs_transport *_transport, + struct vs_service_device *service, struct vs_mbuf *_mbuf, + unsigned long flags) +{ + struct vs_transport_axon *transport = to_vs_transport_axon(_transport); + struct vs_mbuf_axon *mbuf = to_vs_mbuf_axon(_mbuf); + struct vs_mv_service_info *service_info; + vs_service_id_t service_id; + int recv_freed, freed_acks; + bool queued; + int err; + unsigned long irqflags; + + if (WARN_ON(!transport || !mbuf || mbuf->owner != transport)) + return -EINVAL; + + service_id = transport_get_mbuf_service_id(transport, + mbuf_real_base(mbuf), NULL); + + if (WARN_ON(service_id != service->id)) + return -EINVAL; + + service_info = transport_get_service_info(service); + if (!service_info) + return -EINVAL; + + if (mbuf->base.is_recv) { + /* + * This message buffer was allocated for receive. We don't + * allow receive message buffers to be reused for sending + * because it makes our quotas inconsistent. + */ + dev_err(&service_info->service->dev, + "Attempted to send a received message buffer\n"); + transport_put_service_info(service_info); + return -EINVAL; + } + + if (!service_info->ready) { + transport_put_service_info(service_info); + return -ECOMM; + } + + /* + * Set the message's service id reserved bits to the number of buffers + * we have freed. We can only ack 2 ^ VS_SERVICE_ID_RESERVED_BITS - 1 + * buffers in one message. + */ + do { + recv_freed = atomic_read(&service_info->recv_freed); + freed_acks = min_t(int, recv_freed, + VS_SERVICE_ID_TRANSPORT_MASK); + } while (recv_freed != atomic_cmpxchg(&service_info->recv_freed, + recv_freed, recv_freed - freed_acks)); + + service_id = service_info->service->id; + vs_set_service_id_reserved_bits(&service_id, freed_acks); + *(vs_service_id_t *)mbuf_real_base(mbuf) = service_id; + + spin_lock_irqsave(&transport->readiness_lock, irqflags); + err = transport_send_might_queue(transport, mbuf, + service_info->service->id, flags, &queued); + if (err) { + /* We failed to send, so revert the freed acks */ + if (atomic_add_return(freed_acks, + &service_info->recv_freed) >= + service_info->recv_freed_watermark) { + transport->free_bufs_pending = true; + schedule_delayed_work(&transport->free_bufs_work, 0); + } + transport_put_service_info(service_info); + spin_unlock_irqrestore(&transport->readiness_lock, irqflags); + return err; + } + + atomic_dec(&service_info->send_alloc); + + if (queued) { + transport_put_service_info(service_info); + spin_unlock_irqrestore(&transport->readiness_lock, irqflags); + return 0; + } + + /* + * The mbuf was sent successfully. We can free it locally since it is + * now owned by the remote end. + */ + transport_free_sent_mbuf(transport, mbuf); + + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, + "Send okay: service %d (0x%.2x) sq=%d/%d, alloc--=%d, rq=%d/%d, freed=%d/%d, bc=%d\n", + service_info->service->id, service_id, + atomic_read(&service_info->send_inflight), + service_info->service->send_quota, + atomic_read(&service_info->send_alloc), + atomic_read(&service_info->recv_inflight), + service_info->service->recv_quota, freed_acks, + atomic_read(&service_info->recv_freed), + atomic_read(&transport->free_bufs_balance)); + + transport_put_service_info(service_info); + spin_unlock_irqrestore(&transport->readiness_lock, irqflags); + + return 0; +} + +static void transport_free_bufs_work(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct vs_transport_axon *transport = container_of(dwork, + struct vs_transport_axon, free_bufs_work); + struct vs_mbuf_axon *mbuf; + int i, err, count = 0, old_balance; + bool queued; + size_t size; + u16 *p; + + /* + * Atomically decide whether to send a message, and increment + * the balance if we are going to. + * + * We don't need barriers before these reads because they're + * implicit in the work scheduling. + */ + do { + old_balance = atomic_read(&transport->free_bufs_balance); + + /* + * We only try to send if the balance is negative, + * or if we have been triggered by going over a + * watermark. + */ + if (old_balance >= 0 && !transport->free_bufs_pending) + return; + + /* + * If we've hit the max balance, we can't send. The + * tasklet will be rescheduled next time the balance + * is decremented, if free_bufs_pending is true. + */ + if (old_balance >= MAX_BALANCE) + return; + + } while (old_balance != atomic_cmpxchg(&transport->free_bufs_balance, + old_balance, old_balance + 1)); + + /* Try to allocate a message buffer. */ + mbuf = __transport_alloc_mbuf(transport, MSG_SEND_FREE_BUFS, + transport->free_bufs_pool, + transport->msg_size - sizeof(vs_service_id_t), + GFP_KERNEL | __GFP_NOWARN); + if (!mbuf) { + /* Out of memory at the moment; retry later. */ + atomic_dec(&transport->free_bufs_balance); + schedule_delayed_work(dwork, FREE_BUFS_RETRY_DELAY); + return; + } + + /* + * Clear free_bufs_pending, because we are going to try to send. We + * need a write barrier afterwards to guarantee that this write is + * ordered before any writes to the recv_freed counts, and therefore + * before any remote free_bufs_pending = true when a service goes + * over its watermark right after we inspect it. + * + * The matching barrier is implicit in the atomic_inc_return in + * transport_free_mbuf(). + */ + transport->free_bufs_pending = false; + smp_wmb(); + + /* + * Fill in the buffer. Message format is: + * + * u16: Number of services + * + * For each service: + * u16: Service ID + * u16: Number of freed buffers + */ + p = mbuf->base.data; + *(p++) = 0; + + for_each_set_bit(i, transport->service_bitmap, + VS_SERVICE_ID_BITMAP_BITS) { + struct vs_mv_service_info *service_info; + int recv_freed; + u16 freed_acks; + + service_info = transport_get_service_id_info(transport, i); + if (!service_info) + continue; + + /* + * Don't let the message exceed the maximum size for the + * transport. + */ + size = sizeof(vs_service_id_t) + sizeof(u16) + + (count * (2 * sizeof(u16))); + if (size > transport->msg_size) { + /* FIXME: Jira ticket SDK-3131 - ryanm. */ + transport_put_service_info(service_info); + transport->free_bufs_pending = true; + break; + } + + /* + * We decrement each service's quota immediately by up to + * USHRT_MAX. If we subsequently fail to send the message then + * we return the count to what it was previously. + */ + do { + recv_freed = atomic_read(&service_info->recv_freed); + freed_acks = min_t(int, USHRT_MAX, recv_freed); + } while (recv_freed != atomic_cmpxchg( + &service_info->recv_freed, + recv_freed, recv_freed - freed_acks)); + + if (freed_acks) { + if (freed_acks < recv_freed) + transport->free_bufs_pending = true; + + *(p++) = service_info->service->id; + *(p++) = freed_acks; + count++; + + vs_dev_debug(VS_DEBUG_TRANSPORT, + transport->session_dev, + transport->axon_dev, + " [%.2d] Freed %.2d buffers\n", + service_info->service->id, + freed_acks); + } else { + vs_dev_debug(VS_DEBUG_TRANSPORT, + transport->session_dev, + transport->axon_dev, + " [%.2d] No buffers to free\n", + service_info->service->id); + } + + transport_put_service_info(service_info); + } + + if (transport->free_bufs_pending) + schedule_delayed_work(dwork, 0); + + if (count == 0 && old_balance >= 0) { + /* + * We are sending a new free bufs message, but we have no + * freed buffers to tell the other end about. We don't send + * an empty message unless the pre-increment balance was + * negative (in which case we need to ack a remote free_bufs). + * + * Note that nobody else can increase the balance, so we only + * need to check for a non-negative balance once before + * decrementing. However, if the incoming free-bufs handler + * concurrently decrements, the balance may become negative, + * in which case we reschedule ourselves immediately to send + * the ack. + */ + if (atomic_dec_return(&transport->free_bufs_balance) < 0) + schedule_delayed_work(dwork, 0); + + __transport_free_mbuf(transport, mbuf, false); + + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, + "No services had buffers to free\n"); + + return; + } + + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, + "Sending free bufs message for %d services\n", count); + + /* Fix up the message size */ + p = mbuf->base.data; + *p = count; + mbuf->base.size = sizeof(u16) * ((count * 2) + 1); + + spin_lock_irq(&transport->readiness_lock); + err = transport_send_might_queue(transport, mbuf, MSG_SEND_FREE_BUFS, + 0, &queued); + if (err) { + spin_unlock_irq(&transport->readiness_lock); + goto fail; + } + + /* FIXME: Jira ticket SDK-4675 - ryanm. */ + if (!queued) { + /* + * The mbuf was sent successfully. We can free it locally + * since it is now owned by the remote end. + */ + transport_free_sent_mbuf(transport, mbuf); + } + spin_unlock_irq(&transport->readiness_lock); + + return; + +fail: + dev_err(transport->axon_dev, + "Failed to send free bufs message: %d\n", err); + transport_fatal_error(transport, "free bufs send failed"); +} + +int transport_notify(struct vs_transport *_transport, + struct vs_service_device *service, unsigned long bits) +{ + struct vs_transport_axon *transport = to_vs_transport_axon(_transport); + unsigned long bit_offset, bitmask, word; + int first_set_bit, spilled_bits; + + BUG_ON(!transport); + + if (!bits) + return -EINVAL; + + /* Check that the service isn't trying to raise bits it doesn't own */ + if (bits & ~((1UL << service->notify_send_bits) - 1)) + return -EINVAL; + + bit_offset = service->notify_send_offset; + word = BIT_WORD(bit_offset); + bitmask = bits << (bit_offset % BITS_PER_LONG); + + vs_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + "Sending notification %ld to service id %d\n", bitmask, + service->id); + + _okl4_sys_vinterrupt_raise(transport->notify_cap[word], bitmask); + + /* + * Bit range may spill into the next virqline. + * + * Check by adding the bit offset to the index of the highest set bit in + * the requested bitmask. If we need to raise a bit that is greater than + * bit 31, we have spilled into the next word and need to raise that too. + */ + first_set_bit = find_first_bit(&bits, BITS_PER_LONG); + spilled_bits = first_set_bit + bit_offset - (BITS_PER_LONG - 1); + if (spilled_bits > 0) { + /* + * Calculate the new bitmask for the spilled bits. We do this by + * shifting the requested bits to the right. The number of shifts + * is determined on where the first spilled bit is. + */ + int first_spilled_bit = first_set_bit - spilled_bits + 1; + + bitmask = bits >> first_spilled_bit; + + vs_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + "Sending notification %ld to service id %d\n", bitmask, + service->id); + + _okl4_sys_vinterrupt_raise(transport->notify_cap[word + 1], bitmask); + } + + return 0; +} + +static void +transport_handle_free_bufs_message(struct vs_transport_axon *transport, + struct vs_mbuf_axon *mbuf) +{ + struct vs_mv_service_info *service_info; + vs_service_id_t service_id; + u16 *p = mbuf->base.data; + int i, count, freed_acks, new_balance; + + count = *(p++); + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, + "Free bufs message received for %d services\n", count); + for (i = 0; i < count; i++) { + int old_quota __maybe_unused; + + service_id = *(p++); + freed_acks = *(p++); + + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, " [%.2d] %.4d\n", + service_id, freed_acks); + + service_info = transport_get_service_id_info(transport, + service_id); + if (!service_info) { + vs_dev_debug(VS_DEBUG_TRANSPORT, + transport->session_dev, + transport->axon_dev, + "Got %d free_acks for unknown service %d\n", + freed_acks, service_id); + continue; + } + + old_quota = atomic_read(&service_info->send_inflight); + freed_acks = reduce_send_quota(transport, service_info, + freed_acks, service_info->ready); + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, + " [%.2d] Freed %.2d buffers (%d -> %d, quota = %d)\n", + service_id, freed_acks, old_quota, + atomic_read(&service_info->send_inflight), + service_info->service->send_quota); + + transport_put_service_info(service_info); + } + + __transport_free_mbuf(transport, mbuf, true); + + new_balance = atomic_dec_return(&transport->free_bufs_balance); + if (new_balance < -MAX_BALANCE) { + dev_err(transport->axon_dev, + "Balance counter fell below -MAX_BALANCE (%d < %d)\n", + atomic_read(&transport->free_bufs_balance), + -MAX_BALANCE); + transport_fatal_error(transport, "balance counter underrun"); + return; + } + + /* Check if we need to send a freed buffers message back */ + if (new_balance < 0 || transport->free_bufs_pending) + schedule_delayed_work(&transport->free_bufs_work, 0); +} + +static int transport_rx_queue_buffer(struct vs_transport_axon *transport, + void *ptr, dma_addr_t laddr) +{ + struct okl4_axon_queue_entry *desc; + okl4_axon_data_info_t info; + + /* Select the buffer desc to reallocate */ + desc = &transport->rx_descs[transport->rx_uptr_allocated]; + info = ACCESS_ONCE(desc->info); + + /* If there is no space in the rx queue, fail */ + if (okl4_axon_data_info_getusr(&info)) + return -ENOSPC; + + /* Don't update desc before reading the clear usr bit */ + smp_mb(); + + /* Update the buffer pointer in the desc and mark it valid. */ + transport->rx_ptrs[transport->rx_uptr_allocated] = ptr; + okl4_axon_data_info_setladdr(&info, (okl4_laddr_t)laddr); + okl4_axon_data_info_setpending(&info, true); + okl4_axon_data_info_setusr(&info, true); + mb(); + ACCESS_ONCE(desc->info) = info; + + /* Proceed to the next buffer */ + INC_MOD(transport->rx_uptr_allocated, + transport->rx->queues[0].entries); + + /* Return true if the next desc has no buffer yet */ + desc = &transport->rx_descs[transport->rx_uptr_allocated]; + return !okl4_axon_data_info_getusr(&desc->info); +} + +/* TODO: multiple queue support / small message prioritisation */ +static int transport_process_msg(struct vs_transport_axon *transport) +{ + struct vs_mv_service_info *service_info; + struct vs_mbuf_axon *mbuf; + vs_service_id_t service_id; + unsigned freed_acks; + u32 uptr; + struct okl4_axon_queue_entry *desc; + void **ptr; + okl4_axon_data_info_t info; + + /* Select the descriptor to receive from */ + uptr = ACCESS_ONCE(transport->rx->queues[0].uptr); + desc = &transport->rx_descs[uptr]; + ptr = &transport->rx_ptrs[uptr]; + info = ACCESS_ONCE(desc->info); + + /* Have we emptied the whole queue? */ + if (!okl4_axon_data_info_getusr(&info)) + return -ENOBUFS; + + /* Has the next buffer been filled yet? */ + if (okl4_axon_data_info_getpending(&info)) + return 0; + + /* Don't read the buffer or desc before seeing a cleared pending bit */ + rmb(); + + /* Is the message too small to be valid? */ + if (desc->data_size < sizeof(vs_service_id_t)) + return -EBADMSG; + + /* Allocate and set up the mbuf */ + mbuf = kmem_cache_alloc(mbuf_cache, GFP_ATOMIC); + if (!mbuf) + return -ENOMEM; + + mbuf->owner = transport; + mbuf->laddr = okl4_axon_data_info_getladdr(&info); + mbuf->pool = NULL; + mbuf->base.is_recv = true; + mbuf->base.data = *ptr + sizeof(vs_service_id_t); + mbuf->base.size = desc->data_size - sizeof(vs_service_id_t); + + INC_MOD(uptr, transport->rx->queues[0].entries); + ACCESS_ONCE(transport->rx->queues[0].uptr) = uptr; + + /* Finish reading desc before clearing usr bit */ + smp_mb(); + + /* Re-check the pending bit, in case we've just been reset */ + info = ACCESS_ONCE(desc->info); + if (unlikely(okl4_axon_data_info_getpending(&info))) { + kmem_cache_free(mbuf_cache, mbuf); + return 0; + } + + /* Clear usr bit; after this point the buffer is owned by the mbuf */ + okl4_axon_data_info_setusr(&info, false); + ACCESS_ONCE(desc->info) = info; + + /* Determine who to deliver the mbuf to */ + service_id = transport_get_mbuf_service_id(transport, + mbuf_real_base(mbuf), &freed_acks); + + if (service_id == MSG_SEND_FREE_BUFS) { + transport_handle_free_bufs_message(transport, mbuf); + return 1; + } + + service_info = transport_get_service_id_info(transport, service_id); + if (!service_info) { + vs_dev_debug(VS_DEBUG_TRANSPORT, + transport->session_dev, transport->axon_dev, + "discarding message for missing service %d\n", + service_id); + __transport_free_mbuf(transport, mbuf, true); + return -EIDRM; + } + + /* + * If the remote end has freed some buffers that we sent it, then we + * can decrement our send quota count by that amount. + */ + freed_acks = reduce_send_quota(transport, service_info, + freed_acks, service_info->ready); + + /* If the service has been reset, drop the message. */ + if (!service_info->ready) { + vs_dev_debug(VS_DEBUG_TRANSPORT, + transport->session_dev, transport->axon_dev, + "discarding message for reset service %d\n", + service_id); + + __transport_free_mbuf(transport, mbuf, true); + transport_put_service_info(service_info); + + return 1; + } + + /* + * Increment our recv quota since we are now holding a buffer. We + * will decrement it when the buffer is freed in transport_free_mbuf. + */ + if (!atomic_add_unless(&service_info->recv_inflight, 1, + service_info->service->recv_quota)) { + /* + * Going over the recv_quota indicates that something bad + * has happened because either the other end has exceeded + * its send quota or the two ends have a disagreement about + * what the quota is. + * + * We free the buffer and reset the transport. + */ + dev_err(transport->axon_dev, + "Service %d is at max receive quota %d - resetting\n", + service_info->service->id, + service_info->service->recv_quota); + + transport_fatal_error(transport, "rx quota exceeded"); + + __transport_free_mbuf(transport, mbuf, true); + transport_put_service_info(service_info); + + return 0; + } + + WARN_ON(atomic_read(&service_info->recv_inflight) > + service_info->service->recv_quota); + + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, + "receive %zu bytes from service 0x%.2x (%d): sq=%d/%d, rq=%d/%d, freed_acks=%d, freed=%d/%d bc=%d\n", + mbuf->base.size, service_info->service->id, service_id, + atomic_read(&service_info->send_inflight), + service_info->service->send_quota, + atomic_read(&service_info->recv_inflight), + service_info->service->recv_quota, freed_acks, + atomic_read(&service_info->recv_freed), + service_info->recv_freed_watermark, + atomic_read(&transport->free_bufs_balance)); + vs_debug_dump_mbuf(transport->session_dev, &mbuf->base); + + if (vs_session_handle_message(transport->session_dev, &mbuf->base, + service_id) < 0) + transport_free_mbuf(&transport->transport, + service_info->service, &mbuf->base); + + transport_put_service_info(service_info); + + return 1; +} + +static void transport_flush_tx_queues(struct vs_transport_axon *transport) +{ + okl4_error_t err; + int i; + + lockdep_assert_held(&transport->readiness_lock); + + /* Release any queued mbufs */ + free_tx_mbufs(transport); + + /* + * Re-attach the TX Axon's segment, which implicitly invalidates + * the queues and stops any outgoing message transfers. The queues + * will be reconfigured when the transport becomes ready again. + */ + err = _okl4_sys_axon_set_send_segment(transport->tx_cap, + transport->segment, transport->segment_base); + if (err != OKL4_OK) { + dev_err(transport->axon_dev, "TX reattach failed: %d\n", + (int)err); + } + + /* + * The TX Axon has stopped, so we can safely clear the pending + * bit and free the buffer for any outgoing messages, and reset uptr + * and kptr to 0. + */ + for (i = 0; i < transport->tx->queues[0].entries; i++) { + if (!transport->tx_pools[i]) + continue; + + okl4_axon_data_info_setpending( + &transport->tx_descs[i].info, false); + __transport_tx_pool_free(transport->tx_pools[i], + okl4_axon_data_info_getladdr( + &transport->tx_descs[i].info)); + transport->tx_pools[i] = NULL; + } + transport->tx->queues[0].uptr = 0; + transport->tx->queues[0].kptr = 0; + transport->tx_uptr_freed = 0; +} + +static void transport_flush_rx_queues(struct vs_transport_axon *transport) +{ + okl4_error_t err; + int i; + + lockdep_assert_held(&transport->readiness_lock); + + /* + * Re-attach the TX Axon's segment, which implicitly invalidates + * the queues and stops any incoming message transfers, though those + * should already have cancelled those at the sending end. The queues + * will be reconfigured when the transport becomes ready again. + */ + err = _okl4_sys_axon_set_recv_segment(transport->rx_cap, + transport->segment, transport->segment_base); + if (err != OKL4_OK) { + dev_err(transport->axon_dev, "RX reattach failed: %d\n", + (int)err); + } + + /* + * The RX Axon has stopped, so we can reset the pending bit on all + * allocated message buffers to prepare them for reuse when the reset + * completes. + */ + for (i = 0; i < transport->rx->queues[0].entries; i++) { + if (okl4_axon_data_info_getusr(&transport->rx_descs[i].info)) + okl4_axon_data_info_setpending( + &transport->rx_descs[i].info, true); + } + + /* + * Reset kptr to the current uptr. + * + * We use a barrier here to ensure the pending bits are reset before + * reading uptr, matching the barrier in transport_process_msg between + * the uptr update and the second check of the pending bit. This means + * that races with transport_process_msg() will end in one of two + * ways: + * + * 1. transport_process_msg() updates uptr before this barrier, so the + * RX buffer is passed up to the session layer to be rejected there + * and recycled; or + * + * 2. the reset pending bit is seen by the second check in + * transport_process_msg(), which knows that it is being reset and + * can drop the message before it claims the buffer. + */ + smp_mb(); + transport->rx->queues[0].kptr = + ACCESS_ONCE(transport->rx->queues[0].uptr); + + /* + * Cancel any pending freed bufs work. We can't flush it here, but + * that is OK: we will do so before we become ready. + */ + cancel_delayed_work(&transport->free_bufs_work); +} + +static bool transport_axon_reset(struct vs_transport_axon *transport) +{ + okl4_error_t err; + unsigned long flags; + bool reset_complete = false; + + spin_lock_irqsave(&transport->readiness_lock, flags); + + /* + * Reset the transport, dumping any messages in transit, and tell the + * remote end that it should do the same. + * + * We only do this if the transport is not already marked reset. Doing + * otherwise would be redundant. + */ + if ((transport->readiness != VS_TRANSPORT_RESET) && + transport->readiness != VS_TRANSPORT_LOCAL_RESET && + transport->readiness != VS_TRANSPORT_REMOTE_READY) { + /* + * Flush the Axons' TX queues. We can't flush the RX queues + * until after the remote end has acknowledged the reset. + */ + transport_flush_tx_queues(transport); + + /* + * Raise a reset request VIRQ, and discard any incoming reset + * or ready notifications as they are now stale. Note that we + * must do this in a single syscall. + */ + err = _okl4_sys_vinterrupt_clear_and_raise( + transport->reset_okl4_irq, + transport->reset_cap, 0UL, + VS_TRANSPORT_VIRQ_RESET_REQ).error; + if (err != OKL4_OK) { + dev_err(transport->axon_dev, "Reset raise failed: %d\n", + (int)err); + } + + /* Local reset is complete */ + if (transport->readiness != VS_TRANSPORT_SHUTDOWN) + transport->readiness = VS_TRANSPORT_LOCAL_RESET; + } else { + /* Already in reset */ + reset_complete = true; + } + + spin_unlock_irqrestore(&transport->readiness_lock, flags); + + return reset_complete; +} + +static void transport_reset(struct vs_transport *_transport) +{ + struct vs_transport_axon *transport = to_vs_transport_axon(_transport); + + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, "reset\n"); + + if (transport_axon_reset(transport)) { + vs_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + "reset while already reset (no-op)\n"); + + vs_session_handle_reset(transport->session_dev); + } +} + +static void transport_ready(struct vs_transport *_transport) +{ + struct vs_transport_axon *transport = to_vs_transport_axon(_transport); + okl4_error_t err; + + vs_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + "%s: becoming ready\n", __func__); + + /* + * Make sure any previously scheduled freed bufs work is cancelled. + * It should not be possible for this to be rescheduled later, as long + * as the transport is in reset. + */ + cancel_delayed_work_sync(&transport->free_bufs_work); + spin_lock_irq(&transport->readiness_lock); + + atomic_set(&transport->free_bufs_balance, 0); + transport->free_bufs_pending = false; + + switch(transport->readiness) { + case VS_TRANSPORT_RESET: + transport->readiness = VS_TRANSPORT_LOCAL_READY; + break; + case VS_TRANSPORT_REMOTE_READY: + vs_session_handle_activate(transport->session_dev); + transport->readiness = VS_TRANSPORT_ACTIVE; + break; + case VS_TRANSPORT_LOCAL_RESET: + /* + * Session layer is confused; usually due to the reset at init + * time, which it did not explicitly request, not having + * completed yet. We just ignore it and wait for the reset. We + * could avoid this by not starting the session until the + * startup reset completes. + */ + spin_unlock_irq(&transport->readiness_lock); + return; + case VS_TRANSPORT_SHUTDOWN: + /* Do nothing. */ + spin_unlock_irq(&transport->readiness_lock); + return; + default: + /* Session layer is broken */ + WARN(1, "transport_ready() called in the wrong state: %d", + transport->readiness); + goto fail; + } + + /* Raise a ready notification VIRQ. */ + err = _okl4_sys_vinterrupt_raise(transport->reset_cap, + VS_TRANSPORT_VIRQ_READY); + if (err != OKL4_OK) { + dev_err(transport->axon_dev, "Ready raise failed: %d\n", + (int)err); + goto fail; + } + + /* + * Set up the Axons' queue pointers. + */ + err = _okl4_sys_axon_set_send_area(transport->tx_cap, + transport->tx_phys, transport->tx_size); + if (err != OKL4_OK) { + dev_err(transport->axon_dev, "TX set area failed: %d\n", + (int)err); + goto fail; + } + + err = _okl4_sys_axon_set_send_queue(transport->tx_cap, + transport->tx_phys); + if (err != OKL4_OK) { + dev_err(transport->axon_dev, "TX set queue failed: %d\n", + (int)err); + goto fail; + } + + err = _okl4_sys_axon_set_recv_area(transport->rx_cap, + transport->rx_phys, transport->rx_size); + if (err != OKL4_OK) { + dev_err(transport->axon_dev, "RX set area failed: %d\n", + (int)err); + goto fail; + } + + err = _okl4_sys_axon_set_recv_queue(transport->rx_cap, + transport->rx_phys); + if (err != OKL4_OK) { + dev_err(transport->axon_dev, "RX set queue failed: %d\n", + (int)err); + goto fail; + } + + spin_unlock_irq(&transport->readiness_lock); + return; + +fail: + spin_unlock_irq(&transport->readiness_lock); + + transport_axon_reset(transport); +} + +static int transport_service_add(struct vs_transport *_transport, + struct vs_service_device *service) +{ + struct vs_transport_axon *transport = to_vs_transport_axon(_transport); + struct vs_mv_service_info *service_info; + + /* + * We can't print out the core service add because the session + * isn't fully registered at that time. + */ + if (service->id != 0) + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, + "Add service - id = %d\n", service->id); + + service_info = kzalloc(sizeof(*service_info), GFP_KERNEL); + if (!service_info) + return -ENOMEM; + + kref_init(&service_info->kref); + + /* Matching vs_put_service() is in transport_info_free */ + service_info->service = vs_get_service(service); + + /* Make the service_info visible */ + rcu_assign_pointer(service->transport_priv, service_info); + + __set_bit(service->id, transport->service_bitmap); + + return 0; +} + +static void transport_service_remove(struct vs_transport *_transport, + struct vs_service_device *service) +{ + struct vs_transport_axon *transport = to_vs_transport_axon(_transport); + struct vs_mv_service_info *service_info; + + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, "Remove service - id = %d\n", + service->id); + + __clear_bit(service->id, transport->service_bitmap); + + service_info = service->transport_priv; + rcu_assign_pointer(service->transport_priv, NULL); + + if (service_info->ready) { + dev_err(transport->axon_dev, + "Removing service %d while ready\n", + service->id); + transport_fatal_error(transport, "removing ready service"); + } + + transport_put_service_info(service_info); +} + +static struct vs_axon_tx_pool * +transport_axon_init_tx_pool(struct vs_transport_axon *transport, + size_t msg_size, unsigned send_quota) +{ + struct vs_axon_tx_pool *pool; + + pool = devm_kzalloc(transport->axon_dev, sizeof(*pool) + + (sizeof(unsigned long) * BITS_TO_LONGS(send_quota)), + GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + + pool->transport = transport; + pool->alloc_order = ilog2(msg_size + sizeof(vs_service_id_t)); + pool->count = send_quota; + + pool->base_vaddr = dmam_alloc_coherent(transport->axon_dev, + send_quota << pool->alloc_order, &pool->base_laddr, + GFP_KERNEL); + if (!pool->base_vaddr) { + dev_err(transport->axon_dev, "Couldn't allocate %lu times %zu bytes for TX\n", + (unsigned long)pool->count, (size_t)1 << pool->alloc_order); + devm_kfree(transport->axon_dev, pool); + return ERR_PTR(-ENOMEM); + } + + kref_init(&pool->kref); + return pool; +} + +static int transport_service_start(struct vs_transport *_transport, + struct vs_service_device *service) +{ + struct vs_mv_service_info *service_info; + struct vs_transport_axon *transport = to_vs_transport_axon(_transport); + struct vs_notify_info *info; + int i, ret; + bool enable_rx; + + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, "Start service - id = %d\n", + service->id); + + service_info = service->transport_priv; + __transport_get_service_info(service_info); + + /* We shouldn't have any mbufs left from before the last reset. */ + if (WARN_ON(atomic_read(&service_info->outstanding_frees))) { + transport_put_service_info(service_info); + return -EBUSY; + } + + /* + * The watermark is set to half of the received-message quota, rounded + * down, plus one. This is fairly arbitrary. The constant offset + * ensures that we don't set it to 0 for services with 1 quota (and + * thus trigger infinite free_bufs messages). + */ + service_info->recv_freed_watermark = (service->recv_quota + 1) / 2; + + if (WARN_ON(service->notify_recv_bits + service->notify_recv_offset > + transport->notify_rx_nirqs * BITS_PER_LONG)) { + transport_put_service_info(service_info); + return -EINVAL; + } + + if (WARN_ON(service->notify_send_bits + service->notify_send_offset > + transport->notify_tx_nirqs * BITS_PER_LONG)) { + transport_put_service_info(service_info); + return -EINVAL; + } + + /* This is called twice for the core client only. */ + WARN_ON(service->id != 0 && service_info->ready); + + if (!service_info->ready) { + WARN_ON(atomic_read(&service_info->send_alloc)); + WARN_ON(atomic_read(&service_info->recv_freed)); + WARN_ON(atomic_read(&service_info->recv_inflight)); + } + + /* Create the TX buffer pool. */ + WARN_ON(service->send_quota && service_info->tx_pool); + if (service->send_quota) { + service_info->tx_pool = transport_axon_init_tx_pool(transport, + transport->msg_size, service->send_quota); + if (IS_ERR(service_info->tx_pool)) { + ret = PTR_ERR(service_info->tx_pool); + service_info->tx_pool = NULL; + transport_put_service_info(service_info); + return ret; + } + } + + /* Preallocate some RX buffers, if necessary. */ + spin_lock_irq(&transport->rx_alloc_lock); + i = min(transport->rx_alloc_extra, + service->recv_quota - service_info->rx_allocated); + transport->rx_alloc_extra -= i; + service_info->rx_allocated += i; + spin_unlock_irq(&transport->rx_alloc_lock); + + for (; service_info->rx_allocated < service->recv_quota; + service_info->rx_allocated++) { + dma_addr_t laddr; + struct vs_axon_rx_freelist_entry *buf = + dma_pool_alloc(transport->rx_pool, GFP_KERNEL, &laddr); + if (WARN_ON(!buf)) + break; + buf->laddr = laddr; + + spin_lock_irq(&transport->rx_alloc_lock); + list_add(&buf->list, &transport->rx_freelist); + spin_unlock_irq(&transport->rx_alloc_lock); + } + + for (i = 0; i < service->notify_recv_bits; i++) { + unsigned bit = i + service->notify_recv_offset; + info = &transport->transport.notify_info[bit]; + + info->service_id = service->id; + info->offset = service->notify_recv_offset; + } + + atomic_set(&service_info->send_inflight, 0); + + /* + * If this is the core service and it wasn't ready before, we need to + * enable RX for the whole transport. + */ + enable_rx = service->id == 0 && !service_info->ready; + + service_info->ready = true; + + /* We're now ready to receive. */ + if (enable_rx) + tasklet_enable(&transport->rx_tasklet); + + transport_put_service_info(service_info); + + return 0; +} + +static int transport_service_reset(struct vs_transport *_transport, + struct vs_service_device *service) +{ + struct vs_mv_service_info *service_info; + struct vs_transport_axon *transport = to_vs_transport_axon(_transport); + struct vs_mbuf_axon *child, *tmp; + int ret = 0, service_id, send_remaining, recv_remaining; + + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, "Reset service - id = %d\n", + service->id); + + service_info = service->transport_priv; + __transport_get_service_info(service_info); + + /* + * Clear the ready bit with the tasklet disabled. After this point, + * incoming messages will be discarded by transport_process_msg() + * without incrementing recv_inflight, so we won't spuriously see + * nonzero recv_inflight values for messages that would be discarded + * in the session layer. + */ + tasklet_disable(&transport->rx_tasklet); + service_info->ready = false; + if (service->id) + tasklet_enable(&transport->rx_tasklet); + + /* + * Cancel and free all pending outgoing messages for the service being + * reset; i.e. those that have been sent by the service but are not + * yet in the axon queue. + * + * Note that this does not clean out the axon queue; messages there + * are already visible to OKL4 and may be transferred at any time, + * so we treat those as already sent. + */ + spin_lock_irq(&transport->readiness_lock); + list_for_each_entry_safe(child, tmp, &transport->tx_queue, base.queue) { + service_id = transport_get_mbuf_service_id(transport, + mbuf_real_base(child), NULL); + if (service_id == service->id) { + list_del(&child->base.queue); + __transport_tx_pool_free(child->pool, child->laddr); + } + } + spin_unlock_irq(&transport->readiness_lock); + + /* + * If any buffers remain allocated, we mark them as outstanding frees. + * The transport will remain disabled until this count goes to zero. + */ + send_remaining = atomic_read(&service_info->send_alloc); + recv_remaining = atomic_read(&service_info->recv_inflight); + ret = atomic_add_return(send_remaining + recv_remaining, + &service_info->outstanding_frees); + dev_dbg(transport->axon_dev, "reset service %d with %d outstanding (send %d, recv %d)\n", + service->id, ret, send_remaining, recv_remaining); + + /* + * Reduce the send alloc count to 0, accounting for races with frees, + * which might have reduced either the alloc count or the outstanding + * count. + */ + while (send_remaining > 0) { + unsigned new_send_remaining = atomic_cmpxchg( + &service_info->send_alloc, send_remaining, 0); + if (send_remaining == new_send_remaining) { + smp_mb(); + break; + } + WARN_ON(send_remaining < new_send_remaining); + ret = atomic_sub_return(send_remaining - new_send_remaining, + &service_info->outstanding_frees); + send_remaining = new_send_remaining; + dev_dbg(transport->axon_dev, "failed to zero send quota, now %d outstanding (%d send)\n", + ret, send_remaining); + } + + /* Repeat the above for the recv inflight count. */ + while (recv_remaining > 0) { + unsigned new_recv_remaining = atomic_cmpxchg( + &service_info->recv_inflight, recv_remaining, + 0); + if (recv_remaining == new_recv_remaining) { + smp_mb(); + break; + } + WARN_ON(recv_remaining < new_recv_remaining); + ret = atomic_sub_return(recv_remaining - new_recv_remaining, + &service_info->outstanding_frees); + recv_remaining = new_recv_remaining; + dev_dbg(transport->axon_dev, "failed to zero recv quota, now %d outstanding (%d send)\n", + ret, recv_remaining); + } + + /* The outstanding frees count should never go negative */ + WARN_ON(ret < 0); + + /* Discard any outstanding freed buffer notifications. */ + atomic_set(&service_info->recv_freed, 0); + + /* + * Wait for any previously queued free_bufs work to finish. This + * guarantees that any freed buffer notifications that are already in + * progress will be sent to the remote end before we return, and thus + * before the reset is signalled. + */ + flush_delayed_work(&transport->free_bufs_work); + + if (!ret) + transport_free_mbuf_pools(transport, service, service_info); + + transport_put_service_info(service_info); + + return ret; +} + +static ssize_t transport_service_send_avail(struct vs_transport *_transport, + struct vs_service_device *service) +{ + struct vs_mv_service_info *service_info; + ssize_t count = 0; + + service_info = service->transport_priv; + if (!service_info) + return -EINVAL; + + __transport_get_service_info(service_info); + + count = service->send_quota - + atomic_read(&service_info->send_inflight); + + transport_put_service_info(service_info); + + return count < 0 ? 0 : count; +} + +static void transport_get_notify_bits(struct vs_transport *_transport, + unsigned *send_notify_bits, unsigned *recv_notify_bits) +{ + struct vs_transport_axon *transport = to_vs_transport_axon(_transport); + + *send_notify_bits = transport->notify_tx_nirqs * BITS_PER_LONG; + *recv_notify_bits = transport->notify_rx_nirqs * BITS_PER_LONG; +} + +static void transport_get_quota_limits(struct vs_transport *_transport, + unsigned *send_quota, unsigned *recv_quota) +{ + /* + * This driver does not need to enforce a quota limit, because message + * buffers are allocated from the kernel heap rather than a fixed + * buffer area. The queue length only determines the maximum size of + * a message batch, and the number of preallocated RX buffers. + * + * Note that per-service quotas are still enforced; there is simply no + * hard limit on the total of all service quotas. + */ + + *send_quota = UINT_MAX; + *recv_quota = UINT_MAX; +} + +static const struct vs_transport_vtable tvt = { + .alloc_mbuf = transport_alloc_mbuf, + .free_mbuf = transport_free_mbuf, + .mbuf_size = transport_mbuf_size, + .max_mbuf_size = transport_max_mbuf_size, + .send = transport_send, + .flush = transport_flush, + .notify = transport_notify, + .reset = transport_reset, + .ready = transport_ready, + .service_add = transport_service_add, + .service_remove = transport_service_remove, + .service_start = transport_service_start, + .service_reset = transport_service_reset, + .service_send_avail = transport_service_send_avail, + .get_notify_bits = transport_get_notify_bits, + .get_quota_limits = transport_get_quota_limits, +}; + +/* Incoming notification handling for client */ +static irqreturn_t transport_axon_notify_virq(int irq, void *priv) +{ + struct vs_transport_axon *transport = (struct vs_transport_axon *)priv; + struct vs_notify_info *n_info; + unsigned long offset, bit = 0, notification; + int word; + okl4_virq_flags_t payload = okl4_get_virq_payload(irq); + + for (word = 0; word < transport->notify_rx_nirqs; word++) + if (irq == transport->notify_irq[word]) + break; + + if (word == transport->notify_rx_nirqs) { + dev_err(transport->axon_dev, "Bad IRQ %d\n", irq); + return IRQ_NONE; + } + + vs_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + "Got notification irq\n"); + +#if defined(__BIG_ENDIAN) + /* + * We rely on being able to use the Linux bitmap operations directly + * on the VIRQ payload. + */ + BUILD_BUG_ON((sizeof(payload) % sizeof(unsigned long)) != 0); +#endif + + for_each_set_bit(bit, (unsigned long *)&payload, sizeof(payload) * 8) { + offset = bit + word * BITS_PER_LONG; + + /* + * We need to know which service id is associated + * with which notification bit here. The transport is informed + * about notification bit - service id mapping during the + * initialhandshake protocol. + */ + n_info = &transport->transport.notify_info[offset]; + + notification = 1UL << (offset - n_info->offset); + vs_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + "Got notification bit %lu for service %d\n", + notification, n_info->service_id); + + /* FIXME: Jira ticket SDK-2145 - shivanik. */ + vs_session_handle_notify(transport->session_dev, notification, + n_info->service_id); + } + + return IRQ_HANDLED; +} + +static irqreturn_t transport_axon_reset_irq(int irq, void *priv) +{ + struct vs_transport_axon *transport = (struct vs_transport_axon *)priv; + bool do_reset = false; + + u32 payload = okl4_get_virq_payload(irq); + + spin_lock(&transport->readiness_lock); + + if (payload & VS_TRANSPORT_VIRQ_RESET_REQ) { + okl4_error_t err; + + transport->readiness = VS_TRANSPORT_RESET; + + /* Flush the queues in both directions */ + transport_flush_tx_queues(transport); + transport_flush_rx_queues(transport); + + /* + * When sending an ack, it is important to cancel any earlier + * ready notification, so the recipient can safely assume that + * the ack precedes any ready it sees + */ + err = _okl4_sys_vinterrupt_modify(transport->reset_cap, + ~VS_TRANSPORT_VIRQ_READY, + VS_TRANSPORT_VIRQ_RESET_ACK); + if (err != OKL4_OK) { + dev_warn(transport->axon_dev, + "Error sending reset ack: %d\n", (int)err); + } + + /* + * Discard any pending ready event; it must have happened + * before the reset request was raised, because we had not + * yet sent the reset ack. + */ + payload = 0; + do_reset = true; + } else if (payload & VS_TRANSPORT_VIRQ_RESET_ACK) { + transport->readiness = VS_TRANSPORT_RESET; + + /* + * Flush the RX queues, as we know at this point that the + * other end has flushed its TX queues. + */ + transport_flush_rx_queues(transport); + + /* + * Preserve any pending ready event; it must have been + * generated after the ack (see above) + */ + payload &= VS_TRANSPORT_VIRQ_READY; + do_reset = true; + } + + if (do_reset) { + /* + * Reset the session. Note that duplicate calls to this are + * expected if there are duplicate resets; they don't + * necessarily match activate calls. + */ + vs_session_handle_reset(transport->session_dev); + } + + if (payload & VS_TRANSPORT_VIRQ_READY) { + if (transport->readiness == VS_TRANSPORT_RESET) { + transport->readiness = VS_TRANSPORT_REMOTE_READY; + } else if (transport->readiness == VS_TRANSPORT_LOCAL_READY) { + vs_session_handle_activate(transport->session_dev); + transport->readiness = VS_TRANSPORT_ACTIVE; + } else { + /* Ready lost a race with reset; ignore it. */ + } + } + + spin_unlock(&transport->readiness_lock); + + return IRQ_HANDLED; +} + +/* + * Axon VIRQ handling. + */ +static irqreturn_t transport_axon_rx_irq(int irq, void *priv) +{ + struct vs_transport_axon *transport = (struct vs_transport_axon *)priv; + + okl4_axon_virq_flags_t flags = okl4_get_virq_payload(irq); + + if (okl4_axon_virq_flags_getfault(&flags)) { + dev_err_ratelimited(transport->axon_dev, + "fault on RX axon buffer or queue; resetting\n"); + transport_axon_reset(transport); + } else if (okl4_axon_virq_flags_getready(&flags)) { + tasklet_schedule(&transport->rx_tasklet); + } + + return IRQ_HANDLED; +} + +static irqreturn_t transport_axon_tx_irq(int irq, void *priv) +{ + struct vs_transport_axon *transport = (struct vs_transport_axon *)priv; + + okl4_axon_virq_flags_t flags = okl4_get_virq_payload(irq); + + if (okl4_axon_virq_flags_getfault(&flags)) { + dev_err_ratelimited(transport->axon_dev, + "fault on TX axon buffer or queue; resetting\n"); + transport_axon_reset(transport); + } else if (okl4_axon_virq_flags_getready(&flags)) { + spin_lock(&transport->readiness_lock); + if (!list_empty(&transport->tx_queue)) + tasklet_schedule(&transport->tx_tasklet); + spin_unlock(&transport->readiness_lock); + } + + return IRQ_HANDLED; +} + +static void transport_rx_tasklet(unsigned long data) +{ + struct vs_transport_axon *transport = (struct vs_transport_axon *)data; + int status; + struct _okl4_sys_axon_process_recv_return recv_result; + + /* Refill the RX queue */ + spin_lock_irq(&transport->rx_alloc_lock); + while (!list_empty(&transport->rx_freelist)) { + struct vs_axon_rx_freelist_entry *buf; + buf = list_first_entry(&transport->rx_freelist, + struct vs_axon_rx_freelist_entry, list); + list_del(&buf->list); + status = transport_rx_queue_buffer(transport, buf, buf->laddr); + if (status < 0) + list_add(&buf->list, &transport->rx_freelist); + if (status <= 0) + break; + } + spin_unlock_irq(&transport->rx_alloc_lock); + + /* Start the transfer */ + recv_result = _okl4_sys_axon_process_recv(transport->rx_cap, + MAX_TRANSFER_CHUNK); + + if (recv_result.error == OKL4_OK) { + status = 1; + } else { + status = okl4_error_to_errno(recv_result.error); + vs_dev_debug(VS_DEBUG_TRANSPORT, transport->session_dev, + transport->axon_dev, "rx syscall fail: %d", + status); + } + + /* Process the received messages */ + while (status > 0) + status = transport_process_msg(transport); + + if (status == -ENOMEM) { + /* Give kswapd some time to reclaim pages */ + mod_timer(&transport->rx_retry_timer, jiffies + HZ); + } else if (status == -ENOBUFS) { + /* + * Reschedule ourselves if more RX buffers are available, + * otherwise do nothing until a buffer is freed + */ + spin_lock_irq(&transport->rx_alloc_lock); + if (!list_empty(&transport->rx_freelist)) + tasklet_schedule(&transport->rx_tasklet); + spin_unlock_irq(&transport->rx_alloc_lock); + } else if (!status && !recv_result.send_empty) { + /* There are more messages waiting; reschedule */ + tasklet_schedule(&transport->rx_tasklet); + } else if (status < 0 && status != -ECONNRESET) { + /* Something else went wrong, other than a reset */ + dev_err(transport->axon_dev, "Fatal RX error %d\n", status); + transport_fatal_error(transport, "rx failure"); + } else { + /* Axon is empty; wait for an RX interrupt */ + } +} + +static void transport_tx_tasklet(unsigned long data) +{ + struct vs_transport_axon *transport = (struct vs_transport_axon *)data; + struct vs_mbuf_axon *mbuf; + vs_service_id_t service_id; + int err; + + spin_lock_irq(&transport->readiness_lock); + + /* Check to see if there is anything in the queue to send */ + if (list_empty(&transport->tx_queue)) { + /* + * Queue is empty, probably because a service reset cancelled + * some pending messages. Nothing to do. + */ + spin_unlock_irq(&transport->readiness_lock); + return; + } + + /* + * Try to send the mbuf. If it can't, the channel must be + * full again so wait until the next can send event. + */ + mbuf = list_first_entry(&transport->tx_queue, struct vs_mbuf_axon, + base.queue); + + service_id = transport_get_mbuf_service_id(transport, + mbuf_real_base(mbuf), NULL); + + err = __transport_send(transport, mbuf, service_id, + VS_TRANSPORT_SEND_FLAGS_MORE); + if (err == -ENOSPC) { + /* + * The channel is currently full. Leave the message in the + * queue and try again when it has emptied. + */ + __transport_flush(transport); + goto out_unlock; + } + if (err) { + /* + * We cannot properly handle a message send error here because + * we have already returned success for the send to the service + * driver when the message was queued. We don't want to leave + * the message in the queue, since it could cause a DoS if the + * error is persistent. Give up and force a transport reset. + */ + dev_err(transport->axon_dev, + "Failed to send queued mbuf: %d\n", err); + spin_unlock_irq(&transport->readiness_lock); + transport_fatal_error(transport, "queued send failure"); + return; + } + + /* Message sent, remove it from the queue and free the local copy */ + list_del(&mbuf->base.queue); + transport_free_sent_mbuf(transport, mbuf); + + /* Check to see if we have run out of messages to send */ + if (list_empty(&transport->tx_queue)) { + /* Nothing left in the queue; flush and return */ + __transport_flush(transport); + } else { + /* Reschedule to send the next message */ + tasklet_schedule(&transport->tx_tasklet); + } + +out_unlock: + spin_unlock_irq(&transport->readiness_lock); +} + +static void transport_rx_retry_timer(unsigned long data) +{ + struct vs_transport_axon *transport = (struct vs_transport_axon *)data; + + /* Try to receive again; hopefully we have memory now */ + tasklet_schedule(&transport->rx_tasklet); +} + +/* Transport device management */ + +static int alloc_notify_info(struct device *dev, struct vs_notify_info **info, + int *info_size, int virqs) +{ + /* Each VIRQ can handle BITS_PER_LONG notifications */ + *info_size = sizeof(struct vs_notify_info) * (virqs * BITS_PER_LONG); + *info = devm_kzalloc(dev, *info_size, GFP_KERNEL); + if (!(*info)) + return -ENOMEM; + + memset(*info, 0, *info_size); + return 0; +} + +static int transport_axon_probe_virqs(struct vs_transport_axon *transport) +{ + struct device *device = transport->axon_dev; + struct device_node *axon_node = device->of_node; + struct device_node *vs_node = transport->of_node; + struct irq_data *irqd; + struct property *irqlines; + int ret, num_virq_lines; + struct device_node *virq_node = NULL; + u32 cap; + int i, irq_count; + + if (of_irq_count(axon_node) < 2) { + dev_err(device, "Missing axon interrupts\n"); + return -ENODEV; + } + + irq_count = of_irq_count(vs_node); + if (irq_count < 1) { + dev_err(device, "Missing reset interrupt\n"); + return -ENODEV; + } else if (irq_count > 1 + MAX_NOTIFICATION_LINES) { + dev_warn(device, + "Too many notification interrupts; only the first %d will be used\n", + MAX_NOTIFICATION_LINES); + } + + /* Find the TX and RX axon IRQs and the reset IRQ */ + transport->tx_irq = irq_of_parse_and_map(axon_node, 0); + if (!transport->tx_irq) { + dev_err(device, "No TX IRQ\n"); + return -ENODEV; + } + + transport->rx_irq = irq_of_parse_and_map(axon_node, 1); + if (!transport->rx_irq) { + dev_err(device, "No RX IRQ\n"); + return -ENODEV; + } + + transport->reset_irq = irq_of_parse_and_map(vs_node, 0); + if (!transport->reset_irq) { + dev_err(device, "No reset IRQ\n"); + return -ENODEV; + } + irqd = irq_get_irq_data(transport->reset_irq); + if (!irqd) { + dev_err(device, "No reset IRQ data\n"); + return -ENODEV; + } + transport->reset_okl4_irq = irqd_to_hwirq(irqd); + + /* Find the notification IRQs */ + transport->notify_rx_nirqs = irq_count - 1; + for (i = 0; i < transport->notify_rx_nirqs; i++) { + transport->notify_irq[i] = irq_of_parse_and_map(vs_node, + i + 1); + if (!transport->notify_irq[i]) { + dev_err(device, "Bad notify IRQ\n"); + return -ENODEV; + } + } + + /* Find all outgoing virq lines */ + irqlines = of_find_property(vs_node, "okl,interrupt-lines", NULL); + if (!irqlines || irqlines->length < sizeof(u32)) { + dev_err(device, "No VIRQ sources found"); + return -ENODEV; + } + num_virq_lines = irqlines->length / sizeof(u32); + + virq_node = of_parse_phandle(vs_node, "okl,interrupt-lines", 0); + if (!virq_node) { + dev_err(device, "No reset VIRQ line object\n"); + return -ENODEV; + } + ret = of_property_read_u32(virq_node, "reg", &cap); + if (ret || cap == OKL4_KCAP_INVALID) { + dev_err(device, "Bad reset VIRQ line\n"); + return -ENODEV; + } + transport->reset_cap = cap; + + transport->notify_tx_nirqs = num_virq_lines - 1; + for (i = 0; i < transport->notify_tx_nirqs; i++) { + virq_node = of_parse_phandle(vs_node, "okl,interrupt-lines", + i + 1); + if (!virq_node) { + dev_err(device, "No notify VIRQ line object\n"); + return -ENODEV; + } + ret = of_property_read_u32(virq_node, "reg", &cap); + if (ret || cap == OKL4_KCAP_INVALID) { + dev_err(device, "Bad notify VIRQ line\n"); + return -ENODEV; + } + transport->notify_cap[i] = cap; + } + + return 0; +} + +static int transport_axon_request_irqs(struct vs_transport_axon *transport) +{ + struct device *device = transport->axon_dev; + int i, ret; + + ret = devm_request_irq(device, transport->reset_irq, + transport_axon_reset_irq, IRQF_TRIGGER_HIGH, + dev_name(transport->axon_dev), transport); + if (ret < 0) + return ret; + + ret = devm_request_irq(device, transport->tx_irq, + transport_axon_tx_irq, IRQF_TRIGGER_HIGH, + dev_name(transport->axon_dev), transport); + if (ret < 0) + return ret; + + ret = devm_request_irq(device, transport->rx_irq, + transport_axon_rx_irq, IRQF_TRIGGER_HIGH, + dev_name(transport->axon_dev), transport); + if (ret < 0) + return ret; + + for (i = 0; i < transport->notify_rx_nirqs; i++) { + ret = devm_request_irq(device, transport->notify_irq[i], + transport_axon_notify_virq, IRQF_TRIGGER_HIGH, + dev_name(transport->axon_dev), transport); + if (ret < 0) + return ret; + } + + return 0; +} + +static int transport_axon_setup_descs(struct vs_transport_axon *transport) +{ + const int rx_buffer_order = ilog2(transport->msg_size + + sizeof(vs_service_id_t)); + const size_t rx_queue_size = sizeof(*transport->rx) + + (sizeof(*transport->rx_descs) * transport->queue_size) + + (sizeof(*transport->rx_ptrs) * transport->queue_size); + const size_t tx_queue_size = sizeof(*transport->tx) + + (sizeof(*transport->tx_descs) * transport->queue_size); + const size_t queue_size = ALIGN(rx_queue_size, + __alignof__(*transport->tx)) + tx_queue_size; + + struct _okl4_sys_mmu_lookup_pn_return lookup_return; + void *queue; + struct device_node *seg_node; + u32 seg_index; + okl4_kcap_t seg_cap; + okl4_error_t err; + dma_addr_t dma_handle; + const __be32 *prop; + int len, ret; + + /* + * Allocate memory for the queue descriptors. + * + * We allocate one block for both rx and tx because the minimum + * allocation from dmam_alloc_coherent is usually a whole page. + */ + ret = -ENOMEM; + queue = dmam_alloc_coherent(transport->axon_dev, queue_size, + &dma_handle, GFP_KERNEL); + if (queue == NULL) { + dev_err(transport->axon_dev, "Failed to allocate %zd bytes for queue descriptors\n", + queue_size); + goto fail_alloc_dma; + } + memset(queue, 0, queue_size); + + /* + * Find the OKL4 physical segment object to attach to the axons. + * + * If the device has a CMA area, and the cell's memory segments have + * not been split unnecessarily, then all allocations through the DMA + * API for this device will be within a single segment. So, we can + * simply look up the segment that contains the queue. + * + * The location and size of the CMA area can be configured elsewhere. + * In 3.12 and later a device-specific area can be reserved via the + * standard device tree reserved-memory properties. Otherwise, the + * global area will be used, which has a size configurable on the + * kernel command line and defaults to 16MB. + */ + + /* Locate the physical segment */ + ret = -ENODEV; + lookup_return = _okl4_sys_mmu_lookup_pn(okl4_mmu_cap, + dma_handle >> OKL4_DEFAULT_PAGEBITS, -1); + err = okl4_mmu_lookup_index_geterror(&lookup_return.segment_index); + if (err == OKL4_ERROR_NOT_IN_SEGMENT) { + dev_err(transport->axon_dev, + "No segment found for DMA address %pK (%#llx)!\n", + queue, (unsigned long long)dma_handle); + goto fail_lookup_segment; + } + if (err != OKL4_OK) { + dev_err(transport->axon_dev, + "Could not look up segment for DMA address %pK (%#llx): OKL4 error %d\n", + queue, (unsigned long long)dma_handle, + (int)err); + goto fail_lookup_segment; + } + seg_index = okl4_mmu_lookup_index_getindex(&lookup_return.segment_index); + + dev_dbg(transport->axon_dev, "lookup pn %#lx got error %ld segment %ld count %lu offset %#lx\n", + (long)(dma_handle >> OKL4_DEFAULT_PAGEBITS), + (long)err, (long)seg_index, + (unsigned long)lookup_return.count_pn, + (unsigned long)lookup_return.offset_pn); + + /* Locate the physical segment's OF node */ + for_each_compatible_node(seg_node, NULL, "okl,microvisor-segment") { + u32 attach_index; + ret = of_property_read_u32(seg_node, "okl,segment-attachment", + &attach_index); + if (attach_index == seg_index) + break; + } + if (seg_node == NULL) { + ret = -ENXIO; + dev_err(transport->axon_dev, "No physical segment found for %pK\n", + queue); + goto fail_lookup_segment; + } + + /* Determine the physical segment's cap */ + prop = of_get_property(seg_node, "reg", &len); + ret = !!prop ? 0 : -EPERM; + if (!ret) + seg_cap = of_read_number(prop, of_n_addr_cells(seg_node)); + if (!ret && seg_cap == OKL4_KCAP_INVALID) + ret = -ENXIO; + if (ret < 0) { + dev_err(transport->axon_dev, "missing physical-segment cap\n"); + goto fail_lookup_segment; + } + transport->segment = seg_cap; + transport->segment_base = + (round_down(dma_handle >> OKL4_DEFAULT_PAGEBITS, + lookup_return.count_pn) - + lookup_return.offset_pn) << OKL4_DEFAULT_PAGEBITS; + + dev_dbg(transport->axon_dev, "physical segment cap is %#lx, base %#llx\n", + (unsigned long)transport->segment, + (unsigned long long)transport->segment_base); + + /* Attach the segment to the Axon endpoints */ + err = _okl4_sys_axon_set_send_segment(transport->tx_cap, + transport->segment, transport->segment_base); + if (err != OKL4_OK) { + dev_err(transport->axon_dev, "TX attach failed: %d\n", + (int)err); + ret = okl4_error_to_errno(err); + goto fail_attach; + } + + err = _okl4_sys_axon_set_recv_segment(transport->rx_cap, + transport->segment, transport->segment_base); + if (err != OKL4_OK) { + dev_err(transport->axon_dev, "RX attach failed: %d\n", + (int)err); + ret = okl4_error_to_errno(err); + goto fail_attach; + } + + /* Array of pointers to the source TX pool for each outgoing buffer. */ + transport->tx_pools = devm_kzalloc(transport->axon_dev, + sizeof(*transport->tx_pools) * transport->queue_size, + GFP_KERNEL); + if (!transport->tx_pools) { + err = -ENOMEM; + goto fail_alloc_tx_pools; + } + + /* Set up the rx queue descriptors. */ + transport->rx = queue; + transport->rx_phys = dma_handle; + transport->rx_size = rx_queue_size; + transport->rx_descs = (void *)(transport->rx + 1); + transport->rx_ptrs = (void *)(transport->rx_descs + transport->queue_size); + okl4_axon_queue_size_setallocorder(&transport->rx->queue_sizes[0], + rx_buffer_order); + transport->rx->queues[0].queue_offset = sizeof(*transport->rx); + transport->rx->queues[0].entries = transport->queue_size; + transport->rx->queues[0].uptr = 0; + transport->rx->queues[0].kptr = 0; + transport->rx_uptr_allocated = 0; + + /* Set up the tx queue descriptors. */ + transport->tx = queue + ALIGN(rx_queue_size, + __alignof__(*transport->tx)); + transport->tx_phys = dma_handle + ((void *)transport->tx - queue); + transport->tx_size = tx_queue_size; + transport->tx_descs = (void *)(transport->tx + 1); + transport->tx->queues[0].queue_offset = sizeof(*transport->tx); + transport->tx->queues[0].entries = transport->queue_size; + transport->tx->queues[0].uptr = 0; + transport->tx->queues[0].kptr = 0; + transport->tx_uptr_freed = 0; + + /* Create a DMA pool for the RX buffers. */ + transport->rx_pool = dmam_pool_create("vs_axon_rx_pool", + transport->axon_dev, 1 << rx_buffer_order, + max(dma_get_cache_alignment(), + 1 << OKL4_PRESHIFT_LADDR_AXON_DATA_INFO), 0); + + return 0; + +fail_alloc_tx_pools: +fail_attach: +fail_lookup_segment: + dmam_free_coherent(transport->axon_dev, queue_size, queue, dma_handle); +fail_alloc_dma: + return ret; +} + +static void transport_axon_free_descs(struct vs_transport_axon *transport) +{ + int i; + + tasklet_disable(&transport->rx_tasklet); + tasklet_kill(&transport->rx_tasklet); + + tasklet_disable(&transport->tx_tasklet); + tasklet_kill(&transport->tx_tasklet); + + cancel_delayed_work_sync(&transport->free_bufs_work); + + transport->tx = NULL; + transport->tx_descs = NULL; + + for (i = 0; i < transport->rx->queues[0].entries; i++) { + struct okl4_axon_queue_entry *desc = &transport->rx_descs[i]; + + if (okl4_axon_data_info_getusr(&desc->info)) { + void *ptr = transport->rx_ptrs[i]; + dma_addr_t dma = okl4_axon_data_info_getladdr(&desc->info); + dma_pool_free(transport->rx_pool, ptr, dma); + } + } + + transport->rx = NULL; + transport->rx_descs = NULL; + transport->rx_ptrs = NULL; + + /* Let devm free the queues so we don't have to keep the dma handle */ +} + +static int transport_axon_probe(struct platform_device *dev) +{ + struct vs_transport_axon *priv = NULL; + u32 cap[2]; + u32 queue_size, msg_size; + int ret, i; + const char* name; + + if (!dev_get_cma_area(&dev->dev) && !okl4_single_physical_segment) { + dev_err(&dev->dev, "Multiple physical segments, but CMA is disabled\n"); + return -ENOSYS; + } + + dev->dev.coherent_dma_mask = ~(u64)0; + dev->dev.archdata.dma_ops = &axon_dma_ops; + + priv = devm_kzalloc(&dev->dev, sizeof(struct vs_transport_axon) + + sizeof(unsigned long), GFP_KERNEL); + if (priv == NULL) { + dev_err(&dev->dev, "create transport object failed\n"); + ret = -ENOMEM; + goto err_alloc_priv; + } + dev_set_drvdata(&dev->dev, priv); + + priv->of_node = of_get_child_by_name(dev->dev.of_node, + "virtual-session"); + if ((!priv->of_node) || + (!of_device_is_compatible(priv->of_node, + "okl,virtual-session"))) { + dev_err(&dev->dev, "missing virtual-session node\n"); + ret = -ENODEV; + goto error_of_node; + } + + name = dev->dev.of_node->full_name; + of_property_read_string(dev->dev.of_node, "label", &name); + + if (of_property_read_bool(priv->of_node, "okl,is-client")) { + priv->is_server = false; + } else if (of_property_read_bool(priv->of_node, "okl,is-server")) { + priv->is_server = true; + } else { + dev_err(&dev->dev, "virtual-session node is not marked as client or server\n"); + ret = -ENODEV; + goto error_of_node; + } + + priv->transport.vt = &tvt; + priv->transport.type = "microvisor"; + priv->axon_dev = &dev->dev; + + /* Read the Axon caps */ + ret = of_property_read_u32_array(dev->dev.of_node, "reg", cap, 2); + if (ret < 0 || cap[0] == OKL4_KCAP_INVALID || + cap[1] == OKL4_KCAP_INVALID) { + dev_err(&dev->dev, "missing axon endpoint caps\n"); + ret = -ENODEV; + goto error_of_node; + } + priv->tx_cap = cap[0]; + priv->rx_cap = cap[1]; + + /* Set transport properties; default to a 64kb buffer */ + queue_size = 16; + (void)of_property_read_u32(priv->of_node, "okl,queue-length", + &queue_size); + priv->queue_size = max((size_t)queue_size, MIN_QUEUE_SIZE); + + msg_size = PAGE_SIZE - sizeof(vs_service_id_t); + (void)of_property_read_u32(priv->of_node, "okl,message-size", + &msg_size); + priv->msg_size = max((size_t)msg_size, MIN_MSG_SIZE); + + /* + * Since the Axon API requires received message size limits to be + * powers of two, we must round up the message size (including the + * space reserved for the service ID). + */ + priv->msg_size = roundup_pow_of_two(priv->msg_size + + sizeof(vs_service_id_t)) - sizeof(vs_service_id_t); + if (priv->msg_size != msg_size) + dev_info(&dev->dev, "message size rounded up from %zd to %zd\n", + (size_t)msg_size, priv->msg_size); + + INIT_LIST_HEAD(&priv->tx_queue); + + /* Initialise the activation state, tasklets, and RX retry timer */ + spin_lock_init(&priv->readiness_lock); + priv->readiness = VS_TRANSPORT_INIT; + + tasklet_init(&priv->rx_tasklet, transport_rx_tasklet, + (unsigned long)priv); + tasklet_init(&priv->tx_tasklet, transport_tx_tasklet, + (unsigned long)priv); + + INIT_DELAYED_WORK(&priv->free_bufs_work, transport_free_bufs_work); + spin_lock_init(&priv->rx_alloc_lock); + priv->rx_alloc_extra = 0; + INIT_LIST_HEAD(&priv->rx_freelist); + + setup_timer(&priv->rx_retry_timer, transport_rx_retry_timer, + (unsigned long)priv); +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,8,0) + set_timer_slack(&priv->rx_retry_timer, HZ); +#endif + + /* Keep RX disabled until the core service is ready. */ + tasklet_disable(&priv->rx_tasklet); + + ret = transport_axon_probe_virqs(priv); + if (ret < 0) + goto err_probe_virqs; + + if (priv->notify_rx_nirqs) { + ret = alloc_notify_info(&dev->dev, &priv->transport.notify_info, + &priv->transport.notify_info_size, + priv->notify_rx_nirqs); + if (ret < 0) { + dev_err(&dev->dev, "Alloc notify_info failed\n"); + goto err_alloc_notify; + } + } else { + priv->transport.notify_info = NULL; + priv->transport.notify_info_size = 0; + } + + priv->free_bufs_pool = transport_axon_init_tx_pool(priv, priv->msg_size, + FREE_BUFS_QUOTA); + if (IS_ERR(priv->free_bufs_pool)) { + ret = PTR_ERR(priv->free_bufs_pool); + goto err_init_free_bufs_pool; + } + + ret = transport_axon_setup_descs(priv); + if (ret < 0) + goto err_setup_descs; + + /* Allocate RX buffers for free bufs messages */ + for (i = 0; i < FREE_BUFS_QUOTA; i++) { + dma_addr_t laddr; + struct vs_axon_rx_freelist_entry *buf = + dma_pool_alloc(priv->rx_pool, GFP_KERNEL, &laddr); + if (!buf) + goto err_alloc_rx_free_bufs; + buf->laddr = laddr; + + spin_lock_irq(&priv->rx_alloc_lock); + list_add_tail(&buf->list, &priv->rx_freelist); + spin_unlock_irq(&priv->rx_alloc_lock); + } + + /* Set up the session device */ + priv->session_dev = vs_session_register(&priv->transport, &dev->dev, + priv->is_server, name); + if (IS_ERR(priv->session_dev)) { + ret = PTR_ERR(priv->session_dev); + dev_err(&dev->dev, "failed to register session: %d\n", ret); + goto err_session_register; + } + + /* + * Start the core service. Note that it can't actually communicate + * until the initial reset completes. + */ + vs_session_start(priv->session_dev); + + /* + * Reset the transport. This will also set the Axons' segment + * attachments, and eventually the Axons' queue pointers (once the + * session marks the transport ready). + */ + transport_reset(&priv->transport); + + /* + * We're ready to start handling IRQs at this point, so register the + * handlers. + */ + ret = transport_axon_request_irqs(priv); + if (ret < 0) + goto err_irq_register; + + return 0; + +err_irq_register: + vs_session_unregister(priv->session_dev); +err_session_register: +err_alloc_rx_free_bufs: + transport_axon_free_descs(priv); +err_setup_descs: + transport_axon_put_tx_pool(priv->free_bufs_pool); +err_init_free_bufs_pool: + if (priv->transport.notify_info) + devm_kfree(&dev->dev, priv->transport.notify_info); +err_alloc_notify: +err_probe_virqs: + del_timer_sync(&priv->rx_retry_timer); + tasklet_kill(&priv->rx_tasklet); + tasklet_kill(&priv->tx_tasklet); + cancel_delayed_work_sync(&priv->free_bufs_work); +error_of_node: + devm_kfree(&dev->dev, priv); +err_alloc_priv: + return ret; +} + +static int transport_axon_remove(struct platform_device *dev) +{ + struct vs_transport_axon *priv = dev_get_drvdata(&dev->dev); + int i; + + for (i = 0; i < priv->notify_rx_nirqs; i++) + devm_free_irq(&dev->dev, priv->notify_irq[i], priv); + + devm_free_irq(&dev->dev, priv->rx_irq, priv); + irq_dispose_mapping(priv->rx_irq); + devm_free_irq(&dev->dev, priv->tx_irq, priv); + irq_dispose_mapping(priv->tx_irq); + devm_free_irq(&dev->dev, priv->reset_irq, priv); + irq_dispose_mapping(priv->reset_irq); + + del_timer_sync(&priv->rx_retry_timer); + tasklet_kill(&priv->rx_tasklet); + tasklet_kill(&priv->tx_tasklet); + cancel_delayed_work_sync(&priv->free_bufs_work); + + priv->readiness = VS_TRANSPORT_SHUTDOWN; + vs_session_unregister(priv->session_dev); + WARN_ON(priv->readiness != VS_TRANSPORT_SHUTDOWN); + + transport_axon_free_descs(priv); + transport_axon_put_tx_pool(priv->free_bufs_pool); + + if (priv->transport.notify_info) + devm_kfree(&dev->dev, priv->transport.notify_info); + + free_tx_mbufs(priv); + + flush_workqueue(work_queue); + + while (!list_empty(&priv->rx_freelist)) { + struct vs_axon_rx_freelist_entry *buf; + buf = list_first_entry(&priv->rx_freelist, + struct vs_axon_rx_freelist_entry, list); + list_del(&buf->list); + dma_pool_free(priv->rx_pool, buf, buf->laddr); + } + + devm_kfree(&dev->dev, priv); + return 0; +} + +static const struct of_device_id transport_axon_of_match[] = { + { .compatible = "okl,microvisor-axon-transport", }, + {}, +}; +MODULE_DEVICE_TABLE(of, transport_axon_of_match); + +static struct platform_driver transport_axon_driver = { + .probe = transport_axon_probe, + .remove = transport_axon_remove, + .driver = { + .name = DRIVER_NAME, + .owner = THIS_MODULE, + .bus = &platform_bus_type, + .of_match_table = of_match_ptr(transport_axon_of_match), + }, +}; + +static int __init vs_transport_axon_init(void) +{ + int ret; + okl4_error_t err; + struct device_node *cpus; + struct zone *zone; + struct _okl4_sys_mmu_lookup_pn_return lookup_return; + u32 last_seen_attachment = -1; + bool first_attachment; + + printk(KERN_INFO "Virtual Services transport driver for OKL4 Axons\n"); + + /* Allocate the Axon cleanup workqueue */ + work_queue = alloc_workqueue("axon_cleanup", 0, 0); + if (!work_queue) { + ret = -ENOMEM; + goto fail_create_workqueue; + } + + /* Locate the MMU capability, needed for lookups */ + cpus = of_find_node_by_path("/cpus"); + if (IS_ERR_OR_NULL(cpus)) { + ret = -EINVAL; + goto fail_mmu_cap; + } + ret = of_property_read_u32(cpus, "okl,vmmu-capability", &okl4_mmu_cap); + if (ret) { + goto fail_mmu_cap; + } + if (okl4_mmu_cap == OKL4_KCAP_INVALID) { + printk(KERN_ERR "%s: OKL4 MMU capability not found\n", __func__); + ret = -EPERM; + goto fail_mmu_cap; + } + + /* + * Determine whether there are multiple OKL4 physical memory segments + * in this Cell. If so, every transport device must have a valid CMA + * region, to guarantee that its buffer allocations all come from the + * segment that is attached to the axon endpoints. + * + * We assume that each zone is contiguously mapped in stage 2 with a + * constant physical-to-IPA offset, typically 0. The weaver won't + * violate this assumption for Linux (or other HLOS) guests unless it + * is explicitly told to. + */ + okl4_single_physical_segment = true; + first_attachment = true; + for_each_zone(zone) { + u32 attachment; + + /* We only care about zones that the page allocator is using */ + if (!zone->managed_pages) + continue; + + /* Find the segment at the start of the zone */ + lookup_return = _okl4_sys_mmu_lookup_pn(okl4_mmu_cap, + zone->zone_start_pfn, -1); + err = okl4_mmu_lookup_index_geterror( + &lookup_return.segment_index); + if (err != OKL4_OK) { + printk(KERN_WARNING "%s: Unable to determine physical segment count, assuming >1\n", + __func__); + okl4_single_physical_segment = false; + break; + } + attachment = okl4_mmu_lookup_index_getindex( + &lookup_return.segment_index); + + if (first_attachment) { + last_seen_attachment = attachment; + first_attachment = false; + } else if (last_seen_attachment != attachment) { + okl4_single_physical_segment = false; + break; + } + + /* Find the segment at the end of the zone */ + lookup_return = _okl4_sys_mmu_lookup_pn(okl4_mmu_cap, + zone_end_pfn(zone) - 1, -1); + err = okl4_mmu_lookup_index_geterror( + &lookup_return.segment_index); + if (err != OKL4_OK) { + printk(KERN_WARNING "%s: Unable to determine physical segment count, assuming >1\n", + __func__); + okl4_single_physical_segment = false; + break; + } + attachment = okl4_mmu_lookup_index_getindex( + &lookup_return.segment_index); + + /* Check that it's still the same segment */ + if (last_seen_attachment != attachment) { + okl4_single_physical_segment = false; + break; + } + } + +#ifdef DEBUG + printk(KERN_DEBUG "%s: physical segment count %s\n", __func__, + okl4_single_physical_segment ? "1" : ">1"); +#endif + + mbuf_cache = KMEM_CACHE(vs_mbuf_axon, 0UL); + if (!mbuf_cache) { + ret = -ENOMEM; + goto kmem_cache_failed; + } + + ret = platform_driver_register(&transport_axon_driver); + if (ret) + goto register_plat_driver_failed; + + return ret; + +register_plat_driver_failed: + kmem_cache_destroy(mbuf_cache); + mbuf_cache = NULL; +kmem_cache_failed: +fail_mmu_cap: + if (work_queue) + destroy_workqueue(work_queue); +fail_create_workqueue: + return ret; +} + +static void __exit vs_transport_axon_exit(void) +{ + platform_driver_unregister(&transport_axon_driver); + + rcu_barrier(); + + if (mbuf_cache) + kmem_cache_destroy(mbuf_cache); + mbuf_cache = NULL; + + if (work_queue) + destroy_workqueue(work_queue); +} + +module_init(vs_transport_axon_init); +module_exit(vs_transport_axon_exit); + +MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_AUTHOR(DRIVER_AUTHOR);