diff --git a/virtio.cc b/virtio.cc new file mode 100644 index 0000000000..94ad763169 --- /dev/null +++ b/virtio.cc @@ -0,0 +1,495 @@ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#include "virtio.hh" +#include "posix.hh" +#include "vla.hh" +#include "virtio-interface.hh" +#include "reactor.hh" +#include +#include +#include +#include +#include +#include +#include + +using namespace net; + +using phys = uint64_t; + +template +inline +T align_up(T v, T align) { + return (v + align - 1) & ~(align - 1); +} + +template +inline +T* align_up(T* v, size_t align) { + static_assert(sizeof(T) == 1, "align byte pointers only"); + return reinterpret_cast(align_up(reinterpret_cast(v), align)); +} + +inline +phys virt_to_phys(void* p) { + return reinterpret_cast(p); +} + +class vring { +public: + struct config { + char* descs; + char* avail; + char* used; + unsigned size; + bool event_index; + bool indirect; + bool mergable_buffers; + }; + struct buffer { + phys addr; + uint32_t len; + promise completed; + bool writeable; + }; + using buffer_chain = std::vector; + // provide buffers for the queue, wait on @available to gain buffer space + using producer_type = future> (semaphore& available); +private: + class desc { + public: + struct flags { + // This marks a buffer as continuing via the next field. + uint16_t has_next : 1; + // This marks a buffer as write-only (otherwise read-only). + uint16_t writeable : 1; + // This means the buffer contains a list of buffer descriptors. + uint16_t indirect : 1; + }; + + phys get_paddr(); + uint32_t get_len() { return _len; } + uint16_t next_idx() { return _next; } + + phys _paddr; + uint32_t _len; + flags _flags; + uint16_t _next; + }; + + // Guest to host + struct avail_layout { + struct flags { + // Mark that we do not need an interrupt for consuming a descriptor + // from the ring. Unreliable so it's simply an optimization + uint16_t no_interrupts : 1; + }; + + std::atomic _flags; + + // Where we put the next descriptor + std::atomic _idx; + // There may be no more entries than the queue size read from device + uint16_t _ring[]; + // used event index is an optimization in order to get an interrupt from the host + // only when the value reaches this number + // The location of this field is places after the variable length ring array, + // that's why we cannot fully define it within the struct and use a function accessor + //std::atomic used_event; + }; + + struct used_elem { + // Index of start of used _desc chain. (uint32_t for padding reasons) + uint32_t _id; + // Total length of the descriptor chain which was used (written to) + uint32_t _len; + }; + + // Host to guest + struct used_layout { + enum { + // The Host advise the Guest: don't kick me when + // you add a buffer. It's unreliable, so it's simply an + // optimization. Guest will still kick if it's out of buffers. + no_notify = 1 + }; + + bool notifications_disabled() { + return (_flags.load(std::memory_order_relaxed) & VRING_USED_F_NO_NOTIFY) != 0; + } + + // Using std::atomic since it being changed by the host + std::atomic _flags; + // Using std::atomic in order to have memory barriers for it + std::atomic _idx; + used_elem _used_elements[]; + // avail event index is an optimization kick the host only when the value reaches this number + // The location of this field is places after the variable length ring array, + // that's why we cannot fully define it within the struct and use a function accessor + //std::atomic avail_event; + }; + + struct avail { + explicit avail(config conf); + avail_layout* _shared; + std::atomic* _host_notify_on_index = nullptr; + uint16_t _head = 0; + }; + struct used { + explicit used(config conf); + used_layout* _shared; + std::atomic* _notify_on_index = nullptr; + uint16_t _tail = 0; + }; +private: + config _config; + readable_eventfd _notified; + writeable_eventfd _kick; + std::function _producer; + std::unique_ptr[]> _completions; + desc* _descs; + avail _avail; + used _used; + semaphore _available_descriptors = { 0 }; + int _free_desc = -1; +public: + vring(config conf, readable_eventfd notified, writeable_eventfd kick, + std::function producer); + + // start the queue + void run(); + + // complete any buffers returned from the host + void complete(); + + // wait for the used ring to have at least @nr buffers + future<> on_used(size_t nr); + + // Total number of descriptors in ring + int size() { return _config.size; } + + // Let host know about interrupt delivery + void disable_interrupts(); + void enable_interrupts(); +private: + size_t mask() { return size() - 1; } + size_t masked(size_t idx) { return idx & mask(); } + size_t available(); + unsigned allocate_desc(); + void free_desc(unsigned id); + void setup(); +}; + +vring::avail::avail(config conf) + : _shared(reinterpret_cast(conf.avail)) { +} + +vring::used::used(config conf) + : _shared(reinterpret_cast(conf.used)) { +} + +inline +unsigned vring::allocate_desc() { + assert(_free_desc != -1); + auto desc = _free_desc; + _free_desc = _descs[desc]._next; + return desc; +} + +inline +void vring::free_desc(unsigned id) { + _descs[id]._next = _free_desc; + _free_desc = id; + _available_descriptors.signal(); +} + +vring::vring(config conf, readable_eventfd notified, writeable_eventfd kick, + std::function producer) + : _config(conf) + , _notified(std::move(notified)) + , _kick(std::move(kick)) + , _producer(std::move(producer)) + , _completions(new promise[_config.size]) + , _descs(reinterpret_cast(conf.descs)) + , _avail(conf) + , _used(conf) +{ + setup(); +} + +void vring::setup() { + for (unsigned i = 0; i < _config.size; ++i) { + free_desc(i); + } +} + +void vring::run() { + _producer(_available_descriptors).then([this] (std::vector vbc) { + for (auto&& bc: vbc) { + bool has_prev = false; + unsigned prev_desc_idx = 0; + for (auto i = bc.rbegin(); i != bc.rend(); ++i) { + unsigned desc_idx = allocate_desc(); + desc& d = _descs[desc_idx]; + d._flags = {}; + d._flags.writeable = i->writeable; + d._flags.has_next = has_prev; + d._next = prev_desc_idx; + d._paddr = i->addr; + d._len = i->len; + prev_desc_idx = desc_idx; + _completions[desc_idx] = std::move(i->completed); + } + auto desc_head = prev_desc_idx; + _avail._shared->_ring[masked(_avail._head++)] = desc_head; + } + _avail._shared->_idx.store(_avail._head, std::memory_order_release); + _kick.signal(1); + complete(); + run(); + }); +} + +void vring::complete() { + auto used_head = _used._shared->_idx.load(std::memory_order_acquire); + while (used_head != _used._tail) { + auto ue = _used._shared->_used_elements[masked(_used._tail++)]; + //auto& d = _descs[ue._id]; + _completions[ue._id].set_value(ue._len); + free_desc(ue._id); + // FIXME: free buffers? length? chains? + } + _notified.wait().then([this] (size_t ignore) { + complete(); + }); +} + +class virtio_net_device : public net::device { + struct init { + readable_eventfd _txq_notify; + writeable_eventfd _txq_kick; + readable_eventfd _rxq_notify; + writeable_eventfd _rxq_kick; + int _txq_notify_fd; + int _txq_kick_fd; + int _rxq_notify_fd; + int _rxq_kick_fd; + init() { + _txq_notify_fd = _txq_notify.get_write_fd(); + _txq_kick_fd = _txq_kick.get_read_fd(); + _rxq_notify_fd = _rxq_notify.get_write_fd(); + _rxq_kick_fd = _txq_kick.get_read_fd(); + } + }; + class txq { + vring _ring; + public: + txq(vring::config config, readable_eventfd notified, writeable_eventfd kicked); + void run() { _ring.run(); } + future<> post(packet p); + private: + future> transmit(semaphore& available); + std::queue _tx_queue; + semaphore _tx_queue_length = { 0 }; + }; + class rxq { + virtio_net_device& _dev; + vring _ring; + size_t _header_len = 10; // adjust for mrg_buf + public: + rxq(virtio_net_device& _if, + vring::config config, readable_eventfd notified, writeable_eventfd kicked); + void run() { _ring.run(); } + private: + future> prepare_buffers(semaphore& available); + void received(char* buffer); + }; +private: + file_desc _tap_fd; + file_desc _vhost_fd; + std::unique_ptr _txq_storage; + std::unique_ptr _rxq_storage; + txq _txq; + rxq _rxq; + semaphore _rx_queue_length = { 0 }; + std::queue _rx_queue; +private: + vring::config txq_config(); + vring::config rxq_config(); + void queue_rx_packet(packet p); +public: + explicit virtio_net_device(sstring tap_device, init x = init()); + virtual future receive() override; + virtual future<> send(packet p) override; +}; + +virtio_net_device::txq::txq(vring::config config, readable_eventfd notified, writeable_eventfd kicked) + : _ring(config, std::move(notified), std::move(kicked), + [this] (semaphore& available) { return transmit(available); }) { +} + +future> +virtio_net_device::txq::transmit(semaphore& available) { + return _tx_queue_length.wait().then([this] { + auto p = std::move(_tx_queue.front()); + _tx_queue.pop(); + std::vector vbc; + vring::buffer_chain bc; + vring::buffer b; + // dirty hack: assume there is a header there instead of allocating + // it ourself + b.addr = virt_to_phys(p.fragments[0].base - 10); + b.len = p.fragments[0].size + 10; + b.writeable = false; + // schedule packet destruction + b.completed.get_future().then([p = std::move(p)] (size_t) {}); + bc.push_back(std::move(b)); + vbc.push_back(std::move(bc)); + return make_ready_future>(std::move(vbc)); + }); +} + +future<> +virtio_net_device::txq::post(packet p) { + _tx_queue.push(std::move(p)); + _tx_queue_length.signal(); + return make_ready_future<>(); // FIXME: queue bounds +} + +virtio_net_device::rxq::rxq(virtio_net_device& netif, + vring::config config, readable_eventfd notified, writeable_eventfd kicked) + : _dev(netif), _ring(config, std::move(notified), std::move(kicked), + [this] (semaphore& available) { return prepare_buffers(available); }) { +} + +future> +virtio_net_device::rxq::prepare_buffers(semaphore& available) { + return available.wait(1).then([this, &available] { + unsigned count = 1; + auto opportunistic = available.current(); + if (available.try_wait(opportunistic)) { + count += opportunistic; + } + std::vector ret; + ret.reserve(count); + for (unsigned i = 0; i < count; ++i) { + vring::buffer_chain bc; + std::unique_ptr buf(new char[4096]); + vring::buffer b; + b.addr = virt_to_phys(buf.get()); + b.len = 4096; + b.writeable = true; + b.completed.get_future().then([this, buf = buf.get()] (size_t len) { + packet p; + p.fragments.push_back(fragment{buf + _header_len, len - _header_len}); + p.completed.get_future().then([buf] { + delete[] buf; + }); + _dev.queue_rx_packet(std::move(p)); + }); + bc.push_back(std::move(b)); + buf.release(); + ret.push_back(std::move(bc)); + } + return make_ready_future>(std::move(ret)); + }); +} + +virtio_net_device::virtio_net_device(sstring tap_device, init x) + : _tap_fd(file_desc::open("/dev/net/tun", O_RDWR | O_NONBLOCK)) + , _vhost_fd(file_desc::open("/dev/vhost-net", O_RDWR)) + , _txq_storage(allocate_aligned_buffer(3*4096, 4096)) + , _rxq_storage(allocate_aligned_buffer(3*4096, 4096)) + , _txq(txq_config(), std::move(x._txq_notify), std::move(x._txq_kick)) + , _rxq(*this, rxq_config(), std::move(x._rxq_notify), std::move(x._rxq_kick)) { + assert(tap_device.size() + 1 <= IFNAMSIZ); + ifreq ifr = {}; + ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR; + strcpy(ifr.ifr_ifrn.ifrn_name, tap_device.c_str()); + _tap_fd.ioctl(TUNSETIFF, ifr); + _vhost_fd.ioctl(VHOST_SET_OWNER); + auto mem_table = make_struct_with_vla(&vhost_memory::regions, 1); + mem_table->nregions = 1; + auto& region = mem_table->regions[0]; + region.guest_phys_addr = 0; + region.memory_size = (size_t(1) << 47) - 4096; + region.userspace_addr = 0; + region.flags_padding = 0; + _vhost_fd.ioctl(VHOST_SET_MEM_TABLE, *mem_table); + uint64_t features = + /* VIRTIO_RING_F_EVENT_IDX + | */ VIRTIO_RING_F_INDIRECT_DESC + /* | VIRTIO_NET_F_MRG_RXBUF */; + _vhost_fd.ioctl(VHOST_SET_FEATURES, features); + vhost_vring_state vvs0 = { 0, 256 }; + _vhost_fd.ioctl(VHOST_SET_VRING_NUM, vvs0); + vhost_vring_state vvs1 = { 1, 256 }; + _vhost_fd.ioctl(VHOST_SET_VRING_NUM, vvs1); + auto tov = [](char* x) { return reinterpret_cast(x); }; + + _vhost_fd.ioctl(VHOST_SET_VRING_ADDR, vhost_vring_addr{ + 0, 0, tov(rxq_config().descs), tov(rxq_config().used), tov(rxq_config().avail), 0 + }); + _vhost_fd.ioctl(VHOST_SET_VRING_ADDR, vhost_vring_addr{ + 1, 0, tov(txq_config().descs), tov(txq_config().used), tov(txq_config().avail), 0 + }); + _vhost_fd.ioctl(VHOST_SET_VRING_KICK, vhost_vring_file{0, x._rxq_kick_fd}); + _vhost_fd.ioctl(VHOST_SET_VRING_CALL, vhost_vring_file{0, x._rxq_notify_fd}); + _vhost_fd.ioctl(VHOST_SET_VRING_KICK, vhost_vring_file{1, x._txq_kick_fd}); + _vhost_fd.ioctl(VHOST_SET_VRING_CALL, vhost_vring_file{1, x._txq_notify_fd}); + _vhost_fd.ioctl(VHOST_NET_SET_BACKEND, vhost_vring_file{0, _tap_fd.get()}); + _vhost_fd.ioctl(VHOST_NET_SET_BACKEND, vhost_vring_file{1, _tap_fd.get()}); + _txq.run(); + _rxq.run(); +} + +vring::config virtio_net_device::txq_config() { + vring::config r; + auto size = 256; + r.descs = _txq_storage.get(); + r.avail = r.descs + 16 * size; + r.used = align_up(r.avail + 2 * size + 6, 4096); + r.size = size; + r.event_index = !true; + r.indirect = false; + r.mergable_buffers = false; + return r; +} + +vring::config virtio_net_device::rxq_config() { + vring::config r; + auto size = 256; + r.descs = _rxq_storage.get(); + r.avail = r.descs + 16 * size; + r.used = align_up(r.avail + 2 * size + 6, 4096); + r.size = size; + r.event_index = !true; + r.indirect = false; + r.mergable_buffers = true; + return r; +} + +future +virtio_net_device::receive() { + return _rx_queue_length.wait().then([this] { + auto p = std::move(_rx_queue.front()); + _rx_queue.pop(); + return make_ready_future(std::move(p)); + }); +} + +future<> +virtio_net_device::send(packet p) { + return _txq.post(std::move(p)); +} + +void virtio_net_device::queue_rx_packet(packet p) { + _rx_queue.push(std::move(p)); + _rx_queue_length.signal(1); +} + +std::unique_ptr create_virtio_net_device(sstring tap_device) { + return std::make_unique(tap_device); +} diff --git a/virtio.hh b/virtio.hh new file mode 100644 index 0000000000..07d185794d --- /dev/null +++ b/virtio.hh @@ -0,0 +1,14 @@ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef VIRTIO_HH_ +#define VIRTIO_HH_ + +#include +#include "net.hh" +#include "sstring.hh" + +std::unique_ptr create_virtio_net_device(sstring tap_device); + +#endif /* VIRTIO_HH_ */