summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/Makefile.objs3
-rw-r--r--net/colo-compare.c737
-rw-r--r--net/colo.c211
-rw-r--r--net/colo.h88
-rw-r--r--net/filter-mirror.c68
-rw-r--r--net/filter-rewriter.c256
-rw-r--r--net/filter.c2
-rw-r--r--net/net.c14
-rw-r--r--net/slirp.c24
-rw-r--r--net/socket.c128
-rw-r--r--net/tap-bsd.c6
-rw-r--r--net/tap.c4
-rw-r--r--net/trace-events18
-rw-r--r--net/vhost-user.c90
14 files changed, 1484 insertions, 165 deletions
diff --git a/net/Makefile.objs b/net/Makefile.objs
index b7c22fddbf..2a80df5fa7 100644
--- a/net/Makefile.objs
+++ b/net/Makefile.objs
@@ -16,3 +16,6 @@ common-obj-$(CONFIG_NETMAP) += netmap.o
common-obj-y += filter.o
common-obj-y += filter-buffer.o
common-obj-y += filter-mirror.o
+common-obj-y += colo-compare.o
+common-obj-y += colo.o
+common-obj-y += filter-rewriter.o
diff --git a/net/colo-compare.c b/net/colo-compare.c
new file mode 100644
index 0000000000..9bfc736f55
--- /dev/null
+++ b/net/colo-compare.c
@@ -0,0 +1,737 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+#include "qemu-common.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/error.h"
+#include "net/net.h"
+#include "net/eth.h"
+#include "qom/object_interfaces.h"
+#include "qemu/iov.h"
+#include "qom/object.h"
+#include "qemu/typedefs.h"
+#include "net/queue.h"
+#include "sysemu/char.h"
+#include "qemu/sockets.h"
+#include "qapi-visit.h"
+#include "net/colo.h"
+
+#define TYPE_COLO_COMPARE "colo-compare"
+#define COLO_COMPARE(obj) \
+ OBJECT_CHECK(CompareState, (obj), TYPE_COLO_COMPARE)
+
+#define COMPARE_READ_LEN_MAX NET_BUFSIZE
+#define MAX_QUEUE_SIZE 1024
+
+/* TODO: Should be configurable */
+#define REGULAR_PACKET_CHECK_MS 3000
+
+/*
+ + CompareState ++
+ | |
+ +---------------+ +---------------+ +---------------+
+ |conn list +--->conn +--------->conn |
+ +---------------+ +---------------+ +---------------+
+ | | | | | |
+ +---------------+ +---v----+ +---v----+ +---v----+ +---v----+
+ |primary | |secondary |primary | |secondary
+ |packet | |packet + |packet | |packet +
+ +--------+ +--------+ +--------+ +--------+
+ | | | |
+ +---v----+ +---v----+ +---v----+ +---v----+
+ |primary | |secondary |primary | |secondary
+ |packet | |packet + |packet | |packet +
+ +--------+ +--------+ +--------+ +--------+
+ | | | |
+ +---v----+ +---v----+ +---v----+ +---v----+
+ |primary | |secondary |primary | |secondary
+ |packet | |packet + |packet | |packet +
+ +--------+ +--------+ +--------+ +--------+
+*/
+typedef struct CompareState {
+ Object parent;
+
+ char *pri_indev;
+ char *sec_indev;
+ char *outdev;
+ CharBackend chr_pri_in;
+ CharBackend chr_sec_in;
+ CharBackend chr_out;
+ SocketReadState pri_rs;
+ SocketReadState sec_rs;
+
+ /* connection list: the connections belonged to this NIC could be found
+ * in this list.
+ * element type: Connection
+ */
+ GQueue conn_list;
+ /* hashtable to save connection */
+ GHashTable *connection_track_table;
+ /* compare thread, a thread for each NIC */
+ QemuThread thread;
+ /* Timer used on the primary to find packets that are never matched */
+ QEMUTimer *timer;
+ QemuMutex timer_check_lock;
+} CompareState;
+
+typedef struct CompareClass {
+ ObjectClass parent_class;
+} CompareClass;
+
+enum {
+ PRIMARY_IN = 0,
+ SECONDARY_IN,
+};
+
+static int compare_chr_send(CharBackend *out,
+ const uint8_t *buf,
+ uint32_t size);
+
+/*
+ * Return 0 on success, if return -1 means the pkt
+ * is unsupported(arp and ipv6) and will be sent later
+ */
+static int packet_enqueue(CompareState *s, int mode)
+{
+ ConnectionKey key;
+ Packet *pkt = NULL;
+ Connection *conn;
+
+ if (mode == PRIMARY_IN) {
+ pkt = packet_new(s->pri_rs.buf, s->pri_rs.packet_len);
+ } else {
+ pkt = packet_new(s->sec_rs.buf, s->sec_rs.packet_len);
+ }
+
+ if (parse_packet_early(pkt)) {
+ packet_destroy(pkt, NULL);
+ pkt = NULL;
+ return -1;
+ }
+ fill_connection_key(pkt, &key);
+
+ conn = connection_get(s->connection_track_table,
+ &key,
+ &s->conn_list);
+
+ if (!conn->processing) {
+ g_queue_push_tail(&s->conn_list, conn);
+ conn->processing = true;
+ }
+
+ if (mode == PRIMARY_IN) {
+ if (g_queue_get_length(&conn->primary_list) <=
+ MAX_QUEUE_SIZE) {
+ g_queue_push_tail(&conn->primary_list, pkt);
+ } else {
+ error_report("colo compare primary queue size too big,"
+ "drop packet");
+ }
+ } else {
+ if (g_queue_get_length(&conn->secondary_list) <=
+ MAX_QUEUE_SIZE) {
+ g_queue_push_tail(&conn->secondary_list, pkt);
+ } else {
+ error_report("colo compare secondary queue size too big,"
+ "drop packet");
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * The IP packets sent by primary and secondary
+ * will be compared in here
+ * TODO support ip fragment, Out-Of-Order
+ * return: 0 means packet same
+ * > 0 || < 0 means packet different
+ */
+static int colo_packet_compare(Packet *ppkt, Packet *spkt)
+{
+ trace_colo_compare_ip_info(ppkt->size, inet_ntoa(ppkt->ip->ip_src),
+ inet_ntoa(ppkt->ip->ip_dst), spkt->size,
+ inet_ntoa(spkt->ip->ip_src),
+ inet_ntoa(spkt->ip->ip_dst));
+
+ if (ppkt->size == spkt->size) {
+ return memcmp(ppkt->data, spkt->data, spkt->size);
+ } else {
+ return -1;
+ }
+}
+
+/*
+ * Called from the compare thread on the primary
+ * for compare tcp packet
+ * compare_tcp copied from Dr. David Alan Gilbert's branch
+ */
+static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
+{
+ struct tcphdr *ptcp, *stcp;
+ int res;
+
+ trace_colo_compare_main("compare tcp");
+ if (ppkt->size != spkt->size) {
+ if (trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
+ trace_colo_compare_main("pkt size not same");
+ }
+ return -1;
+ }
+
+ ptcp = (struct tcphdr *)ppkt->transport_header;
+ stcp = (struct tcphdr *)spkt->transport_header;
+
+ /*
+ * The 'identification' field in the IP header is *very* random
+ * it almost never matches. Fudge this by ignoring differences in
+ * unfragmented packets; they'll normally sort themselves out if different
+ * anyway, and it should recover at the TCP level.
+ * An alternative would be to get both the primary and secondary to rewrite
+ * somehow; but that would need some sync traffic to sync the state
+ */
+ if (ntohs(ppkt->ip->ip_off) & IP_DF) {
+ spkt->ip->ip_id = ppkt->ip->ip_id;
+ /* and the sum will be different if the IDs were different */
+ spkt->ip->ip_sum = ppkt->ip->ip_sum;
+ }
+
+ res = memcmp(ppkt->data + ETH_HLEN, spkt->data + ETH_HLEN,
+ (spkt->size - ETH_HLEN));
+
+ if (res != 0 && trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
+ trace_colo_compare_pkt_info_src(inet_ntoa(ppkt->ip->ip_src),
+ ntohl(stcp->th_seq),
+ ntohl(stcp->th_ack),
+ res, stcp->th_flags,
+ spkt->size);
+
+ trace_colo_compare_pkt_info_dst(inet_ntoa(ppkt->ip->ip_dst),
+ ntohl(ptcp->th_seq),
+ ntohl(ptcp->th_ack),
+ res, ptcp->th_flags,
+ ppkt->size);
+
+ qemu_hexdump((char *)ppkt->data, stderr,
+ "colo-compare ppkt", ppkt->size);
+ qemu_hexdump((char *)spkt->data, stderr,
+ "colo-compare spkt", spkt->size);
+ }
+
+ return res;
+}
+
+/*
+ * Called from the compare thread on the primary
+ * for compare udp packet
+ */
+static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
+{
+ int ret;
+
+ trace_colo_compare_main("compare udp");
+ ret = colo_packet_compare(ppkt, spkt);
+
+ if (ret) {
+ trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size);
+ qemu_hexdump((char *)ppkt->data, stderr, "colo-compare", ppkt->size);
+ trace_colo_compare_udp_miscompare("Secondary pkt size", spkt->size);
+ qemu_hexdump((char *)spkt->data, stderr, "colo-compare", spkt->size);
+ }
+
+ return ret;
+}
+
+/*
+ * Called from the compare thread on the primary
+ * for compare icmp packet
+ */
+static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt)
+{
+ int network_length;
+
+ trace_colo_compare_main("compare icmp");
+ network_length = ppkt->ip->ip_hl * 4;
+ if (ppkt->size != spkt->size ||
+ ppkt->size < network_length + ETH_HLEN) {
+ return -1;
+ }
+
+ if (colo_packet_compare(ppkt, spkt)) {
+ trace_colo_compare_icmp_miscompare("primary pkt size",
+ ppkt->size);
+ qemu_hexdump((char *)ppkt->data, stderr, "colo-compare",
+ ppkt->size);
+ trace_colo_compare_icmp_miscompare("Secondary pkt size",
+ spkt->size);
+ qemu_hexdump((char *)spkt->data, stderr, "colo-compare",
+ spkt->size);
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
+/*
+ * Called from the compare thread on the primary
+ * for compare other packet
+ */
+static int colo_packet_compare_other(Packet *spkt, Packet *ppkt)
+{
+ trace_colo_compare_main("compare other");
+ trace_colo_compare_ip_info(ppkt->size, inet_ntoa(ppkt->ip->ip_src),
+ inet_ntoa(ppkt->ip->ip_dst), spkt->size,
+ inet_ntoa(spkt->ip->ip_src),
+ inet_ntoa(spkt->ip->ip_dst));
+ return colo_packet_compare(ppkt, spkt);
+}
+
+static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)
+{
+ int64_t now = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+
+ if ((now - pkt->creation_ms) > (*check_time)) {
+ trace_colo_old_packet_check_found(pkt->creation_ms);
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+static void colo_old_packet_check_one_conn(void *opaque,
+ void *user_data)
+{
+ Connection *conn = opaque;
+ GList *result = NULL;
+ int64_t check_time = REGULAR_PACKET_CHECK_MS;
+
+ result = g_queue_find_custom(&conn->primary_list,
+ &check_time,
+ (GCompareFunc)colo_old_packet_check_one);
+
+ if (result) {
+ /* do checkpoint will flush old packet */
+ /* TODO: colo_notify_checkpoint();*/
+ }
+}
+
+/*
+ * Look for old packets that the secondary hasn't matched,
+ * if we have some then we have to checkpoint to wake
+ * the secondary up.
+ */
+static void colo_old_packet_check(void *opaque)
+{
+ CompareState *s = opaque;
+
+ g_queue_foreach(&s->conn_list, colo_old_packet_check_one_conn, NULL);
+}
+
+/*
+ * Called from the compare thread on the primary
+ * for compare connection
+ */
+static void colo_compare_connection(void *opaque, void *user_data)
+{
+ CompareState *s = user_data;
+ Connection *conn = opaque;
+ Packet *pkt = NULL;
+ GList *result = NULL;
+ int ret;
+
+ while (!g_queue_is_empty(&conn->primary_list) &&
+ !g_queue_is_empty(&conn->secondary_list)) {
+ qemu_mutex_lock(&s->timer_check_lock);
+ pkt = g_queue_pop_tail(&conn->primary_list);
+ qemu_mutex_unlock(&s->timer_check_lock);
+ switch (conn->ip_proto) {
+ case IPPROTO_TCP:
+ result = g_queue_find_custom(&conn->secondary_list,
+ pkt, (GCompareFunc)colo_packet_compare_tcp);
+ break;
+ case IPPROTO_UDP:
+ result = g_queue_find_custom(&conn->secondary_list,
+ pkt, (GCompareFunc)colo_packet_compare_udp);
+ break;
+ case IPPROTO_ICMP:
+ result = g_queue_find_custom(&conn->secondary_list,
+ pkt, (GCompareFunc)colo_packet_compare_icmp);
+ break;
+ default:
+ result = g_queue_find_custom(&conn->secondary_list,
+ pkt, (GCompareFunc)colo_packet_compare_other);
+ break;
+ }
+
+ if (result) {
+ ret = compare_chr_send(&s->chr_out, pkt->data, pkt->size);
+ if (ret < 0) {
+ error_report("colo_send_primary_packet failed");
+ }
+ trace_colo_compare_main("packet same and release packet");
+ g_queue_remove(&conn->secondary_list, result->data);
+ packet_destroy(pkt, NULL);
+ } else {
+ /*
+ * If one packet arrive late, the secondary_list or
+ * primary_list will be empty, so we can't compare it
+ * until next comparison.
+ */
+ trace_colo_compare_main("packet different");
+ qemu_mutex_lock(&s->timer_check_lock);
+ g_queue_push_tail(&conn->primary_list, pkt);
+ qemu_mutex_unlock(&s->timer_check_lock);
+ /* TODO: colo_notify_checkpoint();*/
+ break;
+ }
+ }
+}
+
+static int compare_chr_send(CharBackend *out,
+ const uint8_t *buf,
+ uint32_t size)
+{
+ int ret = 0;
+ uint32_t len = htonl(size);
+
+ if (!size) {
+ return 0;
+ }
+
+ ret = qemu_chr_fe_write_all(out, (uint8_t *)&len, sizeof(len));
+ if (ret != sizeof(len)) {
+ goto err;
+ }
+
+ ret = qemu_chr_fe_write_all(out, (uint8_t *)buf, size);
+ if (ret != size) {
+ goto err;
+ }
+
+ return 0;
+
+err:
+ return ret < 0 ? ret : -EIO;
+}
+
+static int compare_chr_can_read(void *opaque)
+{
+ return COMPARE_READ_LEN_MAX;
+}
+
+/*
+ * Called from the main thread on the primary for packets
+ * arriving over the socket from the primary.
+ */
+static void compare_pri_chr_in(void *opaque, const uint8_t *buf, int size)
+{
+ CompareState *s = COLO_COMPARE(opaque);
+ int ret;
+
+ ret = net_fill_rstate(&s->pri_rs, buf, size);
+ if (ret == -1) {
+ qemu_chr_fe_set_handlers(&s->chr_pri_in, NULL, NULL, NULL,
+ NULL, NULL, true);
+ error_report("colo-compare primary_in error");
+ }
+}
+
+/*
+ * Called from the main thread on the primary for packets
+ * arriving over the socket from the secondary.
+ */
+static void compare_sec_chr_in(void *opaque, const uint8_t *buf, int size)
+{
+ CompareState *s = COLO_COMPARE(opaque);
+ int ret;
+
+ ret = net_fill_rstate(&s->sec_rs, buf, size);
+ if (ret == -1) {
+ qemu_chr_fe_set_handlers(&s->chr_sec_in, NULL, NULL, NULL,
+ NULL, NULL, true);
+ error_report("colo-compare secondary_in error");
+ }
+}
+
+static void *colo_compare_thread(void *opaque)
+{
+ GMainContext *worker_context;
+ GMainLoop *compare_loop;
+ CompareState *s = opaque;
+
+ worker_context = g_main_context_new();
+
+ qemu_chr_fe_set_handlers(&s->chr_pri_in, compare_chr_can_read,
+ compare_pri_chr_in, NULL, s, worker_context, true);
+ qemu_chr_fe_set_handlers(&s->chr_sec_in, compare_chr_can_read,
+ compare_sec_chr_in, NULL, s, worker_context, true);
+
+ compare_loop = g_main_loop_new(worker_context, FALSE);
+
+ g_main_loop_run(compare_loop);
+
+ g_main_loop_unref(compare_loop);
+ g_main_context_unref(worker_context);
+ return NULL;
+}
+
+static char *compare_get_pri_indev(Object *obj, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ return g_strdup(s->pri_indev);
+}
+
+static void compare_set_pri_indev(Object *obj, const char *value, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ g_free(s->pri_indev);
+ s->pri_indev = g_strdup(value);
+}
+
+static char *compare_get_sec_indev(Object *obj, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ return g_strdup(s->sec_indev);
+}
+
+static void compare_set_sec_indev(Object *obj, const char *value, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ g_free(s->sec_indev);
+ s->sec_indev = g_strdup(value);
+}
+
+static char *compare_get_outdev(Object *obj, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ return g_strdup(s->outdev);
+}
+
+static void compare_set_outdev(Object *obj, const char *value, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ g_free(s->outdev);
+ s->outdev = g_strdup(value);
+}
+
+static void compare_pri_rs_finalize(SocketReadState *pri_rs)
+{
+ CompareState *s = container_of(pri_rs, CompareState, pri_rs);
+
+ if (packet_enqueue(s, PRIMARY_IN)) {
+ trace_colo_compare_main("primary: unsupported packet in");
+ compare_chr_send(&s->chr_out, pri_rs->buf, pri_rs->packet_len);
+ } else {
+ /* compare connection */
+ g_queue_foreach(&s->conn_list, colo_compare_connection, s);
+ }
+}
+
+static void compare_sec_rs_finalize(SocketReadState *sec_rs)
+{
+ CompareState *s = container_of(sec_rs, CompareState, sec_rs);
+
+ if (packet_enqueue(s, SECONDARY_IN)) {
+ trace_colo_compare_main("secondary: unsupported packet in");
+ } else {
+ /* compare connection */
+ g_queue_foreach(&s->conn_list, colo_compare_connection, s);
+ }
+}
+
+
+/*
+ * Return 0 is success.
+ * Return 1 is failed.
+ */
+static int find_and_check_chardev(CharDriverState **chr,
+ char *chr_name,
+ Error **errp)
+{
+ *chr = qemu_chr_find(chr_name);
+ if (*chr == NULL) {
+ error_setg(errp, "Device '%s' not found",
+ chr_name);
+ return 1;
+ }
+
+ if (!qemu_chr_has_feature(*chr, QEMU_CHAR_FEATURE_RECONNECTABLE)) {
+ error_setg(errp, "chardev \"%s\" is not reconnectable",
+ chr_name);
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Check old packet regularly so it can watch for any packets
+ * that the secondary hasn't produced equivalents of.
+ */
+static void check_old_packet_regular(void *opaque)
+{
+ CompareState *s = opaque;
+
+ timer_mod(s->timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+ REGULAR_PACKET_CHECK_MS);
+ /* if have old packet we will notify checkpoint */
+ /*
+ * TODO: Make timer handler run in compare thread
+ * like qemu_chr_add_handlers_full.
+ */
+ qemu_mutex_lock(&s->timer_check_lock);
+ colo_old_packet_check(s);
+ qemu_mutex_unlock(&s->timer_check_lock);
+}
+
+/*
+ * Called from the main thread on the primary
+ * to setup colo-compare.
+ */
+static void colo_compare_complete(UserCreatable *uc, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(uc);
+ CharDriverState *chr;
+ char thread_name[64];
+ static int compare_id;
+
+ if (!s->pri_indev || !s->sec_indev || !s->outdev) {
+ error_setg(errp, "colo compare needs 'primary_in' ,"
+ "'secondary_in','outdev' property set");
+ return;
+ } else if (!strcmp(s->pri_indev, s->outdev) ||
+ !strcmp(s->sec_indev, s->outdev) ||
+ !strcmp(s->pri_indev, s->sec_indev)) {
+ error_setg(errp, "'indev' and 'outdev' could not be same "
+ "for compare module");
+ return;
+ }
+
+ if (find_and_check_chardev(&chr, s->pri_indev, errp) ||
+ !qemu_chr_fe_init(&s->chr_pri_in, chr, errp)) {
+ return;
+ }
+
+ if (find_and_check_chardev(&chr, s->sec_indev, errp) ||
+ !qemu_chr_fe_init(&s->chr_sec_in, chr, errp)) {
+ return;
+ }
+
+ if (find_and_check_chardev(&chr, s->outdev, errp) ||
+ !qemu_chr_fe_init(&s->chr_out, chr, errp)) {
+ return;
+ }
+
+ net_socket_rs_init(&s->pri_rs, compare_pri_rs_finalize);
+ net_socket_rs_init(&s->sec_rs, compare_sec_rs_finalize);
+
+ g_queue_init(&s->conn_list);
+ qemu_mutex_init(&s->timer_check_lock);
+
+ s->connection_track_table = g_hash_table_new_full(connection_key_hash,
+ connection_key_equal,
+ g_free,
+ connection_destroy);
+
+ sprintf(thread_name, "colo-compare %d", compare_id);
+ qemu_thread_create(&s->thread, thread_name,
+ colo_compare_thread, s,
+ QEMU_THREAD_JOINABLE);
+ compare_id++;
+
+ /* A regular timer to kick any packets that the secondary doesn't match */
+ s->timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, /* Only when guest runs */
+ check_old_packet_regular, s);
+ timer_mod(s->timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
+ REGULAR_PACKET_CHECK_MS);
+
+ return;
+}
+
+static void colo_compare_class_init(ObjectClass *oc, void *data)
+{
+ UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
+
+ ucc->complete = colo_compare_complete;
+}
+
+static void colo_compare_init(Object *obj)
+{
+ object_property_add_str(obj, "primary_in",
+ compare_get_pri_indev, compare_set_pri_indev,
+ NULL);
+ object_property_add_str(obj, "secondary_in",
+ compare_get_sec_indev, compare_set_sec_indev,
+ NULL);
+ object_property_add_str(obj, "outdev",
+ compare_get_outdev, compare_set_outdev,
+ NULL);
+}
+
+static void colo_compare_finalize(Object *obj)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ qemu_chr_fe_deinit(&s->chr_pri_in);
+ qemu_chr_fe_deinit(&s->chr_sec_in);
+ qemu_chr_fe_deinit(&s->chr_out);
+
+ g_queue_free(&s->conn_list);
+
+ if (qemu_thread_is_self(&s->thread)) {
+ /* compare connection */
+ g_queue_foreach(&s->conn_list, colo_compare_connection, s);
+ qemu_thread_join(&s->thread);
+ }
+
+ if (s->timer) {
+ timer_del(s->timer);
+ }
+
+ qemu_mutex_destroy(&s->timer_check_lock);
+
+ g_free(s->pri_indev);
+ g_free(s->sec_indev);
+ g_free(s->outdev);
+}
+
+static const TypeInfo colo_compare_info = {
+ .name = TYPE_COLO_COMPARE,
+ .parent = TYPE_OBJECT,
+ .instance_size = sizeof(CompareState),
+ .instance_init = colo_compare_init,
+ .instance_finalize = colo_compare_finalize,
+ .class_size = sizeof(CompareClass),
+ .class_init = colo_compare_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void register_types(void)
+{
+ type_register_static(&colo_compare_info);
+}
+
+type_init(register_types);
diff --git a/net/colo.c b/net/colo.c
new file mode 100644
index 0000000000..6a6eacd2dc
--- /dev/null
+++ b/net/colo.c
@@ -0,0 +1,211 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "trace.h"
+#include "net/colo.h"
+
+uint32_t connection_key_hash(const void *opaque)
+{
+ const ConnectionKey *key = opaque;
+ uint32_t a, b, c;
+
+ /* Jenkins hash */
+ a = b = c = JHASH_INITVAL + sizeof(*key);
+ a += key->src.s_addr;
+ b += key->dst.s_addr;
+ c += (key->src_port | key->dst_port << 16);
+ __jhash_mix(a, b, c);
+
+ a += key->ip_proto;
+ __jhash_final(a, b, c);
+
+ return c;
+}
+
+int connection_key_equal(const void *key1, const void *key2)
+{
+ return memcmp(key1, key2, sizeof(ConnectionKey)) == 0;
+}
+
+int parse_packet_early(Packet *pkt)
+{
+ int network_length;
+ static const uint8_t vlan[] = {0x81, 0x00};
+ uint8_t *data = pkt->data;
+ uint16_t l3_proto;
+ ssize_t l2hdr_len = eth_get_l2_hdr_length(data);
+
+ if (pkt->size < ETH_HLEN) {
+ trace_colo_proxy_main("pkt->size < ETH_HLEN");
+ return 1;
+ }
+
+ /*
+ * TODO: support vlan.
+ */
+ if (!memcmp(&data[12], vlan, sizeof(vlan))) {
+ trace_colo_proxy_main("COLO-proxy don't support vlan");
+ return 1;
+ }
+
+ pkt->network_header = data + l2hdr_len;
+
+ const struct iovec l2vec = {
+ .iov_base = (void *) data,
+ .iov_len = l2hdr_len
+ };
+ l3_proto = eth_get_l3_proto(&l2vec, 1, l2hdr_len);
+
+ if (l3_proto != ETH_P_IP) {
+ return 1;
+ }
+
+ network_length = pkt->ip->ip_hl * 4;
+ if (pkt->size < l2hdr_len + network_length) {
+ trace_colo_proxy_main("pkt->size < network_header + network_length");
+ return 1;
+ }
+ pkt->transport_header = pkt->network_header + network_length;
+
+ return 0;
+}
+
+void fill_connection_key(Packet *pkt, ConnectionKey *key)
+{
+ uint32_t tmp_ports;
+
+ memset(key, 0, sizeof(*key));
+ key->ip_proto = pkt->ip->ip_p;
+
+ switch (key->ip_proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_DCCP:
+ case IPPROTO_ESP:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDPLITE:
+ tmp_ports = *(uint32_t *)(pkt->transport_header);
+ key->src = pkt->ip->ip_src;
+ key->dst = pkt->ip->ip_dst;
+ key->src_port = ntohs(tmp_ports & 0xffff);
+ key->dst_port = ntohs(tmp_ports >> 16);
+ break;
+ case IPPROTO_AH:
+ tmp_ports = *(uint32_t *)(pkt->transport_header + 4);
+ key->src = pkt->ip->ip_src;
+ key->dst = pkt->ip->ip_dst;
+ key->src_port = ntohs(tmp_ports & 0xffff);
+ key->dst_port = ntohs(tmp_ports >> 16);
+ break;
+ default:
+ break;
+ }
+}
+
+void reverse_connection_key(ConnectionKey *key)
+{
+ struct in_addr tmp_ip;
+ uint16_t tmp_port;
+
+ tmp_ip = key->src;
+ key->src = key->dst;
+ key->dst = tmp_ip;
+
+ tmp_port = key->src_port;
+ key->src_port = key->dst_port;
+ key->dst_port = tmp_port;
+}
+
+Connection *connection_new(ConnectionKey *key)
+{
+ Connection *conn = g_slice_new(Connection);
+
+ conn->ip_proto = key->ip_proto;
+ conn->processing = false;
+ conn->offset = 0;
+ conn->syn_flag = 0;
+ g_queue_init(&conn->primary_list);
+ g_queue_init(&conn->secondary_list);
+
+ return conn;
+}
+
+void connection_destroy(void *opaque)
+{
+ Connection *conn = opaque;
+
+ g_queue_foreach(&conn->primary_list, packet_destroy, NULL);
+ g_queue_free(&conn->primary_list);
+ g_queue_foreach(&conn->secondary_list, packet_destroy, NULL);
+ g_queue_free(&conn->secondary_list);
+ g_slice_free(Connection, conn);
+}
+
+Packet *packet_new(const void *data, int size)
+{
+ Packet *pkt = g_slice_new(Packet);
+
+ pkt->data = g_memdup(data, size);
+ pkt->size = size;
+ pkt->creation_ms = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+
+ return pkt;
+}
+
+void packet_destroy(void *opaque, void *user_data)
+{
+ Packet *pkt = opaque;
+
+ g_free(pkt->data);
+ g_slice_free(Packet, pkt);
+}
+
+/*
+ * Clear hashtable, stop this hash growing really huge
+ */
+void connection_hashtable_reset(GHashTable *connection_track_table)
+{
+ g_hash_table_remove_all(connection_track_table);
+}
+
+/* if not found, create a new connection and add to hash table */
+Connection *connection_get(GHashTable *connection_track_table,
+ ConnectionKey *key,
+ GQueue *conn_list)
+{
+ Connection *conn = g_hash_table_lookup(connection_track_table, key);
+
+ if (conn == NULL) {
+ ConnectionKey *new_key = g_memdup(key, sizeof(*key));
+
+ conn = connection_new(key);
+
+ if (g_hash_table_size(connection_track_table) > HASHTABLE_MAX_SIZE) {
+ trace_colo_proxy_main("colo proxy connection hashtable full,"
+ " clear it");
+ connection_hashtable_reset(connection_track_table);
+ /*
+ * clear the conn_list
+ */
+ while (!g_queue_is_empty(conn_list)) {
+ connection_destroy(g_queue_pop_head(conn_list));
+ }
+ }
+
+ g_hash_table_insert(connection_track_table, new_key, conn);
+ }
+
+ return conn;
+}
diff --git a/net/colo.h b/net/colo.h
new file mode 100644
index 0000000000..7c524f3a1c
--- /dev/null
+++ b/net/colo.h
@@ -0,0 +1,88 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_COLO_PROXY_H
+#define QEMU_COLO_PROXY_H
+
+#include "slirp/slirp.h"
+#include "qemu/jhash.h"
+#include "qemu/timer.h"
+
+#define HASHTABLE_MAX_SIZE 16384
+
+#ifndef IPPROTO_DCCP
+#define IPPROTO_DCCP 33
+#endif
+
+#ifndef IPPROTO_SCTP
+#define IPPROTO_SCTP 132
+#endif
+
+#ifndef IPPROTO_UDPLITE
+#define IPPROTO_UDPLITE 136
+#endif
+
+typedef struct Packet {
+ void *data;
+ union {
+ uint8_t *network_header;
+ struct ip *ip;
+ };
+ uint8_t *transport_header;
+ int size;
+ /* Time of packet creation, in wall clock ms */
+ int64_t creation_ms;
+} Packet;
+
+typedef struct ConnectionKey {
+ /* (src, dst) must be grouped, in the same way than in IP header */
+ struct in_addr src;
+ struct in_addr dst;
+ uint16_t src_port;
+ uint16_t dst_port;
+ uint8_t ip_proto;
+} QEMU_PACKED ConnectionKey;
+
+typedef struct Connection {
+ /* connection primary send queue: element type: Packet */
+ GQueue primary_list;
+ /* connection secondary send queue: element type: Packet */
+ GQueue secondary_list;
+ /* flag to enqueue unprocessed_connections */
+ bool processing;
+ uint8_t ip_proto;
+ /* offset = secondary_seq - primary_seq */
+ tcp_seq offset;
+ /*
+ * we use this flag update offset func
+ * run once in independent tcp connection
+ */
+ int syn_flag;
+} Connection;
+
+uint32_t connection_key_hash(const void *opaque);
+int connection_key_equal(const void *opaque1, const void *opaque2);
+int parse_packet_early(Packet *pkt);
+void fill_connection_key(Packet *pkt, ConnectionKey *key);
+void reverse_connection_key(ConnectionKey *key);
+Connection *connection_new(ConnectionKey *key);
+void connection_destroy(void *opaque);
+Connection *connection_get(GHashTable *connection_track_table,
+ ConnectionKey *key,
+ GQueue *conn_list);
+void connection_hashtable_reset(GHashTable *connection_track_table);
+Packet *packet_new(const void *data, int size);
+void packet_destroy(void *opaque, void *user_data);
+
+#endif /* QEMU_COLO_PROXY_H */
diff --git a/net/filter-mirror.c b/net/filter-mirror.c
index 35df37451d..b7d645617c 100644
--- a/net/filter-mirror.c
+++ b/net/filter-mirror.c
@@ -38,12 +38,12 @@ typedef struct MirrorState {
NetFilterState parent_obj;
char *indev;
char *outdev;
- CharDriverState *chr_in;
- CharDriverState *chr_out;
+ CharBackend chr_in;
+ CharBackend chr_out;
SocketReadState rs;
} MirrorState;
-static int filter_mirror_send(CharDriverState *chr_out,
+static int filter_mirror_send(CharBackend *chr_out,
const struct iovec *iov,
int iovcnt)
{
@@ -110,7 +110,8 @@ static void redirector_chr_read(void *opaque, const uint8_t *buf, int size)
ret = net_fill_rstate(&s->rs, buf, size);
if (ret == -1) {
- qemu_chr_add_handlers(s->chr_in, NULL, NULL, NULL, NULL);
+ qemu_chr_fe_set_handlers(&s->chr_in, NULL, NULL, NULL,
+ NULL, NULL, true);
}
}
@@ -121,7 +122,8 @@ static void redirector_chr_event(void *opaque, int event)
switch (event) {
case CHR_EVENT_CLOSED:
- qemu_chr_add_handlers(s->chr_in, NULL, NULL, NULL, NULL);
+ qemu_chr_fe_set_handlers(&s->chr_in, NULL, NULL, NULL,
+ NULL, NULL, true);
break;
default:
break;
@@ -138,7 +140,7 @@ static ssize_t filter_mirror_receive_iov(NetFilterState *nf,
MirrorState *s = FILTER_MIRROR(nf);
int ret;
- ret = filter_mirror_send(s->chr_out, iov, iovcnt);
+ ret = filter_mirror_send(&s->chr_out, iov, iovcnt);
if (ret) {
error_report("filter_mirror_send failed(%s)", strerror(-ret));
}
@@ -160,8 +162,8 @@ static ssize_t filter_redirector_receive_iov(NetFilterState *nf,
MirrorState *s = FILTER_REDIRECTOR(nf);
int ret;
- if (s->chr_out) {
- ret = filter_mirror_send(s->chr_out, iov, iovcnt);
+ if (qemu_chr_fe_get_driver(&s->chr_out)) {
+ ret = filter_mirror_send(&s->chr_out, iov, iovcnt);
if (ret) {
error_report("filter_mirror_send failed(%s)", strerror(-ret));
}
@@ -175,45 +177,36 @@ static void filter_mirror_cleanup(NetFilterState *nf)
{
MirrorState *s = FILTER_MIRROR(nf);
- if (s->chr_out) {
- qemu_chr_fe_release(s->chr_out);
- }
+ qemu_chr_fe_deinit(&s->chr_out);
}
static void filter_redirector_cleanup(NetFilterState *nf)
{
MirrorState *s = FILTER_REDIRECTOR(nf);
- if (s->chr_in) {
- qemu_chr_add_handlers(s->chr_in, NULL, NULL, NULL, NULL);
- qemu_chr_fe_release(s->chr_in);
- }
- if (s->chr_out) {
- qemu_chr_fe_release(s->chr_out);
- }
+ qemu_chr_fe_deinit(&s->chr_in);
+ qemu_chr_fe_deinit(&s->chr_out);
}
static void filter_mirror_setup(NetFilterState *nf, Error **errp)
{
MirrorState *s = FILTER_MIRROR(nf);
+ CharDriverState *chr;
if (!s->outdev) {
- error_setg(errp, "filter filter mirror needs 'outdev' "
+ error_setg(errp, "filter mirror needs 'outdev' "
"property set");
return;
}
- s->chr_out = qemu_chr_find(s->outdev);
- if (s->chr_out == NULL) {
+ chr = qemu_chr_find(s->outdev);
+ if (chr == NULL) {
error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
"Device '%s' not found", s->outdev);
return;
}
- if (qemu_chr_fe_claim(s->chr_out) != 0) {
- error_setg(errp, QERR_DEVICE_IN_USE, s->outdev);
- return;
- }
+ qemu_chr_fe_init(&s->chr_out, chr, errp);
}
static void redirector_rs_finalize(SocketReadState *rs)
@@ -227,6 +220,7 @@ static void redirector_rs_finalize(SocketReadState *rs)
static void filter_redirector_setup(NetFilterState *nf, Error **errp)
{
MirrorState *s = FILTER_REDIRECTOR(nf);
+ CharDriverState *chr;
if (!s->indev && !s->outdev) {
error_setg(errp, "filter redirector needs 'indev' or "
@@ -243,26 +237,32 @@ static void filter_redirector_setup(NetFilterState *nf, Error **errp)
net_socket_rs_init(&s->rs, redirector_rs_finalize);
if (s->indev) {
- s->chr_in = qemu_chr_find(s->indev);
- if (s->chr_in == NULL) {
+ chr = qemu_chr_find(s->indev);
+ if (chr == NULL) {
error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
"IN Device '%s' not found", s->indev);
return;
}
- qemu_chr_fe_claim_no_fail(s->chr_in);
- qemu_chr_add_handlers(s->chr_in, redirector_chr_can_read,
- redirector_chr_read, redirector_chr_event, nf);
+ if (!qemu_chr_fe_init(&s->chr_in, chr, errp)) {
+ return;
+ }
+
+ qemu_chr_fe_set_handlers(&s->chr_in, redirector_chr_can_read,
+ redirector_chr_read, redirector_chr_event,
+ nf, NULL, true);
}
if (s->outdev) {
- s->chr_out = qemu_chr_find(s->outdev);
- if (s->chr_out == NULL) {
+ chr = qemu_chr_find(s->outdev);
+ if (chr == NULL) {
error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
"OUT Device '%s' not found", s->outdev);
return;
}
- qemu_chr_fe_claim_no_fail(s->chr_out);
+ if (!qemu_chr_fe_init(&s->chr_out, chr, errp)) {
+ return;
+ }
}
}
@@ -315,7 +315,7 @@ filter_mirror_set_outdev(Object *obj, const char *value, Error **errp)
g_free(s->outdev);
s->outdev = g_strdup(value);
if (!s->outdev) {
- error_setg(errp, "filter filter mirror needs 'outdev' "
+ error_setg(errp, "filter mirror needs 'outdev' "
"property set");
return;
}
diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
new file mode 100644
index 0000000000..c4ab91cdee
--- /dev/null
+++ b/net/filter-rewriter.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "trace.h"
+#include "net/colo.h"
+#include "net/filter.h"
+#include "net/net.h"
+#include "qemu-common.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi-visit.h"
+#include "qom/object.h"
+#include "qemu/main-loop.h"
+#include "qemu/iov.h"
+#include "net/checksum.h"
+
+#define FILTER_COLO_REWRITER(obj) \
+ OBJECT_CHECK(RewriterState, (obj), TYPE_FILTER_REWRITER)
+
+#define TYPE_FILTER_REWRITER "filter-rewriter"
+
+typedef struct RewriterState {
+ NetFilterState parent_obj;
+ NetQueue *incoming_queue;
+ /* hashtable to save connection */
+ GHashTable *connection_track_table;
+} RewriterState;
+
+static void filter_rewriter_flush(NetFilterState *nf)
+{
+ RewriterState *s = FILTER_COLO_REWRITER(nf);
+
+ if (!qemu_net_queue_flush(s->incoming_queue)) {
+ /* Unable to empty the queue, purge remaining packets */
+ qemu_net_queue_purge(s->incoming_queue, nf->netdev);
+ }
+}
+
+/*
+ * Return 1 on success, if return 0 means the pkt
+ * is not TCP packet
+ */
+static int is_tcp_packet(Packet *pkt)
+{
+ if (!parse_packet_early(pkt) &&
+ pkt->ip->ip_p == IPPROTO_TCP) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+/* handle tcp packet from primary guest */
+static int handle_primary_tcp_pkt(NetFilterState *nf,
+ Connection *conn,
+ Packet *pkt)
+{
+ struct tcphdr *tcp_pkt;
+
+ tcp_pkt = (struct tcphdr *)pkt->transport_header;
+ if (trace_event_get_state(TRACE_COLO_FILTER_REWRITER_DEBUG)) {
+ trace_colo_filter_rewriter_pkt_info(__func__,
+ inet_ntoa(pkt->ip->ip_src), inet_ntoa(pkt->ip->ip_dst),
+ ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack),
+ tcp_pkt->th_flags);
+ trace_colo_filter_rewriter_conn_offset(conn->offset);
+ }
+
+ if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
+ /*
+ * we use this flag update offset func
+ * run once in independent tcp connection
+ */
+ conn->syn_flag = 1;
+ }
+
+ if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK)) {
+ if (conn->syn_flag) {
+ /*
+ * offset = secondary_seq - primary seq
+ * ack packet sent by guest from primary node,
+ * so we use th_ack - 1 get primary_seq
+ */
+ conn->offset -= (ntohl(tcp_pkt->th_ack) - 1);
+ conn->syn_flag = 0;
+ }
+ /* handle packets to the secondary from the primary */
+ tcp_pkt->th_ack = htonl(ntohl(tcp_pkt->th_ack) + conn->offset);
+
+ net_checksum_calculate((uint8_t *)pkt->data, pkt->size);
+ }
+
+ return 0;
+}
+
+/* handle tcp packet from secondary guest */
+static int handle_secondary_tcp_pkt(NetFilterState *nf,
+ Connection *conn,
+ Packet *pkt)
+{
+ struct tcphdr *tcp_pkt;
+
+ tcp_pkt = (struct tcphdr *)pkt->transport_header;
+
+ if (trace_event_get_state(TRACE_COLO_FILTER_REWRITER_DEBUG)) {
+ trace_colo_filter_rewriter_pkt_info(__func__,
+ inet_ntoa(pkt->ip->ip_src), inet_ntoa(pkt->ip->ip_dst),
+ ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack),
+ tcp_pkt->th_flags);
+ trace_colo_filter_rewriter_conn_offset(conn->offset);
+ }
+
+ if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
+ /*
+ * save offset = secondary_seq and then
+ * in handle_primary_tcp_pkt make offset
+ * = secondary_seq - primary_seq
+ */
+ conn->offset = ntohl(tcp_pkt->th_seq);
+ }
+
+ if ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK) {
+ /* handle packets to the primary from the secondary*/
+ tcp_pkt->th_seq = htonl(ntohl(tcp_pkt->th_seq) - conn->offset);
+
+ net_checksum_calculate((uint8_t *)pkt->data, pkt->size);
+ }
+
+ return 0;
+}
+
+static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
+ NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ NetPacketSent *sent_cb)
+{
+ RewriterState *s = FILTER_COLO_REWRITER(nf);
+ Connection *conn;
+ ConnectionKey key;
+ Packet *pkt;
+ ssize_t size = iov_size(iov, iovcnt);
+ char *buf = g_malloc0(size);
+
+ iov_to_buf(iov, iovcnt, 0, buf, size);
+ pkt = packet_new(buf, size);
+ g_free(buf);
+
+ /*
+ * if we get tcp packet
+ * we will rewrite it to make secondary guest's
+ * connection established successfully
+ */
+ if (pkt && is_tcp_packet(pkt)) {
+
+ fill_connection_key(pkt, &key);
+
+ if (sender == nf->netdev) {
+ /*
+ * We need make tcp TX and RX packet
+ * into one connection.
+ */
+ reverse_connection_key(&key);
+ }
+ conn = connection_get(s->connection_track_table,
+ &key,
+ NULL);
+
+ if (sender == nf->netdev) {
+ /* NET_FILTER_DIRECTION_TX */
+ if (!handle_primary_tcp_pkt(nf, conn, pkt)) {
+ qemu_net_queue_send(s->incoming_queue, sender, 0,
+ (const uint8_t *)pkt->data, pkt->size, NULL);
+ packet_destroy(pkt, NULL);
+ pkt = NULL;
+ /*
+ * We block the packet here,after rewrite pkt
+ * and will send it
+ */
+ return 1;
+ }
+ } else {
+ /* NET_FILTER_DIRECTION_RX */
+ if (!handle_secondary_tcp_pkt(nf, conn, pkt)) {
+ qemu_net_queue_send(s->incoming_queue, sender, 0,
+ (const uint8_t *)pkt->data, pkt->size, NULL);
+ packet_destroy(pkt, NULL);
+ pkt = NULL;
+ /*
+ * We block the packet here,after rewrite pkt
+ * and will send it
+ */
+ return 1;
+ }
+ }
+ }
+
+ packet_destroy(pkt, NULL);
+ pkt = NULL;
+ return 0;
+}
+
+static void colo_rewriter_cleanup(NetFilterState *nf)
+{
+ RewriterState *s = FILTER_COLO_REWRITER(nf);
+
+ /* flush packets */
+ if (s->incoming_queue) {
+ filter_rewriter_flush(nf);
+ g_free(s->incoming_queue);
+ }
+}
+
+static void colo_rewriter_setup(NetFilterState *nf, Error **errp)
+{
+ RewriterState *s = FILTER_COLO_REWRITER(nf);
+
+ s->connection_track_table = g_hash_table_new_full(connection_key_hash,
+ connection_key_equal,
+ g_free,
+ connection_destroy);
+ s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
+}
+
+static void colo_rewriter_class_init(ObjectClass *oc, void *data)
+{
+ NetFilterClass *nfc = NETFILTER_CLASS(oc);
+
+ nfc->setup = colo_rewriter_setup;
+ nfc->cleanup = colo_rewriter_cleanup;
+ nfc->receive_iov = colo_rewriter_receive_iov;
+}
+
+static const TypeInfo colo_rewriter_info = {
+ .name = TYPE_FILTER_REWRITER,
+ .parent = TYPE_NETFILTER,
+ .class_init = colo_rewriter_class_init,
+ .instance_size = sizeof(RewriterState),
+};
+
+static void register_types(void)
+{
+ type_register_static(&colo_rewriter_info);
+}
+
+type_init(register_types);
diff --git a/net/filter.c b/net/filter.c
index 888fe6dd93..1dfd2caa23 100644
--- a/net/filter.c
+++ b/net/filter.c
@@ -239,7 +239,7 @@ static void netfilter_finalize(Object *obj)
}
if (nf->netdev && !QTAILQ_EMPTY(&nf->netdev->filters) &&
- nf->next.tqe_prev) {
+ QTAILQ_IN_USE(nf, next)) {
QTAILQ_REMOVE(&nf->netdev->filters, nf, next);
}
g_free(nf->netdev_id);
diff --git a/net/net.c b/net/net.c
index 660e904742..5c0897d323 100644
--- a/net/net.c
+++ b/net/net.c
@@ -690,9 +690,13 @@ static ssize_t nc_sendv_compat(NetClientState *nc, const struct iovec *iov,
buffer = iov[0].iov_base;
offset = iov[0].iov_len;
} else {
- buf = g_new(uint8_t, NET_BUFSIZE);
+ offset = iov_size(iov, iovcnt);
+ if (offset > NET_BUFSIZE) {
+ return -1;
+ }
+ buf = g_malloc(offset);
buffer = buf;
- offset = iov_to_buf(iov, iovcnt, 0, buf, NET_BUFSIZE);
+ offset = iov_to_buf(iov, iovcnt, 0, buf, offset);
}
if (flags & QEMU_NET_PACKET_FLAG_RAW && nc->info->receive_raw) {
@@ -1179,6 +1183,7 @@ void hmp_host_net_remove(Monitor *mon, const QDict *qdict)
qemu_del_net_client(nc->peer);
qemu_del_net_client(nc);
+ qemu_opts_del(qemu_opts_find(qemu_find_opts("net"), device));
}
void netdev_add(QemuOpts *opts, Error **errp)
@@ -1673,9 +1678,8 @@ int net_fill_rstate(SocketReadState *rs, const uint8_t *buf, int size)
if (rs->index >= rs->packet_len) {
rs->index = 0;
rs->state = 0;
- if (rs->finalize) {
- rs->finalize(rs);
- }
+ assert(rs->finalize);
+ rs->finalize(rs);
}
break;
}
diff --git a/net/slirp.c b/net/slirp.c
index e810ee30a5..76ebb7f029 100644
--- a/net/slirp.c
+++ b/net/slirp.c
@@ -40,6 +40,7 @@
#include "sysemu/char.h"
#include "sysemu/sysemu.h"
#include "qemu/cutils.h"
+#include "qapi/error.h"
static int get_str_sep(char *buf, int buf_size, const char **pp, int sep)
{
@@ -732,7 +733,7 @@ int net_slirp_smb(const char *exported_dir)
#endif /* !defined(_WIN32) */
struct GuestFwd {
- CharDriverState *hd;
+ CharBackend hd;
struct in_addr server;
int port;
Slirp *slirp;
@@ -796,15 +797,23 @@ static int slirp_guestfwd(SlirpState *s, const char *config_str,
return -1;
}
} else {
- fwd = g_new(struct GuestFwd, 1);
- fwd->hd = qemu_chr_new(buf, p, NULL);
- if (!fwd->hd) {
+ Error *err = NULL;
+ CharDriverState *chr = qemu_chr_new(buf, p);
+
+ if (!chr) {
error_report("could not open guest forwarding device '%s'", buf);
+ return -1;
+ }
+
+ fwd = g_new(struct GuestFwd, 1);
+ qemu_chr_fe_init(&fwd->hd, chr, &err);
+ if (err) {
+ error_report_err(err);
g_free(fwd);
return -1;
}
- if (slirp_add_exec(s->slirp, 3, fwd->hd, &server, port) < 0) {
+ if (slirp_add_exec(s->slirp, 3, &fwd->hd, &server, port) < 0) {
error_report("conflicting/invalid host:port in guest forwarding "
"rule '%s'", config_str);
g_free(fwd);
@@ -814,9 +823,8 @@ static int slirp_guestfwd(SlirpState *s, const char *config_str,
fwd->port = port;
fwd->slirp = s->slirp;
- qemu_chr_fe_claim_no_fail(fwd->hd);
- qemu_chr_add_handlers(fwd->hd, guestfwd_can_read, guestfwd_read,
- NULL, fwd);
+ qemu_chr_fe_set_handlers(&fwd->hd, guestfwd_can_read, guestfwd_read,
+ NULL, fwd, NULL, true);
}
return 0;
diff --git a/net/socket.c b/net/socket.c
index 3f98eefb34..fe3547b018 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -489,90 +489,106 @@ static int net_socket_listen_init(NetClientState *peer,
{
NetClientState *nc;
NetSocketState *s;
- struct sockaddr_in saddr;
- int fd, ret;
-
- if (parse_host_port(&saddr, host_str) < 0)
- return -1;
+ SocketAddress *saddr;
+ int ret;
+ Error *local_error = NULL;
- fd = qemu_socket(PF_INET, SOCK_STREAM, 0);
- if (fd < 0) {
- perror("socket");
+ saddr = socket_parse(host_str, &local_error);
+ if (saddr == NULL) {
+ error_report_err(local_error);
return -1;
}
- qemu_set_nonblock(fd);
-
- socket_set_fast_reuse(fd);
- ret = bind(fd, (struct sockaddr *)&saddr, sizeof(saddr));
- if (ret < 0) {
- perror("bind");
- closesocket(fd);
- return -1;
- }
- ret = listen(fd, 0);
+ ret = socket_listen(saddr, &local_error);
if (ret < 0) {
- perror("listen");
- closesocket(fd);
+ qapi_free_SocketAddress(saddr);
+ error_report_err(local_error);
return -1;
}
nc = qemu_new_net_client(&net_socket_info, peer, model, name);
s = DO_UPCAST(NetSocketState, nc, nc);
s->fd = -1;
- s->listen_fd = fd;
+ s->listen_fd = ret;
s->nc.link_down = true;
+ net_socket_rs_init(&s->rs, net_socket_rs_finalize);
qemu_set_fd_handler(s->listen_fd, net_socket_accept, NULL, s);
+ qapi_free_SocketAddress(saddr);
return 0;
}
+typedef struct {
+ NetClientState *peer;
+ SocketAddress *saddr;
+ char *model;
+ char *name;
+} socket_connect_data;
+
+static void socket_connect_data_free(socket_connect_data *c)
+{
+ qapi_free_SocketAddress(c->saddr);
+ g_free(c->model);
+ g_free(c->name);
+ g_free(c);
+}
+
+static void net_socket_connected(int fd, Error *err, void *opaque)
+{
+ socket_connect_data *c = opaque;
+ NetSocketState *s;
+ char *addr_str = NULL;
+ Error *local_error = NULL;
+
+ addr_str = socket_address_to_string(c->saddr, &local_error);
+ if (addr_str == NULL) {
+ error_report_err(local_error);
+ closesocket(fd);
+ goto end;
+ }
+
+ s = net_socket_fd_init(c->peer, c->model, c->name, fd, true);
+ if (!s) {
+ closesocket(fd);
+ goto end;
+ }
+
+ snprintf(s->nc.info_str, sizeof(s->nc.info_str),
+ "socket: connect to %s", addr_str);
+
+end:
+ g_free(addr_str);
+ socket_connect_data_free(c);
+}
+
static int net_socket_connect_init(NetClientState *peer,
const char *model,
const char *name,
const char *host_str)
{
- NetSocketState *s;
- int fd, connected, ret;
- struct sockaddr_in saddr;
+ socket_connect_data *c = g_new0(socket_connect_data, 1);
+ int fd = -1;
+ Error *local_error = NULL;
- if (parse_host_port(&saddr, host_str) < 0)
- return -1;
+ c->peer = peer;
+ c->model = g_strdup(model);
+ c->name = g_strdup(name);
+ c->saddr = socket_parse(host_str, &local_error);
+ if (c->saddr == NULL) {
+ goto err;
+ }
- fd = qemu_socket(PF_INET, SOCK_STREAM, 0);
+ fd = socket_connect(c->saddr, &local_error, net_socket_connected, c);
if (fd < 0) {
- perror("socket");
- return -1;
+ goto err;
}
- qemu_set_nonblock(fd);
- connected = 0;
- for(;;) {
- ret = connect(fd, (struct sockaddr *)&saddr, sizeof(saddr));
- if (ret < 0) {
- if (errno == EINTR || errno == EWOULDBLOCK) {
- /* continue */
- } else if (errno == EINPROGRESS ||
- errno == EALREADY ||
- errno == EINVAL) {
- break;
- } else {
- perror("connect");
- closesocket(fd);
- return -1;
- }
- } else {
- connected = 1;
- break;
- }
- }
- s = net_socket_fd_init(peer, model, name, fd, connected);
- if (!s)
- return -1;
- snprintf(s->nc.info_str, sizeof(s->nc.info_str),
- "socket: connect to %s:%d",
- inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
return 0;
+
+err:
+ error_report_err(local_error);
+ socket_connect_data_free(c);
+ return -1;
}
static int net_socket_mcast_init(NetClientState *peer,
diff --git a/net/tap-bsd.c b/net/tap-bsd.c
index c506ac31d6..6c9692263d 100644
--- a/net/tap-bsd.c
+++ b/net/tap-bsd.c
@@ -35,6 +35,10 @@
#include <net/if_tap.h>
#endif
+#if defined(__OpenBSD__)
+#include <sys/param.h>
+#endif
+
#ifndef __FreeBSD__
int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
int vnet_hdr_required, int mq_required, Error **errp)
@@ -55,7 +59,7 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
if (*ifname) {
snprintf(dname, sizeof dname, "/dev/%s", ifname);
} else {
-#if defined(__OpenBSD__)
+#if defined(__OpenBSD__) && OpenBSD < 201605
snprintf(dname, sizeof dname, "/dev/tun%d", i);
#else
snprintf(dname, sizeof dname, "/dev/tap%d", i);
diff --git a/net/tap.c b/net/tap.c
index e82dfe9df3..57cfe1a30c 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -918,7 +918,9 @@ free_fail:
return -1;
}
- fd = net_bridge_run_helper(tap->helper, DEFAULT_BRIDGE_INTERFACE,
+ fd = net_bridge_run_helper(tap->helper,
+ tap->has_br ?
+ tap->br : DEFAULT_BRIDGE_INTERFACE,
errp);
if (fd == -1) {
return -1;
diff --git a/net/trace-events b/net/trace-events
index 65c46a48fb..35198bc742 100644
--- a/net/trace-events
+++ b/net/trace-events
@@ -2,3 +2,21 @@
# net/vhost-user.c
vhost_user_event(const char *chr, int event) "chr: %s got event: %d"
+
+# net/colo.c
+colo_proxy_main(const char *chr) ": %s"
+
+# net/colo-compare.c
+colo_compare_main(const char *chr) ": %s"
+colo_compare_udp_miscompare(const char *sta, int size) ": %s = %d"
+colo_compare_icmp_miscompare(const char *sta, int size) ": %s = %d"
+colo_compare_ip_info(int psize, const char *sta, const char *stb, int ssize, const char *stc, const char *std) "ppkt size = %d, ip_src = %s, ip_dst = %s, spkt size = %d, ip_src = %s, ip_dst = %s"
+colo_old_packet_check_found(int64_t old_time) "%" PRId64
+colo_compare_miscompare(void) ""
+colo_compare_pkt_info_src(const char *src, uint32_t sseq, uint32_t sack, int res, uint32_t sflag, int ssize) "src/dst: %s s: seq/ack=%u/%u res=%d flags=%x spkt_size: %d\n"
+colo_compare_pkt_info_dst(const char *dst, uint32_t dseq, uint32_t dack, int res, uint32_t dflag, int dsize) "src/dst: %s d: seq/ack=%u/%u res=%d flags=%x dpkt_size: %d\n"
+
+# net/filter-rewriter.c
+colo_filter_rewriter_debug(void) ""
+colo_filter_rewriter_pkt_info(const char *func, const char *src, const char *dst, uint32_t seq, uint32_t ack, uint32_t flag) "%s: src/dst: %s/%s p: seq/ack=%u/%u flags=%x\n"
+colo_filter_rewriter_conn_offset(uint32_t offset) ": offset=%u\n"
diff --git a/net/vhost-user.c b/net/vhost-user.c
index b0595f8781..7aff77ee4a 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -20,18 +20,13 @@
typedef struct VhostUserState {
NetClientState nc;
- CharDriverState *chr;
+ CharBackend chr; /* only queue index 0 */
VHostNetState *vhost_net;
guint watch;
uint64_t acked_features;
bool started;
} VhostUserState;
-typedef struct VhostUserChardevProps {
- bool is_socket;
- bool is_unix;
-} VhostUserChardevProps;
-
VHostNetState *vhost_user_get_vhost_net(NetClientState *nc)
{
VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc);
@@ -67,7 +62,7 @@ static void vhost_user_stop(int queues, NetClientState *ncs[])
}
}
-static int vhost_user_start(int queues, NetClientState *ncs[])
+static int vhost_user_start(int queues, NetClientState *ncs[], CharBackend *be)
{
VhostNetOptions options;
struct vhost_net *net = NULL;
@@ -83,7 +78,7 @@ static int vhost_user_start(int queues, NetClientState *ncs[])
s = DO_UPCAST(VhostUserState, nc, ncs[i]);
options.net_backend = ncs[i];
- options.opaque = s->chr;
+ options.opaque = be;
options.busyloop_timeout = 0;
net = vhost_net_init(&options);
if (!net) {
@@ -155,10 +150,8 @@ static void vhost_user_cleanup(NetClientState *nc)
g_free(s->vhost_net);
s->vhost_net = NULL;
}
- if (s->chr) {
- qemu_chr_add_handlers(s->chr, NULL, NULL, NULL, NULL);
- qemu_chr_fe_release(s->chr);
- s->chr = NULL;
+ if (nc->queue_index == 0) {
+ qemu_chr_fe_deinit(&s->chr);
}
qemu_purge_queued_packets(nc);
@@ -192,7 +185,7 @@ static gboolean net_vhost_user_watch(GIOChannel *chan, GIOCondition cond,
{
VhostUserState *s = opaque;
- qemu_chr_disconnect(s->chr);
+ qemu_chr_fe_disconnect(&s->chr);
return FALSE;
}
@@ -202,6 +195,7 @@ static void net_vhost_user_event(void *opaque, int event)
const char *name = opaque;
NetClientState *ncs[MAX_QUEUE_NUM];
VhostUserState *s;
+ CharDriverState *chr;
Error *err = NULL;
int queues;
@@ -211,13 +205,14 @@ static void net_vhost_user_event(void *opaque, int event)
assert(queues < MAX_QUEUE_NUM);
s = DO_UPCAST(VhostUserState, nc, ncs[0]);
- trace_vhost_user_event(s->chr->label, event);
+ chr = qemu_chr_fe_get_driver(&s->chr);
+ trace_vhost_user_event(chr->label, event);
switch (event) {
case CHR_EVENT_OPENED:
- s->watch = qemu_chr_fe_add_watch(s->chr, G_IO_HUP,
+ s->watch = qemu_chr_fe_add_watch(&s->chr, G_IO_HUP,
net_vhost_user_watch, s);
- if (vhost_user_start(queues, ncs) < 0) {
- qemu_chr_disconnect(s->chr);
+ if (vhost_user_start(queues, ncs, &s->chr) < 0) {
+ qemu_chr_fe_disconnect(&s->chr);
return;
}
qmp_set_link(name, true, &err);
@@ -240,6 +235,7 @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
const char *name, CharDriverState *chr,
int queues)
{
+ Error *err = NULL;
NetClientState *nc, *nc0 = NULL;
VhostUserState *s;
int i;
@@ -249,28 +245,28 @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
for (i = 0; i < queues; i++) {
nc = qemu_new_net_client(&net_vhost_user_info, peer, device, name);
- if (!nc0) {
- nc0 = nc;
- }
-
snprintf(nc->info_str, sizeof(nc->info_str), "vhost-user%d to %s",
i, chr->label);
-
nc->queue_index = i;
+ if (!nc0) {
+ nc0 = nc;
+ s = DO_UPCAST(VhostUserState, nc, nc);
+ if (!qemu_chr_fe_init(&s->chr, chr, &err)) {
+ error_report_err(err);
+ return -1;
+ }
+ }
- s = DO_UPCAST(VhostUserState, nc, nc);
- s->chr = chr;
}
s = DO_UPCAST(VhostUserState, nc, nc0);
do {
- Error *err = NULL;
- if (qemu_chr_wait_connected(chr, &err) < 0) {
+ if (qemu_chr_fe_wait_connected(&s->chr, &err) < 0) {
error_report_err(err);
return -1;
}
- qemu_chr_add_handlers(chr, NULL, NULL,
- net_vhost_user_event, nc0->name);
+ qemu_chr_fe_set_handlers(&s->chr, NULL, NULL,
+ net_vhost_user_event, nc0->name, NULL, true);
} while (!s->started);
assert(s->vhost_net);
@@ -278,51 +274,27 @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
return 0;
}
-static int net_vhost_chardev_opts(void *opaque,
- const char *name, const char *value,
- Error **errp)
-{
- VhostUserChardevProps *props = opaque;
-
- if (strcmp(name, "backend") == 0 && strcmp(value, "socket") == 0) {
- props->is_socket = true;
- } else if (strcmp(name, "path") == 0) {
- props->is_unix = true;
- } else if (strcmp(name, "server") == 0) {
- } else {
- error_setg(errp,
- "vhost-user does not support a chardev with option %s=%s",
- name, value);
- return -1;
- }
- return 0;
-}
-
-static CharDriverState *net_vhost_parse_chardev(
+static CharDriverState *net_vhost_claim_chardev(
const NetdevVhostUserOptions *opts, Error **errp)
{
CharDriverState *chr = qemu_chr_find(opts->chardev);
- VhostUserChardevProps props;
if (chr == NULL) {
error_setg(errp, "chardev \"%s\" not found", opts->chardev);
return NULL;
}
- /* inspect chardev opts */
- memset(&props, 0, sizeof(props));
- if (qemu_opt_foreach(chr->opts, net_vhost_chardev_opts, &props, errp)) {
+ if (!qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_RECONNECTABLE)) {
+ error_setg(errp, "chardev \"%s\" is not reconnectable",
+ opts->chardev);
return NULL;
}
-
- if (!props.is_socket || !props.is_unix) {
- error_setg(errp, "chardev \"%s\" is not a unix socket",
+ if (!qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_FD_PASS)) {
+ error_setg(errp, "chardev \"%s\" does not support FD passing",
opts->chardev);
return NULL;
}
- qemu_chr_fe_claim_no_fail(chr);
-
return chr;
}
@@ -357,7 +329,7 @@ int net_init_vhost_user(const Netdev *netdev, const char *name,
assert(netdev->type == NET_CLIENT_DRIVER_VHOST_USER);
vhost_user_opts = &netdev->u.vhost_user;
- chr = net_vhost_parse_chardev(vhost_user_opts, errp);
+ chr = net_vhost_claim_chardev(vhost_user_opts, errp);
if (!chr) {
return -1;
}