Creating TUN/TAP interfaces in Linux

The basic approach to writing a TUN/TAP client (such as a VPN) for Linux is:

Open the /dev/net/tun device as a file, which (once configured) will communicate network traffic to userspace.
Allocate (or bind) a virtual network interface with the file handle using ioctl(TUNSETIFF).
Configure the network interface's address and link state.
Process network traffic in the userspace program.

There's reasonably complete documentation about each step of this process, but I couldn't find a worked example that tied it all together. The following C program is intended to serve as a basic minimal TUN/TAP client.

Steps 1-2: Allocating a TUN/TAP interface

Opening a file is straightforward, so the important part of this function is the ioctl(TUNSETIFF) call. It's this call that creates the network interface, and there are two user-configurable fields:

The ifr_name field contains the interface name, which may be specified by the caller. If unset (empty), then the kernel will assign a name such as tun0 or tap0.
The ifr_flags field sets whether the create a TUN or TAP interface. TUN interfaces process IP packets, and TAP interfaces process Ethernet frames.

The set of possible flags and their effects are documented at Linux Networking Documentation » Universal TUN/TAP device driver.

The interface name, if provided, must be less than IFNAMSIZ bytes. After the ioctl call returns, the ifr_name field can be inspected to see what name the interface was created with.

/* Copyright (c) John Millikin <john@john-millikin.com> */
/* SPDX-License-Identifier: 0BSD */
#define _POSIX_C_SOURCE 200809L

#include <errno.h>
#include <fcntl.h>
#include <linux/if.h>
#include <linux/if_tun.h>
#include <string.h>
#include <sys/ioctl.h>
#include <unistd.h>

int tuntap_connect(const char *iface_name, short flags, char *iface_name_out) {
	int tuntap_fd, rc;
	size_t iface_name_len;
	struct ifreq setiff_request;

	if (iface_name != NULL) {
		iface_name_len = strlen(iface_name);
		if (iface_name_len >= IFNAMSIZ) {
			errno = EINVAL;
			return -1;
		}
	}

	tuntap_fd = open("/dev/net/tun", O_RDWR | O_CLOEXEC);
	if (tuntap_fd == -1) {
		return -1;
	}

	memset(&setiff_request, 0, sizeof setiff_request);
	setiff_request.ifr_flags = flags;
	if (iface_name != NULL) {
		memcpy(setiff_request.ifr_name, iface_name, iface_name_len + 1);
	}
	rc = ioctl(tuntap_fd, TUNSETIFF, &setiff_request);
	if (rc == -1) {
		int ioctl_errno = errno;
		close(tuntap_fd);
		errno = ioctl_errno;
		return -1;
	}

	if (iface_name_out != NULL) {
		memcpy(iface_name_out, setiff_request.ifr_name, IFNAMSIZ);
	}

	return tuntap_fd;
}

Step 3: Configure the interface with Netlink

At this point, most TUN/TAP examples I've found tell the user to configure the newly-created network interface by using the command line to run tools from iproute2. In this post I will instead use the Linux kernel's native Netlink subsystem.

Netlink can be thought of as a sort of RPC-ish request/response protocol, where messages are assembled manually from C structs. Besides the kernel docs linked above, the following manpages are useful for writing a Netlink client:

In this example we will be using the NETLINK_ROUTE mode to send RTM_NEWADDR and RTM_NEWLINK requests. Netlink error handling is a bit obtuse since it requires manual response handling, so I'm not going to bother with it for this example.

The first step is to open an AF_NETLINK socket by calling socket(AF_NETLINK). I'm also calling bind(), which isn't strictly necessary but provides metadata useful to strace[1].

/* Copyright (c) John Millikin <john@john-millikin.com> */
/* SPDX-License-Identifier: 0BSD */
#include <arpa/inet.h>
#include <linux/if.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <net/if.h>
#include <stdint.h>
#include <string.h>

int netlink_connect() {
	int netlink_fd, rc;
	struct sockaddr_nl sockaddr;

	netlink_fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
	if (netlink_fd == -1) {
		return -1;
	}

	memset(&sockaddr, 0, sizeof sockaddr);
	sockaddr.nl_family = AF_NETLINK;
	rc = bind(netlink_fd, (struct sockaddr*) &sockaddr, sizeof sockaddr);
	if (rc == -1) {
		int bind_errno = errno;
		close(netlink_fd);
		errno = bind_errno;
		return -1;
	}
	return netlink_fd;
}

The first Netlink command will be RTM_NEWADDR, which sets the address and prefix length (netmask) of the interface. I've only implemented IPv4 support for this example, but IPv6 is similar.

A Netlink request contains a header (struct nlmsghdr), message content (here that's a struct ifaddrmsg), and an optional list of key-value attributes. The set of necessary attributes isn't well documented, so I ran strace ip addr add and replicated its requests.

int netlink_set_addr_ipv4(
	  int netlink_fd
	, const char *iface_name
	, const char *address
	, uint8_t network_prefix_bits
) {
	struct {
		struct nlmsghdr  header;
		struct ifaddrmsg content;
		char             attributes_buf[64];
	} request;

	struct rtattr *request_attr;
	size_t attributes_buf_avail = sizeof request.attributes_buf;

	memset(&request, 0, sizeof request);
	request.header.nlmsg_len = NLMSG_LENGTH(sizeof request.content);
	request.header.nlmsg_flags = NLM_F_REQUEST | NLM_F_EXCL | NLM_F_CREATE;
	request.header.nlmsg_type = RTM_NEWADDR;
	request.content.ifa_index = if_nametoindex(iface_name);
	request.content.ifa_family = AF_INET;
	request.content.ifa_prefixlen = network_prefix_bits;

	/* request.attributes[IFA_LOCAL] = address */
	request_attr = IFA_RTA(&request.content);
	request_attr->rta_type = IFA_LOCAL;
	request_attr->rta_len = RTA_LENGTH(sizeof (struct in_addr));
	request.header.nlmsg_len += request_attr->rta_len;
	inet_pton(AF_INET, address, RTA_DATA(request_attr));

	/* request.attributes[IFA_ADDRESS] = address */
	request_attr = RTA_NEXT(request_attr, attributes_buf_avail);
	request_attr->rta_type = IFA_ADDRESS;
	request_attr->rta_len = RTA_LENGTH(sizeof (struct in_addr));
	request.header.nlmsg_len += request_attr->rta_len;
	inet_pton(AF_INET, address, RTA_DATA(request_attr));

	if (send(netlink_fd, &request, request.header.nlmsg_len, 0) == -1) {
		return -1;
	}
	return 0;
}

The second Netlink command uses RTM_NEWLINK to enable the interface. It's equivalent to running ip link set up.

int netlink_link_up(int netlink_fd, const char *iface_name) {
	struct {
		struct nlmsghdr  header;
		struct ifinfomsg content;
	} request;

	memset(&request, 0, sizeof request);
	request.header.nlmsg_len = NLMSG_LENGTH(sizeof request.content);
	request.header.nlmsg_flags = NLM_F_REQUEST;
	request.header.nlmsg_type = RTM_NEWLINK;
	request.content.ifi_index = if_nametoindex(iface_name);
	request.content.ifi_flags = IFF_UP;
	request.content.ifi_change = 1;

	if (send(netlink_fd, &request, request.header.nlmsg_len, 0) == -1) {
		return -1;
	}
	return 0;
}

At this point the TUN/TAP interface has been fully configured and is just waiting for our process to read/write network data.

Step 4: Process network traffic

For this example I'll be writing a very simple tun2udp binary, which forwards IPv4 packets to/from UDP on localhost. Compile it with GCC or Clang:

gcc -o tun2udp tun2udp.c
send_port=12345
recv_port=12346
sudo ./tun2udp 10.11.12.0/24 $send_port $recv_port

/* Copyright (c) John Millikin <john@john-millikin.com> */
/* SPDX-License-Identifier: 0BSD */
#include <poll.h>
#include <stdio.h>
#include <stdlib.h>

int run_proxy(int tuntap_fd, int send_fd, int recv_fd) {
	struct pollfd poll_fds[2];
	char recv_buf[UINT16_MAX];

	poll_fds[0].fd = tuntap_fd;
	poll_fds[0].events = POLLIN;
	poll_fds[1].fd = recv_fd;
	poll_fds[1].events = POLLIN;

	while (1) {
		if (poll(poll_fds, 2, -1) == -1) {
			return -1;
		}

		if ((poll_fds[0].revents & POLLIN) != 0) {
			ssize_t count = read(tuntap_fd, recv_buf, UINT16_MAX);
			if (count < 0) {
				return -1;
			}
			send(send_fd, recv_buf, count, 0);
		}

		if ((poll_fds[1].revents & POLLIN) != 0) {
			ssize_t count = recv(recv_fd, recv_buf, UINT16_MAX, 0);
			if (count < 0) {
				return -1;
			}
			if (write(tuntap_fd, recv_buf, count) == -1) {
				return -1;
			}
		}
	}

	return 0;
}

int bind_localhost_udp(uint16_t port) {
	int fd, rc;
	struct sockaddr_in addr;

	fd = socket(AF_INET, SOCK_DGRAM, 0);
	if (fd == -1) {
		return -1;
	}

	memset(&addr, 0, sizeof addr);
	addr.sin_family = AF_INET;
	addr.sin_port = htons(port);
	addr.sin_addr.s_addr = inet_addr("127.0.0.1");

	rc = connect(fd, (struct sockaddr*) &addr, sizeof addr);
	if (rc == -1) {
		int connect_errno = errno;
		close(fd);
		errno = connect_errno;
		return -1;
	}

	return fd;
}

int connect_localhost_udp(uint16_t port) {
	int fd, rc;
	struct sockaddr_in addr;

	fd = socket(AF_INET, SOCK_DGRAM, 0);
	if (fd == -1) {
		return -1;
	}

	memset(&addr, 0, sizeof addr);
	addr.sin_family = AF_INET;
	addr.sin_port = htons(port);
	addr.sin_addr.s_addr = inet_addr("127.0.0.1");

	rc = bind(fd, (struct sockaddr*) &addr, sizeof addr);
	if (rc == -1) {
		int bind_errno = errno;
		close(fd);
		errno = bind_errno;
		return -1;
	}

	return fd;
}

The rest of the code is just argument parsing. For the TUN interface address it accepts an IPv4 dotted quad, with an optional netmask (defaulting to /32).

int split_address(char *address_str, uint8_t *network_prefix_bits) {
	char *prefix_sep, *prefix_str;

	prefix_sep = strchr(address_str, '/');
	if (prefix_sep == NULL) {
		prefix_str = NULL;
		*network_prefix_bits = 32;
	} else {
		*prefix_sep = 0;
		prefix_str = prefix_sep + 1;
	}

	if (inet_addr(address_str) == INADDR_NONE) {
		return -1;
	}

	if (prefix_str != NULL) {
		char *prefix_extra;
		long prefix_raw = strtol(prefix_str, &prefix_extra, 10);

		if (prefix_raw < 0 || prefix_raw > 32) {
			*prefix_sep = '/';
			return -1;
		}
		if (*prefix_extra != 0) {
			*prefix_sep = '/';
			return -1;
		}
		*network_prefix_bits = prefix_raw;
	}

	return 0;
}

int parse_port(char *port_str, uint16_t *port) {
	char *extra;
	long raw = strtol(port_str, &extra, 10);

	if (raw < 0 || raw > UINT16_MAX) {
		return -1;
	}
	if (*extra != 0) {
		return -1;
	}
	*port = raw;
	return 0;
}

Finally we get to main() and can glue everything together. Copy (or #include) the TUN/TAP and Netlink code from earlier sections. The TUN/TAP flags are hardcoded to IFF_TUN | IFF_NO_PI, which means it will send/receive IP packets with no additional framing. The interface name will be assigned by the kernel.

int main(int argc, char **argv) {
	int tuntap_fd, netlink_fd, send_fd, recv_fd, rc;
	char iface_name[IFNAMSIZ];
	char *address;
	uint8_t prefix_bits;
	uint16_t send_port, recv_port;

	if (argc < 4) {
		fprintf(stderr, "Usage: %s <address> <send-port> <recv-port>\n", argv[0]);
		return 1;
	}

	address = argv[1];
	if (split_address(address, &prefix_bits) == -1) {
		fprintf(stderr, "Invalid address \"%s\"\n", argv[1]);
		return 1;
	}

	if (parse_port(argv[2], &send_port) == -1) {
		fprintf(stderr, "Invalid port \"%s\"\n", argv[2]);
		return 1;
	}

	if (parse_port(argv[3], &recv_port) == -1) {
		fprintf(stderr, "Invalid port \"%s\"\n", argv[3]);
		return 1;
	}

	send_fd = bind_localhost_udp(send_port);
	if (send_fd == -1) {
		fprintf(stderr, "bind_localhost_udp(%u): ", send_port);
		perror(NULL);
		return 1;
	}
	recv_fd = connect_localhost_udp(recv_port);
	if (recv_fd == -1) {
		fprintf(stderr, "connect_localhost_udp(%u): ", recv_port);
		perror(NULL);
		return 1;
	}

	tuntap_fd = tuntap_connect(NULL, IFF_TUN | IFF_NO_PI, iface_name);
	if (tuntap_fd == -1) {
		perror("tuntap_connect");
		return 1;
	}

	netlink_fd = netlink_connect();
	if (netlink_fd == -1) {
		perror("netlink_connect");
		return 1;
	}

	rc = netlink_set_addr_ipv4(netlink_fd, iface_name, address, prefix_bits);
	if (rc == -1) {
		perror("netlink_set_addr_ipv4");
		return 1;
	}
	rc = netlink_link_up(netlink_fd, iface_name);
	if (rc == -1) {
		perror("netlink_link_up");
		return 1;
	}
	close(netlink_fd);

	if (run_proxy(tuntap_fd, send_fd, recv_fd) == -1) {
		perror("run_proxy");
		return 1;
	}
	return 0;
}

If the Netlink socket has bind() called on it, then the traced RTM_NEWADDR command is formatted like this:

sendto(6, [
	{
		nlmsg_len=40,
		nlmsg_type=RTM_NEWADDR,
		nlmsg_flags=NLM_F_REQUEST|NLM_F_EXCL|NLM_F_CREATE,
		nlmsg_seq=0
	 	nlmsg_pid=0
	}, {
		ifa_family=AF_INET,
		ifa_prefixlen=24,
		ifa_flags=0,
		ifa_scope=RT_SCOPE_UNIVERSE,
		ifa_index=if_nametoindex("tun0")
	}, [
		[{nla_len=8, nla_type=IFA_LOCAL}, inet_addr("10.10.0.1")],
		[{nla_len=8, nla_type=IFA_ADDRESS}, inet_addr("10.10.0.1")]
	]
], 40, 0, NULL, 0) = 40

If the socket does not have bind() called on it, then the same command is formatted like this:

sendto(6, [
	{
		nlmsg_len=40,
		nlmsg_type=0x14 /* NLMSG_??? */,
		nlmsg_flags=NLM_F_REQUEST|0x600,
		nlmsg_seq=0,
		nlmsg_pid=0
	}, "\x02\x18\x00\x00\x55\x00\x00\x00\x08\x00\x02\x00\x0a\x0a\x00\x01\x08\x00\x01\x00\x0a\x0a\x00\x01"
], 40, 0, NULL, 0) = 40