5. Epoll Decouple interest set registration from poll +: O(1) on the wait +: Edge trigger - : system call for adding onto the set int epoll_create(int size); //desc, need close int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event); int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout); typedef union epoll_data { void *ptr; int fd; uint32_t u32; uint64_t u64; } epoll_data_t; struct epoll_event { uint32_t events; /* Epoll events */ epoll_data_t data; /* User data variable */ };
6. IOCTL Device / special file control int ioctl(int d, int request, ...); Request is specific to device being controlled, and may have a payload (ioctl_list)
7. Filesystem events int inotify_init(void); // desc, need close int inotify_add_watch(int fd, const char *pathname, uint32_t mask); // watch desc int inotify_rm_watch(int fd, uint32_t wd); FIONREAD ioctl fcntl: F_NOTIFY struct inotify_event { int wd; /* watch descriptor */ uint32_t mask; /* mask of events */ uint32_t cookie; /* unique cookie */ uint32_t len; /* size of 'name' field */ char name[]; /* null-terminated name */ };
8. int inotifyd_main(int argc UNUSED_PARAM, char **argv) { unsigned mask = IN_ALL_EVENTS; // assume we want all events struct pollfd pfd; char **watched = ++argv; // watched name list const char *args[] = { *argv, NULL, NULL, NULL, NULL }; // open inotify pfd.fd = inotify_init(); if (pfd.fd < 0) bb_perror_msg_and_die("no kernel support"); // setup watched while (*++argv) { char *path = *argv; char *masks = strchr(path, ':'); int wd; // watch descriptor // if mask is specified -> if (masks) { *masks = ''; // split path and mask // convert mask names to mask bitset mask = 0; while (*++masks) { int i = strchr(mask_names, *masks) - mask_names; if (i >= 0) { mask |= (1 << i); } } } // add watch wd = inotify_add_watch(pfd.fd, path, mask); if (wd < 0) { bb_perror_msg_and_die("add watch (%s) failed", path); } } static const char mask_names[] ALIGN1 = "a" // 0x00000001 File was accessed "c" // 0x00000002 File was modified "e" // 0x00000004 Metadata changed "w" // 0x00000008 Writtable file was closed "0" // 0x00000010 Unwrittable file closed "r" // 0x00000020 File was opened "m" // 0x00000040 File was moved from X "y" // 0x00000080 File was moved to Y "n" // 0x00000100 Subfile was created "d" // 0x00000200 Subfile was deleted "D" // 0x00000400 Self was deleted "M" // 0x00000800 Self was moved ; pfd.events = POLLIN; while (!signalled && poll(&pfd, 1, -1) > 0) { ssize_t len; void *buf; struct inotify_event *ie; // read out all pending events xioctl(pfd.fd, FIONREAD, &len); #define eventbuf bb_common_bufsiz1 ie = buf = (len <= sizeof(eventbuf)) ? eventbuf : xmalloc(len); len = full_read(pfd.fd, buf, len); // process events. N.B. events may vary in length while (len > 0) { int i; char events[12]; char *s = events; unsigned m = ie->mask; for (i = 0; i < 12; ++i, m >>= 1) { if (m & 1) { *s++ = mask_names[i]; } } *s = ''; args[1] = events; args[2] = watched[ie->wd]; args[3] = ie->len ? ie->name : NULL; xspawn((char **)args); // next event i = sizeof(struct inotify_event) + ie->len; len -= i; ie = (void*)((char*)ie + i); } if (eventbuf != buf) free(buf); } return EXIT_SUCCESS; }
9. Asynchronous I/O Only on O_DIRECT struct aiocb { int aio_filedes; /* file descriptor * int aio_lio_opcode; /* operation to perform */ int aio_reqprio; /* request priority offset * volatile void *aio_buf; /* pointer to buffer */ size_t aio_nbytes; /* length of operation */ struct sigevent aio_sigevent; /* signal number and value */ /* internal, private members follow... */ }; int aio_read (struct aiocb *aiocbp); int aio_write (struct aiocb *aiocbp); int aio_error (const struct aiocb *aiocbp); int aio_return (struct aiocb *aiocbp); int aio_cancel (int fd, struct aiocb *aiocbp); int aio_fsync (int op, struct aiocb *aiocbp); int aio_suspend (const struct aiocb * const cblist[], int n, const struct timespec *timeout);
10. Network Architecture Application – telnet/ftp/http...etc Presentation -- intended for e.g. encryption Session -- e.g. iSCSI Transport – PORTS Network – IP, ATM Link -- Physical – Ethernet, wifi... OSI Packets and Data Encapsulation Protocols can be stacked on top of that e.g. CIM over HTTP ------------------------------------------------------------- | Eth | IP | TCP | App | DDDDAAAATTTTAAAA | -------------------------------------------------------------
11. Focus Link is handled by HW and drivers Network: IP, handled by kernel, affects addressing and byte ordering Transport layer TCP – Reliable, sequenced, Connection-oriented UDP – Unreliable, unsequenced, connectionless Handled by kernel which provides us an interface Application is what you are writing
15. Name Service what hosts (sometimes, what service) DNS/BIND, NIS/YP, LDAP DNS: domain name (fully qualified) The Resolver named /etc/hosts Order: /etc/host.conf
16. Name / Address Info address ==> name Name ==> address(es) String ==> Address Address ==> String My host Info int getnameinfo(const struct sockaddr *sa, socklen_t salen, char *host, size_t hostlen, char *serv, size_t servlen, int flags); int getaddrinfo(const char *node, const char *service, const struct addrinfo *hints, struct addrinfo **res); void freeaddrinfo(struct addrinfo *res); const char *gai_strerror(int errcode); struct addrinfo { int ai_flags; int ai_family; int ai_socktype; int ai_protocol; size_t ai_addrlen; struct sockaddr *ai_addr; char *ai_canonname; struct addrinfo *ai_next; }; int inet_pton(int af, const char *src, void *dst); const char *inet_ntop(int af, const void *src, char *dst, socklen_t cnt); NI_NOFQDN NI_NUMERICHOST NI_NAMEREQD NI_NUMERICSERV NI_DGRAM int gethostname(char *name, size_t len);
17. Legacy Name/Address Info struct hostent *gethostbyname(const char *name); struct hostent *gethostbyaddr(const void *addr, socklen_t len, int type); void herror(const char *s); const char *hstrerror(int err); Require a deep copy GNU extensions: re-entrancy (_r), POSIX extension: gethostent(void) IPV4 only: inet_ntoa/aton and family struct hostent { char *h_name; char **h_aliases; int h_addrtype; int h_length; char **h_addr_list; }
18. Sockets Model Virtual hookup (like the phone) A special “descriptor” (hooks VFS to transport layer) Creation int socket(int domain, int type, int protocol); Domains: PF_{INET, INET6, UNIX, NETLINK ....} Types: SOCK_{STREAM, DGRAM, RAW, ...} Protocols and getprotoent() Address / Socket binding int bind(int sockfd, const struct sockaddr *addr, socklen_t addrlen); INADDR_ANY, INADDR6_ANY
19. Reliable Sockets Connect to server address int connect(int sockfd, const struct sockaddr *serv_addr, socklen_t addrlen); Listening to incoming connections int listen(int sockfd, int backlog); Accepting a new connection int accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen); Gets a new “child” socket descriptor Stevens et al
21. Socket Options int getsockopt(int s, int level, int optname, void *optval, socklen_t *optlen); int setsockopt(int s, int level, int optname, const void *optval, socklen_t optlen); Some important options: SO_KEEPALIVE SO_RCVBUF / SO_SNDBUF SO_LINGER SO_REUSEADDR
22. Unreliable Communication ssize_t sendto(int s, const void *buf, size_t len, int flags, const struct sockaddr *to, socklen_t tolen); ssize_t recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from, socklen_t *fromlen); To add reliability: Connection (You can still connect, no handshake) Sequence Replies + timeouts + retransmission
23. I/O Like File I/O: read/write/readv/writev/poll/select/ fcntl-SIGIO... ssize_t send(int s, const void *buf, size_t len, int flags); ssize_t recv(int s, void *buf, size_t len, int flags); Flags only matter on connections MSG_{CONFIRM, DONTROUTE, DONTWAIT, EOR, MORE, NOSIGNAL, OOB, WAITALL, PEEK}