分类 系统安全 下的文章

ss源代码调试&原理分析

源代码调试

ss是位于iproute2这个库中,可以从iproute2上面下载到源代码,配置其源代码调试的方式和netstat源代码调试这篇文章一样.

在根目录下创建CMakeLists.txt文件,内容如下:

cmake_minimum_required(VERSION 3.13)
project(test C)
 
set(BUILD_DIR .)
 
#add_executable()
add_custom_target(ss command -c ${BUILD_DIR})

同时修改Makefile文件中的45行的CCOPTS = -O2CCOPTS = -O0 -g3

在clion中配置Target:

clion-settings.png

Netid  State      Recv-Q Send-Q Local Address:Port             Peer Address:Port
tcp    ESTAB      0      0      127.0.0.1:57354                127.0.0.1:socks               
tcp    ESTAB      0      0      127.0.0.1:37350                127.0.0.1:socks               
tcp    ESTAB      0      0      172.16.40.154:43450                45.8.223.61:17250               
tcp    CLOSE-WAIT 1      0      127.0.0.1:57398                127.0.0.1:socks               
tcp    ESTAB      0      0      127.0.0.1:57062                127.0.0.1:socks

和直接运行ss命令得到的结果一样.接下来就是分析整个ss程序的执行流程

main

main函数就是用于对各种选项进行解析,并以此判断执行什么函数.

int main(int argc, char *argv[])
{
    int saw_states = 0;
    int saw_query = 0;
    int do_summary = 0;
    const char *dump_tcpdiag = NULL;
    FILE *filter_fp = NULL;
    int ch;
    int state_filter = 0;
    int addrp_width, screen_width = 80;
 
    while ((ch = getopt_long(argc, argv,
                 "dhaletuwxnro460spbEf:miA:D:F:vVzZN:KHS",
                 long_opts, NULL)) != EOF) {
        switch (ch) {
        case 'n':
            resolve_services = 0;
            break;
        ......
        }
        .....
    }

在默认情况下,会进入到如下代码中

if (do_default) {
    state_filter = state_filter ? state_filter : SS_CONN;
    filter_default_dbs(&current_filter);
}

程序会执行filter_default_dbs()函数,设置默认的过滤条件.

filter_default_dbs

static void filter_default_dbs(struct filter *f) {
    filter_db_set(f, UDP_DB);
    filter_db_set(f, DCCP_DB);
    filter_db_set(f, TCP_DB);
    filter_db_set(f, RAW_DB);
    filter_db_set(f, UNIX_ST_DB);
    filter_db_set(f, UNIX_DG_DB);
    filter_db_set(f, UNIX_SQ_DB);
    filter_db_set(f, PACKET_R_DB);
    filter_db_set(f, PACKET_DG_DB);
    filter_db_set(f, NETLINK_DB);
    filter_db_set(f, SCTP_DB);
}

ilter_default_dbs很简单就是在默认情况下设置的过滤条件.

之后程序会执行到unix_show(&current_filter);

unix_show

函数代码如下:

static void filter_default_dbs(struct filter *f) {
    filter_db_set(f, UDP_DB);
    filter_db_set(f, DCCP_DB);
    filter_db_set(f, TCP_DB);
    filter_db_set(f, RAW_DB);
    filter_db_set(f, UNIX_ST_DB);
    filter_db_set(f, UNIX_DG_DB);
    filter_db_set(f, UNIX_SQ_DB);
    filter_db_set(f, PACKET_R_DB);
    filter_db_set(f, PACKET_DG_DB);
    filter_db_set(f, NETLINK_DB);
    filter_db_set(f, SCTP_DB);
}
filter_default_dbs很简单就是在默认情况下设置的过滤条件.

之后程序会执行到unix_show(&current_filter);

unix_show
函数代码如下:

unix_show  Collapse source
static int unix_show(struct filter *f)
{
    FILE *fp;
    char buf[256];
    char name[128];
    int  newformat = 0;
    int  cnt;
    struct sockstat *list = NULL;
    const int unix_state_map[] = { SS_CLOSE, SS_SYN_SENT,
                       SS_ESTABLISHED, SS_CLOSING };
 
    if (!filter_af_get(f, AF_UNIX))
        return 0;
 
    if (!getenv("PROC_NET_UNIX") && !getenv("PROC_ROOT")
        && unix_show_netlink(f) == 0)
        return 0;
 
    if ((fp = net_unix_open()) == NULL)
        return -1;
    if (!fgets(buf, sizeof(buf), fp)) {
        fclose(fp);
        return -1;
    }
 
    if (memcmp(buf, "Peer", 4) == 0)
        newformat = 1;
    cnt = 0;
 
    while (fgets(buf, sizeof(buf), fp)) {
        struct sockstat *u, **insp;
        int flags;
 
        if (!(u = calloc(1, sizeof(*u))))
            break;
 
        if (sscanf(buf, "%x: %x %x %x %x %x %d %s",
               &u->rport, &u->rq, &u->wq, &flags, &u->type,
               &u->state, &u->ino, name) < 8)
            name[0] = 0;
 
        u->lport = u->ino;
        u->local.family = u->remote.family = AF_UNIX;
 
        if (flags & (1 << 16)) {
            u->state = SS_LISTEN;
        } else if (u->state > 0 &&
               u->state <= ARRAY_SIZE(unix_state_map)) {
            u->state = unix_state_map[u->state-1];
            if (u->type == SOCK_DGRAM && u->state == SS_CLOSE && u->rport)
                u->state = SS_ESTABLISHED;
        }
        if (unix_type_skip(u, f) ||
            !(f->states & (1 << u->state))) {
            free(u);
            continue;
        }
 
        if (!newformat) {
            u->rport = 0;
            u->rq = 0;
            u->wq = 0;
        }
 
        if (name[0]) {
            u->name = strdup(name);
            if (!u->name) {
                free(u);
                break;
            }
        }
 
        if (u->rport) {
            struct sockstat *p;
 
            for (p = list; p; p = p->next) {
                if (u->rport == p->lport)
                    break;
            }
            if (!p)
                u->peer_name = "?";
            else
                u->peer_name = p->name ? : "*";
        }
 
        if (f->f) {
            struct sockstat st = {
                .local.family = AF_UNIX,
                .remote.family = AF_UNIX,
            };
 
            memcpy(st.local.data, &u->name, sizeof(u->name));
            if (strcmp(u->peer_name, "*"))
                memcpy(st.remote.data, &u->peer_name,
                       sizeof(u->peer_name));
            if (run_ssfilter(f->f, &st) == 0) {
                free(u->name);
                free(u);
                continue;
            }
        }
 
        insp = &list;
        while (*insp) {
            if (u->type < (*insp)->type ||
                (u->type == (*insp)->type &&
                 u->ino < (*insp)->ino))
                break;
            insp = &(*insp)->next;
        }
        u->next = *insp;
        *insp = u;
 
        if (++cnt > MAX_UNIX_REMEMBER) {
            while (list) {
                unix_stats_print(list, f);
                printf("\n");
 
                unix_list_drop_first(&list);
            }
            cnt = 0;
        }
    }
    fclose(fp);
    while (list) {
        unix_stats_print(list, f);
        printf("\n");
 
        unix_list_drop_first(&list);
    }
 
    return 0;
}

这个函数就是解析网络数据的核心函数.代码较多,还是分布分析这些代码.

unix_show_netlink

if (!getenv("PROC_NET_UNIX") && !getenv("PROC_ROOT")
       && unix_show_netlink(f) == 0)
       return 0;
  • getenv判断PROC_NET_UNIXPROC_ROOT是否存在
  • unix_show_netlink(f)创建netlink

追踪进入到unix_show_netlink()

static int unix_show_netlink(struct filter *f)
{
    DIAG_REQUEST(req, struct unix_diag_req r);
 
    req.r.sdiag_family = AF_UNIX;
    req.r.udiag_states = f->states;
    req.r.udiag_show = UDIAG_SHOW_NAME | UDIAG_SHOW_PEER | UDIAG_SHOW_RQLEN;
    if (show_mem)
        req.r.udiag_show |= UDIAG_SHOW_MEMINFO;
 
    return handle_netlink_request(f, &req.nlh, sizeof(req), unix_show_sock);
}

f是一个filter,用于设置一些简单的过滤条件.

req.r.sdiag_family = AF_UNIX;
req.r.udiag_states = f->states;
req.r.udiag_show = UDIAG_SHOW_NAME | UDIAG_SHOW_PEER | UDIAG_SHOW_RQLEN;

是用于设置diag_netnetlink的请求头,之后调用handle_netlink_request(f, &req.nlh, sizeof(req),unix_show_sock);

handle_netlink_request

跟踪进入到handle_netlink_request的实现

static int handle_netlink_request(struct filter *f, struct nlmsghdr *req,
        size_t size, rtnl_filter_t show_one_sock)
{
    int ret = -1;
    struct rtnl_handle rth;
 
    if (rtnl_open_byproto(&rth, 0, NETLINK_SOCK_DIAG))
        return -1;
 
    rth.dump = MAGIC_SEQ;
 
    if (rtnl_send(&rth, req, size) < 0)
        goto Exit;
 
    if (rtnl_dump_filter(&rth, show_one_sock, f))
        goto Exit;
 
    ret = 0;
Exit:
    rtnl_close(&rth);
    return ret;
}
  • 调用rtnl_send(&rth, req, size)用于发送diag_netnetlink的消息头.
  • rtnl_dump_filter(&rth, show_one_sock,f)获取netlink的返回消息,回调show_one_sock()函数.

rtnl_send

跟踪进入到lib/libnetlink.c

int rtnl_send(struct rtnl_handle *rth, const void *buf, int len)
{
    return send(rth->fd, buf, len, 0);
}

rtnl_send直接调用send()方法发送信息.

rtnl_dump_filter

跟踪进入到lib/libnetlink.c

int rtnl_dump_filter_nc(struct rtnl_handle *rth,
             rtnl_filter_t filter,
             void *arg1, __u16 nc_flags)
{
    const struct rtnl_dump_filter_arg a[2] = {
        { .filter = filter, .arg1 = arg1, .nc_flags = nc_flags, },
        { .filter = NULL,   .arg1 = NULL, .nc_flags = 0, },
    };
 
    return rtnl_dump_filter_l(rth, a);
}

rtnl_dump_filter_nc()中设置rtnl_dump_filter_arg过滤函数,之后调用rtnl_dump_filter_l()

int rtnl_dump_filter_l(struct rtnl_handle *rth,
               const struct rtnl_dump_filter_arg *arg)
{
    struct sockaddr_nl nladdr;
    struct iovec iov;
    struct msghdr msg = {
        .msg_name = &nladdr,
        .msg_namelen = sizeof(nladdr),
        .msg_iov = &iov,
        .msg_iovlen = 1,
    };
    char buf[32768];
    int dump_intr = 0;
 
    iov.iov_base = buf;
    while (1) {
        int status;
        const struct rtnl_dump_filter_arg *a;
        int found_done = 0;
        int msglen = 0;
 
        iov.iov_len = sizeof(buf);
        status = recvmsg(rth->fd, &msg, 0);
 
        if (status < 0) {
            if (errno == EINTR || errno == EAGAIN)
                continue;
            fprintf(stderr, "netlink receive error %s (%d)\n",
                strerror(errno), errno);
            return -1;
        }
 
        if (status == 0) {
            fprintf(stderr, "EOF on netlink\n");
            return -1;
        }
 
        if (rth->dump_fp)
            fwrite(buf, 1, NLMSG_ALIGN(status), rth->dump_fp);
 
        for (a = arg; a->filter; a++) {
            struct nlmsghdr *h = (struct nlmsghdr *)buf;
 
            msglen = status;
 
            while (NLMSG_OK(h, msglen)) {
                int err = 0;
 
                h->nlmsg_flags &= ~a->nc_flags;
 
                if (nladdr.nl_pid != 0 ||
                    h->nlmsg_pid != rth->local.nl_pid ||
                    h->nlmsg_seq != rth->dump)
                    goto skip_it;
 
                if (h->nlmsg_flags & NLM_F_DUMP_INTR)
                    dump_intr = 1;
 
                if (h->nlmsg_type == NLMSG_DONE) {
                    err = rtnl_dump_done(h);
                    if (err < 0)
                        return -1;
 
                    found_done = 1;
                    break; /* process next filter */
                }
 
                if (h->nlmsg_type == NLMSG_ERROR) {
                    rtnl_dump_error(rth, h);
                    return -1;
                }
 
                if (!rth->dump_fp) {
                    err = a->filter(&nladdr, h, a->arg1);
                    if (err < 0)
                        return err;
                }
 
skip_it:
                h = NLMSG_NEXT(h, msglen);
            }
        }
 
        if (found_done) {
            if (dump_intr)
                fprintf(stderr,
                    "Dump was interrupted and may be inconsistent.\n");
            return 0;
        }
 
        if (msg.msg_flags & MSG_TRUNC) {
            fprintf(stderr, "Message truncated\n");
            continue;
        }
        if (msglen) {
            fprintf(stderr, "!!!Remnant of size %d\n", msglen);
            exit(1);
        }
    }
}

rtnl_dump_filter_l()实现了通过netlink获取数据,然后根据rtnl_dump_filter_arg过滤数据.

获取数据:

struct sockaddr_nl nladdr;
struct iovec iov;
struct msghdr msg = {
    .msg_name = &nladdr,
    .msg_namelen = sizeof(nladdr),
    .msg_iov = &iov,
    .msg_iovlen = 1,
};
.....
status = recvmsg(rth->fd, &msg, 0);

过滤数据:

for (a = arg; a->filter; a++) {
    struct nlmsghdr *h = (struct nlmsghdr *)buf;
    .....
    h->nlmsg_flags &= ~a->nc_flags;
    if (nladdr.nl_pid != 0 ||
                h->nlmsg_pid != rth->local.nl_pid ||
                h->nlmsg_seq != rth->dump)
                goto skip_it;
 
            if (h->nlmsg_flags & NLM_F_DUMP_INTR)
                dump_intr = 1;
 
            if (h->nlmsg_type == NLMSG_DONE) {
                err = rtnl_dump_done(h);
                if (err < 0)
                    return -1;
 
                found_done = 1;
                break; /* process next filter */
            }
            .......

之前说过,handle_netlink_request(f, &req.nlh, sizeof(req), unix_show_sock);程序最终会回调unix_show_sock函数.

unix_show_sock

跟踪unix_show_sock的实现

static int unix_show_sock(const struct sockaddr_nl *addr, struct nlmsghdr *nlh,
        void *arg)
{
    struct filter *f = (struct filter *)arg;
    struct unix_diag_msg *r = NLMSG_DATA(nlh);
    struct rtattr *tb[UNIX_DIAG_MAX+1];
    char name[128];
    struct sockstat stat = { .name = "*", .peer_name = "*" };
 
    parse_rtattr(tb, UNIX_DIAG_MAX, (struct rtattr *)(r+1),
             nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*r)));
 
    stat.type  = r->udiag_type;
    stat.state = r->udiag_state;
    stat.ino   = stat.lport = r->udiag_ino;
    stat.local.family = stat.remote.family = AF_UNIX;
 
    if (unix_type_skip(&stat, f))
        return 0;
 
    if (tb[UNIX_DIAG_RQLEN]) {
        struct unix_diag_rqlen *rql = RTA_DATA(tb[UNIX_DIAG_RQLEN]);
 
        stat.rq = rql->udiag_rqueue;
        stat.wq = rql->udiag_wqueue;
    }
    if (tb[UNIX_DIAG_NAME]) {
        int len = RTA_PAYLOAD(tb[UNIX_DIAG_NAME]);
 
        memcpy(name, RTA_DATA(tb[UNIX_DIAG_NAME]), len);
        name[len] = '\0';
        if (name[0] == '\0') {
            int i;
            for (i = 0; i < len; i++)
                if (name[i] == '\0')
                    name[i] = '@';
        }
        stat.name = &name[0];
        memcpy(stat.local.data, &stat.name, sizeof(stat.name));
    }
    if (tb[UNIX_DIAG_PEER])
        stat.rport = rta_getattr_u32(tb[UNIX_DIAG_PEER]);
 
    if (f->f && run_ssfilter(f->f, &stat) == 0)
        return 0;
 
    unix_stats_print(&stat, f);
 
    if (show_mem)
        print_skmeminfo(tb, UNIX_DIAG_MEMINFO);
    if (show_details) {
        if (tb[UNIX_DIAG_SHUTDOWN]) {
            unsigned char mask;
 
            mask = rta_getattr_u8(tb[UNIX_DIAG_SHUTDOWN]);
            printf(" %c-%c", mask & 1 ? '-' : '<', mask & 2 ? '-' : '>');
        }
    }
    printf("\n");
 
    return 0;
}

1.struct unix_diag_msg *r = NLMSG_DATA(nlh); parse_rtattr(tb, UNIX_DIAG_MAX, (struct rtattr *)(r+1),nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*r)));获取netlink的数据

2.解析数据并赋值

stat.type  = r->udiag_type;
stat.state = r->udiag_state;
stat.ino   = stat.lport = r->udiag_ino;
stat.local.family = stat.remote.family = AF_UNIX;
-------------------------------------------------
stat.rq = rql->udiag_rqueue;
stat.wq = rql->udiag_wqueue;

unix_stats_print

unix_stats_print(&stat, f);获取网络的连接状态

static void unix_stats_print(struct sockstat *s, struct filter *f)
{
    char port_name[30] = {};
 
    sock_state_print(s);
 
    sock_addr_print(s->name ?: "*", " ",
            int_to_str(s->lport, port_name), NULL);
    sock_addr_print(s->peer_name ?: "*", " ",
            int_to_str(s->rport, port_name), NULL);
 
    proc_ctx_print(s);
}

sock_state_print

跟踪进入到sock_state_print()

static void sock_state_print(struct sockstat *s)
{
    const char *sock_name;
    static const char * const sstate_name[] = {
        "UNKNOWN",
        [SS_ESTABLISHED] = "ESTAB",
        [SS_SYN_SENT] = "SYN-SENT",
        [SS_SYN_RECV] = "SYN-RECV",
        [SS_FIN_WAIT1] = "FIN-WAIT-1",
        [SS_FIN_WAIT2] = "FIN-WAIT-2",
        [SS_TIME_WAIT] = "TIME-WAIT",
        [SS_CLOSE] = "UNCONN",
        [SS_CLOSE_WAIT] = "CLOSE-WAIT",
        [SS_LAST_ACK] = "LAST-ACK",
        [SS_LISTEN] =   "LISTEN",
        [SS_CLOSING] = "CLOSING",
    };
 
    switch (s->local.family) {
    case AF_UNIX:
        sock_name = unix_netid_name(s->type);
        break;
    case AF_INET:
    case AF_INET6:
        sock_name = proto_name(s->type);
        break;
    case AF_PACKET:
        sock_name = s->type == SOCK_RAW ? "p_raw" : "p_dgr";
        break;
    case AF_NETLINK:
        sock_name = "nl";
        break;
    default:
        sock_name = "unknown";
    }
 
    if (netid_width)
        printf("%-*s ", netid_width,
               is_sctp_assoc(s, sock_name) ? "" : sock_name);
    if (state_width) {
        if (is_sctp_assoc(s, sock_name))
            printf("`- %-*s ", state_width - 3,
                   sctp_sstate_name[s->state]);
        else
            printf("%-*s ", state_width, sstate_name[s->state]);
    }
 
    printf("%-6d %-6d ", s->rq, s->wq);
}

根据s→local.family分别输出对应的内容,代码就不做过多的解释了,就是简单的switch case的判断.全部执行完毕之后,输出的结果是:

Netid  State      Recv-Q Send-Q Local Address:Port                 Peer Address:Port               
u_seq  ESTAB      0      0      @00017 309855                * 309856

可以发现其实在ss的默认输出情况下也是没有pid信息.如果我们采用ss -p,结果是:

etid  State      Recv-Q Send-Q Local Address:Port                 Peer Address:Port               
u_seq  ESTAB      0      0      @00017 309855                * 309856                users:(("code",pid=17009,fd=17))
u_seq  ESTAB      0      0      @00012 157444                * 157445                users:(("chrome",pid=5834,fd=10))

user_ent_hash_build

当我们加了-p参数之后,程序运行的结果:

case 'p':
    show_users++;
    user_ent_hash_build();
    break;

show_users的值变为1,程序接着执行user_ent_hash_build()

static void user_ent_hash_build(void)
{
    const char *root = getenv("PROC_ROOT") ? : "/proc/";
    struct dirent *d;
    char name[1024];
    int nameoff;
    DIR *dir;
    char *pid_context;
    char *sock_context;
    const char *no_ctx = "unavailable";
    static int user_ent_hash_build_init;
 
    /* If show_users & show_proc_ctx set only do this once */
    if (user_ent_hash_build_init != 0)
        return;
 
    user_ent_hash_build_init = 1;
 
    strlcpy(name, root, sizeof(name));
 
    if (strlen(name) == 0 || name[strlen(name)-1] != '/')
        strcat(name, "/");
 
    nameoff = strlen(name);
 
    dir = opendir(name);
    if (!dir)
        return;
 
    while ((d = readdir(dir)) != NULL) {
        struct dirent *d1;
        char process[16];
        char *p;
        int pid, pos;
        DIR *dir1;
        char crap;
 
        if (sscanf(d->d_name, "%d%c", &pid, &crap) != 1)
            continue;
 
        if (getpidcon(pid, &pid_context) != 0)
            pid_context = strdup(no_ctx);
 
        snprintf(name + nameoff, sizeof(name) - nameoff, "%d/fd/", pid);
        pos = strlen(name);
        if ((dir1 = opendir(name)) == NULL) {
            free(pid_context);
            continue;
        }
 
        process[0] = '\0';
        p = process;
 
        while ((d1 = readdir(dir1)) != NULL) {
            const char *pattern = "socket:[";
            unsigned int ino;
            char lnk[64];
            int fd;
            ssize_t link_len;
            char tmp[1024];
 
            if (sscanf(d1->d_name, "%d%c", &fd, &crap) != 1)
                continue;
 
            snprintf(name+pos, sizeof(name) - pos, "%d", fd);
 
            link_len = readlink(name, lnk, sizeof(lnk)-1);
            if (link_len == -1)
                continue;
            lnk[link_len] = '\0';
 
            if (strncmp(lnk, pattern, strlen(pattern)))
                continue;
 
            sscanf(lnk, "socket:[%u]", &ino);
 
            snprintf(tmp, sizeof(tmp), "%s/%d/fd/%s",
                    root, pid, d1->d_name);
 
            if (getfilecon(tmp, &sock_context) <= 0)
                sock_context = strdup(no_ctx);
 
            if (*p == '\0') {
                FILE *fp;
 
                snprintf(tmp, sizeof(tmp), "%s/%d/stat",
                    root, pid);
                if ((fp = fopen(tmp, "r")) != NULL) {
                    if (fscanf(fp, "%*d (%[^)])", p) < 1)
                        ; /* ignore */
                    fclose(fp);
                }
            }
            user_ent_add(ino, p, pid, fd,
                    pid_context, sock_context);
            free(sock_context);
        }
        free(pid_context);
        closedir(dir1);
    }
    closedir(dir);
}

这个解析方法与netstat中的prg_cache_load的方式类似.都是解析/proc/pid/fd下面的内容获得socketinode编号.得到pid,inodefd之后,调用user_ent_add()方法.

user_ent_add

static void user_ent_add(unsigned int ino, char *process,
                    int pid, int fd,
                    char *proc_ctx,
                    char *sock_ctx)
{
    struct user_ent *p, **pp;
 
    p = malloc(sizeof(struct user_ent));
    if (!p) {
        fprintf(stderr, "ss: failed to malloc buffer\n");
        abort();
    }
    p->next = NULL;
    p->ino = ino;
    p->pid = pid;
    p->fd = fd;
    p->process = strdup(process);
    p->process_ctx = strdup(proc_ctx);
    p->socket_ctx = strdup(sock_ctx);
 
    pp = &user_ent_hash[user_ent_hashfn(ino)];
    p->next = *pp;
    *pp = p;
}

获取inode,pidfd信息,最终组成一个链表.

proc_ctx_print

程序在输出结果的时候,调用proc_ctx_print()

static void proc_ctx_print(struct sockstat *s)
{
    char *buf;
 
    if (show_proc_ctx || show_sock_ctx) {
        if (find_entry(s->ino, &buf,
                (show_proc_ctx & show_sock_ctx) ?
                PROC_SOCK_CTX : PROC_CTX) > 0) {
            printf(" users:(%s)", buf);
            free(buf);
        }
    } else if (show_users) {
        if (find_entry(s->ino, &buf, USERS) > 0) {
            printf(" users:(%s)", buf);
            free(buf);
        }
    }
}

如果show_users>0,执行find_entry(0,根据inode编号找到对应进程的信息:

find_entry

static int find_entry(unsigned int ino, char **buf, int type)
{
    struct user_ent *p;
    int cnt = 0;
    char *ptr;
    char *new_buf;
    int len, new_buf_len;
    int buf_used = 0;
    int buf_len = 0;
 
    if (!ino)
        return 0;
 
    p = user_ent_hash[user_ent_hashfn(ino)];
    ptr = *buf = NULL;
    while (p) {
        if (p->ino != ino)
            goto next;
 
        while (1) {
            ptr = *buf + buf_used;
            switch (type) {
            case USERS:
                len = snprintf(ptr, buf_len - buf_used,
                    "(\"%s\",pid=%d,fd=%d),",
                    p->process, p->pid, p->fd);
                break;
            case PROC_CTX:
                len = snprintf(ptr, buf_len - buf_used,
                    "(\"%s\",pid=%d,proc_ctx=%s,fd=%d),",
                    p->process, p->pid,
                    p->process_ctx, p->fd);
                break;
            case PROC_SOCK_CTX:
                len = snprintf(ptr, buf_len - buf_used,
                    "(\"%s\",pid=%d,proc_ctx=%s,fd=%d,sock_ctx=%s),",
                    p->process, p->pid,
                    p->process_ctx, p->fd,
                    p->socket_ctx);
                break;
            default:
                fprintf(stderr, "ss: invalid type: %d\n", type);
                abort();
            }
 
            if (len < 0 || len >= buf_len - buf_used) {
                new_buf_len = buf_len + ENTRY_BUF_SIZE;
                new_buf = realloc(*buf, new_buf_len);
                if (!new_buf) {
                    fprintf(stderr, "ss: failed to malloc buffer\n");
                    abort();
                }
                *buf = new_buf;
                buf_len = new_buf_len;
                continue;
            } else {
                buf_used += len;
                break;
            }
        }
        cnt++;
next:
        p = p->next;
    }
    if (buf_used) {
        ptr = *buf + buf_used;
        ptr[-1] = '\0';
    }
    return cnt;
}

通过遍历p = user_ent_hash[user_ent_hashfn(ino)];这个链表得到得到所有的节点.然后利用

p = user_ent_hash[user_ent_hashfn(ino)];
ptr = *buf = NULL;
while (p) {
    if (p->ino != ino)
        goto next;

如果遍历得到inode相等,那么就说明找到了pid,最终输出的结果如下:

switch (type) {
            case USERS:
                len = snprintf(ptr, buf_len - buf_used,
                    "(\"%s\",pid=%d,fd=%d),",
                    p->process, p->pid, p->fd);
                break;

最终输出的结果是:

Netid  State      Recv-Q Send-Q Local Address:Port                 Peer Address:Port               
u_seq  ESTAB      0      0      @00017 309855                * 309856                users:(("code",pid=17009,fd=17))

总结

由于ssnetstat数据获取的方式不同,导致在执行效率上面存在很大的差别.ssnetstat这两种方式也给我们需要获取主机上面的网络数据提供了一个很好的思路.

netstat源代码调试&原理分析

说明

估计平时大部分人都是通过netstat来查看网络状态,但是事实是netstat已经逐渐被其他的命令替代,很多新的Linux发行版本中很多都不支持了netstat。以ubuntu 18.04为例来进行说明:

~ netstat 
zsh: command not found: netstat

按照difference between netstat and ss in linux?这篇文章的说法:

NOTE This program is obsolete. Replacement for netstat is ss.
Replacement for netstat -r is ip route. Replacement for netstat -i is
ip -s link. Replacement for netstat -g is ip maddr.

中文含义就是:netstat已经过时了,netstat的部分命令已经被ip这个命令取代了,当然还有更为强大的ssss命令用来显示处于活动状态的套接字信息。ss命令可以用来获取socket统计信息,它可以显示和netstat类似的内容。但ss的优势在于它能够显示更多更详细的有关TCP和连接状态的信息,而且比netstat更快速更高效。netstat的原理显示网络的原理仅仅只是解析/proc/net/tcp,所以如果服务器的socket连接数量变得非常大,那么通过netstat执行速度是非常慢。而ss采用的是通过tcp_diag的方式来获取网络信息,tcp_diag通过netlink的方式从内核拿到网络信息,这也是ss更高效更全面的原因。

下图就展示了ssnetstat在监控上面的区别。

ss.png

ss是获取的socket的信息,而netstat是通过解析/proc/net/下面的文件来获取信息包括Sockets,TCP/UDPIPEthernet信息。

netstatss的效率的对比,找同一台机器执行:

time ss
........
real    0m0.016s
user    0m0.001s
sys        0m0.001s
--------------------------------
time netstat
real    0m0.198s
user    0m0.009s
sys        0m0.011s

ss明显比netstat更加高效.

netstat简介

netstat是在net-tools工具包下面的一个工具集,net-tools提供了一份net-tools的源码,我们通过net-tools来看看netstat的实现原理。

netstat源代码调试

下载net-tools之后,导入到Clion中,创建CMakeLists.txt文件,内容如下:

cmake_minimum_required(VERSION 3.13)
project(test C)

set(BUILD_DIR .)

#add_executable()
add_custom_target(netstat command -c ${BUILD_DIR})

修改根目录下的Makefile中的59行的编译配置为:

CFLAGS ?= -O0 -g3

netstat.png

按照如上图设置自己的编译选项

以上就是搭建netstat的源代码调试过程。

tcp show

在netstat不需要任何参数的情况,程序首先会运行到2317行的tcp_info()

#if HAVE_AFINET
    if (!flag_arg || flag_tcp) {
        i = tcp_info();
        if (i)
        return (i);
    }

    if (!flag_arg || flag_sctp) {
        i = sctp_info();
        if (i)
        return (i);
    }
.........

跟踪进入到tcp_info():

static int tcp_info(void)
{
    INFO_GUTS6(_PATH_PROCNET_TCP, _PATH_PROCNET_TCP6, "AF INET (tcp)",
           tcp_do_one, "tcp", "tcp6");
}

参数的情况如下:

_PATH_PROCNET_TCP,在lib/pathnames.h中定义,是#define _PATH_PROCNET_TCP "/proc/net/tcp"

_PATH_PROCNET_TCP6, 在lib/pathnames.h中定义, 是#define _PATH_PROCNET_TCP6 "/proc/net/tcp6"

tcp_do_one,函数指针,位于1100行,部分代码如下:

static void tcp_do_one(int lnr, const char *line, const char *prot)
{
unsigned long rxq, txq, time_len, retr, inode;
int num, local_port, rem_port, d, state, uid, timer_run, timeout;
char rem_addr[128], local_addr[128], timers[64];
const struct aftype *ap;
struct sockaddr_storage localsas, remsas;
struct sockaddr_in *localaddr = (struct sockaddr_in *)&localsas;
struct sockaddr_in *remaddr = (struct sockaddr_in *)&remsas;
......

tcp_do_one()就是用来解析/proc/net/tcp/proc/net/tcp6每一行的含义的,关于/proc/net/tcp的每一行的含义可以参考之前写过的osquery源码解读之分析process_open_socket中的扩展章节。

INFO_GUTS6

#define INFO_GUTS6(file,file6,name,proc,prot4,prot6)    \
 char buffer[8192];                    \
 int rc = 0;                        \
 int lnr = 0;                        \
 if (!flag_arg || flag_inet) {                \
    INFO_GUTS1(file,name,proc,prot4)            \
 }                            \
 if (!flag_arg || flag_inet6) {                \
    INFO_GUTS2(file6,proc,prot6)            \
 }                            \
 INFO_GUTS3

INFO_GUTS6采用了#define的方式进行定义,最终根据是flag_inet(IPv4)或者flag_inet6(IPv6)的选项分别调用不同的函数,我们以INFO_GUTS1(file,name,proc,prot4)进一步分析。

INFO_GUTS1

#define INFO_GUTS1(file,name,proc,prot)            \
  procinfo = proc_fopen((file));            \
  if (procinfo == NULL) {                \
    if (errno != ENOENT && errno != EACCES) {        \
      perror((file));                    \
      return -1;                    \
    }                            \
    if (!flag_noprot && (flag_arg || flag_ver))        \
      ESYSNOT("netstat", (name));            \
    if (!flag_noprot && flag_arg)            \
      rc = 1;                        \
  } else {                        \
    do {                        \
      if (fgets(buffer, sizeof(buffer), procinfo))    \
        (proc)(lnr++, buffer,prot);            \
    } while (!feof(procinfo));                \
    fclose(procinfo);                    \
  }

rocinfo = proc_fopen((file)) 获取/proc/net/tcp的文件句柄

fgets(buffer, sizeof(buffer), procinfo) 解析文件内容并将每一行的内容存储在buffer

(proc)(lnr++, buffer,prot),利用(proc)函数解析buffer(proc)就是前面说明的tcp_do_one()函数

tcp_do_one

" 14: 020110AC:B498 CF0DE1B9:4362 06 00000000:00000000 03:000001B2 00000000 0 0 0 3 0000000000000000这一行为例来说明tcp_do_one()函数的执行过程。

tcp_do_one_1.png

由于分析是Ipv4,所以会跳过#if HAVE_AFINET6这段代码。之后执行:

num = sscanf(line,
    "%d: %64[0-9A-Fa-f]:%X %64[0-9A-Fa-f]:%X %X %lX:%lX %X:%lX %lX %d %d %lu %*s\n",
         &d, local_addr, &local_port, rem_addr, &rem_port, &state,
         &txq, &rxq, &timer_run, &time_len, &retr, &uid, &timeout, &inode);
if (num < 11) {
    fprintf(stderr, _("warning, got bogus tcp line.\n"));
    return;
}

解析数据,并将每一列的数据分别填充到对应的字段上面。分析一下其中的每个字段的定义:

char rem_addr[128], local_addr[128], timers[64];
struct sockaddr_storage localsas, remsas;
struct sockaddr_in *localaddr = (struct sockaddr_in *)&localsas;
struct sockaddr_in *remaddr = (struct sockaddr_in *)&remsas;

在Linux中sockaddr_insockaddr_storage的定义如下:

struct sockaddr {
   unsigned short    sa_family;    // address family, AF_xxx
   char              sa_data[14];  // 14 bytes of protocol address
};


struct  sockaddr_in {
    short  int  sin_family;                      /* Address family */
    unsigned  short  int  sin_port;       /* Port number */
    struct  in_addr  sin_addr;              /* Internet address */
    unsigned  char  sin_zero[8];         /* Same size as struct sockaddr */
};
/* Internet address. */
struct in_addr {
  uint32_t       s_addr;     /* address in network byte order */
};

struct sockaddr_storage {
    sa_family_t  ss_family;     // address family

    // all this is padding, implementation specific, ignore it:
    char      __ss_pad1[_SS_PAD1SIZE];
    int64_t   __ss_align;
    char      __ss_pad2[_SS_PAD2SIZE];
};

之后代码继续执行:

sscanf(local_addr, "%X", &localaddr->sin_addr.s_addr);
sscanf(rem_addr, "%X", &remaddr->sin_addr.s_addr);
localsas.ss_family = AF_INET;
remsas.ss_family = AF_INET;

local_addr使用sscanf(,"%X")得到对应的十六进制,保存到&localaddr->sin_addr.s_addr(即in_addr结构体中的s_addr)中,同理&remaddr->sin_addr.s_addr。运行结果如下所示:

saddr.png

addr_do_one

addr_do_one(local_addr, sizeof(local_addr), 22, ap, &localsas, local_port, "tcp");
addr_do_one(rem_addr, sizeof(rem_addr), 22, ap, &remsas, rem_port, "tcp");

程序继续执行,最终会执行到addr_do_one()函数,用于解析本地IP地址和端口,以及远程IP地址和端口。

static void addr_do_one(char *buf, size_t buf_len, size_t short_len, const struct aftype *ap,
            const struct sockaddr_storage *addr,
            int port, const char *proto
)
{
    const char *sport, *saddr;
    size_t port_len, addr_len;

    saddr = ap->sprint(addr, flag_not & FLAG_NUM_HOST);
    sport = get_sname(htons(port), proto, flag_not & FLAG_NUM_PORT);
    addr_len = strlen(saddr);
    port_len = strlen(sport);
    if (!flag_wide && (addr_len + port_len > short_len)) {
        /* Assume port name is short */
        port_len = netmin(port_len, short_len - 4);
        addr_len = short_len - port_len;
        strncpy(buf, saddr, addr_len);
        buf[addr_len] = '\0';
        strcat(buf, ":");
        strncat(buf, sport, port_len);
    } else
          snprintf(buf, buf_len, "%s:%s", saddr, sport);
}

1.saddr = ap->sprint(addr, flag_not & FLAG_NUM_HOST); 这个表示是否需要将addr转换为域名的形式。由于addr值是127.0.0.1,转换之后得到的就是localhost,其中FLAG_NUM_HOST的就等价于--numeric-hosts的选项。

2.sport = get_sname(htons(port), proto, flag_not & FLAG_NUM_PORT);,port无法无法转换,其中的FLAG_NUM_PORT就等价于--numeric-ports这个选项。

3.!flag_wide && (addr_len + port_len > short_len 这个代码的含义是判断是否需要对IP和PORT进行截断。其中flag_wide的等同于-W, --wide don't truncate IP addresses。而short_len长度是22.

4.snprintf(buf, buf_len, "%s:%s", saddr, sport);,将IP:PORT赋值给buf.

output

最终程序执行

printf("%-4s  %6ld %6ld %-*s %-*s %-11s",
           prot, rxq, txq, (int)netmax(23,strlen(local_addr)), local_addr, (int)netmax(23,strlen(rem_addr)), rem_addr, _(tcp_state[state]));

按照制定的格式解析,输出结果

finish_this_one

最终程序会执行finish_this_one(uid,inode,timers);.

static void finish_this_one(int uid, unsigned long inode, const char *timers)
{
    struct passwd *pw;

    if (flag_exp > 1) {
    if (!(flag_not & FLAG_NUM_USER) && ((pw = getpwuid(uid)) != NULL))
        printf(" %-10s ", pw->pw_name);
    else
        printf(" %-10d ", uid);
    printf("%-10lu",inode);
    }
    if (flag_prg)
    printf(" %-" PROGNAME_WIDTHs "s",prg_cache_get(inode));
    if (flag_selinux)
    printf(" %-" SELINUX_WIDTHs "s",prg_cache_get_con(inode));

    if (flag_opt)
    printf(" %s", timers);
    putchar('\n');
}

1.flag_exp 等同于-e的参数。-e, --extend display other/more information.举例如下:

netstat -e 
Proto Recv-Q Send-Q Local Address           Foreign Address         State       User       Inode
tcp        0      0 localhost:6379          172.16.1.200:46702    ESTABLISHED redis      437788048

netstat
Proto Recv-Q Send-Q Local Address           Foreign Address         State      
tcp        0      0 localhost:6379          172.16.1.200:46702    ESTABLISHED

发现使用-e参数会多显示UserInode号码。而在本例中还可以如果用户名不存在,则显示uid
getpwuid

2.flag_prg等同于-p, --programs display PID/Program name for sockets.举例如下:

netstat -pe
Proto Recv-Q Send-Q Local Address           Foreign Address         State       User       Inode      PID/Program name
tcp        0      0 localhost:6379          172.16.1.200:34062      ESTABLISHED redis      437672000  6017/redis-server *

netstat -e
Proto Recv-Q Send-Q Local Address           Foreign Address         State       User       Inode
tcp        0      0 localhost:6379          172.16.1.200:46702    ESTABLISHED redis      437788048

可以看到是通过prg_cache_get(inode)inode来找到对应的PID和进程信息;

3.flag_selinux等同于-Z, --context display SELinux security context for sockets

prg_cache_get

对于上面的通过inode找到对应进程的方法非常的好奇,于是去追踪prg_cache_get()函数的实现。

#define PRG_HASH_SIZE 211

#define PRG_HASHIT(x) ((x) % PRG_HASH_SIZE)

static struct prg_node {
    struct prg_node *next;
    unsigned long inode;
    char name[PROGNAME_WIDTH];
    char scon[SELINUX_WIDTH];
} *prg_hash[PRG_HASH_SIZE];

static const char *prg_cache_get(unsigned long inode)
{
    unsigned hi = PRG_HASHIT(inode);
    struct prg_node *pn;

    for (pn = prg_hash[hi]; pn; pn = pn->next)
    if (pn->inode == inode)
        return (pn->name);
    return ("-");
}

prg_hash中存储了所有的inode编号与program的对应关系,所以当给定一个inode编号时就能够找到对应的程序名称。那么prg_hash又是如何初始化的呢?

prg_cache_load

我们使用debug模式,加入-p的运行参数:

netstat-p.png

程序会运行到2289行的prg_cache_load(); 进入到prg_cache_load()函数中.

由于整个函数的代码较长,拆分来分析.

一、获取fd

#define PATH_PROC      "/proc"
#define PATH_FD_SUFF    "fd"
#define PATH_FD_SUFFl       strlen(PATH_FD_SUFF)
#define PATH_PROC_X_FD      PATH_PROC "/%s/" PATH_FD_SUFF
#define PATH_CMDLINE    "cmdline"
#define PATH_CMDLINEl       strlen(PATH_CMDLINE)
 
if (!(dirproc=opendir(PATH_PROC))) goto fail;
    while (errno = 0, direproc = readdir(dirproc)) {
    for (cs = direproc->d_name; *cs; cs++)
        if (!isdigit(*cs))
        break;
    if (*cs)
        continue;
    procfdlen = snprintf(line,sizeof(line),PATH_PROC_X_FD,direproc->d_name);
    if (procfdlen <= 0 || procfdlen >= sizeof(line) - 5)
        continue;
    errno = 0;
    dirfd = opendir(line);
    if (! dirfd) {
        if (errno == EACCES)
        eacces = 1;
        continue;
    }
    line[procfdlen] = '/';
    cmdlp = NULL;

1.dirproc=opendir(PATH_PROC);errno = 0, direproc = readdir(dirproc) 遍历/proc拿到所有的pid

2.procfdlen = snprintf(line,sizeof(line),PATH_PROC_X_FD,direproc→d_name); 遍历所有的/proc/pid拿到所有进程的fd

3.dirfd = opendir(line); 得到/proc/pid/fd的文件句柄

二、获取inode

while ((direfd = readdir(dirfd))) {
        /* Skip . and .. */
        if (!isdigit(direfd->d_name[0]))
            continue;
    if (procfdlen + 1 + strlen(direfd->d_name) + 1 > sizeof(line))
       continue;
    memcpy(line + procfdlen - PATH_FD_SUFFl, PATH_FD_SUFF "/",
        PATH_FD_SUFFl + 1);
    safe_strncpy(line + procfdlen + 1, direfd->d_name,
                    sizeof(line) - procfdlen - 1);
    lnamelen = readlink(line, lname, sizeof(lname) - 1);
    if (lnamelen == -1)
        continue;
        lname[lnamelen] = '\0';  /*make it a null-terminated string*/
 
        if (extract_type_1_socket_inode(lname, &inode) < 0)
            if (extract_type_2_socket_inode(lname, &inode) < 0)
            continue;

1.memcpy(line + procfdlen - PATH_FD_SUFFl, PATH_FD_SUFF "/",PATH_FD_SUFFl + 1);safe_strncpy(line + procfdlen + 1, direfd->d_name, sizeof(line) - procfdlen - 1); 得到遍历之后的fd信息,比如/proc/pid/fd

2.lnamelen = readlink(line, lname, sizeof(lname) - 1); 得到fd所指向的link,因为通常情况下fd一般都是链接,要么是socket链接要么是pipe链接.如下所示:

$ ls -al /proc/1289/fd
total 0
dr-x------ 2 username username  0 May 25 15:45 .
dr-xr-xr-x 9 username username  0 May 25 09:11 ..
lr-x------ 1 username username 64 May 25 16:23 0 -> 'pipe:[365366]'
l-wx------ 1 username username 64 May 25 16:23 1 -> 'pipe:[365367]'
l-wx------ 1 username username 64 May 25 16:23 2 -> 'pipe:[365368]'
lr-x------ 1 username username 64 May 25 16:23 3 -> /proc/uptime

3.通过extract_type_1_socket_inode获取到link中对应的inode编号.

#define PRG_SOCKET_PFX    "socket:["
#define PRG_SOCKET_PFXl (strlen(PRG_SOCKET_PFX))
static int extract_type_1_socket_inode(const char lname[], unsigned long * inode_p) {
 
/* If lname is of the form "socket:[12345]", extract the "12345"
   as *inode_p.  Otherwise, return -1 as *inode_p.
   */
// 判断长度是否小于 strlen(socket:[)+3
if (strlen(lname) < PRG_SOCKET_PFXl+3) return(-1);
 
//函数说明:memcmp()用来比较s1 和s2 所指的内存区间前n 个字符。
// 判断lname是否以 socket:[ 开头
if (memcmp(lname, PRG_SOCKET_PFX, PRG_SOCKET_PFXl)) return(-1);
if (lname[strlen(lname)-1] != ']') return(-1);  {
    char inode_str[strlen(lname + 1)];  /* e.g. "12345" */
    const int inode_str_len = strlen(lname) - PRG_SOCKET_PFXl - 1;
    char *serr;
 
    // 获取到inode的编号
    strncpy(inode_str, lname+PRG_SOCKET_PFXl, inode_str_len);
    inode_str[inode_str_len] = '\0';
    *inode_p = strtoul(inode_str, &serr, 0);
    if (!serr || *serr || *inode_p == ~0)
        return(-1);
}

4.获取程序对应的cmdline

if (!cmdlp) {
    if (procfdlen - PATH_FD_SUFFl + PATH_CMDLINEl >=sizeof(line) - 5)
        continue;
    safe_strncpy(line + procfdlen - PATH_FD_SUFFl, PATH_CMDLINE,sizeof(line) - procfdlen + PATH_FD_SUFFl);
fd = open(line, O_RDONLY);
if (fd < 0)
    continue;
cmdllen = read(fd, cmdlbuf, sizeof(cmdlbuf) - 1);
if (close(fd))
    continue;
if (cmdllen == -1)
    continue;
if (cmdllen < sizeof(cmdlbuf) - 1)
    cmdlbuf[cmdllen]='\0';
if (cmdlbuf[0] == '/' && (cmdlp = strrchr(cmdlbuf, '/')))
    cmdlp++;
else
    cmdlp = cmdlbuf;
}

由于cmdline是可以直接读取的,所以并不需要像读取fd那样借助与readlink()函数,直接通过read(fd, cmdlbuf, sizeof(cmdlbuf) - 1)即可读取文件内容.

5.snprintf(finbuf, sizeof(finbuf), "%s/%s", direproc->d_name, cmdlp); 拼接pidcmdlp,最终得到的就是类似与6017/redis-server *这样的效果 

6.最终程序调用prg_cache_add(inode, finbuf, "-");将解析得到的inodefinbuf加入到缓存中.

prg_cache_add

#define PRG_HASH_SIZE 211
#define PRG_HASHIT(x) ((x) % PRG_HASH_SIZE)
static struct prg_node {
    struct prg_node *next;
    unsigned long inode;
    char name[PROGNAME_WIDTH];
    char scon[SELINUX_WIDTH];
} *prg_hash[ ];
 
static void prg_cache_add(unsigned long inode, char *name, const char *scon)
{
    unsigned hi = PRG_HASHIT(inode);
    struct prg_node **pnp,*pn;
 
    prg_cache_loaded = 2;
    for (pnp = prg_hash + hi; (pn = *pnp); pnp = &pn->next) {
    if (pn->inode == inode) {
        /* Some warning should be appropriate here
           as we got multiple processes for one i-node */
        return;
    }
    }
    if (!(*pnp = malloc(sizeof(**pnp))))
    return;
    pn = *pnp;
    pn->next = NULL;
    pn->inode = inode;
    safe_strncpy(pn->name, name, sizeof(pn->name));
 
    {
    int len = (strlen(scon) - sizeof(pn->scon)) + 1;
    if (len > 0)
            safe_strncpy(pn->scon, &scon[len + 1], sizeof(pn->scon));
    else
            safe_strncpy(pn->scon, scon, sizeof(pn->scon));
    }
 
}

1.unsigned hi = PRG_HASHIT(inode); 使用inode整除211得到作为hash

2.for (pnp = prg_hash + hi; (pn = *pnp); pnp = &pn->next) 由于prg_hash是一个链表结构,所以通过for循环找到链表的结尾;

3.pn = *pnp;pn->next = NULL;pn->inode = inode;safe_strncpy(pn->name, name, sizeof(pn→name)); 为新的inode赋值并将其加入到链表的末尾;

所以prg_node是一个全局变量,是一个链表结果,保存了inode编号与pid/cmdline之间的对应关系;

prg_cache_get

static const char *prg_cache_get(unsigned long inode)
{
    unsigned hi = PRG_HASHIT(inode);
    struct prg_node *pn;
 
    for (pn = prg_hash[hi]; pn; pn = pn->next)
    if (pn->inode == inode)
        return (pn->name);
    return ("-");
}

分析完毕prg_cache_add()之后,看prg_cache_get()就很简单了.

1.unsigned hi = PRG_HASHIT(inode);通过inode号拿到hash

2.for (pn = prg_hash[hi]; pn; pn = pn->next) 遍历prg_hash链表中的每一个节点,如果遍历的inode与目标的inode相符就返回对应的信息.

总结

通过对netstat的一个简单的分析,可以发现其实netstat就是通过遍历/proc目录下的目录或者是文件来获取对应的信息.如果在一个网络进程频繁关闭打开关闭,那么使用netstat显然是相当耗时的.

osquery源码解读之分析shell_history

说明

前面两篇主要是对osquery的使用进行了说明,本篇文章将会分析osquery的源码。本文将主要对shell_historyprocess_open_sockets两张表进行说明。通过对这些表的实现分析,一方面能够了解osquery的实现通过SQL查询系统信息的机制,另一方面可以加深对Linux系统的理解。

表的说明

shell_history是用于查看shell的历史记录,而process_open_sockets是用于记录主机当前的网络行为。示例用法如下:

shell_history

osquery> select * from shell_history limit 3;
+------+------+-------------------------------------------------------------------+-----------------------------+
| uid  | time | command                                                           | history_file                |
+------+------+-------------------------------------------------------------------+-----------------------------+
| 1000 | 0    | pwd                                                               | /home/username/.bash_history |
| 1000 | 0    | ps -ef                                                            | /home/username/.bash_history |
| 1000 | 0    | ps -ef | grep java                                                | /home/username/.bash_history |
+------+------+-------------------------------------------------------------------+-----------------------------+

process_open_socket显示了一个反弹shell的链接。

osquery> select * from process_open_sockets order by pid desc limit 1;
+--------+----+----------+--------+----------+---------------+----------------+------------+-------------+------+------------+---------------+
| pid    | fd | socket   | family | protocol | local_address | remote_address | local_port | remote_port | path | state      | net_namespace |
+--------+----+----------+--------+----------+---------------+----------------+------------+-------------+------+------------+---------------+
| 115567 | 3  | 16467630 | 2      | 6        | 192.168.2.142 | 192.168.2.143  | 46368      | 8888        |      | ESTABLISH  | 0             |
+--------+----+----------+--------+----------+---------------+----------------+------------+-------------+------+------------+---------------+

osquery整体的代码结构十分地清晰。所有表的定义都是位于specs下面,所有表的实现都是位于osquery/tables

我们以shell_history为例,其表的定义是在specs/posix/shell_history.table

table_name("shell_history")
description("A line-delimited (command) table of per-user .*_history data.")
schema([
    Column("uid", BIGINT, "Shell history owner", additional=True),
    Column("time", INTEGER, "Entry timestamp. It could be absent, default value is 0."),
    Column("command", TEXT, "Unparsed date/line/command history line"),
    Column("history_file", TEXT, "Path to the .*_history for this user"),
    ForeignKey(column="uid", table="users"),
])
attributes(user_data=True, no_pkey=True)
implementation("shell_history@genShellHistory")
examples([
    "select * from users join shell_history using (uid)",
])
fuzz_paths([
    "/home",
    "/Users",
])s

shell_history.table中已经定义了相关的信息,入口是shell_history.cpp中的genShellHistory()函数,甚至给出了示例的SQL语句select * from users join shell_history using (uid)shell_history.cpp是位于osquery/tables/system/posix/shell_history.cpp中。

同理,process_open_sockets的表定义位于specs/process_open_sockets.table,实现位于osquery/tables/networking/[linux|freebsd|windows]/process_open_sockets.cpp。可以看到由于process_open_sockets在多个平台上面都有,所以在linux/freebsd/windows中都存在process_open_sockets.cpp的实现。本文主要是以linux为例。

shell_history实现

前提知识

在分析之前,介绍一下Linux中的一些基本概念。我们常常会看到各种不同的unix shell,如bash、zsh、tcsh、sh等等。bash是我们目前最常见的,它几乎是所有的类unix操作中内置的一个shell。而zsh相对于bash增加了更多的功能。我们在终端输入各种命令时,其实都是使用的这些shell。

我们在用户的根目录下方利用ls -all就可以发现存在.bash_history文件,此文件就记录了我们在终端中输入的所有的命令。同样地,如果我们使用zsh,则会存在一个.zsh_history记录我们的命令。

同时在用户的根目录下还存在.bash_sessions的目录,根据这篇文章的介绍:

A new folder (~/.bash_sessions/) is used to store HISTFILE’s and .session files that are unique to sessions. If $BASH_SESSION or $TERM_SESSION_ID is set upon launching the shell (i.e. if Terminal is resuming from a saved state), the associated HISTFILE is merged into the current one, and the .session file is ran. Session saving is facilitated by means of an EXIT trap being set for a function bash_update_session_state.

.bash_sessions中存储了特定SESSION的HISTFILE和.session文件。如果在启动shell时设置了$BASH_SESSION$TERM_SESSION_ID。当此特定的SESSION启动了之后就会利用$BASH_SESSION$TERM_SESSION_ID恢复之前的状态。这也说明在.bash_sessions目录下也会存在*.history用于记录特定SESSION的历史命令信息。

分析

QueryData genShellHistory(QueryContext& context) {
    QueryData results;
    // Iterate over each user
    QueryData users = usersFromContext(context);
    for (const auto& row : users) {
        auto uid = row.find("uid");
        auto gid = row.find("gid");
        auto dir = row.find("directory");
        if (uid != row.end() && gid != row.end() && dir != row.end()) {
            genShellHistoryForUser(uid->second, gid->second, dir->second, results);
            genShellHistoryFromBashSessions(uid->second, dir->second, results);
        }
    }

    return results;
}

分析shell_history.cpp的入口函数genShellHistory():

遍历所有的用户,拿到uidgiddirectory。之后调用genShellHistoryForUser()获取用户的shell记录genShellHistoryFromBashSessions()genShellHistoryForUser()作用类似。

genShellHistoryForUser():

void genShellHistoryForUser(const std::string& uid, const std::string& gid, const std::string& directory, QueryData& results) {
    auto dropper = DropPrivileges::get();
    if (!dropper->dropTo(uid, gid)) {
        VLOG(1) << "Cannot drop privileges to UID " << uid;
        return;
    }

    for (const auto& hfile : kShellHistoryFiles) {
        boost::filesystem::path history_file = directory;
        history_file /= hfile;
        genShellHistoryFromFile(uid, history_file, results);
    }
}

可以看到在执行之前调用了:

auto dropper = DropPrivileges::get();
if (!dropper->dropTo(uid, gid)) {
    VLOG(1) << "Cannot drop privileges to UID " << uid;
    return;
}

用于对giduid降权,为什么要这么做呢?后来询问外国网友,给了一个很详尽的答案:

Think about a scenario where you are a malicious user and you spotted a vulnerability(buffer overflow) which none of us has. In the code (osquery which is running usually with root permission) you also know that history files(controlled by you) are being read by code(osquery). Now you stored a shell code (a code which is capable of destroying anything in the system)such a way that it would overwrite the saved rip. So once the function returns program control is with the injected code(shell code) with root privilege. With dropping privilege you reduce the chance of putting entire system into danger.

There are other mitigation techniques (e.g. stack guard) to avoid above scenario but multiple defenses are required

简而言之,osquery一般都是使用root权限运行的,如果攻击者在.bash_history中注入了一段恶意的shellcode代码。那么当osquery读到了这个文件之后,攻击者就能够获取到root权限了,所以通过降权的方式就能够很好地避免这样的问题。

/**
* @brief The privilege/permissions dropper deconstructor will restore
* effective permissions.
*
* There should only be a single drop of privilege/permission active.
*/
virtual ~DropPrivileges();

可以看到当函数被析构之后,就会重新恢复对应文件的权限。

之后遍历kShellHistoryFiles文件,执行genShellHistoryFromFile()代码。kShellHistoryFiles在之前已经定义,内容是:

const std::vector<std::string> kShellHistoryFiles = {
    ".bash_history", ".zsh_history", ".zhistory", ".history", ".sh_history",
};

可以发现其实在kShellHistoryFiles定义的就是常见的bash用于记录shell history目录的文件。最后调用genShellHistoryFromFile()读取.history文件,解析数据。

void genShellHistoryFromFile(const std::string& uid, const boost::filesystem::path& history_file, QueryData& results) {
    std::string history_content;
    if (forensicReadFile(history_file, history_content).ok()) {
        auto bash_timestamp_rx = xp::sregex::compile("^#(?P<timestamp>[0-9]+)$");
        auto zsh_timestamp_rx = xp::sregex::compile("^: {0,10}(?P<timestamp>[0-9]{1,11}):[0-9]+;(?P<command>.*)$");
        std::string prev_bash_timestamp;
        for (const auto& line : split(history_content, "\n")) {
            xp::smatch bash_timestamp_matches;
            xp::smatch zsh_timestamp_matches;

            if (prev_bash_timestamp.empty() &&
                xp::regex_search(line, bash_timestamp_matches, bash_timestamp_rx)) {
                prev_bash_timestamp = bash_timestamp_matches["timestamp"];
                continue;
            }

            Row r;

            if (!prev_bash_timestamp.empty()) {
                r["time"] = INTEGER(prev_bash_timestamp);
                r["command"] = line;
                prev_bash_timestamp.clear();
            } else if (xp::regex_search(
                    line, zsh_timestamp_matches, zsh_timestamp_rx)) {
                std::string timestamp = zsh_timestamp_matches["timestamp"];
                r["time"] = INTEGER(timestamp);
                r["command"] = zsh_timestamp_matches["command"];
            } else {
                r["time"] = INTEGER(0);
                r["command"] = line;
            }

            r["uid"] = uid;
            r["history_file"] = history_file.string();
            results.push_back(r);
        }
    }
}

整个代码逻辑非常地清晰。

  1. forensicReadFile(history_file, history_content)读取文件内容。
  2. 定义bash_timestamp_rxzsh_timestamp_rx的正则表达式,用于解析对应的.history文件的内容。 for (const auto& line : split(history_content, "\n"))读取文件的每一行,分别利用bash_timestamp_rxzsh_timestamp_rx解析每一行的内容。
  3. Row r;...;r["history_file"] = history_file.string();results.push_back(r);将解析之后的内容写入到Row中返回。

自此就完成了shell_history的解析工作。执行select * from shell_history就会按照上述的流程返回所有的历史命令的结果。

对于genShellHistoryFromBashSessions()函数:

void genShellHistoryFromBashSessions(const std::string &uid,const std::string &directory,QueryData &results) {
    boost::filesystem::path bash_sessions = directory;
    bash_sessions /= ".bash_sessions";

    if (pathExists(bash_sessions)) {
        bash_sessions /= "*.history";
        std::vector <std::string> session_hist_files;
        resolveFilePattern(bash_sessions, session_hist_files);

        for (const auto &hfile : session_hist_files) {
            boost::filesystem::path history_file = hfile;
            genShellHistoryFromFile(uid, history_file, results);
        }
    }
}

genShellHistoryFromBashSessions()获取历史命令的方法比较简单。

  1. 获取到.bash_sessions/*.history所有的文件;
  2. 同样调用genShellHistoryFromFile(uid, history_file, results);方法获取到历史命令;

总结

阅读一些优秀的开源软件的代码,不仅能够学习到相关的知识更能够了解到一些设计哲学。拥有快速学习能⼒的⽩帽子,是不能有短板的。有的只是⼤量的标准板和⼏块长板。

使用osqueryd监控系统

0x01 说明

osquery初识主要是借由osqueryi的方式对osquery进行了一个基本的介绍。可以看到osqueryi是一个交互式的shell,我们可以很方便使用它进行测试,但是如果我们要将osquery投入实际使用,明显是osqueryd更加合适。本篇文章将详细地介绍osqueryd的使用。

0x02 osqueryd配置

如果使用osqueryi,我们可以通过osqueryi -audit_allow_config=true --audit_allow_sockets=true --audit_persist=true这样的方式传入设置。如果是osqueryd呢?其实我们安装好osquery之后,会以service的方式存在于系统中,同时可以利用systemctl的方式进行控制,其文件位于/usr/lib/systemd/system/osqueryd.service

[Unit]
Description=The osquery Daemon
After=network.service syslog.service

[Service]
TimeoutStartSec=0
EnvironmentFile=/etc/sysconfig/osqueryd
ExecStartPre=/bin/sh -c "if [ ! -f $FLAG_FILE ]; then touch $FLAG_FILE; fi"
ExecStartPre=/bin/sh -c "if [ -f $LOCAL_PIDFILE ]; then mv $LOCAL_PIDFILE $PIDFILE; fi"
ExecStart=/usr/bin/osqueryd \
  --flagfile $FLAG_FILE \
  --config_path $CONFIG_FILE
Restart=on-failure
KillMode=process
KillSignal=SIGTERM

[Install]
WantedBy=multi-user.target

启动方式就是ExecStart=/usr/bin/osqueryd --flagfile $FLAG_FILE --config_path $CONFIG_FILE,通过--flagfile--config_path的方式指定配置文件的路径。$FLAG_FILE和$CONFIG_FILE是在/etc/sysconfig/osqueryd中定义。

FLAG_FILE="/etc/osquery/osquery.flags"
CONFIG_FILE="/etc/osquery/osquery.conf"
LOCAL_PIDFILE="/var/osquery/osqueryd.pidfile"
PIDFILE="/var/run/osqueryd.pidfile"

默认的配置文件就是位于/etc/osquery/osquery.flags/etc/osquery/osquery.conf。当启动osqueryd时,如果不存在osquery.flagsosquery.conf会创建两个空文件,否则直接读取此文件的内容。其实osquery.conf可以认为是osquery.flags的超集,因为osquery.flags仅仅只是设置一些配置,而这些配置也同样可以在osquery.conf中实现,同时在osquery.conf中还可以配置osqueryd需要执行的SQL。所以接下来本文将仅仅只介绍osquery.conf的使用。

0x03 osquery.conf

osquery本身提供了一个osquery.conf的例子,其写法是一个JSON格式的文件,在这里我们将其简化一下。

{
  // Configure the daemon below:
  "options": {
    // Select the osquery config plugin.
    "config_plugin": "filesystem",

    // Select the osquery logging plugin.
    "logger_plugin": "filesystem",

    // The log directory stores info, warning, and errors.
    // If the daemon uses the 'filesystem' logging retriever then the log_dir
    // will also contain the query results.
    //"logger_path": "/var/log/osquery",

    // Set 'disable_logging' to true to prevent writing any info, warning, error
    // logs. If a logging plugin is selected it will still write query results.
    //"disable_logging": "false",

    // Splay the scheduled interval for queries.
    // This is very helpful to prevent system performance impact when scheduling
    // large numbers of queries that run a smaller or similar intervals.
    //"schedule_splay_percent": "10",

    // A filesystem path for disk-based backing storage used for events and
    // query results differentials. See also 'use_in_memory_database'.
    //"database_path": "/var/osquery/osquery.db",

    // Comma-delimited list of table names to be disabled.
    // This allows osquery to be launched without certain tables.
    //"disable_tables": "foo_bar,time",

    "utc": "true"
  },

  // Define a schedule of queries:
  "schedule": {
    // This is a simple example query that outputs basic system information.
    "system_info": {
      // The exact query to run.
      "query": "SELECT hostname, cpu_brand, physical_memory FROM system_info;",
      // The interval in seconds to run this query, not an exact interval.
      "interval": 3600
    }
  },

  // Decorators are normal queries that append data to every query.
  "decorators": {
    "load": [
      "SELECT uuid AS host_uuid FROM system_info;",
      "SELECT user AS username FROM logged_in_users ORDER BY time DESC LIMIT 1;"
    ]
  },
  "packs": {
    // "osquery-monitoring": "/usr/share/osquery/packs/osquery-monitoring.conf",
    ....
  }, 
}

osquery.conf文件大致可以分为4个部分。

  • options,配置选项,Command Line Flags基本上对所有的配置选项都进行了说明。其实osquery.flags所配置也是这个部分。这也是之前说的osquery.conf可以认为是osquery.flags的超集的原因;
  • schedule,配置SQL语句。因为osqueryd是以daemon的方式运行,所以需要通过在schedule中定义SQL语句使其定期执行返回结果;
  • decorators,中文意思是“装饰”。在decorators中也是定义了一系列的SQL语句,执行得到的结果会附加在是在执行schedule中的结果的后面;所以我们看到在decorators我们取的是uuid和登录的username
  • packs,就是一系列SQL语句的合集;

0x04 配置说明

上一节中对osquery.conf中的配置进了一个简单的说明,在本节中将详细说明。

options

  • options就是配置。Command Line Flags基本上对所有的配置选项都进行了说明。我们可以进行多种配置,有兴趣的可以自行研究。本节仅仅说明几个常用的配置;
  • config_plugin,配置选项是filesystem。如果是通过osquery.conf管理osquery就是采用filesystem,还有一种选项是tls(这一种主要是通过API的方式来配置osquery)。
  • logger_plugin,配置选项是filesystem,这也是osquery的默认值。根据Logger plugins,还可以配置tls,syslog (for POSIX,windows_event_log (for Windows),kinesis,firehose,kafka_producer
  • database_path,默认值是/var/osquery/osquery.db。因为osquery内部会使用到数据,所以配置此目录是osquery的数据库文件位置。
  • disable_logging,是配置设置osquery的结果是否需要保存到本地,这个配置其实和logger_plugin:filesystem有点重复。
  • hostIdentifier,相当于表示每个主机的标识,比如可以采用hostname作为标识。

schedule

schedule是osqeuryd用于写SQL语句的标签。其中的一个示例如下所示:

"system_info": {
    // The exact query to run.
    "query": "SELECT hostname, cpu_brand, physical_memory FROM system_info;",
    // The interval in seconds to run this query, not an exact interval.
    "interval": 3600
}

其中system_info是定义的一个SQL任务的名字,也是一个JSON格式。在其中可以进行多项设置,包括:

  1. query,定义需要执行的SQL语句;
  2. interval,定时执行的时间,示例中是3600,表示每隔3600秒执行一次;
  3. snapshot,可选选项,可以配置为snapshot:true。osquery默认执行的是增量模式,使用了snapshot则是快照模式。比如执行select * from processes;,osqeury每次产生的结果是相比上一次变化的结果;如果采用的是snapshot,则会显示所有的进程的,不会与之前的结果进行对比;
  4. removed,可选选项,默认值是true,用来设置是否记录actionremove的日志。

当然还有一些其他的不常用选项,如platformversionsharddescription等等。

更多关于schedule的介绍可以参考schedule

decorators

正如其注释Decorators are normal queries that append data to every query所说,Decorators会把他的执行结果添加到schedule中的sql语句执行结果中。所以根据其作用Decorators也不是必须存在的。。在本例中Decorators存在两条记录:

SELECT uuid AS host_uuid FROM system_info;
SELECT user AS username FROM logged_in_users ORDER BY time DESC LIMIT 1;
  1. SELECT uuid AS host_uuid FROM system_info,从system_info获取uuid作为标识符1;
  2. SELECT user AS username FROM logged_in_users ORDER BY time DESC LIMIT 1;,从logged_in_users选择user(其实查询的是用户名)的第一项作为标识符2;

当然可以在Decorators写多条语句作为标识符,但是感觉没有必要;

packs

packs就是打包的SQL语句的合集,本示例中使用的/usr/share/osquery/packs/osquery-monitoring.conf,这是官方提供的一个监控系统信息的SQL语句的集合;

{
  "queries": {
    "schedule": {
      "query": "select name, interval, executions, output_size, wall_time, (user_time/executions) as avg_user_time, (system_time/executions) as avg_system_time, average_memory, last_executed from osquery_schedule;",
      "interval": 7200,
      "removed": false,
      "blacklist": false,
      "version": "1.6.0",
      "description": "Report performance for every query within packs and the general schedule."
    },
    "events": {
      "query": "select name, publisher, type, subscriptions, events, active from osquery_events;",
      "interval": 86400,
      "removed": false,
      "blacklist": false,
      "version": "1.5.3",
      "description": "Report event publisher health and track event counters."
    },
    "osquery_info": {
      "query": "select i.*, p.resident_size, p.user_time, p.system_time, time.minutes as counter from osquery_info i, processes p, time where p.pid = i.pid;",
      "interval": 600,
      "removed": false,
      "blacklist": false,
      "version": "1.2.2",
      "description": "A heartbeat counter that reports general performance (CPU, memory) and version."
    }
  }
}

packs中的配置和schedule的配置方法并没有什么区别。我们在packs中查询到的信息包括:

  • osquery_schedule拿到osqueryd设置的schedule的配置信息;
  • osquery_events中拿到osqueryd所支持的所有的event
  • processesosquery_info中拿到进程相关的信息;

使用packs的好处是可以将一系列相同功能的SQL语句放置在同一个文件中;

0x05 运行osqueryd

当以上配置完毕之后,我们就可以通过sudo osqueryd的方式启动;如果我们设置logger_plugin:filesystem,那么日志就会落在本地/var/log/osquery下。此目录下包含了多个文件,每个文件分别记录不同的信息。

osqueryd.results.log,osqueryd的增量日志的信息都会写入到此文件中;保存结果的形式是JSON形式。示例如下:

{"name":"auditd_process_info","hostIdentifier":"localhost.localdomain","calendarTime":"Wed Oct 24 13:07:12 2018 UTC","unixTime":1540386432,"epoch":0,"counter":0,"decorations":{"host_uuid":"99264D56-9A4E-E593-0B4E-872FBF3CD064","username":"username"},"columns":{"atime":"1540380461","auid":"4294967295","btime":"0","cmdline":"awk { sum += $1 }; END { print 0+sum }","ctime":"1538239175","cwd":"\"/\"","egid":"0","euid":"0","gid":"0","mode":"0100755","mtime":"1498686768","owner_gid":"0","owner_uid":"0","parent":"4086","path":"/usr/bin/gawk","pid":"4090","time":"1540386418","uid":"0","uptime":"1630"},"action":"added"}
{"name":"auditd_process_info","hostIdentifier":"localhost.localdomain","calendarTime":"Wed Oct 24 13:07:12 2018 UTC","unixTime":1540386432,"epoch":0,"counter":0,"decorations":{"host_uuid":"99264D56-9A4E-E593-0B4E-872FBF3CD064","username":"username"},"columns":{"atime":"1540380461","auid":"4294967295","btime":"0","cmdline":"sleep 60","ctime":"1538240835","cwd":"\"/\"","egid":"0","euid":"0","gid":"0","mode":"0100755","mtime":"1523421302","owner_gid":"0","owner_uid":"0","parent":"741","path":"/usr/bin/sleep","pid":"4091","time":"1540386418","uid":"0","uptime":"1630"},"action":"added"}

其中的added表示的就是相当于上一次增加的进程信息;每一次执行的结果都是一条JSON记录;

squeryd.snapshots.log,记录的是osqueryd中使用snapshot:true标记的SQL语句执行结果;

{"snapshot":[{"header":"Defaults","rule_details":"!visiblepw"},{"header":"Defaults","rule_details":"always_set_home"},{"header":"Defaults","rule_details":"match_group_by_gid"},{"header":"Defaults","rule_details":"env_reset"},{"header":"Defaults","rule_details":"env_keep = \"COLORS DISPLAY HOSTNAME HISTSIZE KDEDIR LS_COLORS\""},{"header":"Defaults","rule_details":"env_keep += \"MAIL PS1 PS2 QTDIR USERNAME LANG LC_ADDRESS LC_CTYPE\""},{"header":"Defaults","rule_details":"env_keep += \"LC_COLLATE LC_IDENTIFICATION LC_MEASUREMENT LC_MESSAGES\""},{"header":"Defaults","rule_details":"env_keep += \"LC_MONETARY LC_NAME LC_NUMERIC LC_PAPER LC_TELEPHONE\""},{"header":"Defaults","rule_details":"env_keep += \"LC_TIME LC_ALL LANGUAGE LINGUAS _XKB_CHARSET XAUTHORITY\""},{"header":"Defaults","rule_details":"secure_path = /sbin:/bin:/usr/sbin:/usr/bin"},{"header":"root","rule_details":"ALL=(ALL) ALL"},{"header":"%wheel","rule_details":"ALL=(ALL) ALL"}],"action":"snapshot","name":"sudoers","hostIdentifier":"localhost.localdomain","calendarTime":"Tue Oct  9 11:54:00 2018 UTC","unixTime":1539086040,"epoch":0,"counter":0,"decorations":{"host_uuid":"99264D56-9A4E-E593-0B4E-872FBF3CD064","username":"username"}}
{"snapshot":[{"header":"Defaults","rule_details":"!visiblepw"},{"header":"Defaults","rule_details":"always_set_home"},{"header":"Defaults","rule_details":"match_group_by_gid"},{"header":"Defaults","rule_details":"env_reset"},{"header":"Defaults","rule_details":"env_keep = \"COLORS DISPLAY HOSTNAME HISTSIZE KDEDIR LS_COLORS\""},{"header":"Defaults","rule_details":"env_keep += \"MAIL PS1 PS2 QTDIR USERNAME LANG LC_ADDRESS LC_CTYPE\""},{"header":"Defaults","rule_details":"env_keep += \"LC_COLLATE LC_IDENTIFICATION LC_MEASUREMENT LC_MESSAGES\""},{"header":"Defaults","rule_details":"env_keep += \"LC_MONETARY LC_NAME LC_NUMERIC LC_PAPER LC_TELEPHONE\""},{"header":"Defaults","rule_details":"env_keep += \"LC_TIME LC_ALL LANGUAGE LINGUAS _XKB_CHARSET XAUTHORITY\""},{"header":"Defaults","rule_details":"secure_path = /sbin:/bin:/usr/sbin:/usr/bin"},{"header":"root","rule_details":"ALL=(ALL) ALL"},{"header":"%wheel","rule_details":"ALL=(ALL) ALL"}],"action":"snapshot","name":"sudoers","hostIdentifier":"localhost.localdomain","calendarTime":"Tue Oct  9 11:54:30 2018 UTC","unixTime":1539086070,"epoch":0,"counter":0,"decorations":{"host_uuid":"99264D56-9A4E-E593-0B4E-872FBF3CD064","username":"username"}}

由于snapshot是快照模式,所以即使两次结果相同也会全部显示出来;

osqueryd.INFO,记录osqueryd中正在运行的情况。示例如下:

Log file created at: 2018/11/22 17:06:06
Running on machine: osquery.origin
Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
I1122 17:06:06.729902 22686 events.cpp:862] Event publisher not enabled: auditeventpublisher: Publisher disabled via configuration
I1122 17:06:06.730651 22686 events.cpp:862] Event publisher not enabled: syslog: Publisher disabled via configuration

osqueryd.WARNING,记录osquery的警告。示例如下:

Log file created at: 2018/10/09 19:53:45
Running on machine: localhost.localdomain
Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
E1009 19:53:45.471046 104258 events.cpp:987] Requested unknown/failed event publisher: auditeventpublisher
E1009 19:53:45.471606 104259 events.cpp:987] Requested unknown/failed event publisher: inotify
E1009 19:53:45.471634 104260 events.cpp:987] Requested unknown/failed event publisher: syslog
E1009 19:53:45.471658 104261 events.cpp:987] Requested unknown/failed event publisher: udev

osqueryd.ERROR,记录的是osquery的错误信息。示例如下:

Log file created at: 2018/10/09 19:53:45
Running on machine: localhost.localdomain
Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
E1009 19:53:45.471046 104258 events.cpp:987] Requested unknown/failed event publisher: auditeventpublisher
E1009 19:53:45.471606 104259 events.cpp:987] Requested unknown/failed event publisher: inotify
E1009 19:53:45.471634 104260 events.cpp:987] Requested unknown/failed event publisher: syslog
E1009 19:53:45.471658 104261 events.cpp:987] Requested unknown/failed event publisher: udev

在本例中错误信息和警告信息完全相同。在实际情况下,可能很多时候均不相同;

0x06 总结

本文主要是对osqueryd的常用配置进行了简要的说法。通过本文能够快速地利用上手osquery,由于篇幅的原因,有关osquery的很多东西没有介绍或者说明得很详细。官方的文档对osqueryd的配置已经说明得很是详尽了,如果对本文有任何的不解,可以去查阅相关的文档,也欢迎大家就相关问题与我讨论。

以上

osquery初识

0x01 说明

osquery是一个由FaceBook开源用于对系统进行查询、监控以及分析的一款软件。osquery对其的说明如下:

osquery exposes an operating system as a high-performance relational database. This allows you to write SQL-based queries to explore operating system data. With osquery, SQL tables represent abstract concepts such as running processes, loaded kernel modules, open network connections, browser plugins, hardware events or file hashes.

我们知道当你们在Linux中使用诸如pstopls -l等等命令的时候,可以发下其实他们的输出结果的格式都是很固定的很像一张表。或许是基于这样的想法,facebook开发了osquery。osquery将操作系统当作是一个高性能的关系型数据库。使用osquery运行我们能够使用类似于SQL语句的方式去查询数据库中的信息,比如正在运行的进程信息,加载的内核模块,网络连接,浏览器插件等等信息(一切查询的信息的粒度取决于osquery的实现粒度了)。

osquery也广泛地支持多个平台,包括MacOS、CentOS、Ubuntu、Windows 10以及FreeBSD,具体所支持的版本的信息也可以在osquery主页查看。除此之外,osquery的配套文档/网站也是一应俱全,包括主页Githubreadthedocsslack

本篇文章以CentOS为例说明Osquery的安装以及使用。

0x02 安装

主页上面提供了不同操作系统的安装包,我们下载CentOS对应的rpm文件即可。在本例中文件名是osquery-3.3.0-1.linux.x86_64.rpm,使用命令sudo yum install osquery-3.3.0-1.linux.x86_64.rpm安装。安装成功之后会出现:

Installed:
  osquery.x86_64 0:3.3.0-1.linux                                                                                                                                                             
Complete!

0x03 运行

osquery存在两种运行模式,分别是osqueryi(交互式模式)、osqueryd(后台进程模式)。

  • osqueryi,与osqueryd安全独立,不需要以管理员的身份运行,能够及时地查看当前操作系统的状态信息。
  • osqueryd,我们能够利用osqueryd执行定时查询记录操作系统的变化,例如在第一次执行和第二次执行之间的进程变化(增加/减少),osqueryd会将进程执行的结果保存(文件或者是直接打到kafka中)。osqueryd还会利用操作系统的API来记录文件目录的变化、硬件事件、网络行为的变化等等。osqueryd在Linux中是以系统服务的方式来运行。

为了便于演示,我们使用osqueyi来展示osquery强大的功能。我们直接在terminal中输入osqueryi即可进入到osqueryi的交互模式中(osqueryi采用的是sqlite的shell的语法,所以我们也可以使用在sqlite中的所有的内置函数)。

[user@localhost Desktop]$ osqueryi
Using a virtual database. Need help, type '.help'
osquery> .help
Welcome to the osquery shell. Please explore your OS!
You are connected to a transient 'in-memory' virtual database.

.all [TABLE]     Select all from a table
.bail ON|OFF     Stop after hitting an error
.echo ON|OFF     Turn command echo on or off
.exit            Exit this program
.features        List osquery's features and their statuses
.headers ON|OFF  Turn display of headers on or off
.help            Show this message
.mode MODE       Set output mode where MODE is one of:
                   csv      Comma-separated values
                   column   Left-aligned columns see .width
                   line     One value per line
                   list     Values delimited by .separator string
                   pretty   Pretty printed SQL results (default)
.nullvalue STR   Use STRING in place of NULL values
.print STR...    Print literal STRING
.quit            Exit this program
.schema [TABLE]  Show the CREATE statements
.separator STR   Change separator used by output mode
.socket          Show the osquery extensions socket path
.show            Show the current values for various settings
.summary         Alias for the show meta command
.tables [TABLE]  List names of tables
.width [NUM1]+   Set column widths for "column" mode
.timer ON|OFF      Turn the CPU timer measurement on or off

通过.help,我们能够查看在osqueryi模式下的一些基本操作。比如.exit表示退出osqueryi,.mode切换osqueryi的输出结果,.show展示目前osqueryi的配置信息,.tables展示在当前的操作系统中能够支持的所有的表名。.schema [TABLE]显示具体的表的结构信息。

osquery> .show
osquery - being built, with love, at Facebook

osquery 3.3.0
using SQLite 3.19.3

General settings:
     Flagfile: 
       Config: filesystem (/etc/osquery/osquery.conf)
       Logger: filesystem (/var/log/osquery/)
  Distributed: tls
     Database: ephemeral
   Extensions: core
       Socket: /home/xingjun/.osquery/shell.em

Shell settings:
         echo: off
      headers: on
         mode: pretty
    nullvalue: ""
       output: stdout
    separator: "|"
        width: 

Non-default flags/options:
  database_path: /home/xingjun/.osquery/shell.db
  disable_database: true
  disable_events: true
  disable_logging: true
  disable_watchdog: true
  extensions_socket: /home/xingjun/.osquery/shell.em
  hash_delay: 0
  logtostderr: true
  stderrthreshold: 3

可以看到设置包括常规设置(General settings)、shell设置(Shell settings)、非默认选项(Non-default flags/options)。在常规设置中主要是显示了各种配置文件的位置(配置文件/存储日志文件的路径)。 在shell设置中包括了是否需要表头信息(headers),显示方式(mode: pretty),分隔符(separator: "|")。

.table可以查看在当前操作系统中所支持的所有的表,虽然在schema中列出了所有的表(包括了win平台,MacOS平台,Linux平台)。但是具体到某一个平台上面是不会包含其他平台上的表。下方显示的就是我在CentOS7下显示的表。

osquery> .table
  => acpi_tables
  => apt_sources
  => arp_cache
  => augeas
  => authorized_keys
  => block_devices
  => carbon_black_info
  => carves
  => chrome_extensions
  => cpu_time
  => cpuid
  => crontab
...

.schema [TABLE]可以用于查看具体的表的结构信息。如下所示:

osquery> .schema users
CREATE TABLE users(`uid` BIGINT, `gid` BIGINT, `uid_signed` BIGINT, `gid_signed` BIGINT, `username` TEXT, `description` TEXT, `directory` TEXT, `shell` TEXT, `uuid` TEXT, `type` TEXT HIDDEN, PRIMARY KEY (`uid`, `username`)) WITHOUT ROWID;
osquery> .schema processes
CREATE TABLE processes(`pid` BIGINT, `name` TEXT, `path` TEXT, `cmdline` TEXT, `state` TEXT, `cwd` TEXT, `root` TEXT, `uid` BIGINT, `gid` BIGINT, `euid` BIGINT, `egid` BIGINT, `suid` BIGINT, `sgid` BIGINT, `on_disk` INTEGER, `wired_size` BIGINT, `resident_size` BIGINT, `total_size` BIGINT, `user_time` BIGINT, `system_time` BIGINT, `disk_bytes_read` BIGINT, `disk_bytes_written` BIGINT, `start_time` BIGINT, `parent` BIGINT, `pgroup` BIGINT, `threads` INTEGER, `nice` INTEGER, `is_elevated_token` INTEGER HIDDEN, `upid` BIGINT HIDDEN, `uppid` BIGINT HIDDEN, `cpu_type` INTEGER HIDDEN, `cpu_subtype` INTEGER HIDDEN, `phys_footprint` BIGINT HIDDEN, PRIMARY KEY (`pid`)) WITHOUT ROWID;

上面通过.schema查看usersprocesses表的信息,结果输出的是他们对应的DDL。

0x03 基本使用

在本章节中,将会演示使用osqueryi来实时查询操作系统中的信息(为了方便展示查询结果使用的是.mode line模式)。

查看系统信息

osquery> select * from system_info;
          hostname = localhost
              uuid = 4ee0ad05-c2b2-47ce-aea1-c307e421fa88
          cpu_type = x86_64
       cpu_subtype = 158
         cpu_brand = Intel(R) Core(TM) i5-8400 CPU @ 2.80GHz
cpu_physical_cores = 1
 cpu_logical_cores = 1
     cpu_microcode = 0x84
   physical_memory = 2924228608
   hardware_vendor = 
    hardware_model = 
  hardware_version = 
   hardware_serial = 
     computer_name = localhost.localdomain
    local_hostname = localhost

查询的结果包括了CPU的型号,核数,内存大小,计算机名称等等;

查看OS版本

osquery> select * from os_version;
         name = CentOS Linux
      version = CentOS Linux release 7.4.1708 (Core)
        major = 7
        minor = 4
        patch = 1708
        build = 
     platform = rhel
platform_like = rhel
     codename =

以看到我的本机的操作系统的版本是CentOS Linux release 7.4.1708 (Core)

查看内核信息版本

osquery> SELECT * FROM kernel_info;
  version = 3.10.0-693.el7.x86_64
arguments = ro crashkernel=auto rd.lvm.lv=centos/root rd.lvm.lv=centos/swap rhgb quiet LANG=en_US.UTF-8
     path = /vmlinuz-3.10.0-693.el7.x86_64
   device = /dev/mapper/centos-root

osquery> SELECT * FROM kernel_modules LIMIT 3;
   name = tcp_lp
   size = 12663
used_by = -
 status = Live
address = 0xffffffffc06cf000

   name = fuse
   size = 91874
used_by = -
 status = Live
address = 0xffffffffc06ae000

   name = xt_CHECKSUM
   size = 12549
used_by = -
 status = Live
address = 0xffffffffc06a9000

查询repo和pkg信息

osquery提供查询系统中的repo和okg相关信息的表。在Ubuntu中对应的是apt相关的包信息,在Centos中对应的是yum相关的包信息。本例均以yum包为例进行说明

osquery> SELECT * FROM yum_sources  limit 2;
    name = CentOS-$releasever - Base
 baseurl = 
 enabled = 
gpgcheck = 1
  gpgkey = file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7

    name = CentOS-$releasever - Updates
 baseurl = 
 enabled = 
gpgcheck = 1
  gpgkey = file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7

我们可以直接利用yum_sources来查看操作系统的yum源相关的信息。

osquery> SELECT name, version FROM rpm_packages order by name limit 3;
   name = GConf2
version = 3.2.6

   name = GeoIP
version = 1.5.0

   name = ModemManager
version = 1.6.0

利用rpm_packages查看系统中已经安装的rpm包信息。我们也可以通过name对我们需要查询的包进行过滤,如下:

osquery> SELECT name, version FROM rpm_packages where name="osquery";
   name = osquery
version = 3.3.0

挂载信息

我们可以使用mounts表来查询系统中的具体的驱动信息。例如我们可以如下的SQL语句进行查询:

SELECT * FROM mounts;
SELECT device, path, type, inodes_free, flags FROM mounts;

我们也可以使用where语句查询摸一个具体的驱动信息,例如ext4或者是tmpfs信息。如下:

osquery> SELECT device, path, type, inodes_free, flags FROM mounts WHERE type="ext4";
osquery> SELECT device, path, type, inodes_free, flags FROM mounts WHERE type="tmpfs";
     device = tmpfs
       path = /dev/shm
       type = tmpfs
inodes_free = 356960
      flags = rw,seclabel,nosuid,nodev

     device = tmpfs
       path = /run
       type = tmpfs
inodes_free = 356386
      flags = rw,seclabel,nosuid,nodev,mode=755

     device = tmpfs
       path = /sys/fs/cgroup
       type = tmpfs
inodes_free = 356945
      flags = ro,seclabel,nosuid,nodev,noexec,mode=755

     device = tmpfs
       path = /run/user/42
       type = tmpfs
inodes_free = 356955
      flags = rw,seclabel,nosuid,nodev,relatime,size=285572k,mode=700,uid=42,gid=42

     device = tmpfs
       path = /run/user/1000
       type = tmpfs
inodes_free = 356939
      flags = rw,seclabel,nosuid,nodev,relatime,size=285572k,mode=700,uid=1000,gid=1000

内存信息

使用memory_info查看内存信息,如下:

osquery> select * from memory_info;
memory_total = 2924228608
 memory_free = 996024320
     buffers = 4280320
      cached = 899137536
 swap_cached = 0
      active = 985657344
    inactive = 629919744
  swap_total = 2684350464
   swap_free = 2684350464

网卡信息

使用interface_addresses查看网卡信息,如下:

osquery> SELECT * FROM interface_addresses;
     interface = lo
       address = 127.0.0.1
          mask = 255.0.0.0
     broadcast = 
point_to_point = 127.0.0.1
          type = 

     interface = virbr0
       address = 192.168.122.1
          mask = 255.255.255.0
     broadcast = 192.168.122.255
point_to_point = 
          type = 

     interface = lo
       address = ::1
          mask = ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff
     broadcast = 
point_to_point = 
          type =

还可以使用interface_details查看更加具体的网卡信息。

SELECT * FROM interface_details;
SELECT interface, mac, ipackets, opackets, ibytes, obytes FROM interface_details;

查询结果如下

osquery> SELECT * FROM interface_details;
  interface = lo
        mac = 00:00:00:00:00:00
       type = 4
        mtu = 65536
     metric = 0
      flags = 65609
   ipackets = 688
   opackets = 688
     ibytes = 59792
     obytes = 59792
    ierrors = 0
    oerrors = 0
     idrops = 0
     odrops = 0
 collisions = 0
last_change = -1
 link_speed = 
   pci_slot = 
    ....

系统启动时间

osquery> select * from uptime;
         days = 0
        hours = 2
      minutes = 23
      seconds = 51
total_seconds = 8631

查询用户信息

osquery提供了多个表用于查询用户的信息,包括使用users表检索系统中所有的用户,使用last表查看用户上次登录的信息,使用logged_in_user查询具有活动shell的用户信息。

使用select * from users查看所有用户信息,使用类似于uid>1000的方式过滤用户。

osquery> select * from users where uid>1000;
        uid = 65534
        gid = 65534
 uid_signed = 65534
 gid_signed = 65534
   username = nfsnobody
description = Anonymous NFS User
  directory = /var/lib/nfs
      shell = /sbin/nologin
       uuid =

我们可以使用last表查询最终的登录信息,如SELECT * FROM last;。对于普通用户来说,其type值为7。那么我们的查询条件如下:

osquery> SELECT * FROM last where type=7;
username = user
     tty = :0
     pid = 12776
    type = 7
    time = 1539882439
    host = :0

username = user
     tty = pts/0
     pid = 13754
    type = 7
    time = 1539882466
    host = :0

其中的time是时间戳类型,转换为具体的日期之后就可以看到具体的登录时间了。

使用SELECT * FROM logged_in_users;查看当前已经登录的用户信息。

防火墙信息

我们可以使用iptables来查看具体的防火墙信息,如select * from iptables;,也可以进行过滤查询具体的防火墙信息。如SELECT chain, policy, src_ip, dst_ip FROM iptables WHERE chain="POSTROUTING" order by src_ip;

进程信息

我们可以使用processes来查询系统上进程的信息,包括pid,name,path,command等等。
可以使用select * from processes;或者查看具体的某几项信息,select pid,name,path,cmdline from processes;

osquery> select pid,name,path,cmdline from processes limit 2;
    pid = 1
   name = systemd
   path = 
cmdline = /usr/lib/systemd/systemd --switched-root --system --deserialize 21

    pid = 10
   name = watchdog/0
   path = 
cmdline =

检查计划任务

我们可以使用crontab来检查系统中的计划任务。

osquery> select * from crontab;
       event = 
      minute = 01
        hour = *
day_of_month = *
       month = *
 day_of_week = *
     command = root run-parts /etc/cron.hourly
        path = /etc/cron.d/0hourly

       event = 
      minute = 0
        hour = 1
day_of_month = *
       month = *
 day_of_week = Sun
     command = root /usr/sbin/raid-check
        path = /etc/cron.d/raid-check

其他

在Linux中还存在其他很多的表能够帮助我们更好地进行入侵检测相关的工作,包括process_eventssocket_eventsprocess_open_sockets等等,这些表可供我们进行入侵检测的确认工作。至于这些表的工作原理,有待阅读osquery的源代码进行进一步分析。

0x04 总结

本文主要是对Osquery的基础功能进行了介绍。Oquery的强大功能需要进一步地挖掘和发现。总体来说,Osquery将操作系统中的信息抽象成为一张张表,对于进行基线检查,系统监控是一个非常优雅的方式。当然由于Osquery在这方面的优势,也可以考虑将其作为HIDS的客户端,但是如果HIDS仅仅只有Osquery也显然是不够的。

以上