redis 源码探讨--AOF实现

redis是内存型的数据库，当机器宕机以后，数据就会丢失。

redis为了数据的持久化，提供了两种机制，AOF和RDB。

下一篇分析RDB。

打开AOF需要在配置文件里面打开

appendonly yes

会在配置文件的目录里面生成appendonly.aof文件

AOF是以明文记录操作的，比如收到"set testkey testvalue"，记录的明文是

*3
$3
set
$7
testkey
$9
testvalue

其中 *3 表示当前命令有三个部分，每个部分由“$+数字”开头，后面跟着的是具体的命令，键或值，数字表示这部分的命令，键，或者值一共有多少个字节。

在上一篇中处理redis命令的时候核心函数是调用了 call(c,CMD_CALL_FULL);

int processCommand(client *c) {
//处理集群，LUA，安全验证等一系列命令
....
	//在server.command的字典里面查找redisCommand的命令
    c->cmd = c->lastcmd = lookupCommand(c->argv[0]->ptr);
    ....
//执行普通命令
    if (c->flags & CLIENT_MULTI &&
        c->cmd->proc != execCommand && c->cmd->proc != discardCommand &&
        c->cmd->proc != multiCommand && c->cmd->proc != watchCommand)
    {
        queueMultiCommand(c);
        addReply(c,shared.queued);
    } else {
//处理命令核心逻辑
        call(c,CMD_CALL_FULL);
        c->woff = server.master_repl_offset;
        if (listLength(server.ready_keys))
            handleClientsBlockedOnKeys();
    }
    return C_OK;
}

处理完redis的命令以后就会进行aof的持久化工作，调用了propagate函数

void call(client *c, int flags) {
....
/* Propagate the command into the AOF and replication link */
    if (flags & CMD_CALL_PROPAGATE &&
        (c->flags & CLIENT_PREVENT_PROP) != CLIENT_PREVENT_PROP)
    {
        int propagate_flags = PROPAGATE_NONE;

        /* Check if the command operated changes in the data set. If so
         * set for replication / AOF propagation. */
        if (dirty) propagate_flags |= (PROPAGATE_AOF|PROPAGATE_REPL);

        /* If the client forced AOF / replication of the command, set
         * the flags regardless of the command effects on the data set. */
        if (c->flags & CLIENT_FORCE_REPL) propagate_flags |= PROPAGATE_REPL;
        if (c->flags & CLIENT_FORCE_AOF) propagate_flags |= PROPAGATE_AOF;

        /* However prevent AOF / replication propagation if the command
         * implementations called preventCommandPropagation() or similar,
         * or if we don't have the call() flags to do so. */
        if (c->flags & CLIENT_PREVENT_REPL_PROP ||
            !(flags & CMD_CALL_PROPAGATE_REPL))
                propagate_flags &= ~PROPAGATE_REPL;
        if (c->flags & CLIENT_PREVENT_AOF_PROP ||
            !(flags & CMD_CALL_PROPAGATE_AOF))
                propagate_flags &= ~PROPAGATE_AOF;

        /* Call propagate() only if at least one of AOF / replication
         * propagation is needed. Note that modules commands handle replication
         * in an explicit way, so we never replicate them automatically. */
        if (propagate_flags != PROPAGATE_NONE && !(c->cmd->flags & CMD_MODULE))
   	     //调用传播函数
            propagate(c->cmd,c->db->id,c->argv,c->argc,propagate_flags);
    }
}

在propagate函数里面调用了feedAppendOnlyFile把相关的操作写入到了server.aof的缓存里面

//判断是否开启AOF和是否是集群模式需要进行传播
void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,
               int flags)
{
    if (server.aof_state != AOF_OFF && flags & PROPAGATE_AOF)
        feedAppendOnlyFile(cmd,dbid,argv,argc);
    if (flags & PROPAGATE_REPL)
        replicationFeedSlaves(server.slaves,dbid,argv,argc);
}

//按照AOF文件格式写入一个临时的简单动态字符串SDS，
//AOF文件的格式是：*3 表示当前命令有三个部分，每部分都是$+数字开头，后面是具体的命令
//键或值一共有多少个字节
void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
   ......
   //按照格式进行字符串格式化
    if (dictid != server.aof_selected_db) {
        char seldb[64];

        snprintf(seldb,sizeof(seldb),"%d",dictid);
        buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
            (unsigned long)strlen(seldb),seldb);
        server.aof_selected_db = dictid;
    }
   ....
   //写入到server.aof_buf,等待刷入硬盘  
    if (server.aof_state == AOF_ON)
        server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));

    /* If a background append only file rewriting is in progress we want to
     * accumulate the differences between the child DB and the current one
     * in a buffer, so that when the child process will do its work we
     * can append the differences to the new append only file. */
    if (server.aof_child_pid != -1)
        aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));
    sdsfree(buf);
}

刷盘的核心操作是在flushAppendOnlyFile函数里面，这里要区分三种落盘的方式，分别是

Alaways: 同步会写，每个命令执行完，同步的将日志落盘

Everysec:每秒落盘，每个写命令执行完毕以后，每隔一秒落盘

No:由操作系统决定何时落盘

#define AOF_WRITE_LOG_ERROR_RATE 30 /* Seconds between errors logging. */
//forces 0 不强制落盘
//		1 强制落盘
void flushAppendOnlyFile(int force) {
    ssize_t nwritten;
    int sync_in_progress = 0;
    mstime_t latency;
	//因为在AOF_FSYNC_EVERYSEC的模式下，fsync只有在aof buffer不为空的时候才会调用
	//为了使得用户的操作落盘的更为及时
    if (sdslen(server.aof_buf) == 0) {      
        if (server.aof_fsync == AOF_FSYNC_EVERYSEC &&
            server.aof_fsync_offset != server.aof_current_size &&
            server.unixtime > server.aof_last_fsync &&
            !(sync_in_progress = aofFsyncInProgress())) {
            goto try_fsync;
        } else {
            return;
        }
    }
	//在AOF_FSYNC_EVERYSEC模式下面，需要判断后台是不是有bio的线程在跑
	//此时加了锁的操作，如果有bio线程在跑那么现在由于不能获得锁，处于阻塞
    if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
        sync_in_progress = aofFsyncInProgress();

    if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
        /* With this append fsync policy we do background fsyncing.
         * If the fsync is still in progress we can try to delay
         * the write for a couple of seconds. */
        if (sync_in_progress) {
            if (server.aof_flush_postponed_start == 0) {
                /* No previous write postponing, remember that we are
                 * postponing the flush and return. */
                server.aof_flush_postponed_start = server.unixtime;
                return;
            } else if (server.unixtime - server.aof_flush_postponed_start < 2) {
                /* We were already waiting for fsync to finish, but for less
                 * than two seconds this is still ok. Postpone again. */
                return;
            }
            /* Otherwise fall trough, and go write since we can't wait
             * over two seconds. */
            server.aof_delayed_fsync++;
            serverLog(LL_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
        }
    }
    /* We want to perform a single write. This should be guaranteed atomic
     * at least if the filesystem we are writing is a real physical one.
     * While this will save us against the server being killed I don't think
     * there is much to do about the whole server stopping for power problems
     * or alike */

    if (server.aof_flush_sleep && sdslen(server.aof_buf)) {
        usleep(server.aof_flush_sleep);
    }
	//记录开始时间
    latencyStartMonitor(latency);
    //调用系统函数写入文件
    nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
    latencyEndMonitor(latency);
    /* We want to capture different events for delayed writes:
     * when the delay happens with a pending fsync, or with a saving child
     * active, and when the above two conditions are missing.
     * We also use an additional event name to save all samples which is
     * useful for graphing / monitoring purposes. */
    //记录操作时间日志
    if (sync_in_progress) {
        latencyAddSampleIfNeeded("aof-write-pending-fsync",latency);
    } else if (hasActiveChildProcess()) {
        latencyAddSampleIfNeeded("aof-write-active-child",latency);
    } else {
        latencyAddSampleIfNeeded("aof-write-alone",latency);
    }
    latencyAddSampleIfNeeded("aof-write",latency);

    /* We performed the write so reset the postponed flush sentinel to zero. */
    server.aof_flush_postponed_start = 0;
	//处理错误情况
    if (nwritten != (ssize_t)sdslen(server.aof_buf)) {
        static time_t last_write_error_log = 0;
        int can_log = 0;

        /* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */
        if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) {
            can_log = 1;
            last_write_error_log = server.unixtime;
        }

        /* Log the AOF write error and record the error code. */
        if (nwritten == -1) {
            if (can_log) {
                serverLog(LL_WARNING,"Error writing to the AOF file: %s",
                    strerror(errno));
                server.aof_last_write_errno = errno;
            }
        } else {
            if (can_log) {
                serverLog(LL_WARNING,"Short write while writing to "
                                       "the AOF file: (nwritten=%lld, "
                                       "expected=%lld)",
                                       (long long)nwritten,
                                       (long long)sdslen(server.aof_buf));
            }

            if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {
                if (can_log) {
                    serverLog(LL_WARNING, "Could not remove short write "
                             "from the append-only file.  Redis may refuse "
                             "to load the AOF the next time it starts.  "
                             "ftruncate: %s", strerror(errno));
                }
            } else {
                /* If the ftruncate() succeeded we can set nwritten to
                 * -1 since there is no longer partial data into the AOF. */
                nwritten = -1;
            }
            server.aof_last_write_errno = ENOSPC;
        }

        /* Handle the AOF write error. */
        if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
            /* We can't recover when the fsync policy is ALWAYS since the
             * reply for the client is already in the output buffers, and we
             * have the contract with the user that on acknowledged write data
             * is synced on disk. */
            serverLog(LL_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");
            exit(1);
        } else {
            /* Recover from failed write leaving data into the buffer. However
             * set an error to stop accepting writes as long as the error
             * condition is not cleared. */
            server.aof_last_write_status = C_ERR;

            /* Trim the sds buffer if there was a partial write, and there
             * was no way to undo it with ftruncate(2). */
            if (nwritten > 0) {
                server.aof_current_size += nwritten;
                sdsrange(server.aof_buf,nwritten,-1);
            }
            return; /* We'll try again on the next call... */
        }
    } else {
        /* Successful write(2). If AOF was in error state, restore the
         * OK state and log the event. */
        if (server.aof_last_write_status == C_ERR) {
            serverLog(LL_WARNING,
                "AOF write error looks solved, Redis can write again.");
            server.aof_last_write_status = C_OK;
        }
    }
    server.aof_current_size += nwritten;

    /* Re-use AOF buffer when it is small enough. The maximum comes from the
     * arena size of 4k minus some overhead (but is otherwise arbitrary). */
    if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
        sdsclear(server.aof_buf);
    } else {
        sdsfree(server.aof_buf);
        server.aof_buf = sdsempty();
    }
//因为是goto语句的跳转，如果正常执行也是会走到下面的
try_fsync:
    /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
     * children doing I/O in the background. */
    if (server.aof_no_fsync_on_rewrite && hasActiveChildProcess())
        return;

    /* Perform the fsync if needed. */
    //在AOF_FSYNC_ALWAYS的模式下，每次操作都会执行一次落盘操作
    if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
        /* redis_fsync is defined as fdatasync() for Linux in order to avoid
         * flushing metadata. */
        latencyStartMonitor(latency);
        //真正的落盘操作
        redis_fsync(server.aof_fd); /* Let's try to get this data on the disk */
        latencyEndMonitor(latency);
        latencyAddSampleIfNeeded("aof-fsync-always",latency);
        server.aof_fsync_offset = server.aof_current_size;
        server.aof_last_fsync = server.unixtime;
    } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
                server.unixtime > server.aof_last_fsync)) {
        if (!sync_in_progress) {
        	//使用bio线程落盘，有可能会被阻塞，因为有加锁的操作
            aof_background_fsync(server.aof_fd);
            server.aof_fsync_offset = server.aof_current_size;
        }
        server.aof_last_fsync = server.unixtime;
    }
}

那么落盘函数是什么时候调用的呢？

第一个是beforesleep里面：

void beforeSleep(struct aeEventLoop *eventLoop) {
    ....
    /* Write the AOF buffer on disk */
    flushAppendOnlyFile(0);
    ...
}

beforesleep是每次在主循环里面都会调用的函数，这是第一个地方

int aeProcessEvents(aeEventLoop *eventLoop, int flags)
{
   ...
        if (eventLoop->beforesleep != NULL && flags & AE_CALL_BEFORE_SLEEP)
            eventLoop->beforesleep(eventLoop);
            ....

如果是每秒执行的模式下，是在servercron里面周期性的调用：

int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
   .....
        /* Trigger an AOF rewrite if needed. */
        if (server.aof_state == AOF_ON &&
            !hasActiveChildProcess() &&
            server.aof_rewrite_perc &&
            server.aof_current_size > server.aof_rewrite_min_size)
        {
            long long base = server.aof_rewrite_base_size ?
                server.aof_rewrite_base_size : 1;
            long long growth = (server.aof_current_size*100/base) - 100;
            if (growth >= server.aof_rewrite_perc) {
                serverLog(LL_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth);
                rewriteAppendOnlyFileBackground();
            }
        }
    }


    //在每一次severcron执行的时候判断是否还有没有及时落盘的操作
    if (server.aof_flush_postponed_start) flushAppendOnlyFile(0);

	//每秒执行
    run_with_period(1000) {
        if (server.aof_last_write_status == C_ERR)
            flushAppendOnlyFile(0);
    }
   .....
 
}

AOF是以文件追加的方式记录日志的，当一个键值对被多条写命令反复操作时候，AOF会记录多条命令。AOF有个重写的机制可以减少日志，重写就是根据键值对最新的状态记录一条日志。由于要把整个数据库的最新日志都写回磁盘，这是一个非常耗时的操作。redis使用了后台线程bgrewriteaof来完成。

出发的时机，

1 当用户通过config命令修改appendonly no 变成 appendonly yes

2 用户调用BGREWRITEAOF

3 在servercorn里面判断当前用户调用了BGREWRITEAOF命令，但是BGSAVE后台进程已经执行完了，第二个是超过了配置文件里面配置了最小重写的大小。

重写函数的核心逻辑

//当用户调用BGREWRITEAOF会fork一个子进程进行重写工作，主线程仍然在继续工作不会被阻塞
//fork出来的字进程由于会拷贝所用主进程的内存，所以是一个拷贝，两处操作，子进程的重写工作
//会生成临时文件。
int rewriteAppendOnlyFileBackground(void) {
    pid_t childpid;

    if (hasActiveChildProcess()) return C_ERR;
    if (aofCreatePipes() != C_OK) return C_ERR;
    openChildInfoPipe();
    if ((childpid = redisFork()) == 0) {
        char tmpfile[256];

        /* Child */
        //子进程的标题和设置CPU亲和性
        redisSetProcTitle("redis-aof-rewrite");
        redisSetCpuAffinity(server.aof_rewrite_cpulist);
        //临时文件用来重写工作
        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
        if (rewriteAppendOnlyFile(tmpfile) == C_OK) {
            sendChildCOWInfo(CHILD_INFO_TYPE_AOF, "AOF rewrite");
            exitFromChild(0);
        } else {
            exitFromChild(1);
        }
    } else {
        /* Parent */
        if (childpid == -1) {
            closeChildInfoPipe();
            serverLog(LL_WARNING,
                "Can't rewrite append only file in background: fork: %s",
                strerror(errno));
            aofClosePipes();
            return C_ERR;
        }
        serverLog(LL_NOTICE,
            "Background append only file rewriting started by pid %d",childpid);
        server.aof_rewrite_scheduled = 0;
        server.aof_rewrite_time_start = time(NULL);
        server.aof_child_pid = childpid;
        /* We set appendseldb to -1 in order to force the next call to the
         * feedAppendOnlyFile() to issue a SELECT command, so the differences
         * accumulated by the parent into server.aof_rewrite_buf will start
         * with a SELECT statement and it will be safe to merge. */
        server.aof_selected_db = -1;
        replicationScriptCacheFlush();
        return C_OK;
    }
    return C_OK; /* unreached */
}

redis 源码探讨--AOF实现

继续阅读

秒懂JVM的三大参数类型，就靠这十个小实验了初级—中级—高级三个级别的大厂面试真题阿里云——Java 实习生/初级美团——Java 中级蚂蚁金服——Java 高级基础篇JVM 篇MySQL 篇Redis 篇

CentOs7 安装redis4.0 遇到的坑一、CentOs7连接网络二、make文件的时候出错三、安装ruby的redis插件时候报错最后

docker 搭建Redis 问题解决

如何解决Redis缓存击穿、雪崩、穿透问题

java 版本的redis-stat不能运行在后台和daemon

redis管理常用命令

django短信验证码的后端实现

Redis订阅了一段时间后订阅失效了（ redisTemplate.convertAndSend）

2022秋招面试总结（cpp+java+测开）百度测开一面字节后端一面虾皮后端一面虾皮后端二面

数据迁移方法数据迁移原则数据迁移之双写方案数据迁移之级联同步方案

微服务-性能压测\缓存redis和分布式锁redisson和SpringCache

Nacos 2.0 升级前后性能对比压测

Spring数据和Redis

redis集群数据一致性_RedisRaft为Redis集群带来强大的数据一致性

supervisor 管理redis 和httpd 环境centos7

Redis简介一(单机版)发展历程Redis