diff --git a/src/aof.c b/src/aof.c index d59e4061..dbe7bfa6 100644 --- a/src/aof.c +++ b/src/aof.c @@ -226,6 +226,7 @@ int startAppendOnly(void) { * * However if force is set to 1 we'll write regardless of the background * fsync. */ +#define AOF_WRITE_LOG_ERROR_RATE 30 /* Seconds between errors logging. */ void flushAppendOnlyFile(int force) { ssize_t nwritten; int sync_in_progress = 0; @@ -267,27 +268,76 @@ void flushAppendOnlyFile(int force) { * or alike */ nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf)); if (nwritten != (signed)sdslen(server.aof_buf)) { - /* Ooops, we are in troubles. The best thing to do for now is - * aborting instead of giving the illusion that everything is - * working as expected. */ + static time_t last_write_error_log = 0; + int can_log = 0; + + /* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */ + if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) { + can_log = 1; + last_write_error_log = server.unixtime; + } + + /* Lof the AOF write error and record the error code. */ if (nwritten == -1) { - redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno)); + if (can_log) { + redisLog(REDIS_WARNING,"Error writing to the AOF file: %s", + strerror(errno)); + server.aof_last_write_errno = errno; + } } else { - redisLog(REDIS_WARNING,"Exiting on short write while writing to " - "the append-only file: %s (nwritten=%ld, " - "expected=%ld)", - strerror(errno), - (long)nwritten, - (long)sdslen(server.aof_buf)); + if (can_log) { + redisLog(REDIS_WARNING,"Short write while writing to " + "the AOF file: (nwritten=%lld, " + "expected=%lld)", + (long long)nwritten, + (long long)sdslen(server.aof_buf)); + } if (ftruncate(server.aof_fd, server.aof_current_size) == -1) { - redisLog(REDIS_WARNING, "Could not remove short write " - "from the append-only file. Redis may refuse " - "to load the AOF the next time it starts. " - "ftruncate: %s", strerror(errno)); + if (can_log) { + redisLog(REDIS_WARNING, "Could not remove short write " + "from the append-only file. Redis may refuse " + "to load the AOF the next time it starts. " + "ftruncate: %s", strerror(errno)); + } + } else { + /* If the ftrunacate() succeeded we can set nwritten to + * -1 since there is no longer partial data into the AOF. */ + nwritten = -1; } + server.aof_last_write_errno = ENOSPC; + } + + /* Handle the AOF write error. */ + if (server.aof_fsync == AOF_FSYNC_ALWAYS) { + /* We can't recover when the fsync policy is ALWAYS since the + * reply for the client is already in the output buffers, and we + * have the contract with the user that on acknowledged write data + * is synched on disk. */ + redisLog(REDIS_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting..."); + exit(1); + } else { + /* Recover from failed write leaving data into the buffer. However + * set an error to stop accepting writes as long as the error + * condition is not cleared. */ + server.aof_last_write_status = REDIS_ERR; + + /* Trim the sds buffer if there was a partial write, and there + * was no way to undo it with ftruncate(2). */ + if (nwritten > 0) { + server.aof_current_size += nwritten; + sdsrange(server.aof_buf,nwritten,-1); + } + return; /* We'll try again on the next call... */ + } + } else { + /* Successful write(2). If AOF was in error state, restore the + * OK state and log the event. */ + if (server.aof_last_write_status == REDIS_ERR) { + redisLog(REDIS_WARNING, + "AOF write error looks solved, Redis can write again."); + server.aof_last_write_status = REDIS_OK; } - exit(1); } server.aof_current_size += nwritten; diff --git a/src/redis.c b/src/redis.c index 059c43d4..5299c69d 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1167,9 +1167,13 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { } - /* If we postponed an AOF buffer flush, let's try to do it every time the - * cron function is called. */ - if (server.aof_flush_postponed_start) flushAppendOnlyFile(0); + /* AOF: we may have postponed buffer flush, or were not able to + * write our buffer because of write(2) error. Try again here. */ + if (server.aof_flush_postponed_start || + server.aof_last_write_status == REDIS_ERR) + { + flushAppendOnlyFile(0); + } /* Close clients that need to be closed asynchronous */ freeClientsInAsyncFreeQueue(); @@ -1670,6 +1674,8 @@ void initServer() { server.unixtime = time(NULL); server.mstime = mstime(); server.lastbgsave_status = REDIS_OK; + server.aof_last_write_status = REDIS_OK; + server.aof_last_write_errno = 0; server.repl_good_slaves_count = 0; /* Create the serverCron() time event, that's our main way to process @@ -2035,15 +2041,22 @@ int processCommand(redisClient *c) { /* Don't accept write commands if there are problems persisting on disk * and if this is a master instance. */ - if (server.stop_writes_on_bgsave_err && - server.saveparamslen > 0 - && server.lastbgsave_status == REDIS_ERR && + if (((server.stop_writes_on_bgsave_err && + server.saveparamslen > 0 && + server.lastbgsave_status == REDIS_ERR) || + server.aof_last_write_status == REDIS_ERR) && server.masterhost == NULL && (c->cmd->flags & REDIS_CMD_WRITE || c->cmd->proc == pingCommand)) { flagTransaction(c); - addReply(c, shared.bgsaveerr); + if (server.aof_last_write_status == REDIS_OK) + addReply(c, shared.bgsaveerr); + else + addReplySds(c, + sdscatprintf(sdsempty(), + "-MISCONF Errors writing to the AOF file: %s\r\n", + strerror(server.aof_last_write_errno))); return REDIS_OK; } @@ -2426,7 +2439,8 @@ sds genRedisInfoString(char *section) { "aof_rewrite_scheduled:%d\r\n" "aof_last_rewrite_time_sec:%jd\r\n" "aof_current_rewrite_time_sec:%jd\r\n" - "aof_last_bgrewrite_status:%s\r\n", + "aof_last_bgrewrite_status:%s\r\n" + "aof_last_write_status:%s\r\n", server.loading, server.dirty, server.rdb_child_pid != -1, @@ -2441,7 +2455,8 @@ sds genRedisInfoString(char *section) { (intmax_t)server.aof_rewrite_time_last, (intmax_t)((server.aof_child_pid == -1) ? -1 : time(NULL)-server.aof_rewrite_time_start), - (server.aof_lastbgrewrite_status == REDIS_OK) ? "ok" : "err"); + (server.aof_lastbgrewrite_status == REDIS_OK) ? "ok" : "err", + (server.aof_last_write_status == REDIS_OK) ? "ok" : "err"); if (server.aof_state != REDIS_AOF_OFF) { info = sdscatprintf(info, diff --git a/src/redis.h b/src/redis.h index 504d7947..d7a9eea0 100644 --- a/src/redis.h +++ b/src/redis.h @@ -691,6 +691,8 @@ struct redisServer { int aof_lastbgrewrite_status; /* REDIS_OK or REDIS_ERR */ unsigned long aof_delayed_fsync; /* delayed AOF fsync() counter */ int aof_rewrite_incremental_fsync;/* fsync incrementally while rewriting? */ + int aof_last_write_status; /* REDIS_OK or REDIS_ERR */ + int aof_last_write_errno; /* Valid if aof_last_write_status is ERR */ /* RDB persistence */ long long dirty; /* Changes to DB from the last save */ long long dirty_before_bgsave; /* Used to restore dirty on failed BGSAVE */