diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d603f403047e..4c29be80436d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -173,15 +173,22 @@ static bool __same_bdev(struct f2fs_sb_info *sbi, */ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, struct writeback_control *wbc, - int npages, bool is_read) + int npages, bool is_read, + enum page_type type, enum temp_type temp) { struct bio *bio; bio = f2fs_bio_alloc(sbi, npages, true); f2fs_target_device(sbi, blk_addr, bio); - bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; - bio->bi_private = is_read ? NULL : sbi; + if (is_read) { + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = NULL; + } else { + bio->bi_end_io = f2fs_write_end_io; + bio->bi_private = sbi; + bio->bi_write_hint = io_type_to_rw_hint(sbi, type, temp); + } if (wbc) wbc_init_bio(wbc, bio); @@ -380,7 +387,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) /* Allocate a new bio */ bio = __bio_alloc(fio->sbi, fio->new_blkaddr, fio->io_wbc, - 1, is_read_io(fio->op)); + 1, is_read_io(fio->op), fio->type, fio->temp); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); @@ -443,7 +450,8 @@ alloc_new: goto out_fail; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, fio->io_wbc, - BIO_MAX_PAGES, false); + BIO_MAX_PAGES, false, + fio->type, fio->temp); io->fio = *fio; } @@ -867,8 +875,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_seg_type = NO_CHECK_TYPE; if (direct_io) { - /* map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); */ - map.m_seg_type = rw_hint_to_seg_type(WRITE_LIFE_NOT_SET); + map.m_seg_type = rw_hint_to_seg_type(iocb->ki_hint); flag = __force_buffered_io(inode, WRITE) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; @@ -1150,8 +1157,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_DEFAULT, NULL, rw_hint_to_seg_type( - WRITE_LIFE_NOT_SET)); - /* inode->i_write_hint)); */ + inode->i_write_hint)); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -2292,10 +2298,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; int rw = iov_iter_rw(iter); int err; + enum rw_hint hint = iocb->ki_hint; + int whint_mode = sbi->whint_mode; err = check_direct_IO(inode, iter, offset); if (err) @@ -2306,11 +2315,16 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) trace_f2fs_direct_IO_enter(inode, offset, count, rw); + if (rw == WRITE && whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = WRITE_LIFE_NOT_SET; + down_read(&F2FS_I(inode)->dio_rwsem[rw]); err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio); up_read(&F2FS_I(inode)->dio_rwsem[rw]); if (rw == WRITE) { + if (whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = hint; if (err > 0) { f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, err); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1942b53b51eb..1fc80ba282dd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1035,6 +1035,11 @@ enum { MAX_TIME, }; +enum { + WHINT_MODE_OFF, /* not pass down write hints */ + WHINT_MODE_USER, /* try to pass down hints given by users */ +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -1218,6 +1223,8 @@ struct f2fs_sb_info { char *s_qf_names[MAXQUOTAS]; int s_jquota_fmt; /* Format of quota to use */ #endif + /* For which write hints are passed down to block layer */ + int whint_mode; }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -2507,15 +2514,6 @@ static inline void *kvzalloc(size_t size, gfp_t flags) return ret; } -enum rw_hint { - WRITE_LIFE_NOT_SET = 0, - WRITE_LIFE_NONE = 1, /* RWH_WRITE_LIFE_NONE */ - WRITE_LIFE_SHORT = 2, /* RWH_WRITE_LIFE_SHORT */ - WRITE_LIFE_MEDIUM = 3, /* RWH_WRITE_LIFE_MEDIUM */ - WRITE_LIFE_LONG = 4, /* RWH_WRITE_LIFE_LONG */ - WRITE_LIFE_EXTREME = 5, /* RWH_WRITE_LIFE_EXTREME */ -}; - static inline int wbc_to_write_flags(struct writeback_control *wbc) { if (wbc->sync_mode == WB_SYNC_ALL) @@ -2806,6 +2804,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); int rw_hint_to_seg_type(enum rw_hint hint); +enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, + enum temp_type temp); /* * checkpoint.c diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 915984add03f..fbb5539d051a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2456,6 +2456,62 @@ int rw_hint_to_seg_type(enum rw_hint hint) } } +/* This returns write hints for each segment type. This hints will be + * passed down to block layer. There are mapping tables which depend on + * the mount option 'whint_mode'. + * + * 1) whint_mode=off. F2FS only passes down WRITE_LIFE_NOT_SET. + * + * 2) whint_mode=user-based. F2FS tries to pass down hints given by users. + * + * User F2FS Block + * ---- ---- ----- + * META WRITE_LIFE_NOT_SET + * HOT_NODE " + * WARM_NODE " + * COLD_NODE " + * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME + * extension list " " + * + * -- buffered io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " " + * WRITE_LIFE_MEDIUM " " + * WRITE_LIFE_LONG " " + * + * -- direct io + * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME + * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT + * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET + * WRITE_LIFE_NONE " WRITE_LIFE_NONE + * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM + * WRITE_LIFE_LONG " WRITE_LIFE_LONG + * + */ + +enum rw_hint io_type_to_rw_hint(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) +{ + if (sbi->whint_mode == WHINT_MODE_USER) { + if (type == DATA) { + switch (temp) { + case COLD: + return WRITE_LIFE_EXTREME; + case HOT: + return WRITE_LIFE_SHORT; + default: + return WRITE_LIFE_NOT_SET; + } + } else { + return WRITE_LIFE_NOT_SET; + } + } else { + return WRITE_LIFE_NOT_SET; + } +} + static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) @@ -2644,6 +2700,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = META, + .temp = HOT, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = page->index, @@ -2692,6 +2749,8 @@ int rewrite_data_page(struct f2fs_io_info *fio) int err; fio->new_blkaddr = fio->old_blkaddr; + /* i/o temperature is needed for passing down write hints */ + __get_segment_type(fio); stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6afb5e249aa2..b5e6fd179e5b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -129,6 +129,7 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, + Opt_whint, Opt_err, }; @@ -182,6 +183,7 @@ static match_table_t f2fs_tokens = { {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, + {Opt_whint, "whint_mode=%s"}, {Opt_err, NULL}, }; @@ -679,6 +681,22 @@ static int parse_options(struct super_block *sb, char *options) "quota operations not supported"); break; #endif + case Opt_whint: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (strlen(name) == 10 && + !strncmp(name, "user-based", 10)) { + sbi->whint_mode = WHINT_MODE_USER; + } else if (strlen(name) == 3 && + !strncmp(name, "off", 3)) { + sbi->whint_mode = WHINT_MODE_OFF; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -722,6 +740,12 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } + + /* Not pass down write hints if the number of active logs is lesser + * than NR_CURSEG_TYPE. + */ + if (sbi->active_logs != NR_CURSEG_TYPE) + sbi->whint_mode = WHINT_MODE_OFF; return 0; } @@ -1232,6 +1256,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",prjquota"); #endif f2fs_show_quota_options(seq, sbi->sb); + if (sbi->whint_mode == WHINT_MODE_USER) + seq_printf(seq, ",whint_mode=%s", "user-based"); return 0; } @@ -1241,6 +1267,7 @@ static void default_options(struct f2fs_sb_info *sbi) /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS; + sbi->whint_mode = WHINT_MODE_OFF; set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1281,6 +1308,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); + int old_whint_mode = sbi->whint_mode; #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif @@ -1380,7 +1408,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & MS_RDONLY) { + if (*flags & MS_RDONLY || sbi->whint_mode != old_whint_mode) { writeback_inodes_sb(sb, WB_REASON_SYNC); sync_inodes_sb(sb); @@ -1430,6 +1458,7 @@ restore_opts: sbi->s_qf_names[i] = s_qf_names[i]; } #endif + sbi->whint_mode = old_whint_mode; sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; sb->s_flags = old_sb_flags; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index cd395ecec99d..99ad6101320b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -25,6 +25,7 @@ typedef void (bio_end_io_t) (struct bio *); struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; + unsigned short bi_write_hint; int bi_error; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use diff --git a/include/linux/fs.h b/include/linux/fs.h index 825e61a603d1..d384b50a2c42 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -315,6 +316,18 @@ struct page; struct address_space; struct writeback_control; +/* + * Write life time hint values. + */ +enum rw_hint { + WRITE_LIFE_NOT_SET = 0, + WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, + WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, + WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, + WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, + WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, +}; + #define IOCB_EVENTFD (1 << 0) #define IOCB_APPEND (1 << 1) #define IOCB_DIRECT (1 << 2) @@ -329,6 +342,7 @@ struct kiocb { void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); void *private; int ki_flags; + enum rw_hint ki_hint; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) @@ -642,6 +656,7 @@ struct inode { spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; unsigned int i_blkbits; + enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED @@ -1069,8 +1084,6 @@ struct file_lock_context { #define OFFT_OFFSET_MAX INT_LIMIT(off_t) #endif -#include - extern void send_sigio(struct fown_struct *fown, int fd, int band); /* diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index beed138bd359..f85ed3a5ef4d 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -42,6 +42,27 @@ #define F_SEAL_WRITE 0x0008 /* prevent writes */ /* (1U << 31) is reserved for signed error codes */ +/* + * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the + * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on + * the specific file. + */ +#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) +#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) +#define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) + +/* + * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be + * used to clear any hints previously set. + */ +#define RWF_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NONE 1 +#define RWH_WRITE_LIFE_SHORT 2 +#define RWH_WRITE_LIFE_MEDIUM 3 +#define RWH_WRITE_LIFE_LONG 4 +#define RWH_WRITE_LIFE_EXTREME 5 + /* * Types of directory notifications that may be requested. */