1. VFS VFS以一组通用对象看待所有文件系统.
超级块(SuperBlock)
索引节点(inode)
目录项(dentry)
文件(struct file)
1.1. 超级块 sb 超级块代表一个已经安装的文件系统,存储该文件系统的有关信息(如类型\大小\状态等)
对基于磁盘
的文件系统, 该对象通常存放在磁盘的特定扇区上;
对非磁盘
的(如sysfs),现场创建保存在内存中
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 struct super_block { struct list_head s_list; dev_t s_dev; unsigned char s_blocksize_bits; unsigned long s_blocksize; loff_t s_maxbytes; struct file_system_type *s_type; const struct super_operations *s_op; const struct dquot_operations *dq_op; const struct quotactl_ops *s_qcop; const struct export_operations *s_export_op; unsigned long s_flags; unsigned long s_iflags; unsigned long s_magic; struct dentry *s_root; struct rw_semaphore s_umount; int s_count; atomic_t s_active; const struct xattr_handler **s_xattr; const struct fscrypt_operations *s_cop; struct hlist_bl_head s_anon; struct list_head s_mounts; struct block_device *s_bdev; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; struct hlist_node s_instances; unsigned int s_quota_types; struct quota_info s_dquot; struct sb_writers s_writers; char s_id[32 ]; u8 s_uuid[16 ]; void *s_fs_info; unsigned int s_max_links; fmode_t s_mode; u32 s_time_gran; struct mutex s_vfs_rename_mutex; char *s_subtype; char __rcu *s_options; const struct dentry_operations *s_d_op; int cleancache_poolid; struct shrinker s_shrink; atomic_long_t s_remove_count; int s_readonly_remount; struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; struct list_lru s_dentry_lru ____cacheline_aligned_in_smp; struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; struct work_struct destroy_work; struct mutex s_sync_lock; int s_stack_depth; spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; struct list_head s_inodes; };
1.1.1. 超级块操作 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); void (*dirty_inode) (struct inode *, int flags); int (*write_inode) (struct inode *, struct writeback_control *wbc); int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); int (*freeze_super) (struct super_block *); int (*freeze_fs) (struct super_block *); int (*thaw_super) (struct super_block *); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); int (*remount_fs2) (struct vfsmount *, struct super_block *, int *, char *); void *(*clone_mnt_data) (void *); void (*copy_mnt_data) (void *, void *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); int (*show_options2)(struct vfsmount *,struct seq_file *, struct dentry *); int (*show_devname)(struct seq_file *, struct dentry *); int (*show_path)(struct seq_file *, struct dentry *); int (*show_stats)(struct seq_file *, struct dentry *); #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int , char *, size_t , loff_t ); ssize_t (*quota_write)(struct super_block *, int , const char *, size_t , loff_t ); struct dquot **(*get_dquots)(struct inode *); #endif int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t ); long (*nr_cached_objects)(struct super_block *, struct shrink_control *); long (*free_cached_objects)(struct super_block *, struct shrink_control *); };
1.2. 索引节点 inode 用于存放内核在操作文件或目录时需要的全部信息. 具体文件系统的索引节点存储在磁盘上,使用时将其读入内存填充VFS的索引节点,之后VFS inode的任何修改回写到磁盘上.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 struct inode { umode_t i_mode; unsigned short i_opflags; kuid_t i_uid; kgid_t i_gid; unsigned int i_flags; #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif const struct inode_operations *i_op; struct super_block *i_sb; struct address_space *i_mapping; #ifdef CONFIG_SECURITY void *i_security; #endif unsigned long i_ino; union { const unsigned int i_nlink; unsigned int __i_nlink; }; dev_t i_rdev; loff_t i_size; struct timespec i_atime; struct timespec i_mtime; struct timespec i_ctime; spinlock_t i_lock; unsigned short i_bytes; unsigned int i_blkbits; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED seqcount_t i_size_seqcount; #endif unsigned long i_state; struct mutex i_mutex; unsigned long dirtied_when; unsigned long dirtied_time_when; struct hlist_node i_hash; struct list_head i_io_list; #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; int i_wb_frn_winner; u16 i_wb_frn_avg_time; u16 i_wb_frn_history; #endif struct list_head i_lru; struct list_head i_sb_list; union { struct hlist_head i_dentry; struct rcu_head i_rcu; }; u64 i_version; atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; #ifdef CONFIG_IMA atomic_t i_readcount; #endif const struct file_operations *i_fop; struct file_lock_context *i_flctx; struct address_space i_data; struct list_head i_devices; union { struct pipe_inode_info *i_pipe; struct block_device *i_bdev; struct cdev *i_cdev; char *i_link; }; __u32 i_generation; #ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; struct hlist_head i_fsnotify_marks; #endif #if IS_ENABLED(CONFIG_FS_ENCRYPTION) struct fscrypt_info *i_crypt_info; #endif void *i_private; };
4个管理inode的链表:
inode_unused, 目前还没使用的linode
inode_in_use,目前正在使用的inode
超级块的s_dirty字段,将所有脏inode链接
inode_in_use使用效率不高, 将使用中的inode计算hash值, hash值可能重复, i_hash将同样hash值的对个inode链接
1.2.1. 索引节点操作 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int ); const char * (*follow_link) (struct dentry *, void **); int (*permission) (struct inode *, int ); int (*permission2) (struct vfsmount *, struct inode *, int ); struct posix_acl * (*get_acl)(struct inode *, int ); int (*readlink) (struct dentry *, char __user *,int ); void (*put_link) (struct inode *, void *); int (*create) (struct inode *,struct dentry *, umode_t , bool ); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct inode *,struct dentry *,const char *); int (*mkdir) (struct inode *,struct dentry *,umode_t ); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct inode *,struct dentry *,umode_t ,dev_t ); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); int (*rename2) (struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int ); int (*setattr) (struct dentry *, struct iattr *); int (*setattr2) (struct vfsmount *, struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t ,int ); ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t ); ssize_t (*listxattr) (struct dentry *, char *, size_t ); int (*removexattr) (struct dentry *, const char *); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); int (*update_time)(struct inode *, struct timespec *, int ); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t ); int (*set_acl)(struct inode *, struct posix_acl *, int ); } ____cacheline_aligned;
1.3. 目录项(dentry) 为方便查找操作, 引入目录. 每个目录项代表路径的一部分.
/home/test/test.c (test.c也是一个目录项)
目录项将路径中的每个部分与其对应的inode相连,沿着路径各部分的目录项进行搜索,最终找到目标文件的inode.
与超级块和索引节点不同, 目录项在磁盘上没有对应描述. 只存在于内存中(目录页缓存),仅仅为提高系统的性能存在.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 struct dentry { unsigned int d_flags; seqcount_t d_seq; struct hlist_bl_node d_hash; struct dentry *d_parent; struct qstr d_name; struct inode *d_inode; unsigned char d_iname[DNAME_INLINE_LEN]; struct lockref d_lockref; const struct dentry_operations *d_op; struct super_block *d_sb; unsigned long d_time; void *d_fsdata; struct list_head d_lru; struct list_head d_child; struct list_head d_subdirs; union { struct hlist_node d_alias; struct rcu_head d_rcu; } d_u; };
1.3.1. 几点小结
每个dentry通过d_hash
字段挂入dentry_hashtable中的某个链表里. 通过该链表管理所有目录.目录一经创建, 就会加入到该链表里
引用计数为0的dentry通过d_lru挂入链表dentry_unsed,等待释放或重新使用
每个dentry通过d_inode与一个inode关联,多个dentry可以与一个inode关联
指向同一个inode的dentry通过d_alias字段链接在一起,都挂入inode的i_dentry链表中
每个dentry通过d_parent字段指向其parent目录的dentry,通过d_child跟同一目录中其他文件的dentry链接在一起,都挂在parent目录dentry的d_subdirs链表中
每个d_entry通过d_sb指向所属文件系统的超级块
1.3.2. dentry的状态
空闲状态(free): 不包含有效信息, 且未被VFS使用
未使用状态(unused): d_inode字段仍指向关联的inode, 但引用计数为0, 未被VFS使用, 没有进程访问. 在内存回收时可能被丢弃
使用状态(in use) 存在使用进程, 关联inode. 不会被丢弃
负状态(negative) 没有关联inode, 由于其关联的inode被删除或解析一个不存在的文件创建,留作使用
1.3.3. 目录项操作(dentry_oprations) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 struct dentry_operations { int (*d_revalidate)(struct dentry *, unsigned int ); int (*d_weak_revalidate)(struct dentry *, unsigned int ); int (*d_hash)(const struct dentry *, struct qstr *); int (*d_compare)(const struct dentry *, const struct dentry *, unsigned int , const char *, const struct qstr *); int (*d_delete)(const struct dentry *); void (*d_release)(struct dentry *); void (*d_prune)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int ); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(struct dentry *, bool ); struct inode *(*d_select_inode)(struct dentry *, unsigned ); struct dentry *(*d_real)(struct dentry *, struct inode *); void (*d_canonical_path)(const struct path *, struct path *); } ____cacheline_aligned;
1.4. 文件对象 文件描述进程已经打开的文件, 进程直接处理的是文件, 而不是其他三个.因多个进程可以打开和操作同一个文件, 所以同一个文件可能存在多个对应的文件对象 .但对应的inode 和dentry 是唯一的.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 struct file { union { struct llist_node fu_llist; struct rcu_head fu_rcuhead; } f_u; struct path f_path; struct inode *f_inode; const struct file_operations *f_op; spinlock_t f_lock; atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; struct mutex f_pos_lock; loff_t f_pos; struct fown_struct f_owner; const struct cred *f_cred; struct file_ra_state f_ra; u64 f_version; #ifdef CONFIG_SECURITY void *f_security; #endif void *private_data; #ifdef CONFIG_EPOLL struct list_head f_ep_links; struct list_head f_tfile_llink; #endif struct address_space *f_mapping; } __attribute__((aligned (4 )));
内核从磁盘上将inode装入内存时, 与该inode相关的文件操作存放在i_fop字段,之后进程打开这个文件时,VFS通过inode中的i_fop初始化新文件对应的f_op字段.
1.4.1. 文件操作(file_operations f_op) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t , int ); ssize_t (*read) (struct file *, char __user *, size_t , loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t , loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); unsigned int (*poll) (struct file *, struct poll_table_struct *) ; long (*unlocked_ioctl) (struct file *, unsigned int , unsigned long ); long (*compat_ioctl) (struct file *, unsigned int , unsigned long ); int (*mmap) (struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, loff_t , loff_t , int datasync); int (*aio_fsync) (struct kiocb *, int datasync); int (*fasync) (int , struct file *, int ); int (*lock) (struct file *, int , struct file_lock *); ssize_t (*sendpage) (struct file *, struct page *, int , size_t , loff_t *, int ); unsigned long (*get_unmapped_area) (struct file *, unsigned long , unsigned long , unsigned long , unsigned long ) ; int (*check_flags)(int ); int (*flock) (struct file *, int , struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t , unsigned int ); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t , unsigned int ); int (*setlease)(struct file *, long , struct file_lock **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); void (*show_fdinfo)(struct seq_file *m, struct file *f); #ifndef CONFIG_MMU unsigned (*mmap_capabilities)(struct file *); #endif };
1.5. 文件系统相关的数据结构 1.5.1. file_system_type(具体文件系统类型) ex: ext2_fs_type ext3_fs_type vfat_fs_type
Linux 支持的文件系统, 都会有且只有一个file_system_type结构, 每当有一个文件系统被安装时,会有一个vfsmount
结构被创建, 代表该文件系统的一个安装实例, 也代表了该文件系统的一个安装点
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 struct file_system_type { const char *name; int fs_flags; #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 #define FS_USERNS_DEV_MOUNT 16 #define FS_USERNS_VISIBLE 32 #define FS_RENAME_DOES_D_MOVE 32768 struct dentry *(*mount) (struct file_system_type *, int , const char *, void *); struct dentry *(*mount2) (struct vfsmount *, struct file_system_type *, int , const char *, void *); void *(*alloc_mnt_data) (void ); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; struct hlist_head fs_supers; struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; struct lock_class_key s_vfs_rename_key; struct lock_class_key s_writers_key[SB_FREEZE_LEVELS]; struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; struct lock_class_key i_mutex_dir_key; };
1.5.2. vfsmount实例 1 2 3 4 5 6 7 8 9 struct vfsmount { struct dentry *mnt_root; struct super_block *mnt_sb; int mnt_flags; void *data; };
1.5.3. 与进程相关的数据结构 每个进程都有自己的根目录和当前工作目录,内核使用了struct fs_struct记录这些信息, 进程描述符的fs字段指向该进程的fs_struct结构
1 2 3 4 5 6 7 8 9 10 struct fs_struct { int users; spinlock_t lock; seqcount_t seq; int umask; int in_exec; struct path root, pwd; };
除了根目录/当前工作目录, 进程还需要记录自己打开的文件. 进程已经打开的文件用struct files_struct来记录, 进程描述符
的files字段指向该进程的files_struct结构.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 struct files_struct { atomic_t count; bool resize_in_progress; wait_queue_head_t resize_wait; struct fdtable __rcu *fdt; struct fdtable fdtab; spinlock_t file_lock ____cacheline_aligned_in_smp; int next_fd; unsigned long close_on_exec_init[1 ]; unsigned long open_fds_init[1 ]; unsigned long full_fds_bits_init[1 ]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; };
文件描述符fd等于文件对象在数组fd_array中的下标. 通过fd_array中的前3个分别是stdin, stdout,stderr 标准描述符.
内核将fd/max_fds以及其他字段组织在一起, 成了fdtable,称为文件描述符表. 当进程打开的文件数目超过32个时,内核调用expand_fdtable()生成一个新的文件描述表, 将它的地址指向fdt
1 2 3 4 5 6 7 8 9 10 11 struct fdtable { unsigned int max_fds; struct file __rcu **fd; unsigned long *close_on_exec; unsigned long *open_fds; unsigned long *full_fds_bits; struct rcu_head rcu; };
1.5.4. 路径查找辅助结构 nameidata用于在路径查找的过程中记录中间信息和查找结果
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 struct nameidata { struct path path; struct qstr last; struct path root; struct inode *inode; unsigned int flags; unsigned seq, m_seq; int last_type; unsigned depth; int total_link_count; struct saved { struct path link; void *cookie; const char *name; struct inode *inode; unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; struct nameidata *saved; unsigned root_seq; int dfd; };
1.6. VFS的缓存机制 内核启动时,通过vfs_caches_init()创建inode/dentry/file文件对象/vfsmount缓存提高操作系统性能
1.6.1. inode缓存
从内存中申请或释放一个inode对象,必须通过kmem_cache_alloc和kmem_cache_free进行
将inode对象插入不同的链表, 具有相同hash值的inode对象在同一链表中. 当访问一个inode时,先在缓存hash表中查询,如果查到,引用计数+1, 如没有, 需要找到一个空闲的inode, 从底层的文件系统中读取信息填充该inode, 插入到对应的链表中.
1.6.2. 目录项缓存 执行文件操作时, VFS需要解析文件路径中的每一部分, 并为之构造目录项对象, 重复访问同一文件时或包含相同的目录项时,直接从内存中获得对应的dentry
目录项的缓存机制与inode缓存类似, 不重复介绍. 申请dentry时需要关联inode.
1.6.3. 缓冲区缓存 对磁盘文件访问, 最终转化为对磁盘操作. 扇区是块设备的基本单元, 也是最小的寻址单元. 内核在扇区上抽象出了块
的概念,块的大小是扇区的n倍, 不能超过页面长度, 通常为512b/1k/4k
块被作为文件系统的最小寻址单元, 一个磁盘块被调入内存时, 需存储在对应的内存上的缓冲区中.
新的kernel版本上, page cache中包含buffer cache. page cache为4k大小, 根据配置的块大小, 一个page cache可以包含一个还是多个(最多8-对应512b)buffer cache.
1.7. 文件系统的注册与安装 将指定文件系统的file_system_type对象向内核注册.已注册文件系统的file_system_type对象形成链表.
1.7.1. 文件系统安装 1 2 3 4 5 6 7 8 9 10 11 12 13 14 struct mnt_namespace { atomic_t count; struct ns_common ns; struct mount * root; struct list_head list; struct user_namespace *user_ns; u64 seq; wait_queue_head_t poll; u64 event; unsigned int mounts; unsigned int pending_mounts; };
默认情况下, 所有进程共享同样的namespace, 即看到的是同样的结构.但如调用clone()时使用CLONE_NEWNS标志, 进程会获得一个新的namespace. 只有保证是相同的namespace, 看到的文件结构才一致.
mount是基于进程的namespace进行安装的. 同样的namespace共享安装