0%

文件系统初探

1. VFS

VFS以一组通用对象看待所有文件系统.

  • 超级块(SuperBlock)
  • 索引节点(inode)
  • 目录项(dentry)
  • 文件(struct file)

1.1. 超级块 sb

超级块代表一个已经安装的文件系统,存储该文件系统的有关信息(如类型\大小\状态等)

  1. 对基于磁盘的文件系统, 该对象通常存放在磁盘的特定扇区上;
  2. 非磁盘的(如sysfs),现场创建保存在内存中
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
struct super_block {
//用于形成超级块链表
struct list_head s_list; /* Keep this first */
// 所属文件系统所在的设备描述符
dev_t s_dev; /* search index; _not_ kdev_t */
// 一个块需要几个bit表示, 如块为1024字节, 该处就是10 块大小的就位数
unsigned char s_blocksize_bits;
// 块大小, 以字节为单位.
unsigned long s_blocksize;
loff_t s_maxbytes; /* Max file size */
// 文件系统类型
struct file_system_type *s_type;
// 超级块操作
const struct super_operations *s_op;
// VFS磁盘限额处理方法
const struct dquot_operations *dq_op;
// 用于配置磁盘限额的方法,处理来自用户空间的请求
const struct quotactl_ops *s_qcop;
// 导出的方法, 从NFS服务器中共享目录又称导出目录
const struct export_operations *s_export_op;
// mount的flag,
unsigned long s_flags;
unsigned long s_iflags; /* internal SB_I_* flags */
// 魔数,用于识别文件系统
unsigned long s_magic;
// 文件系统的根目录的目录项对象, 从文件系统的超级块可以读到任何一个文件, 前提是知道根目录在哪里.
struct dentry *s_root;
// 卸载所用的信号量
struct rw_semaphore s_umount;
// 引用计数
int s_count;
// 引用计数
atomic_t s_active;
// 指向超级块扩展属性结构的指针
const struct xattr_handler **s_xattr;
// 加密相关的操作
const struct fscrypt_operations *s_cop;
// 要导出的匿名目录项的列表
struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */
struct list_head s_mounts; /* list of mounts; _not_ for fs use */
struct block_device *s_bdev;
struct backing_dev_info *s_bdi;
struct mtd_info *s_mtd;
struct hlist_node s_instances;
unsigned int s_quota_types; /* Bitmask of supported quota types */
struct quota_info s_dquot; /* Diskquota specific options */

struct sb_writers s_writers;

char s_id[32]; /* Informational name */
u8 s_uuid[16]; /* UUID */

void *s_fs_info; /* Filesystem private info */
unsigned int s_max_links;
fmode_t s_mode;

/* Granularity of c/m/atime in ns.
Cannot be worse than a second */
u32 s_time_gran;

/*
* The next field is for VFS *only*. No filesystems have any business
* even looking at it. You had been warned.
*/
struct mutex s_vfs_rename_mutex; /* Kludge */

/*
* Filesystem subtype. If non-empty the filesystem type field
* in /proc/mounts will be "type.subtype"
*/
char *s_subtype;

/*
* Saved mount options for lazy filesystems using
* generic_show_options()
*/
char __rcu *s_options;
const struct dentry_operations *s_d_op; /* default d_op for dentries */

/*
* Saved pool identifier for cleancache (-1 means none)
*/
int cleancache_poolid;

struct shrinker s_shrink; /* per-sb shrinker handle */

/* Number of inodes with nlink == 0 but still referenced */
atomic_long_t s_remove_count;

/* Being remounted read-only */
int s_readonly_remount;

/* AIO completions deferred from interrupt context */
struct workqueue_struct *s_dio_done_wq;
struct hlist_head s_pins;

/*
* Keep the lru lists last in the structure so they always sit on their
* own individual cachelines.
*/
struct list_lru s_dentry_lru ____cacheline_aligned_in_smp;
struct list_lru s_inode_lru ____cacheline_aligned_in_smp;
struct rcu_head rcu;
struct work_struct destroy_work;

struct mutex s_sync_lock; /* sync serialisation lock */

/*
* Indicates how deep in a filesystem stack this SB is
*/
int s_stack_depth;

/* s_inode_list_lock protects s_inodes */
spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp;
struct list_head s_inodes; /* all inodes */
};

1.1.1. 超级块操作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
void (*destroy_inode)(struct inode *);
void (*dirty_inode) (struct inode *, int flags);
int (*write_inode) (struct inode *, struct writeback_control *wbc);
int (*drop_inode) (struct inode *);
void (*evict_inode) (struct inode *);
void (*put_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
int (*freeze_super) (struct super_block *);
int (*freeze_fs) (struct super_block *);
int (*thaw_super) (struct super_block *);
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
int (*remount_fs2) (struct vfsmount *, struct super_block *, int *, char *);
void *(*clone_mnt_data) (void *);
void (*copy_mnt_data) (void *, void *);
void (*umount_begin) (struct super_block *);

int (*show_options)(struct seq_file *, struct dentry *);
int (*show_options2)(struct vfsmount *,struct seq_file *, struct dentry *);
int (*show_devname)(struct seq_file *, struct dentry *);
int (*show_path)(struct seq_file *, struct dentry *);
int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
struct dquot **(*get_dquots)(struct inode *);
#endif
int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
long (*nr_cached_objects)(struct super_block *,
struct shrink_control *);
long (*free_cached_objects)(struct super_block *,
struct shrink_control *);
};

1.2. 索引节点 inode

用于存放内核在操作文件或目录时需要的全部信息.
具体文件系统的索引节点存储在磁盘上,使用时将其读入内存填充VFS的索引节点,之后VFS inode的任何修改回写到磁盘上.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
struct inode {
umode_t i_mode;
unsigned short i_opflags;
kuid_t i_uid;
kgid_t i_gid;
unsigned int i_flags;

#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
#endif

const struct inode_operations *i_op;
struct super_block *i_sb;
struct address_space *i_mapping;

#ifdef CONFIG_SECURITY
void *i_security;
#endif

/* Stat data, not accessed from path walking */
unsigned long i_ino;
/*
* Filesystems may only read i_nlink directly. They shall use the
* following functions for modification:
*
* (set|clear|inc|drop)_nlink
* inode_(inc|dec)_link_count
*/
union {
const unsigned int i_nlink;
unsigned int __i_nlink;
};
dev_t i_rdev;
loff_t i_size;
struct timespec i_atime;
struct timespec i_mtime;
struct timespec i_ctime;
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned short i_bytes;
unsigned int i_blkbits;
blkcnt_t i_blocks;

#ifdef __NEED_I_SIZE_ORDERED
seqcount_t i_size_seqcount;
#endif

/* Misc */
unsigned long i_state;
struct mutex i_mutex;

unsigned long dirtied_when; /* jiffies of first dirtying */
unsigned long dirtied_time_when;

struct hlist_node i_hash;
struct list_head i_io_list; /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *i_wb; /* the associated cgroup wb */

/* foreign inode detection, see wbc_detach_inode() */
int i_wb_frn_winner;
u16 i_wb_frn_avg_time;
u16 i_wb_frn_history;
#endif
struct list_head i_lru; /* inode LRU list */
struct list_head i_sb_list;
union {
struct hlist_head i_dentry;
struct rcu_head i_rcu;
};
u64 i_version;
atomic_t i_count;
atomic_t i_dio_count;
atomic_t i_writecount;
#ifdef CONFIG_IMA
atomic_t i_readcount; /* struct files open RO */
#endif
const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
struct file_lock_context *i_flctx;
struct address_space i_data;
struct list_head i_devices;
union {
struct pipe_inode_info *i_pipe;
struct block_device *i_bdev;
struct cdev *i_cdev;
char *i_link;
};

__u32 i_generation;

#ifdef CONFIG_FSNOTIFY
__u32 i_fsnotify_mask; /* all events this inode cares about */
struct hlist_head i_fsnotify_marks;
#endif

#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
struct fscrypt_info *i_crypt_info;
#endif
void *i_private; /* fs or device private pointer */
};

4个管理inode的链表:

  1. inode_unused, 目前还没使用的linode
  2. inode_in_use,目前正在使用的inode
  3. 超级块的s_dirty字段,将所有脏inode链接
  4. inode_in_use使用效率不高, 将使用中的inode计算hash值, hash值可能重复, i_hash将同样hash值的对个inode链接

1.2.1. 索引节点操作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
struct inode_operations {
// 指定目录查找
struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
const char * (*follow_link) (struct dentry *, void **);
int (*permission) (struct inode *, int);
int (*permission2) (struct vfsmount *, struct inode *, int);
struct posix_acl * (*get_acl)(struct inode *, int);

int (*readlink) (struct dentry *, char __user *,int);
void (*put_link) (struct inode *, void *);

int (*create) (struct inode *,struct dentry *, umode_t, bool);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct inode *,struct dentry *,const char *);
int (*mkdir) (struct inode *,struct dentry *,umode_t);
int (*rmdir) (struct inode *,struct dentry *);
int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *);
int (*rename2) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*setattr) (struct dentry *, struct iattr *);
int (*setattr2) (struct vfsmount *, struct dentry *, struct iattr *);
int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
/*为指定的文件设置特定的扩展属性, (xattr)允许用户将文件与未被文件系统解释的信息关联,与之对应的是经过文件系统严格定义的正规文件属性,如文件创建和修改的事件等. 如文件作者/编码等
*/
int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*removexattr) (struct dentry *, const char *);
int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
u64 len);
int (*update_time)(struct inode *, struct timespec *, int);
int (*atomic_open)(struct inode *, struct dentry *,
struct file *, unsigned open_flag,
umode_t create_mode, int *opened);
int (*tmpfile) (struct inode *, struct dentry *, umode_t);
int (*set_acl)(struct inode *, struct posix_acl *, int);
} ____cacheline_aligned;

1.3. 目录项(dentry)

为方便查找操作, 引入目录. 每个目录项代表路径的一部分.

/home/test/test.c (test.c也是一个目录项)

目录项将路径中的每个部分与其对应的inode相连,沿着路径各部分的目录项进行搜索,最终找到目标文件的inode.

与超级块和索引节点不同, 目录项在磁盘上没有对应描述. 只存在于内存中(目录页缓存),仅仅为提高系统的性能存在.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
struct dentry {
/* RCU lookup touched fields */
unsigned int d_flags; /* protected by d_lock */
seqcount_t d_seq; /* per dentry seqlock */
// dentry的链表, 链接所用的dentry. 保存parent指针和next指针.
struct hlist_bl_node d_hash; /* lookup hash list */
// 父目录项
struct dentry *d_parent; /* parent directory */
// 目录项名字
struct qstr d_name;
// 与目录关联的inode
struct inode *d_inode; /* Where the name belongs to - NULL is
* negative */
unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */

/* Ref lookup also touches following */
struct lockref d_lockref; /* per-dentry lock and refcount */
// dentry 操作
const struct dentry_operations *d_op;
// 所属文件系统的超级块
struct super_block *d_sb; /* The root of the dentry tree */
unsigned long d_time; /* used by d_revalidate */
void *d_fsdata; /* fs-specific data */
// 最近未使用的目录项链表
struct list_head d_lru; /* LRU list */
// 通过该字段加入到其父目录的 d_subdirs链表中
struct list_head d_child; /* child of parent list */
// 其子目录链表的头
struct list_head d_subdirs; /* our children */
/*
* d_alias and d_rcu can share memory
*/
union {
// 一个inode可能对应多个dentry, 与该inode相关连的所有目录通过dentry的d_alias挂入inode的i_dentry链表中
struct hlist_node d_alias; /* inode alias list */
struct rcu_head d_rcu;
} d_u;
};

1.3.1. 几点小结

  • 每个dentry通过d_hash字段挂入dentry_hashtable中的某个链表里. 通过该链表管理所有目录.目录一经创建, 就会加入到该链表里
  • 引用计数为0的dentry通过d_lru挂入链表dentry_unsed,等待释放或重新使用
  • 每个dentry通过d_inode与一个inode关联,多个dentry可以与一个inode关联
  • 指向同一个inode的dentry通过d_alias字段链接在一起,都挂入inode的i_dentry链表中
  • 每个dentry通过d_parent字段指向其parent目录的dentry,通过d_child跟同一目录中其他文件的dentry链接在一起,都挂在parent目录dentry的d_subdirs链表中
  • 每个d_entry通过d_sb指向所属文件系统的超级块

1.3.2. dentry的状态

  • 空闲状态(free): 不包含有效信息, 且未被VFS使用
  • 未使用状态(unused): d_inode字段仍指向关联的inode, 但引用计数为0, 未被VFS使用, 没有进程访问. 在内存回收时可能被丢弃
  • 使用状态(in use) 存在使用进程, 关联inode. 不会被丢弃
  • 负状态(negative) 没有关联inode, 由于其关联的inode被删除或解析一个不存在的文件创建,留作使用

1.3.3. 目录项操作(dentry_oprations)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
struct dentry_operations {
// 判断dentry是否有效.
int (*d_revalidate)(struct dentry *, unsigned int);
int (*d_weak_revalidate)(struct dentry *, unsigned int);
int (*d_hash)(const struct dentry *, struct qstr *);
int (*d_compare)(const struct dentry *, const struct dentry *,
unsigned int, const char *, const struct qstr *);
int (*d_delete)(const struct dentry *);
void (*d_release)(struct dentry *);
void (*d_prune)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
char *(*d_dname)(struct dentry *, char *, int);
struct vfsmount *(*d_automount)(struct path *);
int (*d_manage)(struct dentry *, bool);
struct inode *(*d_select_inode)(struct dentry *, unsigned);
struct dentry *(*d_real)(struct dentry *, struct inode *);
void (*d_canonical_path)(const struct path *, struct path *);
} ____cacheline_aligned;

1.4. 文件对象

文件描述进程已经打开的文件, 进程直接处理的是文件, 而不是其他三个.因多个进程可以打开和操作同一个文件, 所以同一个文件可能存在多个对应的文件对象.但对应的inodedentry是唯一的.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
struct file {
union {
// 文件系统的所有已打开文件通过该字段挂入文件系统的超级块(sb)的s_files链表中
struct llist_node fu_llist;
struct rcu_head fu_rcuhead;
} f_u;
struct path f_path;
// 关联的inode
struct inode *f_inode; /* cached value */
// 文件操作
const struct file_operations *f_op;

/*
* Protects f_ep_links, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
atomic_long_t f_count;
// 打开文件时指定的参数
unsigned int f_flags;
fmode_t f_mode;
struct mutex f_pos_lock;
// 目前文件的offset,每次读写从该位置开始
loff_t f_pos;
struct fown_struct f_owner;
const struct cred *f_cred;
// 文件预读状态
struct file_ra_state f_ra;
// fpos改变时, f_version ++
u64 f_version;
#ifdef CONFIG_SECURITY
void *f_security;
#endif
/* needed for tty driver, and maybe others */
// 供文件系统或驱动程序使用的私有数据
void *private_data;

#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
struct list_head f_ep_links;
struct list_head f_tfile_llink;
#endif /* #ifdef CONFIG_EPOLL */
// 查看inode的i_mapping字段
struct address_space *f_mapping;
} __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */

内核从磁盘上将inode装入内存时, 与该inode相关的文件操作存放在i_fop字段,之后进程打开这个文件时,VFS通过inode中的i_fop初始化新文件对应的f_op字段.

1.4.1. 文件操作(file_operations f_op)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
struct file_operations {
struct module *owner;
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
// 指定的文件映射到指定的地址空间, 由mmap()调用
int (*mmap) (struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
// 当调用close时会被调用 根据具体文件系统而定
int (*flush) (struct file *, fl_owner_t id);
// 引用计数变为0时, 调用释放文件对象
int (*release) (struct inode *, struct file *);
// 将文件所有被缓存的数据写入磁盘, 由系统调用fsync()和fdatasync()调用, fdatasync()只会影响文件的数据部分,fsync会同步更新文件的属性
int (*fsync) (struct file *, loff_t, loff_t, int datasync);
int (*aio_fsync) (struct kiocb *, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
int (*flock) (struct file *, int, struct file_lock *);
// 从一个pipe移动数据到一个文件, 由系统调用splice()时调用
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
// 从一个文件移动数据到一个pipe, 由系统调用splice()时调用
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
// 为一个已打开的文件设置一个租约, 提供当一个进程试图打开或读写文件内容时,有文件租约的进程会被通知的机制
int (*setlease)(struct file *, long, struct file_lock **, void **);
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *);
#endif
};

1.5. 文件系统相关的数据结构

1.5.1. file_system_type(具体文件系统类型)

ex: ext2_fs_type ext3_fs_type vfat_fs_type

Linux 支持的文件系统, 都会有且只有一个file_system_type结构, 每当有一个文件系统被安装时,会有一个vfsmount结构被创建, 代表该文件系统的一个安装实例, 也代表了该文件系统的一个安装点

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
struct file_system_type {
const char *name;
int fs_flags;
#define FS_REQUIRES_DEV 1
#define FS_BINARY_MOUNTDATA 2
#define FS_HAS_SUBTYPE 4
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
#define FS_USERNS_DEV_MOUNT 16 /* A userns mount does not imply MNT_NODEV */
#define FS_USERNS_VISIBLE 32 /* FS must already be visible */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);
struct dentry *(*mount2) (struct vfsmount *, struct file_system_type *, int,
const char *, void *);
void *(*alloc_mnt_data) (void);
void (*kill_sb) (struct super_block *);
struct module *owner;
// 不同类型的文件系统通过next字段链接形成链表
struct file_system_type * next;
// 同一种文件系统的超级块通过s_instances字段链接到一起, 挂入fs_supers链表中.
struct hlist_head fs_supers;

struct lock_class_key s_lock_key;
struct lock_class_key s_umount_key;
struct lock_class_key s_vfs_rename_key;
struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];

struct lock_class_key i_lock_key;
struct lock_class_key i_mutex_key;
struct lock_class_key i_mutex_dir_key;
};

1.5.2. vfsmount实例

1
2
3
4
5
6
7
8
9
struct vfsmount {
// 该文件系统根目录的dentry, 与mountpoint一般是相同的.
struct dentry *mnt_root; /* root of the mounted tree */
// 指向安装文件系统的超级块
struct super_block *mnt_sb; /* pointer to superblock */
// 挂载参数
int mnt_flags;
void *data;
};

1.5.3. 与进程相关的数据结构

每个进程都有自己的根目录和当前工作目录,内核使用了struct fs_struct记录这些信息, 进程描述符的fs字段指向该进程的fs_struct结构

1
2
3
4
5
6
7
8
9
10
struct fs_struct {
int users;
spinlock_t lock;
seqcount_t seq;
// 打开文件时默认设置的文件访问权限
int umask;
int in_exec;
// pwd指向当前工作目录
struct path root, pwd;
};

除了根目录/当前工作目录, 进程还需要记录自己打开的文件. 进程已经打开的文件用struct files_struct来记录, 进程描述符的files字段指向该进程的files_struct结构.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
struct files_struct {
/*
* read mostly part
*/
atomic_t count;
bool resize_in_progress;
wait_queue_head_t resize_wait;
// fdtab是初始的文件描述表, fdt最初指向fdtab
struct fdtable __rcu *fdt;
struct fdtable fdtab;
/*
* written part on a separate cache line in SMP
*/
spinlock_t file_lock ____cacheline_aligned_in_smp;
// 最近关闭的文件描述符中最小的下一个可用的文件描述符
int next_fd;
// 执行exec时需要关闭的文件描述符
unsigned long close_on_exec_init[1];
// 当前已经打开的文件描述符
unsigned long open_fds_init[1];
unsigned long full_fds_bits_init[1];
// 文件对象的初始化数组
struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

文件描述符fd等于文件对象在数组fd_array中的下标. 通过fd_array中的前3个分别是stdin, stdout,stderr 标准描述符.

内核将fd/max_fds以及其他字段组织在一起, 成了fdtable,称为文件描述符表. 当进程打开的文件数目超过32个时,内核调用expand_fdtable()生成一个新的文件描述表, 将它的地址指向fdt

1
2
3
4
5
6
7
8
9
10
11
struct fdtable {
// 文件对象的最大数目
unsigned int max_fds;
// 指向当前的文件对象数组, 初始指向fd_array
struct file __rcu **fd; /* current fd array */
unsigned long *close_on_exec;
// 当前已经打开的文件描述符, 初始指向open_fds_init
unsigned long *open_fds;
unsigned long *full_fds_bits;
struct rcu_head rcu;
};

1.5.4. 路径查找辅助结构

nameidata用于在路径查找的过程中记录中间信息和查找结果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
struct nameidata {
// 所解析的最后一个路径的对象
struct path path;
// 表示当前目录项的名称
struct qstr last;
// 根目录
struct path root;
// 父目录关联的inode
struct inode *inode; /* path.dentry.d_inode */
unsigned int flags;
unsigned seq, m_seq;
// 路径最后一个分量的类型, 可以去LAST_NORM/LAST_DOT/LAST_DOTDOT/LAST_BIND
int last_type;
// 符号链接嵌套的深度
unsigned depth;
int total_link_count;
struct saved {
struct path link;
void *cookie;
const char *name;
struct inode *inode;
unsigned seq;
} *stack, internal[EMBEDDED_LEVELS];
struct filename *name;
struct nameidata *saved;
unsigned root_seq;
int dfd;
};

1.6. VFS的缓存机制

内核启动时,通过vfs_caches_init()创建inode/dentry/file文件对象/vfsmount缓存提高操作系统性能

1.6.1. inode缓存

  • 从内存中申请或释放一个inode对象,必须通过kmem_cache_alloc和kmem_cache_free进行
  • 将inode对象插入不同的链表, 具有相同hash值的inode对象在同一链表中. 当访问一个inode时,先在缓存hash表中查询,如果查到,引用计数+1, 如没有, 需要找到一个空闲的inode, 从底层的文件系统中读取信息填充该inode, 插入到对应的链表中.

1.6.2. 目录项缓存

执行文件操作时, VFS需要解析文件路径中的每一部分, 并为之构造目录项对象, 重复访问同一文件时或包含相同的目录项时,直接从内存中获得对应的dentry

目录项的缓存机制与inode缓存类似, 不重复介绍. 申请dentry时需要关联inode.

1.6.3. 缓冲区缓存

对磁盘文件访问, 最终转化为对磁盘操作. 扇区是块设备的基本单元, 也是最小的寻址单元. 内核在扇区上抽象出了的概念,块的大小是扇区的n倍, 不能超过页面长度, 通常为512b/1k/4k

块被作为文件系统的最小寻址单元, 一个磁盘块被调入内存时, 需存储在对应的内存上的缓冲区中.

新的kernel版本上, page cache中包含buffer cache. page cache为4k大小, 根据配置的块大小, 一个page cache可以包含一个还是多个(最多8-对应512b)buffer cache.

1.7. 文件系统的注册与安装

将指定文件系统的file_system_type对象向内核注册.已注册文件系统的file_system_type对象形成链表.

1.7.1. 文件系统安装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
struct mnt_namespace {
atomic_t count;
struct ns_common ns;
// 该namespace下的根目录的vfsmount对象
struct mount * root;
// 链接属于该命名空间的安装的所有文件系统
struct list_head list;
struct user_namespace *user_ns;
u64 seq; /* Sequence number to prevent loops */
wait_queue_head_t poll;
u64 event;
unsigned int mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
};

默认情况下, 所有进程共享同样的namespace, 即看到的是同样的结构.但如调用clone()时使用CLONE_NEWNS标志, 进程会获得一个新的namespace. 只有保证是相同的namespace, 看到的文件结构才一致.

mount是基于进程的namespace进行安装的. 同样的namespace共享安装