summaryrefslogtreecommitdiff
path: root/share/man/man9/vnode.9
blob: 615db0c571ad6bcf24b59d18973143095f6516ae (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
.\"     $OpenBSD: vnode.9,v 1.20 2005/09/22 14:33:08 jmc Exp $
.\"
.\" Copyright (c) 2001 Constantine Sapuntzakis
.\" All rights reserved.
.\"
.\" Redistribution and use in source and binary forms, with or without
.\" modification, are permitted provided that the following conditions
.\" are met:
.\"
.\" 1. Redistributions of source code must retain the above copyright
.\"    notice, this list of conditions and the following disclaimer.
.\" 2. The name of the author may not be used to endorse or promote products
.\"    derived from this software without specific prior written permission.
.\"
.\" THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
.\" INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
.\" AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
.\" THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
.\" EXEMPLARY, OR CONSEQUENTIAL  DAMAGES (INCLUDING, BUT NOT LIMITED TO,
.\" PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
.\" OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
.\" WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
.\" OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
.\" ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.\"
.Dd September 16, 2004
.Dt VNODE 9
.Os
.Sh NAME
.Nm vnode
.Nd an overview of vnodes
.Sh DESCRIPTION
A
.Em vnode
is an object in kernel memory that speaks the
.Ux
file interface (open, read, write, close, readdir, etc.).
Vnodes can represent files, directories, FIFOs, domain sockets, block devices,
character devices.
.Pp
Each vnode has a set of methods which start with the string
.Dq VOP_ .
These methods include
.Fn VOP_OPEN ,
.Fn VOP_READ ,
.Fn VOP_WRITE ,
.Fn VOP_RENAME ,
.Fn VOP_CLOSE ,
and
.Fn VOP_MKDIR .
Many of these methods correspond closely to the equivalent
file system call \-
.Xr open 2 ,
.Xr read 2 ,
.Xr write 2 ,
.Xr rename 2 ,
etc.
Each file system (FFS, NFS, etc.) provides implementations for these methods.
.Pp
The Virtual File System library (see
.Xr vfs 9 )
maintains a pool of vnodes.
File systems cannot allocate their own vnodes; they must use the functions
provided by the VFS to create and manage vnodes.
.Pp
The definition of a vnode is as follows:
.Bd -literal
struct vnode {
	struct uvm_vnode v_uvm;		/* uvm(9) data */
	int	(**v_op)(void *);	/* vnode operations vector */
	enum	vtype v_type;		/* vnode type */
	u_int	v_flag;			/* vnode flags (see below) */
	u_int	v_usecount;		/* reference count of users */
	u_int	v_writecount;		/* reference count of writers */
	/* Flags that can be read/written in interrupts */
	u_int	v_bioflag;		/* flags used by intr handlers */
	u_int	v_holdcnt;		/* buffer references */
	u_int	v_id;			/* capability identifier */
	struct	mount *v_mount;		/* ptr to vfs we are in */
	TAILQ_ENTRY(vnode) v_freelist;	/* vnode freelist */
	LIST_ENTRY(vnode) v_mntvnodes;	/* vnodes for mount point */
	struct	buflists v_cleanblkhd;	/* clean blocklist head */
	struct	buflists v_dirtyblkhd;	/* dirty blocklist head */
	u_int	v_numoutput;		/* num of writes in progress */
	LIST_ENTRY(vnode) v_synclist;	/* vnode with dirty buffers */
	union {
	  struct mount    *vu_mountedhere;/* ptr to mounted vfs (VDIR) */
	  struct socket   *vu_socket;	/* UNIX IPC (VSOCK) */
	  struct specinfo *vu_specinfo;	/* device (VCHR, VBLK) */
	  struct fifoinfo *vu_fifoinfo;	/* fifo (VFIFO) */
	} v_un;

	struct	simplelock v_interlock;	/* lock on usecount and flag */
	struct	lock v_lock;		/* used for non-locking fs's */
	struct	lock *v_vnlock;		/* pointer to vnode lock */
	enum	vtagtype v_tag;		/* type of underlying data */
	void	*v_data;		/* private data for fs */
	struct {
	  struct simplelock vsi_lock;	/* lock to protect below */
	  struct selinfo vsi_selinfo;	/* identity of poller(s) */
	} v_selectinfo;
};
#define v_mountedhere	v_un.vu_mountedhere
#define v_socket	v_un.vu_socket
#define v_specinfo	v_un.vu_specinfo
#define v_fifoinfo	v_un.vu_fifoinfo
.Ed
.Ss Vnode life cycle
When a client of the VFS requests a new vnode, the vnode allocation
code can reuse an old vnode object that is no longer in use.
Whether a vnode is in use is tracked by the vnode reference count
.Pq Va v_usecount .
By convention, each open file handle holds a reference
as do VM objects backed by files.
A vnode with a reference count of 1 or more will not be deallocated or
reused to point to a different file.
So, if you want to ensure that your vnode doesn't become a different
file under you, you better be sure you have a reference to it.
A vnode that points to a valid file and has a reference count of 1 or more
is called
.Em active .
.Pp
When a vnode's reference count drops to zero, it becomes
.Em inactive ,
that is, a candidate for reuse.
An inactive vnode still refers to a valid file and one can try to
reactivate it using
.Xr vget 9
(this is used a lot by caches).
.Pp
Before the VFS can reuse an inactive vnode to refer to another file,
it must clean all information pertaining to the old file.
A cleaned out vnode is called a
.Em reclaimed
vnode.
.Pp
To support forceable unmounts and the
.Xr revoke 2
system call, the VFS may reclaim a vnode with a positive reference
count.
The reclaimed vnode is given to the dead file system, which
returns errors for most operations.
The reclaimed vnode will not be
reused for another file until its reference count hits zero.
.Ss Vnode pool
The
.Xr getnewvnode 9
system call allocates a vnode from the pool, possibly reusing an
inactive vnode, and returns it to the caller.
The vnode returned has a reference count
.Pq Va v_usecount
of 1.
.Pp
The
.Xr vref 9
call increments the reference count on the vnode.
It may only be on a vnode with reference count of 1 or greater.
The
.Xr vrele 9
and
.Xr vput 9
calls decrement the reference count.
In addition, the
.Xr vput 9
call also releases the vnode lock.
.Pp
The
.Xr vget 9
call, when used on an inactive vnode, will make the vnode active
by bumping the reference count to one.
When called on an active vnode,
.Fn vget
increases the reference count by one.
However, if the vnode is being reclaimed concurrently, then
.Fn vget
will fail and return an error.
.Pp
The
.Xr vgone 9
and
.Xr vgonel 9
calls
orchestrate the reclamation of a vnode.
They can be called on both active and inactive vnodes.
.Pp
When transitioning a vnode to the reclaimed state, the VFS will call
.Xr VOP_RECLAIM 9
method.
File systems use this method to free any file-system-specific data
they attached to the vnode.
.Ss Vnode locks
The vnode actually has three different types of lock: the vnode lock,
the vnode interlock, and the vnode reclamation lock
.Pq Dv VXLOCK .
.Ss The vnode lock
The vnode lock and its consistent use accomplishes the following:
.Bl -bullet
.It
It keeps a locked vnode from changing across certain pairs of VOP_ calls,
thus preserving cached data.
For example, it keeps the directory from
changing between a
.Xr VOP_LOOKUP 9
call and a
.Xr VOP_CREATE 9 .
The
.Fn VOP_LOOKUP
call makes sure the name doesn't already exist in the
directory and finds free room in the directory for the new entry.
The
.Fn VOP_CREATE
call can then go ahead and create the file without checking if
it already exists or looking for free space.
.It
Some file systems rely on it to ensure that only one
.Dq thread
at a time
is calling VOP_ vnode operations on a given file or directory.
Otherwise, the file system's behavior is undefined.
.It
On rare occasions, code will hold the vnode lock so that a series of
VOP_ operations occurs as an atomic unit.
(Of course, this doesn't work with network file systems like NFSv2 that don't
have any notion of bundling a bunch of operations into an atomic unit.)
.It
While the vnode lock is held, the vnode will not be reclaimed.
.El
.Pp
There is a discipline to using the vnode lock.
Some VOP_ operations require that the vnode lock is held before being called.
A description of this rather arcane locking discipline is in
.Pa sys/kern/vnode_if.src .
.Pp
The vnode lock is acquired by calling
.Xr vn_lock 9
and released by calling
.Xr VOP_UNLOCK 9 .
.Pp
A process is allowed to sleep while holding the vnode lock.
.Pp
The implementation of the vnode lock is the responsibility of the individual
file systems.
Not all file systems implement it.
.Pp
To prevent deadlocks, when acquiring locks on multiple vnodes, the lock
of parent directory must be acquired before the lock on the child directory.
.Ss Vnode interlock
The vnode interlock
.Pq Va v_interlock
is a simplelock (see
.Xr simple_lock 9 ) .
It is useful on multi-processor systems for acquiring a quick exclusive
lock on the contents of the vnode.
It MUST NOT be held while sleeping.
.Pp
This field protects the
.Va v_flag , v_writecount , v_usecount ,
and
.Va v_holdcnt
fields from concurrent access.
See
.Xr lock 9
for more details on lock synchronization in interrupt context.
.\" Other splbio/interrupt issues?
.Pp
Operations on this lock are a no-op on uniprocessor systems.
.Ss Other vnode synchronization
The vnode reclamation lock
.Pq Dv VXLOCK
is used to prevent multiple
processes from entering the vnode reclamation code.
It is also used as a flag to indicate that reclamation is in progress.
The
.Dv VXWANT
flag is set by processes that wish to be woken up when reclamation
is finished.
.Pp
The
.Xr vwaitforio 9
call is used to wait for all outstanding write I/Os associated with a
vnode to complete.
.Ss Version number/capability
The vnode capability,
.Va v_id ,
is a 32-bit version number on the vnode.
Every time a vnode is reassigned to a new file, the vnode capability
is changed.
This is used by code that wishes to keep pointers to vnodes but doesn't want
to hold a reference (e.g., caches).
The code keeps both a vnode pointer and a copy of the capability.
The code can later compare the vnode's capability to its copy and see
if the vnode still points to the same file.
.Pp
Note: for this to work, memory assigned to hold a
.Vt struct vnode
can
only be used for another purpose when all pointers to it have disappeared.
Since the vnode pool has no way of knowing when all pointers have
disappeared, it never frees memory it has allocated for vnodes.
.Ss Vnode fields
Most of the fields of the vnode structure should be treated as opaque
and only manipulated through the proper APIs.
This section describes the fields that are manipulated directly.
.Pp
The
.Va v_flag
attribute contains random flags related to various functions.
They are summarized in the following table:
.Pp
.Bl -tag -width 10n -compact -offset indent
.It Dv VROOT
This vnode is the root of its file system.
.It Dv VTEXT
This vnode is a pure text prototype.
.It Dv VSYSTEM
This vnode is being used by kernel.
.It Dv VISTTY
This vnode represents a
.Xr tty 4 .
.It Dv VXLOCK
This vnode is locked to change its underlying type.
.It Dv VXWANT
A process is waiting for this vnode.
.It Dv VALIASED
This vnode has an alias.
.It Dv VLAYER
This vnode is on a layered file system.
.It Dv VLOCKSWORK
This vnode's underlying file system supports locking discipline.
.El
.Pp
The
.Va v_tag
attribute indicates what file system the vnode belongs to.
Very little code actually uses this attribute and its use is deprecated.
Programmers should seriously consider using more object-oriented approaches
(e.g. function tables).
There is no safe way of defining new
.Va v_tag Ns 's
for loadable file systems.
The
.Va v_tag
attribute is read-only.
.Pp
The
.Va v_type
attribute indicates what type of file (e.g. directory,
regular, FIFO) this vnode is.
This is used by the generic code for various checks.
For example, the
.Xr read 2
system call returns an error when a read is attempted on a directory.
.Pp
Possible types are:
.Pp
.Bl -tag -width 10n -offset indent -compact
.It Dv VNON
This vnode has no type.
.It Dv VREG
This vnode represents a regular file.
.It Dv VDIR
This vnode represents a directory.
.It Dv VBLK
This vnode represents a block device.
.It Dv VCHR
This vnode represents a character device.
.It Dv VLNK
This vnode represents a symbolic link.
.It Dv VSOCK
This vnode represents a socket.
.It Dv VFIFO
This vnode represents a named pipe.
.It Dv VBAD
This vnode represents a bad or dead file.
.El
.Pp
The
.Va v_data
attribute allows a file system to attach a piece of file
system specific memory to the vnode.
This contains information about the file that is specific to
the file system (such as an inode pointer in the case of FFS).
.Pp
The
.Va v_numoutput
attribute indicates the number of pending synchronous
and asynchronous writes on the vnode.
It does not track the number of dirty buffers attached to the vnode.
The attribute is used by code like
.Xr fsync 2
to wait for all writes
to complete before returning to the user.
This attribute must be manipulated at
.Xr splbio 9 .
.Pp
The
.Va v_writecount
attribute tracks the number of write calls pending
on the vnode.
.Ss Rules
The vast majority of vnode functions may not be called from interrupt
context.
The exceptions are
.Fn bgetvp
and
.Fn brelvp .
The following fields of the vnode are manipulated at interrupt level:
.Va v_numoutput , v_holdcnt , v_dirtyblkhd ,
.Va v_cleanblkhd , v_bioflag , v_freelist ,
and
.Va v_synclist .
Any access to these fields should be protected by
.Xr splbio 9 .
.Sh SEE ALSO
.Xr uvm 9 ,
.Xr vaccess 9 ,
.Xr vclean 9 ,
.Xr vcount 9 ,
.Xr vdevgone 9 ,
.Xr vfinddev 9 ,
.Xr vflush 9 ,
.Xr vflushbuf 9 ,
.Xr vfs 9 ,
.Xr vget 9 ,
.Xr vgone 9 ,
.Xr vhold 9 ,
.Xr vinvalbuf 9 ,
.Xr vn_lock 9 ,
.Xr VOP_LOOKUP 9 ,
.Xr vput 9 ,
.Xr vrecycle 9 ,
.Xr vref 9 ,
.Xr vrele 9 ,
.Xr vwaitforio 9 ,
.Xr vwakeup 9
.Sh HISTORY
This document first appeared in
.Ox 2.9 .