Discussion:
[RFC 15/32] cifs: convert to struct inode_time
Arnd Bergmann
2014-05-30 20:01:39 UTC
Permalink
cifs uses multiple time formats for inode timestamps, which will work
at least another 92 years, but the VFS uses struct timespec for timestamps,
which is only good until 2038 on 32-bit CPUs.

This gets us one small step closer to lifting the VFS limit by using
struct inode_time in cifs. After 2106, users of the old protocol versions
will have to move to the latest version.

Signed-off-by: Arnd Bergmann <***@arndb.de>
Cc: Steve French <***@samba.org>
Cc: linux-***@vger.kernel.org
Cc: samba-***@lists.samba.org
---
fs/cifs/cache.c | 6 +++---
fs/cifs/cifsglob.h | 6 +++---
fs/cifs/cifsproto.h | 6 +++---
fs/cifs/cifssmb.c | 5 +++--
fs/cifs/inode.c | 2 +-
fs/cifs/netmisc.c | 15 ++++++++-------
6 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 6c665bf..5343f38 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -221,9 +221,9 @@ const struct fscache_cookie_def cifs_fscache_super_index_def = {
* Auxiliary data attached to CIFS inode within the cache
*/
struct cifs_fscache_inode_auxdata {
- struct timespec last_write_time;
- struct timespec last_change_time;
- u64 eof;
+ struct inode_time last_write_time;
+ struct inode_time last_change_time;
+ u64 eof;
};

static uint16_t cifs_fscache_inode_get_key(const void *cookie_netfs_data,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index de6aed8..f944c44 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1344,9 +1344,9 @@ struct cifs_fattr {
dev_t cf_rdev;
unsigned int cf_nlink;
unsigned int cf_dtype;
- struct timespec cf_atime;
- struct timespec cf_mtime;
- struct timespec cf_ctime;
+ struct inode_time cf_atime;
+ struct inode_time cf_mtime;
+ struct inode_time cf_ctime;
};

static inline void free_dfs_info_param(struct dfs_info3_param *param)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ca7980a..ad476c6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -122,9 +122,9 @@ extern enum securityEnum select_sectype(struct TCP_Server_Info *server,
enum securityEnum requested);
extern int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
const struct nls_table *nls_cp);
-extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
-extern u64 cifs_UnixTimeToNT(struct timespec);
-extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
+extern struct inode_time cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
+extern u64 cifs_UnixTimeToNT(struct inode_time);
+extern struct inode_time cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
int offset);
extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
extern int cifs_get_writer(struct cifsInodeInfo *cinode);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c3dc52e..4452be7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -482,7 +482,7 @@ decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr)
* this requirement.
*/
int val, seconds, remain, result;
- struct timespec ts, utc;
+ struct inode_time ts, utc;
utc = CURRENT_TIME;
ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
rsp->SrvTime.Time, 0);
@@ -3952,7 +3952,8 @@ QInfRetry:
if (rc) {
cifs_dbg(FYI, "Send error in QueryInfo = %d\n", rc);
} else if (data) {
- struct timespec ts;
+ struct inode_time ts;
+ /* FIXME: 32-bit time? */
__u32 time = le32_to_cpu(pSMBr->last_write_time);

/* decode response */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9ff8df8..30ff02f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -109,7 +109,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
}

/* revalidate if mtime or size have changed */
- if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
+ if (inode_time_equal(&inode->i_mtime, &fattr->cf_mtime) &&
cifs_i->server_eof == fattr->cf_eof) {
cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
__func__, cifs_i->uniqueid);
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 6834b9c..40bcbcb 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -918,10 +918,10 @@ smbCalcSize(void *buf)
* Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
* into Unix UTC (based 1970-01-01, in seconds).
*/
-struct timespec
+struct inode_time
cifs_NTtimeToUnix(__le64 ntutc)
{
- struct timespec ts;
+ struct inode_time ts;
/* BB what about the timezone? BB */

/* Subtract the NTFS time offset, then convert to 1s intervals. */
@@ -935,7 +935,7 @@ cifs_NTtimeToUnix(__le64 ntutc)

/* Convert the Unix UTC into NT UTC. */
u64
-cifs_UnixTimeToNT(struct timespec t)
+cifs_UnixTimeToNT(struct inode_time t)
{
/* Convert to 100ns intervals and then add the NTFS time offset. */
return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET;
@@ -945,10 +945,11 @@ static const int total_days_of_prev_months[] = {
0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334
};

-struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
+struct inode_time cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
{
- struct timespec ts;
- int sec, min, days, month, year;
+ struct inode_time ts;
+ long long sec;
+ int min, days, month, year;
u16 date = le16_to_cpu(le_date);
u16 time = le16_to_cpu(le_time);
SMB_TIME *st = (SMB_TIME *)&time;
@@ -959,7 +960,7 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
sec = 2 * st->TwoSeconds;
min = st->Minutes;
if ((sec > 59) || (min > 59))
- cifs_dbg(VFS, "illegal time min %d sec %d\n", min, sec);
+ cifs_dbg(VFS, "illegal time min %d sec %lld\n", min, sec);
sec += (min * 60);
sec += 60 * 60 * st->Hours;
if (st->Hours > 24)
--
1.8.3.2
Vyacheslav Dubeyko
2014-05-31 14:30:49 UTC
Permalink
Hi Arnd,

On Fri, 2014-05-30 at 22:01 +0200, Arnd Bergmann wrote:

[snip]
fs: introduce new 'struct inode_time'
uapi: add struct __kernel_timespec{32,64}
fs: introduce sys_utimens64at
fs: introduce sys_newfstat64/sys_newfstatat64
arch: hook up new stat and utimes syscalls
isofs: fix timestamps beyond 2027
fs/nfs: convert to struct inode_time
fs/ceph: convert to 'struct inode_time'
fs/pstore: convert to struct inode_time
fs/coda: convert to struct inode_time
xfs: convert to struct inode_time
btrfs: convert to struct inode_time
ext3: convert to struct inode_time
ext4: convert to struct inode_time
cifs: convert to struct inode_time
ntfs: convert to struct inode_time
ubifs: convert to struct inode_time
ocfs2: convert to struct inode_time
fs/fat: convert to struct inode_time
afs: convert to struct inode_time
udf: convert to struct inode_time
fs: convert simple fs to inode_time
logfs: convert to struct inode_time
hfs, hfsplus: convert to struct inode_time
gfs2: convert to struct inode_time
reiserfs: convert to struct inode_time
jffs2: convert to struct inode_time
adfs: convert to struct inode_time
f2fs: convert to struct inode_time
fuse: convert to struct inode_time
scsi: fnic: use current_kernel_time() for timestamp
fs: use new inode_time definition unconditionally
By the way, what about NILFS2? Is NILFS2 ready for suggested approach
without any changes?

Thanks,
Vyacheslav Dubeyko.


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Arnd Bergmann
2014-06-03 12:21:29 UTC
Permalink
Post by Vyacheslav Dubeyko
By the way, what about NILFS2? Is NILFS2 ready for suggested approach
without any changes?
nilfs2 and a lot of other file systems don't need any changes for
this, because they don't assign the inode time stamp fields to
a 'struct timespec'.

FWIW, nilfs2 uses a 64-bit seconds value, which is always safe and
can represent the full range of user space timespec on all machines.

Arnd
Richard Cochran
2014-05-31 14:51:15 UTC
Permalink
I picked this because it is a fairly isolated problem, as the
inode time stamps are rarely assigned to any other time values.
As a byproduct of this work, I documented for each of the file
systems we support how long the on-disk format can work[1].
Why are some of the time stamp expiration dates marked as "never"?

Thanks,
Richard
Arnd Bergmann
2014-05-31 15:23:02 UTC
Permalink
Post by Richard Cochran
I picked this because it is a fairly isolated problem, as the
inode time stamps are rarely assigned to any other time values.
As a byproduct of this work, I documented for each of the file
systems we support how long the on-disk format can work[1].
Why are some of the time stamp expiration dates marked as "never"?
It's an approximation:
with 64-bit timestamps, you can represent close to 300 billion
years, which is way past the time that our planet can sustain
life of any form[1].

Arnd

[1] http://en.wikipedia.org/wiki/Timeline_of_the_far_future
Geert Uytterhoeven
2014-05-31 16:20:43 UTC
Permalink
Post by Arnd Bergmann
Post by Richard Cochran
I picked this because it is a fairly isolated problem, as the
inode time stamps are rarely assigned to any other time values.
As a byproduct of this work, I documented for each of the file
systems we support how long the on-disk format can work[1].
Why are some of the time stamp expiration dates marked as "never"?
with 64-bit timestamps, you can represent close to 300 billion
years, which is way past the time that our planet can sustain
life of any form[1].
FWIW, the 48-bit second limit of befs marked never happens sooner
than the 32-bit day limit of affs marked as Y11760870.

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ***@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
Richard Cochran
2014-05-31 18:22:37 UTC
Permalink
(Approximately never ;)
Post by Arnd Bergmann
with 64-bit timestamps, you can represent close to 300 billion
years, which is way past the time that our planet can sustain
life of any form[1].
Did you mean mean 64 bits worth of seconds?

2^64 / (3600*24*365) = 584,942,417,355

That is more than 300 billion years, and still, it is not quite the
same as "never".

In any case, that term is not too helpful in the comparison table,
IMHO. One could think that some sort of clever running count relative
to the last mount time was implied.

Thanks,
Richard

[1] You are forgetting the immortal robotic overlords.
H. Peter Anvin
2014-05-31 19:34:12 UTC
Permalink
Typically they are using 64-bit signed seconds.
Post by Richard Cochran
(Approximately never ;)
Post by Arnd Bergmann
with 64-bit timestamps, you can represent close to 300 billion
years, which is way past the time that our planet can sustain
life of any form[1].
Did you mean mean 64 bits worth of seconds?
2^64 / (3600*24*365) = 584,942,417,355
That is more than 300 billion years, and still, it is not quite the
same as "never".
In any case, that term is not too helpful in the comparison table,
IMHO. One could think that some sort of clever running count relative
to the last mount time was implied.
Thanks,
Richard
[1] You are forgetting the immortal robotic overlords.
--
Sent from my mobile phone. Please pardon brevity and lack of formatting.
Richard Cochran
2014-06-01 04:46:09 UTC
Permalink
Post by H. Peter Anvin
Typically they are using 64-bit signed seconds.
Okay, that is what I wanted to know.

Thanks,
Richard
Richard Cochran
2014-06-01 04:44:36 UTC
Permalink
Post by Richard Cochran
Why are some of the time stamp expiration dates marked as "never"?
Also, the term "never" might mean using arbitrarily long integers
as in ASN.1.

Thanks,
Richard
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Joseph S. Myers
2014-06-02 13:52:19 UTC
Permalink
a) is this the right approach in general? The previous discussion
pointed this way, but there may be other opinions.
The syscall changes seem like the sort of thing I'd expect, although
patches adding new syscalls or otherwise affecting the kernel/userspace
interface (as opposed to those relating to an individual filesystem)
should go to linux-api as well as other relevant lists.
--
Joseph S. Myers
***@codesourcery.com
Arnd Bergmann
2014-06-02 19:19:55 UTC
Permalink
Post by Joseph S. Myers
a) is this the right approach in general? The previous discussion
pointed this way, but there may be other opinions.
The syscall changes seem like the sort of thing I'd expect, although
patches adding new syscalls or otherwise affecting the kernel/userspace
interface (as opposed to those relating to an individual filesystem)
should go to linux-api as well as other relevant lists.
Ok. Sorry about missing linux-api, I confused it with linux-arch, which
may not be as relevant here, except for the one question whether we
actually want to have the new ABI on all 32-bit architectures or only
as an opt-in for those that expect to stay around for another 24 years.

Two more questions for you:

- are you (and others) happy with adding this type of stat syscall
(fstatat64/fstat64) as opposed to the more generic xstat that has
been discussed in the past and that never made it through the bike-
shedding discussion?

- once we have enough buy-in from reviewers to merge this initial
series, should we proceed to define rest of the syscall ABI
(minus driver ioctls) so glibc and kernel can do the conversion
on top of that, or should we better try to do things one syscall
family at a time and actually get the kernel to handle them
correctly internally?

Arnd
H. Peter Anvin
2014-06-02 19:26:22 UTC
Permalink
Post by Arnd Bergmann
Post by Joseph S. Myers
a) is this the right approach in general? The previous discussion
pointed this way, but there may be other opinions.
The syscall changes seem like the sort of thing I'd expect, although
patches adding new syscalls or otherwise affecting the kernel/userspace
interface (as opposed to those relating to an individual filesystem)
should go to linux-api as well as other relevant lists.
Ok. Sorry about missing linux-api, I confused it with linux-arch, which
may not be as relevant here, except for the one question whether we
actually want to have the new ABI on all 32-bit architectures or only
as an opt-in for those that expect to stay around for another 24 years.
- are you (and others) happy with adding this type of stat syscall
(fstatat64/fstat64) as opposed to the more generic xstat that has
been discussed in the past and that never made it through the bike-
shedding discussion?
- once we have enough buy-in from reviewers to merge this initial
series, should we proceed to define rest of the syscall ABI
(minus driver ioctls) so glibc and kernel can do the conversion
on top of that, or should we better try to do things one syscall
family at a time and actually get the kernel to handle them
correctly internally?
The bit that is really going to hurt is every single ioctl that uses a
timespec.

Honestly, though, I really don't understand the point with "struct
inode_time". It seems like the zeroeth-order thing is to change the
kernel internal version of struct timespec to have a 64-bit time... it
isn't just about inodes. We then should be explicit about the external
uses of time, and use accessors.

-hpa
Arnd Bergmann
2014-06-02 19:55:52 UTC
Permalink
Post by H. Peter Anvin
Post by Arnd Bergmann
Post by Joseph S. Myers
a) is this the right approach in general? The previous discussion
pointed this way, but there may be other opinions.
The syscall changes seem like the sort of thing I'd expect, although
patches adding new syscalls or otherwise affecting the kernel/userspace
interface (as opposed to those relating to an individual filesystem)
should go to linux-api as well as other relevant lists.
Ok. Sorry about missing linux-api, I confused it with linux-arch, which
may not be as relevant here, except for the one question whether we
actually want to have the new ABI on all 32-bit architectures or only
as an opt-in for those that expect to stay around for another 24 years.
- are you (and others) happy with adding this type of stat syscall
(fstatat64/fstat64) as opposed to the more generic xstat that has
been discussed in the past and that never made it through the bike-
shedding discussion?
- once we have enough buy-in from reviewers to merge this initial
series, should we proceed to define rest of the syscall ABI
(minus driver ioctls) so glibc and kernel can do the conversion
on top of that, or should we better try to do things one syscall
family at a time and actually get the kernel to handle them
correctly internally?
The bit that is really going to hurt is every single ioctl that uses a
timespec.
Honestly, though, I really don't understand the point with "struct
inode_time". It seems like the zeroeth-order thing is to change the
kernel internal version of struct timespec to have a 64-bit time... it
isn't just about inodes. We then should be explicit about the external
uses of time, and use accessors.
I picked these because they are fairly isolated from all other uses,
in particular since inode times are the only things where we really
care about times in the distant past or future (decades away as opposed
to things that happened between boot and shutdown).

For other kernel-internal uses, we may be better off migrating to
a completely different representation, such as nanoseconds since
boot or the architecture specific ktime_t, but this is really something
to decide for each subsystem.

I just tried building an arm32 kernel with a s64 time_t, and that
failed horribly, I get linker errors for missing 64-bit divides
and lots of warnings for code that expects time_t pointers to
functions taking a 'long' or vice versa. I also think the only
way to maintain ABI compatibility is to separate the internal uses
from the interface, which means auditing all code in the end.

Arnd
H. Peter Anvin
2014-06-02 21:57:26 UTC
Permalink
Post by Arnd Bergmann
Post by H. Peter Anvin
The bit that is really going to hurt is every single ioctl that uses a
timespec.
Honestly, though, I really don't understand the point with "struct
inode_time". It seems like the zeroeth-order thing is to change the
kernel internal version of struct timespec to have a 64-bit time... it
isn't just about inodes. We then should be explicit about the external
uses of time, and use accessors.
I picked these because they are fairly isolated from all other uses,
in particular since inode times are the only things where we really
care about times in the distant past or future (decades away as opposed
to things that happened between boot and shutdown).
If nothing else, I would expect to be able to set the system time to
weird values for testing. So I'm not so sure I agree with that...
Post by Arnd Bergmann
For other kernel-internal uses, we may be better off migrating to
a completely different representation, such as nanoseconds since
boot or the architecture specific ktime_t, but this is really something
to decide for each subsystem.
Having a bunch of different time representations in the kernel seems
like a real headache...

-hpa
Arnd Bergmann
2014-06-03 14:22:19 UTC
Permalink
Post by H. Peter Anvin
Post by Arnd Bergmann
Post by H. Peter Anvin
The bit that is really going to hurt is every single ioctl that uses a
timespec.
Honestly, though, I really don't understand the point with "struct
inode_time". It seems like the zeroeth-order thing is to change the
kernel internal version of struct timespec to have a 64-bit time... it
isn't just about inodes. We then should be explicit about the external
uses of time, and use accessors.
I picked these because they are fairly isolated from all other uses,
in particular since inode times are the only things where we really
care about times in the distant past or future (decades away as opposed
to things that happened between boot and shutdown).
If nothing else, I would expect to be able to set the system time to
weird values for testing. So I'm not so sure I agree with that...
I think John Stultz and Thomas Gleixner have already started looking
at how the timekeeping code can be updated. Once that is done, we should
be able to add a functional 64-bit gettimeofday/settimeofday syscall
pair. While I definitely agree this is one of the most basic things to
have, it's also not an area of the kernel that is easy to change.
Post by H. Peter Anvin
Post by Arnd Bergmann
For other kernel-internal uses, we may be better off migrating to
a completely different representation, such as nanoseconds since
boot or the architecture specific ktime_t, but this is really something
to decide for each subsystem.
Having a bunch of different time representations in the kernel seems
like a real headache...
We already have time_t, ktime_t, timeval, timespec, compat_timespec,
clock_t, cputime_t, cputime64_t, tm, nanoseconds, jiffies, jiffies64,
and lots of driver or file system specific representations. I'm all for
removing a bunch of these from the kernel, but my feeling is that this is
one of the cases where we first have to add new ones in order to remove
those that are already there.
To complicate things further, we also have various times bases
(realtime/utc, realtime/tai, monotonic, monotonic_raw, boottime, ...),
and at least for the timespec values we pass around, it's not always
obvious which one is used, of if that's the right one.

We probably don't want to add a lot of new representations, and it's
possible that we can change most of the internal code we have to
ktime_t and then convert that to whatever user space wants at the
interfaces.

The possible uses I can see for non-ktime_t types in the kernel are:
* inodes need 96 bit timestamps to represent the full range of values
that can be stored in a file system, you made a convincing argument
for that. Almost everything else can fit into 64 bit on a 32-bit
kernel, in theory also on a 64-bit kernel if we want that.
* A number of interfaces pass relative timespecs: nanosleep(), poll(),
select(), sigtimedwait(), alarm(), futex() and probably more. There is
nothing wrong with the use of timespec here, and it may be good to
annotate that by using a new type (e.g. struct timeout) that is defined
as compatible with the current timespec.
* For new user interfaces, we need a new type such as the
__kernel_timespec64 I introduced, so it doesn't clash with the normal
user timespec that may be smaller, depending on the libc.
* A lot of drivers will need new ioctl commands, and for drivers that
just need time stamps (audio, v4l, sockets, ...) it may be more
efficient and more correct to use a new timestamp_t (e.g. boot time
64-bit nanoseconds) than __kernel_timespec64, which is not normally
monotonic and requires a normalization step. If we end up introducing
such a type in the user interface, we can also start using it in the
kernel.

Arnd
Joseph S. Myers
2014-06-03 14:33:10 UTC
Permalink
Post by Arnd Bergmann
I think John Stultz and Thomas Gleixner have already started looking
at how the timekeeping code can be updated. Once that is done, we should
be able to add a functional 64-bit gettimeofday/settimeofday syscall
pair. While I definitely agree this is one of the most basic things to
have, it's also not an area of the kernel that is easy to change.
64-bit clock_gettime / clock_settime instead of gettimeofday /
settimeofday should avoid the need for the kernel to have a 64-bit version
of struct timeval. (Userspace 64-bit gettimeofday / settimeofday would
need to use a combination of the syscalls if the tz pointer is non-NULL.)
--
Joseph S. Myers
***@codesourcery.com
Arnd Bergmann
2014-06-03 14:37:46 UTC
Permalink
Post by Joseph S. Myers
Post by Arnd Bergmann
I think John Stultz and Thomas Gleixner have already started looking
at how the timekeeping code can be updated. Once that is done, we should
be able to add a functional 64-bit gettimeofday/settimeofday syscall
pair. While I definitely agree this is one of the most basic things to
have, it's also not an area of the kernel that is easy to change.
64-bit clock_gettime / clock_settime instead of gettimeofday /
settimeofday should avoid the need for the kernel to have a 64-bit version
of struct timeval. (Userspace 64-bit gettimeofday / settimeofday would
need to use a combination of the syscalls if the tz pointer is non-NULL.)
Yes, that's what I meant.

Arnd
Dave Chinner
2014-06-03 21:38:02 UTC
Permalink
Post by Arnd Bergmann
* inodes need 96 bit timestamps to represent the full range of values
that can be stored in a file system, you made a convincing argument
for that. Almost everything else can fit into 64 bit on a 32-bit
kernel, in theory also on a 64-bit kernel if we want that.
Just ot be pedantic, inodes don't *need* 96 bit timestamps - some
filesystems can *support up to* 96 bit timestamps. If the kernel
only supports 64 bit timestamps and that's all the kernel can
represent, then the upper bits of the 96 bit on-disk inode
timestamps simply remain zero.

If you move the filesystem between kernels with different time
ranges, then the filesystem needs to be able to tell the kernel what
it's supported range is. This is where having the VFS limit the
range of supported timestamps is important: the limit is the
min(kernel range, filesystem range). This allows the filesystems
to be indepenent of the kernel time representation, and the kernel
to be independent of the physical filesystem time encoding....

Cheers,

Dave.
--
Dave Chinner
david-***@public.gmane.org
Arnd Bergmann
2014-06-04 15:03:47 UTC
Permalink
Post by Arnd Bergmann
* inodes need 96 bit timestamps to represent the full range of values
that can be stored in a file system, you made a convincing argument
for that. Almost everything else can fit into 64 bit on a 32-bit
kernel, in theory also on a 64-bit kernel if we want that.
Just ot be pedantic, inodes don't need 96 bit timestamps - some
filesystems can *support up to* 96 bit timestamps. If the kernel
only supports 64 bit timestamps and that's all the kernel can
represent, then the upper bits of the 96 bit on-disk inode
timestamps simply remain zero.
I meant the reverse: since we have file systems that can store
96-bit timestamps when using 64-bit kernels, we need to extend
32-bit kernels to have the same internal representation so we
can actually read those file systems correctly.
If you move the filesystem between kernels with different time
ranges, then the filesystem needs to be able to tell the kernel what
it's supported range is. This is where having the VFS limit the
range of supported timestamps is important: the limit is the
min(kernel range, filesystem range). This allows the filesystems
to be indepenent of the kernel time representation, and the kernel
to be independent of the physical filesystem time encoding....
I agree it makes sense to let the kernel know about the limits
of the file system it accesses, but for the reverse, we're probably
better off just making the kernel representation large enough (i.e.
96 bits) so it can work with any known file system. We need another
check at the user space boundary to turn that into a value that the
user can understand, but that's another problem.

Arnd
Nicolas Pitre
2014-06-04 17:30:32 UTC
Permalink
Post by Arnd Bergmann
Just ot be pedantic, inodes don't need 96 bit timestamps - some
filesystems can *support up to* 96 bit timestamps. If the kernel
only supports 64 bit timestamps and that's all the kernel can
represent, then the upper bits of the 96 bit on-disk inode
timestamps simply remain zero.
I meant the reverse: since we have file systems that can store
96-bit timestamps when using 64-bit kernels, we need to extend
32-bit kernels to have the same internal representation so we
can actually read those file systems correctly.
If you move the filesystem between kernels with different time
ranges, then the filesystem needs to be able to tell the kernel what
it's supported range is. This is where having the VFS limit the
range of supported timestamps is important: the limit is the
min(kernel range, filesystem range). This allows the filesystems
to be indepenent of the kernel time representation, and the kernel
to be independent of the physical filesystem time encoding....
I agree it makes sense to let the kernel know about the limits
of the file system it accesses, but for the reverse, we're probably
better off just making the kernel representation large enough (i.e.
96 bits) so it can work with any known file system.
Depends... 96 bit handling may get prohibitive on 32-bit archs.

The important point here is for the kernel to be able to represent the
time _range_ used by any known filesystem, not necessarily the time
_precision_.

For example, a 64 bit representation can be made of 40 bits for seconds
spanning 34865 years, and 24 bits for fractional seconds providing
precision down to 60 nanosecs. That ought to be plenty good on 32 bit
systems while still being cheap to handle.


Nicolas
Arnd Bergmann
2014-06-04 19:24:42 UTC
Permalink
Post by Nicolas Pitre
Post by Arnd Bergmann
Just ot be pedantic, inodes don't need 96 bit timestamps - some
filesystems can *support up to* 96 bit timestamps. If the kernel
only supports 64 bit timestamps and that's all the kernel can
represent, then the upper bits of the 96 bit on-disk inode
timestamps simply remain zero.
I meant the reverse: since we have file systems that can store
96-bit timestamps when using 64-bit kernels, we need to extend
32-bit kernels to have the same internal representation so we
can actually read those file systems correctly.
If you move the filesystem between kernels with different time
ranges, then the filesystem needs to be able to tell the kernel what
it's supported range is. This is where having the VFS limit the
range of supported timestamps is important: the limit is the
min(kernel range, filesystem range). This allows the filesystems
to be indepenent of the kernel time representation, and the kernel
to be independent of the physical filesystem time encoding....
I agree it makes sense to let the kernel know about the limits
of the file system it accesses, but for the reverse, we're probably
better off just making the kernel representation large enough (i.e.
96 bits) so it can work with any known file system.
Depends... 96 bit handling may get prohibitive on 32-bit archs.
The important point here is for the kernel to be able to represent the
time _range_ used by any known filesystem, not necessarily the time
_precision_.
For example, a 64 bit representation can be made of 40 bits for seconds
spanning 34865 years, and 24 bits for fractional seconds providing
precision down to 60 nanosecs. That ought to be plenty good on 32 bit
systems while still being cheap to handle.
I have checked earlier that we don't do any computation on inode
time stamps in common code, we just pass them around, so there is
very little runtime overhead. There is a small bit of space overhead
(12 byte) per inode, but that structure is already on the order of
500 bytes.

For other timekeeping stuff in the kernel, I agree that using some
64-bit representation (nanoseconds, 32/32 unsigned seconds/nanoseconds,
...) has advantages, that's exactly the point I was making earlier
against simply extending the internal time_t/timespec to 64-bit
seconds for everything.

Arnd
H. Peter Anvin
2014-06-05 00:10:24 UTC
Permalink
Post by Arnd Bergmann
For other timekeeping stuff in the kernel, I agree that using some
64-bit representation (nanoseconds, 32/32 unsigned seconds/nanoseconds,
...) has advantages, that's exactly the point I was making earlier
against simply extending the internal time_t/timespec to 64-bit
seconds for everything.
How much of a performance issue is it to make time_t 64 bits, and for
the bits there are, how hard are they to fix?

-hpa
Arnd Bergmann
2014-06-10 09:54:14 UTC
Permalink
Post by H. Peter Anvin
Post by Arnd Bergmann
For other timekeeping stuff in the kernel, I agree that using some
64-bit representation (nanoseconds, 32/32 unsigned seconds/nanoseconds,
...) has advantages, that's exactly the point I was making earlier
against simply extending the internal time_t/timespec to 64-bit
seconds for everything.
How much of a performance issue is it to make time_t 64 bits, and for
the bits there are, how hard are they to fix?
Probably very little overhead for most uses, it's more the regression
potential in the less common parts of the kernel I'm worried about.

There is a significant but not overwhelming number of uses of the
main problematic types in the kernel:

***@wuerfel:~/arm-soc$ git grep -wl time_t | wc
188 188 5566
***@wuerfel:~/arm-soc$ git grep -wl timeval | wc
320 320 10353
***@wuerfel:~/arm-soc$ git grep -wl timespec | wc
406 406 10886

I believe we have to audit all of them anyway if we want to change
the kernel to less problematic types and introduce new user
interfaces.

IMHO this work is helped if we change the uses to a new type
as we find the problems. This lets us do the work one subsystem
at a time and avoid accidental ABI changes. I don't care much what
type that will be, and having a 96-bit type will certainly work
well in a lot of cases, but I don't see a strong reason to use
that over other types, especially when they can be more efficient.

Arnd

Joseph S. Myers
2014-06-02 21:02:15 UTC
Permalink
Post by Arnd Bergmann
Ok. Sorry about missing linux-api, I confused it with linux-arch, which
may not be as relevant here, except for the one question whether we
actually want to have the new ABI on all 32-bit architectures or only
as an opt-in for those that expect to stay around for another 24 years.
For glibc I think it will make the most sense to add the support for
64-bit time_t across all architectures that currently have 32-bit time_t
(with the new interfaces having fallback support to implementation in
terms of the 32-bit kernel interfaces, if the 64-bit syscalls are
unavailable either at runtime or in the kernel headers against which glibc
is compiled - this fallback code will of course need to check for overflow
when passing a time value to the kernel, hopefully with error handling
consistent with whatever the kernel ends up doing when a filesystem can't
support a timestamp). If some architectures don't provide the new
interfaces in the kernel then that will mean the fallback code in glibc
can't be removed until glibc support for those architectures is removed
(as opposed to removing it when glibc no longer supports kernels predating
the kernel support).
Post by Arnd Bergmann
- are you (and others) happy with adding this type of stat syscall
(fstatat64/fstat64) as opposed to the more generic xstat that has
been discussed in the past and that never made it through the bike-
shedding discussion?
I am.
Post by Arnd Bergmann
- once we have enough buy-in from reviewers to merge this initial
series, should we proceed to define rest of the syscall ABI
(minus driver ioctls) so glibc and kernel can do the conversion
on top of that, or should we better try to do things one syscall
family at a time and actually get the kernel to handle them
correctly internally?
I don't have any comments on that ordering question.
--
Joseph S. Myers
***@codesourcery.com
--
To unsubscribe from this list: send the line "unsubscribe reiserfs-devel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Arnd Bergmann
2014-06-04 15:05:27 UTC
Permalink
Post by Joseph S. Myers
Post by Arnd Bergmann
Ok. Sorry about missing linux-api, I confused it with linux-arch, which
may not be as relevant here, except for the one question whether we
actually want to have the new ABI on all 32-bit architectures or only
as an opt-in for those that expect to stay around for another 24 years.
For glibc I think it will make the most sense to add the support for
64-bit time_t across all architectures that currently have 32-bit time_t
(with the new interfaces having fallback support to implementation in
terms of the 32-bit kernel interfaces, if the 64-bit syscalls are
unavailable either at runtime or in the kernel headers against which glibc
is compiled - this fallback code will of course need to check for overflow
when passing a time value to the kernel, hopefully with error handling
consistent with whatever the kernel ends up doing when a filesystem can't
support a timestamp). If some architectures don't provide the new
interfaces in the kernel then that will mean the fallback code in glibc
can't be removed until glibc support for those architectures is removed
(as opposed to removing it when glibc no longer supports kernels predating
the kernel support).
Ok, that's a good reason to just provide the new interfaces on all
architectures right away. Thanks for the insight!

Arnd
Loading...