Add as a feature case-insensitive directories (the casefold feature)

using Unicode 12.1.  Also, the usual largish number of cleanups and bug
 fixes.
 -----BEGIN PGP SIGNATURE-----
 
 iQEzBAABCAAdFiEEK2m5VNv+CHkogTfJ8vlZVpUNgaMFAlzSDXYACgkQ8vlZVpUN
 gaPQ3Qf/Sh0NqHbmbdW1J52oh4GqUKUhUezEac40yZcZBU4p3PFPZ5Ji83kAQV5r
 JgHx5YW4AYHs59UkRVq/er7wKEFJxAE8weUq90WYLE1Z/EjojDE8JHSsK00obKNN
 rJOm5qX/gy5C7PVUSWkSuAZQPMSGrmH5U5ie0nrI7bFWnr7T5CQkWarspUq53JBG
 RP910mPTT/otE7iTgUzjDeAMKfaSdtRhcJn/uTQ+2YZ1BJsHBHJHDnfQtd3CttHs
 ncTUaqPnhWqOKJV2Y9TDyAWYeSbn30cF0dpBM38N4u6YwaUwrBp/kPI0tes97SgY
 lZM4VEAW6iF+18uLSyv7D0Mpba9qQg==
 =9R7U
 -----END PGP SIGNATURE-----

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "Add as a feature case-insensitive directories (the casefold feature)
  using Unicode 12.1.

  Also, the usual largish number of cleanups and bug fixes"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (25 commits)
  ext4: export /sys/fs/ext4/feature/casefold if Unicode support is present
  ext4: fix ext4_show_options for file systems w/o journal
  unicode: refactor the rule for regenerating utf8data.h
  docs: ext4.rst: document case-insensitive directories
  ext4: Support case-insensitive file name lookups
  ext4: include charset encoding information in the superblock
  MAINTAINERS: add Unicode subsystem entry
  unicode: update unicode database unicode version 12.1.0
  unicode: introduce test module for normalized utf8 implementation
  unicode: implement higher level API for string handling
  unicode: reduce the size of utf8data[]
  unicode: introduce code for UTF-8 normalization
  unicode: introduce UTF-8 character database
  ext4: actually request zeroing of inode table after grow
  ext4: cond_resched in work-heavy group loops
  ext4: fix use-after-free race with debug_want_extra_isize
  ext4: avoid drop reference to iloc.bh twice
  ext4: ignore e_value_offs for xattrs with value-in-ea-inode
  ext4: protect journal inode's blocks using block_validity
  ext4: use BUG() instead of BUG_ON(1)
  ...
This commit is contained in:
Linus Torvalds 2019-05-07 21:12:44 -07:00
commit 5abe37954e
35 changed files with 9591 additions and 59 deletions

View File

@ -91,10 +91,48 @@ Currently Available
* large block (up to pagesize) support
* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
the ordering)
* Case-insensitive file name lookups
[1] Filesystems with a block size of 1k may see a limit imposed by the
directory hash tree having a maximum depth of two.
case-insensitive file name lookups
======================================================
The case-insensitive file name lookup feature is supported on a
per-directory basis, allowing the user to mix case-insensitive and
case-sensitive directories in the same filesystem. It is enabled by
flipping the +F inode attribute of an empty directory. The
case-insensitive string match operation is only defined when we know how
text in encoded in a byte sequence. For that reason, in order to enable
case-insensitive directories, the filesystem must have the
casefold feature, which stores the filesystem-wide encoding
model used. By default, the charset adopted is the latest version of
Unicode (12.1.0, by the time of this writing), encoded in the UTF-8
form. The comparison algorithm is implemented by normalizing the
strings to the Canonical decomposition form, as defined by Unicode,
followed by a byte per byte comparison.
The case-awareness is name-preserving on the disk, meaning that the file
name provided by userspace is a byte-per-byte match to what is actually
written in the disk. The Unicode normalization format used by the
kernel is thus an internal representation, and not exposed to the
userspace nor to the disk, with the important exception of disk hashes,
used on large case-insensitive directories with DX feature. On DX
directories, the hash must be calculated using the casefolded version of
the filename, meaning that the normalization format used actually has an
impact on where the directory entry is stored.
When we change from viewing filenames as opaque byte sequences to seeing
them as encoded strings we need to address what happens when a program
tries to create a file with an invalid name. The Unicode subsystem
within the kernel leaves the decision of what to do in this case to the
filesystem, which select its preferred behavior by enabling/disabling
the strict mode. When Ext4 encounters one of those strings and the
filesystem did not require strict mode, it falls back to considering the
entire string as an opaque byte sequence, which still allows the user to
operate on that file, but the case-insensitive lookups won't work.
Options
=======

View File

@ -176,6 +176,7 @@ mkprep
mkregtable
mktables
mktree
mkutf8data
modpost
modules.builtin
modules.order
@ -254,6 +255,7 @@ vsyscall_32.lds
wanxlfw.inc
uImage
unifdef
utf8data.h
wakeup.bin
wakeup.elf
wakeup.lds

View File

@ -15984,6 +15984,12 @@ F: drivers/uwb/
F: include/linux/uwb.h
F: include/linux/uwb/
UNICODE SUBSYSTEM:
M: Gabriel Krisman Bertazi <krisman@collabora.com>
L: linux-fsdevel@vger.kernel.org
S: Supported
F: fs/unicode/
UNICORE32 ARCHITECTURE:
M: Guan Xuetao <gxt@pku.edu.cn>
W: http://mprc.pku.edu.cn/~guanxuetao/linux

View File

@ -317,5 +317,6 @@ endif # NETWORK_FILESYSTEMS
source "fs/nls/Kconfig"
source "fs/dlm/Kconfig"
source "fs/unicode/Kconfig"
endmenu

View File

@ -92,6 +92,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs/
obj-$(CONFIG_NFSD) += nfsd/
obj-$(CONFIG_LOCKD) += lockd/
obj-$(CONFIG_NLS) += nls/
obj-$(CONFIG_UNICODE) += unicode/
obj-$(CONFIG_SYSV_FS) += sysv/
obj-$(CONFIG_CIFS) += cifs/
obj-$(CONFIG_HPFS_FS) += hpfs/

View File

@ -137,6 +137,48 @@ static void debug_print_tree(struct ext4_sb_info *sbi)
printk(KERN_CONT "\n");
}
static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino)
{
struct inode *inode;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_map_blocks map;
u32 i = 0, err = 0, num, n;
if ((ino < EXT4_ROOT_INO) ||
(ino > le32_to_cpu(sbi->s_es->s_inodes_count)))
return -EINVAL;
inode = ext4_iget(sb, ino, EXT4_IGET_SPECIAL);
if (IS_ERR(inode))
return PTR_ERR(inode);
num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
while (i < num) {
map.m_lblk = i;
map.m_len = num - i;
n = ext4_map_blocks(NULL, inode, &map, 0);
if (n < 0) {
err = n;
break;
}
if (n == 0) {
i++;
} else {
if (!ext4_data_block_valid(sbi, map.m_pblk, n)) {
ext4_error(sb, "blocks %llu-%llu from inode %u "
"overlap system zone", map.m_pblk,
map.m_pblk + map.m_len - 1, ino);
err = -EFSCORRUPTED;
break;
}
err = add_system_zone(sbi, map.m_pblk, n);
if (err < 0)
break;
i += n;
}
}
iput(inode);
return err;
}
int ext4_setup_system_zone(struct super_block *sb)
{
ext4_group_t ngroups = ext4_get_groups_count(sb);
@ -155,6 +197,7 @@ int ext4_setup_system_zone(struct super_block *sb)
return 0;
for (i=0; i < ngroups; i++) {
cond_resched();
if (ext4_bg_has_super(sb, i) &&
((i < 5) || ((i % flex_size) == 0)))
add_system_zone(sbi, ext4_group_first_block_no(sb, i),
@ -171,6 +214,12 @@ int ext4_setup_system_zone(struct super_block *sb)
if (ret)
return ret;
}
if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) {
ret = ext4_protect_reserved_inode(sb,
le32_to_cpu(sbi->s_es->s_journal_inum));
if (ret)
return ret;
}
if (test_opt(sb, DEBUG))
debug_print_tree(sbi);

View File

@ -26,6 +26,7 @@
#include <linux/buffer_head.h>
#include <linux/slab.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include "ext4.h"
#include "xattr.h"
@ -660,3 +661,50 @@ const struct file_operations ext4_dir_operations = {
.open = ext4_dir_open,
.release = ext4_release_dir,
};
#ifdef CONFIG_UNICODE
static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
{
struct qstr qstr = {.name = str, .len = len };
if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) {
if (len != name->len)
return -1;
return !memcmp(str, name, len);
}
return ext4_ci_compare(dentry->d_parent->d_inode, name, &qstr);
}
static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
{
const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb);
const struct unicode_map *um = sbi->s_encoding;
unsigned char *norm;
int len, ret = 0;
if (!IS_CASEFOLDED(dentry->d_inode))
return 0;
norm = kmalloc(PATH_MAX, GFP_ATOMIC);
if (!norm)
return -ENOMEM;
len = utf8_casefold(um, str, norm, PATH_MAX);
if (len < 0) {
if (ext4_has_strict_mode(sbi))
ret = -EINVAL;
goto out;
}
str->hash = full_name_hash(dentry, norm, len);
out:
kfree(norm);
return ret;
}
const struct dentry_operations ext4_dentry_ops = {
.d_hash = ext4_d_hash,
.d_compare = ext4_d_compare,
};
#endif

View File

@ -399,10 +399,11 @@ struct flex_groups {
#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */
#define EXT4_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */
#define EXT4_FL_USER_VISIBLE 0x704BDFFF /* User visible flags */
#define EXT4_FL_USER_MODIFIABLE 0x604BC0FF /* User modifiable flags */
/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \
@ -417,10 +418,10 @@ struct flex_groups {
EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
EXT4_PROJINHERIT_FL)
EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL))
/* Flags that are appropriate for non-directories/regular files. */
#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
@ -1313,7 +1314,9 @@ struct ext4_super_block {
__u8 s_first_error_time_hi;
__u8 s_last_error_time_hi;
__u8 s_pad[2];
__le32 s_reserved[96]; /* Padding to the end of the block */
__le16 s_encoding; /* Filename charset encoding */
__le16 s_encoding_flags; /* Filename charset encoding flags */
__le32 s_reserved[95]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@ -1338,6 +1341,16 @@ struct ext4_super_block {
/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3
#define EXT4_ENC_UTF8_12_1 1
/*
* Flags for ext4_sb_info.s_encoding_flags.
*/
#define EXT4_ENC_STRICT_MODE_FL (1 << 0)
#define ext4_has_strict_mode(sbi) \
(sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL)
/*
* fourth extended-fs super-block data in memory
*/
@ -1387,6 +1400,10 @@ struct ext4_sb_info {
struct kobject s_kobj;
struct completion s_kobj_unregister;
struct super_block *s_sb;
#ifdef CONFIG_UNICODE
struct unicode_map *s_encoding;
__u16 s_encoding_flags;
#endif
/* Journaling */
struct journal_s *s_journal;
@ -1592,9 +1609,6 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_SB(sb) (sb)
#endif
/*
* Returns true if the inode is inode is encrypted
*/
#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
/*
@ -1663,6 +1677,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000
#define EXT4_FEATURE_INCOMPAT_CASEFOLD 0x20000
extern void ext4_update_dynamic_rev(struct super_block *sb);
@ -1756,6 +1771,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED)
EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR)
EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA)
EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT)
EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@ -1783,6 +1799,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT)
EXT4_FEATURE_INCOMPAT_MMP | \
EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
EXT4_FEATURE_INCOMPAT_ENCRYPT | \
EXT4_FEATURE_INCOMPAT_CASEFOLD | \
EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
EXT4_FEATURE_INCOMPAT_LARGEDIR)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
@ -2376,8 +2393,8 @@ extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
/* hash.c */
extern int ext4fs_dirhash(const char *name, int len, struct
dx_hash_info *hinfo);
extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo);
/* ialloc.c */
extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t,
@ -2973,6 +2990,10 @@ static inline void ext4_unlock_group(struct super_block *sb,
/* dir.c */
extern const struct file_operations ext4_dir_operations;
#ifdef CONFIG_UNICODE
extern const struct dentry_operations ext4_dentry_ops;
#endif
/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
@ -3065,6 +3086,10 @@ extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
extern int ext4_handle_dirty_dirent_node(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
extern int ext4_ci_compare(const struct inode *parent,
const struct qstr *name,
const struct qstr *entry);
#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
[S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,

View File

@ -711,7 +711,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
* We don't need to check unwritten extent because
* indirect-based file doesn't have it.
*/
BUG_ON(1);
BUG();
}
} else if (retval == 0) {
if (ext4_es_is_written(es)) {
@ -780,7 +780,7 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
}
p = &(*p)->rb_right;
} else {
BUG_ON(1);
BUG();
return -EINVAL;
}
}

View File

@ -6,6 +6,7 @@
*/
#include <linux/fs.h>
#include <linux/unicode.h>
#include <linux/compiler.h>
#include <linux/bitops.h>
#include "ext4.h"
@ -196,7 +197,8 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
* represented, and whether or not the returned hash is 32 bits or 64
* bits. 32 bit hashes will return 0 for the minor hash.
*/
int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
static int __ext4fs_dirhash(const char *name, int len,
struct dx_hash_info *hinfo)
{
__u32 hash;
__u32 minor_hash = 0;
@ -268,3 +270,33 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
hinfo->minor_hash = minor_hash;
return 0;
}
int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo)
{
#ifdef CONFIG_UNICODE
const struct unicode_map *um = EXT4_SB(dir->i_sb)->s_encoding;
int r, dlen;
unsigned char *buff;
struct qstr qstr = {.name = name, .len = len };
if (len && IS_CASEFOLDED(dir)) {
buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
if (!buff)
return -ENOMEM;
dlen = utf8_casefold(um, &qstr, buff, PATH_MAX);
if (dlen < 0) {
kfree(buff);
goto opaque_seq;
}
r = __ext4fs_dirhash(buff, dlen, hinfo);
kfree(buff);
return r;
}
opaque_seq:
#endif
return __ext4fs_dirhash(name, len, hinfo);
}

View File

@ -455,7 +455,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
if (qstr) {
hinfo.hash_version = DX_HASH_HALF_MD4;
hinfo.seed = sbi->s_hash_seed;
ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
grp = hinfo.hash;
} else
grp = prandom_u32();

View File

@ -1407,7 +1407,7 @@ int htree_inlinedir_to_tree(struct file *dir_file,
}
}
ext4fs_dirhash(de->name, de->name_len, hinfo);
ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
(hinfo->minor_hash < start_minor_hash)))

View File

@ -399,6 +399,10 @@ static int __check_block_validity(struct inode *inode, const char *func,
unsigned int line,
struct ext4_map_blocks *map)
{
if (ext4_has_feature_journal(inode->i_sb) &&
(inode->i_ino ==
le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
return 0;
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
@ -541,7 +545,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
map->m_len = retval;
retval = 0;
} else {
BUG_ON(1);
BUG();
}
#ifdef ES_AGGRESSIVE_TEST
ext4_map_blocks_es_recheck(handle, inode, map,
@ -1876,7 +1880,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
else if (ext4_es_is_unwritten(&es))
map->m_flags |= EXT4_MAP_UNWRITTEN;
else
BUG_ON(1);
BUG();
#ifdef ES_AGGRESSIVE_TEST
ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
@ -4738,9 +4742,11 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_DAX;
if (flags & EXT4_ENCRYPT_FL)
new_fl |= S_ENCRYPTED;
if (flags & EXT4_CASEFOLD_FL)
new_fl |= S_CASEFOLD;
inode_set_flags(inode, new_fl,
S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
S_ENCRYPTED);
S_ENCRYPTED|S_CASEFOLD);
}
static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,

View File

@ -278,6 +278,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
struct ext4_iloc iloc;
unsigned int oldflags, mask, i;
unsigned int jflag;
struct super_block *sb = inode->i_sb;
/* Is it quota file? Do not allow user to mess with it */
if (ext4_is_quota_file(inode))
@ -322,6 +323,23 @@ static int ext4_ioctl_setflags(struct inode *inode,
goto flags_out;
}
if ((flags ^ oldflags) & EXT4_CASEFOLD_FL) {
if (!ext4_has_feature_casefold(sb)) {
err = -EOPNOTSUPP;
goto flags_out;
}
if (!S_ISDIR(inode->i_mode)) {
err = -ENOTDIR;
goto flags_out;
}
if (!ext4_empty_dir(inode)) {
err = -ENOTEMPTY;
goto flags_out;
}
}
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
@ -978,7 +996,7 @@ mext_out:
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
if (!err && (o_group > EXT4_SB(sb)->s_groups_count) &&
if (!err && (o_group < EXT4_SB(sb)->s_groups_count) &&
ext4_has_group_desc_csum(sb) &&
test_opt(sb, INIT_INODE_TABLE))
err = ext4_register_li_request(sb, o_group);

View File

@ -1539,7 +1539,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
ex->fe_len += 1 << order;
}
if (ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))) {
if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
/* Should never happen! (but apparently sometimes does?!?) */
WARN_ON(1);
ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent "
@ -2490,6 +2490,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
for (i = 0; i < ngroups; i++) {
cond_resched();
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
@ -2705,6 +2706,7 @@ int ext4_mb_release(struct super_block *sb)
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
cond_resched();
grinfo = ext4_get_group_info(sb, i);
#ifdef DOUBLE_CHECK
kfree(grinfo->bb_bitmap);

View File

@ -35,6 +35,7 @@
#include <linux/buffer_head.h>
#include <linux/bio.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include "ext4.h"
#include "ext4_jbd2.h"
@ -629,7 +630,7 @@ static struct stats dx_show_leaf(struct inode *dir,
}
if (!fscrypt_has_encryption_key(dir)) {
/* Directory is not encrypted */
ext4fs_dirhash(de->name,
ext4fs_dirhash(dir, de->name,
de->name_len, &h);
printk("%*.s:(U)%x.%u ", len,
name, h.hash,
@ -662,8 +663,8 @@ static struct stats dx_show_leaf(struct inode *dir,
name = fname_crypto_str.name;
len = fname_crypto_str.len;
}
ext4fs_dirhash(de->name, de->name_len,
&h);
ext4fs_dirhash(dir, de->name,
de->name_len, &h);
printk("%*.s:(E)%x.%u ", len, name,
h.hash, (unsigned) ((char *) de
- base));
@ -673,7 +674,7 @@ static struct stats dx_show_leaf(struct inode *dir,
#else
int len = de->name_len;
char *name = de->name;
ext4fs_dirhash(de->name, de->name_len, &h);
ext4fs_dirhash(dir, de->name, de->name_len, &h);
printk("%*.s:%x.%u ", len, name, h.hash,
(unsigned) ((char *) de - base));
#endif
@ -762,7 +763,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
if (fname && fname_name(fname))
ext4fs_dirhash(fname_name(fname), fname_len(fname), hinfo);
ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo);
hash = hinfo->hash;
if (root->info.unused_flags & 1) {
@ -1008,7 +1009,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
/* silently ignore the rest of the block */
break;
}
ext4fs_dirhash(de->name, de->name_len, hinfo);
ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
(hinfo->minor_hash < start_minor_hash)))
@ -1197,7 +1198,7 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
while ((char *) de < base + blocksize) {
if (de->name_len && de->inode) {
ext4fs_dirhash(de->name, de->name_len, &h);
ext4fs_dirhash(dir, de->name, de->name_len, &h);
map_tail--;
map_tail->hash = h.hash;
map_tail->offs = ((char *) de - base)>>2;
@ -1252,15 +1253,52 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
dx_set_count(entries, count + 1);
}
#ifdef CONFIG_UNICODE
/*
* Test whether a case-insensitive directory entry matches the filename
* being searched for.
*
* Returns: 0 if the directory entry matches, more than 0 if it
* doesn't match or less than zero on error.
*/
int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
const struct qstr *entry)
{
const struct ext4_sb_info *sbi = EXT4_SB(parent->i_sb);
const struct unicode_map *um = sbi->s_encoding;
int ret;
ret = utf8_strncasecmp(um, name, entry);
if (ret < 0) {
/* Handle invalid character sequence as either an error
* or as an opaque byte sequence.
*/
if (ext4_has_strict_mode(sbi))
return -EINVAL;
if (name->len != entry->len)
return 1;
return !!memcmp(name->name, entry->name, name->len);
}
return ret;
}
#endif
/*
* Test whether a directory entry matches the filename being searched for.
*
* Return: %true if the directory entry matches, otherwise %false.
*/
static inline bool ext4_match(const struct ext4_filename *fname,
static inline bool ext4_match(const struct inode *parent,
const struct ext4_filename *fname,
const struct ext4_dir_entry_2 *de)
{
struct fscrypt_name f;
#ifdef CONFIG_UNICODE
const struct qstr entry = {.name = de->name, .len = de->name_len};
#endif
if (!de->inode)
return false;
@ -1270,6 +1308,12 @@ static inline bool ext4_match(const struct ext4_filename *fname,
#ifdef CONFIG_FS_ENCRYPTION
f.crypto_buf = fname->crypto_buf;
#endif
#ifdef CONFIG_UNICODE
if (EXT4_SB(parent->i_sb)->s_encoding && IS_CASEFOLDED(parent))
return (ext4_ci_compare(parent, fname->usr_fname, &entry) == 0);
#endif
return fscrypt_match_name(&f, de->name, de->name_len);
}
@ -1290,7 +1334,7 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
if ((char *) de + de->name_len <= dlimit &&
ext4_match(fname, de)) {
ext4_match(dir, fname, de)) {
/* found a match - just to be sure, do
* a full check */
if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
@ -1588,6 +1632,17 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
return ERR_PTR(-EPERM);
}
}
#ifdef CONFIG_UNICODE
if (!inode && IS_CASEFOLDED(dir)) {
/* Eventually we want to call d_add_ci(dentry, NULL)
* for negative dentries in the encoding case as
* well. For now, prevent the negative dentry
* from being cached.
*/
return NULL;
}
#endif
return d_splice_alias(inode, dentry);
}
@ -1798,7 +1853,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
if (ext4_check_dir_entry(dir, NULL, de, bh,
buf, buf_size, offset))
return -EFSCORRUPTED;
if (ext4_match(fname, de))
if (ext4_match(dir, fname, de))
return -EEXIST;
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
@ -1983,7 +2038,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
if (fname->hinfo.hash_version <= DX_HASH_TEA)
fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
ext4fs_dirhash(fname_name(fname), fname_len(fname), &fname->hinfo);
ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo);
memset(frames, 0, sizeof(frames));
frame = frames;
@ -2036,6 +2091,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
struct ext4_dir_entry_2 *de;
struct ext4_dir_entry_tail *t;
struct super_block *sb;
struct ext4_sb_info *sbi;
struct ext4_filename fname;
int retval;
int dx_fallback=0;
@ -2047,10 +2103,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
sbi = EXT4_SB(sb);
blocksize = sb->s_blocksize;
if (!dentry->d_name.len)
return -EINVAL;
#ifdef CONFIG_UNICODE
if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) &&
utf8_validate(sbi->s_encoding, &dentry->d_name))
return -EINVAL;
#endif
retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname);
if (retval)
return retval;
@ -2975,6 +3038,17 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
ext4_update_dx_flag(dir);
ext4_mark_inode_dirty(handle, dir);
#ifdef CONFIG_UNICODE
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
* negative dentries at ext4_lookup(), when it is better
* supported by the VFS for the CI case.
*/
if (IS_CASEFOLDED(dir))
d_invalidate(dentry);
#endif
end_rmdir:
brelse(bh);
if (handle)
@ -3044,6 +3118,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
inode->i_ctime = current_time(inode);
ext4_mark_inode_dirty(handle, inode);
#ifdef CONFIG_UNICODE
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
* negative dentries at ext4_lookup(), when it is better
* supported by the VFS for the CI case.
*/
if (IS_CASEFOLDED(dir))
d_invalidate(dentry);
#endif
end_unlink:
brelse(bh);
if (handle)

View File

@ -126,9 +126,10 @@ int ext4_mpage_readpages(struct address_space *mapping,
int fully_mapped = 1;
unsigned first_hole = blocks_per_page;
prefetchw(&page->flags);
if (pages) {
page = lru_to_page(pages);
prefetchw(&page->flags);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping, page->index,
readahead_gfp_mask(mapping)))

View File

@ -874,6 +874,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
if (unlikely(err)) {
ext4_std_error(sb, err);
iloc.bh = NULL;
goto errout;
}
brelse(dind);

View File

@ -42,6 +42,7 @@
#include <linux/cleancache.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@ -1054,6 +1055,9 @@ static void ext4_put_super(struct super_block *sb)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
fs_put_dax(sbi->s_daxdev);
#ifdef CONFIG_UNICODE
utf8_unload(sbi->s_encoding);
#endif
kfree(sbi);
}
@ -1749,6 +1753,36 @@ static const struct mount_opts {
{Opt_err, 0, 0}
};
#ifdef CONFIG_UNICODE
static const struct ext4_sb_encodings {
__u16 magic;
char *name;
char *version;
} ext4_sb_encoding_map[] = {
{EXT4_ENC_UTF8_12_1, "utf8", "12.1.0"},
};
static int ext4_sb_read_encoding(const struct ext4_super_block *es,
const struct ext4_sb_encodings **encoding,
__u16 *flags)
{
__u16 magic = le16_to_cpu(es->s_encoding);
int i;
for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
if (magic == ext4_sb_encoding_map[i].magic)
break;
if (i >= ARRAY_SIZE(ext4_sb_encoding_map))
return -EINVAL;
*encoding = &ext4_sb_encoding_map[i];
*flags = le16_to_cpu(es->s_encoding_flags);
return 0;
}
#endif
static int handle_mount_opt(struct super_block *sb, char *opt, int token,
substring_t *args, unsigned long *journal_devnum,
unsigned int *journal_ioprio, int is_remount)
@ -2875,6 +2909,15 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
#ifndef CONFIG_UNICODE
if (ext4_has_feature_casefold(sb)) {
ext4_msg(sb, KERN_ERR,
"Filesystem with casefold feature cannot be "
"mounted without CONFIG_UNICODE");
return 0;
}
#endif
if (readonly)
return 1;
@ -3496,6 +3539,37 @@ int ext4_calculate_overhead(struct super_block *sb)
return 0;
}
static void ext4_clamp_want_extra_isize(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
/* determine the minimum size of new large inodes, if present */
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
sbi->s_want_extra_isize == 0) {
sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
if (ext4_has_feature_extra_isize(sb)) {
if (sbi->s_want_extra_isize <
le16_to_cpu(es->s_want_extra_isize))
sbi->s_want_extra_isize =
le16_to_cpu(es->s_want_extra_isize);
if (sbi->s_want_extra_isize <
le16_to_cpu(es->s_min_extra_isize))
sbi->s_want_extra_isize =
le16_to_cpu(es->s_min_extra_isize);
}
}
/* Check if enough inode space is available */
if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
sbi->s_inode_size) {
sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
ext4_msg(sb, KERN_INFO,
"required extra inode space not available");
}
}
static void ext4_set_resv_clusters(struct super_block *sb)
{
ext4_fsblk_t resv_clusters;
@ -3722,6 +3796,43 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
&journal_ioprio, 0))
goto failed_mount;
#ifdef CONFIG_UNICODE
if (ext4_has_feature_casefold(sb) && !sbi->s_encoding) {
const struct ext4_sb_encodings *encoding_info;
struct unicode_map *encoding;
__u16 encoding_flags;
if (ext4_has_feature_encrypt(sb)) {
ext4_msg(sb, KERN_ERR,
"Can't mount with encoding and encryption");
goto failed_mount;
}
if (ext4_sb_read_encoding(es, &encoding_info,
&encoding_flags)) {
ext4_msg(sb, KERN_ERR,
"Encoding requested by superblock is unknown");
goto failed_mount;
}
encoding = utf8_load(encoding_info->version);
if (IS_ERR(encoding)) {
ext4_msg(sb, KERN_ERR,
"can't mount with superblock charset: %s-%s "
"not supported by the kernel. flags: 0x%x.",
encoding_info->name, encoding_info->version,
encoding_flags);
goto failed_mount;
}
ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
"%s-%s with flags 0x%hx", encoding_info->name,
encoding_info->version?:"\b", encoding_flags);
sbi->s_encoding = encoding;
sbi->s_encoding_flags = encoding_flags;
}
#endif
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
printk_once(KERN_WARNING "EXT4-fs: Warning: mounting "
"with data=journal disables delayed "
@ -4219,7 +4330,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"data=, fs mounted w/o journal");
goto failed_mount_wq;
}
sbi->s_def_mount_opt &= EXT4_MOUNT_JOURNAL_CHECKSUM;
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
sbi->s_journal = NULL;
@ -4354,6 +4465,12 @@ no_journal:
iput(root);
goto failed_mount4;
}
#ifdef CONFIG_UNICODE
if (sbi->s_encoding)
sb->s_d_op = &ext4_dentry_ops;
#endif
sb->s_root = d_make_root(root);
if (!sb->s_root) {
ext4_msg(sb, KERN_ERR, "get root dentry failed");
@ -4368,30 +4485,7 @@ no_journal:
} else if (ret)
goto failed_mount4a;
/* determine the minimum size of new large inodes, if present */
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
sbi->s_want_extra_isize == 0) {
sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
if (ext4_has_feature_extra_isize(sb)) {
if (sbi->s_want_extra_isize <
le16_to_cpu(es->s_want_extra_isize))
sbi->s_want_extra_isize =
le16_to_cpu(es->s_want_extra_isize);
if (sbi->s_want_extra_isize <
le16_to_cpu(es->s_min_extra_isize))
sbi->s_want_extra_isize =
le16_to_cpu(es->s_min_extra_isize);
}
}
/* Check if enough inode space is available */
if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
sbi->s_inode_size) {
sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
ext4_msg(sb, KERN_INFO, "required extra inode space not"
"available");
}
ext4_clamp_want_extra_isize(sb);
ext4_set_resv_clusters(sb);
@ -4559,6 +4653,11 @@ failed_mount2:
failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
#ifdef CONFIG_UNICODE
utf8_unload(sbi->s_encoding);
#endif
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
@ -5175,6 +5274,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
ext4_clamp_want_extra_isize(sb);
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
test_opt(sb, JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "changing journal_checksum "

View File

@ -238,6 +238,9 @@ EXT4_ATTR_FEATURE(meta_bg_resize);
#ifdef CONFIG_FS_ENCRYPTION
EXT4_ATTR_FEATURE(encryption);
#endif
#ifdef CONFIG_UNICODE
EXT4_ATTR_FEATURE(casefold);
#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
static struct attribute *ext4_feat_attrs[] = {
@ -246,6 +249,9 @@ static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(meta_bg_resize),
#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
#endif
#ifdef CONFIG_UNICODE
ATTR_LIST(casefold),
#endif
ATTR_LIST(metadata_csum_seed),
NULL,

View File

@ -1696,7 +1696,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
/* No failures allowed past this point. */
if (!s->not_found && here->e_value_size && here->e_value_offs) {
if (!s->not_found && here->e_value_size && !here->e_value_inum) {
/* Remove the old value. */
void *first_val = s->base + min_offs;
size_t offs = le16_to_cpu(here->e_value_offs);

View File

@ -132,7 +132,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
return;
}
spin_lock(&journal->j_list_lock);
nblocks = jbd2_space_needed(journal);
space_left = jbd2_log_space_left(journal);
if (space_left < nblocks) {
int chkpt = journal->j_checkpoint_transactions != NULL;

View File

@ -1350,6 +1350,10 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
journal_superblock_t *sb = journal->j_superblock;
int ret;
/* Buffer got discarded which means block device got invalidated */
if (!buffer_mapped(bh))
return -EIO;
trace_jbd2_write_superblock(journal, write_flags);
if (!(journal->j_flags & JBD2_BARRIER))
write_flags &= ~(REQ_FUA | REQ_PREFLUSH);

2
fs/unicode/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
mkutf8data
utf8data.h

13
fs/unicode/Kconfig Normal file
View File

@ -0,0 +1,13 @@
#
# UTF-8 normalization
#
config UNICODE
bool "UTF-8 normalization and casefolding support"
help
Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
support.
config UNICODE_NORMALIZATION_SELFTEST
tristate "Test UTF-8 normalization support"
depends on UNICODE
default n

38
fs/unicode/Makefile Normal file
View File

@ -0,0 +1,38 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_UNICODE) += unicode.o
obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
unicode-y := utf8-norm.o utf8-core.o
$(obj)/utf8-norm.o: $(obj)/utf8data.h
# In the normal build, the checked-in utf8data.h is just shipped.
#
# To generate utf8data.h from UCD, put *.txt files in this directory
# and pass REGENERATE_UTF8DATA=1 from the command line.
ifdef REGENERATE_UTF8DATA
quiet_cmd_utf8data = GEN $@
cmd_utf8data = $< \
-a $(srctree)/$(src)/DerivedAge.txt \
-c $(srctree)/$(src)/DerivedCombiningClass.txt \
-p $(srctree)/$(src)/DerivedCoreProperties.txt \
-d $(srctree)/$(src)/UnicodeData.txt \
-f $(srctree)/$(src)/CaseFolding.txt \
-n $(srctree)/$(src)/NormalizationCorrections.txt \
-t $(srctree)/$(src)/NormalizationTest.txt \
-o $@
$(obj)/utf8data.h: $(obj)/mkutf8data $(filter %.txt, $(cmd_utf8data)) FORCE
$(call if_changed,utf8data)
else
$(obj)/utf8data.h: $(src)/utf8data.h_shipped FORCE
$(call if_changed,shipped)
endif
targets += utf8data.h
hostprogs-y += mkutf8data

View File

@ -0,0 +1,71 @@
The utf8data.h file in this directory is generated from the Unicode
Character Database for version 12.1.0 of the Unicode standard.
The full set of files can be found here:
http://www.unicode.org/Public/12.1.0/ucd/
Note!
The URL's listed below are not stable. That's because Unicode 12.1.0
has not been officially released yet; it is scheduled to be released
on May 8, 2019. We taking Unicode 12.1.0 a few weeks early because it
contains a new Japanese character which is required in order to
specify Japenese dates after May 1, 2019, when Crown Prince Naruhito
ascends to the Chrysanthemum Throne. (Isn't internationalization fun?
The abdication of Emperor Akihito of Japan is requiring dozens of
software packages to be updated with only a month's notice. :-)
We will update the URL's (and any needed changes to the checksums)
after the final Unicode 12.1.0 is released.
Individual source links:
https://www.unicode.org/Public/12.1.0/ucd/CaseFolding-12.1.0d2.txt
https://www.unicode.org/Public/12.1.0/ucd/DerivedAge-12.1.0d3.txt
https://www.unicode.org/Public/12.1.0/ucd/extracted/DerivedCombiningClass-12.1.0d2.txt
https://www.unicode.org/Public/12.1.0/ucd/DerivedCoreProperties-12.1.0d2.txt
https://www.unicode.org/Public/12.1.0/ucd/NormalizationCorrections-12.1.0d1.txt
https://www.unicode.org/Public/12.1.0/ucd/NormalizationTest-12.1.0d3.txt
https://www.unicode.org/Public/12.1.0/ucd/UnicodeData-12.1.0d2.txt
md5sums (verify by running "md5sum -c README.utf8data"):
900e76da1d822a160fd6b8c0b1d70094 CaseFolding.txt
131256380bff4fea8ad4a851616f2f10 DerivedAge.txt
e731a4089b30002144e107e3d6f8d1fa DerivedCombiningClass.txt
a47c9fbd7ff92a9b261ba9831e68778a DerivedCoreProperties.txt
fcab6dad15e440879d92f315978f93d3 NormalizationCorrections.txt
f9ff1c55a60decf436100f791b44aa98 NormalizationTest.txt
755f6af699f8c8d2d958da411f78f6c6 UnicodeData.txt
sha1sums (verify by running "sha1sum -c README.utf8data"):
dc9245f6803c4ac99555c361f5052e0b13eb779b CaseFolding.txt
3281104f237184cdb5d869e86eb8573678ada7da DerivedAge.txt
2f5f995ccb96e0fa84b15151b35d5e2681535175 DerivedCombiningClass.txt
5b8698a3fcd5018e1987f296b02e2c17e696415e DerivedCoreProperties.txt
cd83935fbc012345d8792d2c704f69497e753835 NormalizationCorrections.txt
ea419aae505b337b0d99a83fa83fe58ddff7c19f NormalizationTest.txt
dc973c0fc93d6f09d9ab9f70d1c9f89c447f0526 UnicodeData.txt
To update to the newer version of the Unicode standard, the latest
released version of the UCD can be found here:
http://www.unicode.org/Public/UCD/latest/
Then, build under fs/unicode/ with REGENERATE_UTF8DATA=1:
make REGENERATE_UTF8DATA=1 fs/unicode/
After sanity checking the newly generated utf8data.h file (the
version generated from the 12.1.0 UCD should be 4,109 lines long, and
have a total size of 324k) and/or comparing it with the older version
of utf8data.h_shipped, rename it to utf8data.h_shipped.
If you are a kernel developer updating to a newer version of the
Unicode Character Database, please update this README.utf8data file
with the version of the UCD that was used, the md5sum and sha1sums of
the *.txt files, before checking in the new versions of the utf8data.h
and README.utf8data files.

3419
fs/unicode/mkutf8data.c Normal file

File diff suppressed because it is too large Load Diff

187
fs/unicode/utf8-core.c Normal file
View File

@ -0,0 +1,187 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/parser.h>
#include <linux/errno.h>
#include <linux/unicode.h>
#include "utf8n.h"
int utf8_validate(const struct unicode_map *um, const struct qstr *str)
{
const struct utf8data *data = utf8nfdi(um->version);
if (utf8nlen(data, str->name, str->len) < 0)
return -1;
return 0;
}
EXPORT_SYMBOL(utf8_validate);
int utf8_strncmp(const struct unicode_map *um,
const struct qstr *s1, const struct qstr *s2)
{
const struct utf8data *data = utf8nfdi(um->version);
struct utf8cursor cur1, cur2;
int c1, c2;
if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
return -EINVAL;
if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
return -EINVAL;
do {
c1 = utf8byte(&cur1);
c2 = utf8byte(&cur2);
if (c1 < 0 || c2 < 0)
return -EINVAL;
if (c1 != c2)
return 1;
} while (c1);
return 0;
}
EXPORT_SYMBOL(utf8_strncmp);
int utf8_strncasecmp(const struct unicode_map *um,
const struct qstr *s1, const struct qstr *s2)
{
const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur1, cur2;
int c1, c2;
if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
return -EINVAL;
if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
return -EINVAL;
do {
c1 = utf8byte(&cur1);
c2 = utf8byte(&cur2);
if (c1 < 0 || c2 < 0)
return -EINVAL;
if (c1 != c2)
return 1;
} while (c1);
return 0;
}
EXPORT_SYMBOL(utf8_strncasecmp);
int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen)
{
const struct utf8data *data = utf8nfdicf(um->version);
struct utf8cursor cur;
size_t nlen = 0;
if (utf8ncursor(&cur, data, str->name, str->len) < 0)
return -EINVAL;
for (nlen = 0; nlen < dlen; nlen++) {
int c = utf8byte(&cur);
dest[nlen] = c;
if (!c)
return nlen;
if (c == -1)
break;
}
return -EINVAL;
}
EXPORT_SYMBOL(utf8_casefold);
int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen)
{
const struct utf8data *data = utf8nfdi(um->version);
struct utf8cursor cur;
ssize_t nlen = 0;
if (utf8ncursor(&cur, data, str->name, str->len) < 0)
return -EINVAL;
for (nlen = 0; nlen < dlen; nlen++) {
int c = utf8byte(&cur);
dest[nlen] = c;
if (!c)
return nlen;
if (c == -1)
break;
}
return -EINVAL;
}
EXPORT_SYMBOL(utf8_normalize);
static int utf8_parse_version(const char *version, unsigned int *maj,
unsigned int *min, unsigned int *rev)
{
substring_t args[3];
char version_string[12];
const struct match_token token[] = {
{1, "%d.%d.%d"},
{0, NULL}
};
strncpy(version_string, version, sizeof(version_string));
if (match_token(version_string, token, args) != 1)
return -EINVAL;
if (match_int(&args[0], maj) || match_int(&args[1], min) ||
match_int(&args[2], rev))
return -EINVAL;
return 0;
}
struct unicode_map *utf8_load(const char *version)
{
struct unicode_map *um = NULL;
int unicode_version;
if (version) {
unsigned int maj, min, rev;
if (utf8_parse_version(version, &maj, &min, &rev) < 0)
return ERR_PTR(-EINVAL);
if (!utf8version_is_supported(maj, min, rev))
return ERR_PTR(-EINVAL);
unicode_version = UNICODE_AGE(maj, min, rev);
} else {
unicode_version = utf8version_latest();
printk(KERN_WARNING"UTF-8 version not specified. "
"Assuming latest supported version (%d.%d.%d).",
(unicode_version >> 16) & 0xff,
(unicode_version >> 8) & 0xff,
(unicode_version & 0xff));
}
um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
if (!um)
return ERR_PTR(-ENOMEM);
um->charset = "UTF-8";
um->version = unicode_version;
return um;
}
EXPORT_SYMBOL(utf8_load);
void utf8_unload(struct unicode_map *um)
{
kfree(um);
}
EXPORT_SYMBOL(utf8_unload);
MODULE_LICENSE("GPL v2");

799
fs/unicode/utf8-norm.c Normal file
View File

@ -0,0 +1,799 @@
/*
* Copyright (c) 2014 SGI.
* All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*/
#include "utf8n.h"
struct utf8data {
unsigned int maxage;
unsigned int offset;
};
#define __INCLUDED_FROM_UTF8NORM_C__
#include "utf8data.h"
#undef __INCLUDED_FROM_UTF8NORM_C__
int utf8version_is_supported(u8 maj, u8 min, u8 rev)
{
int i = ARRAY_SIZE(utf8agetab) - 1;
unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
while (i >= 0 && utf8agetab[i] != 0) {
if (sb_utf8version == utf8agetab[i])
return 1;
i--;
}
return 0;
}
EXPORT_SYMBOL(utf8version_is_supported);
int utf8version_latest(void)
{
return utf8vers;
}
EXPORT_SYMBOL(utf8version_latest);
/*
* UTF-8 valid ranges.
*
* The UTF-8 encoding spreads the bits of a 32bit word over several
* bytes. This table gives the ranges that can be held and how they'd
* be represented.
*
* 0x00000000 0x0000007F: 0xxxxxxx
* 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
* 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
* 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*
* There is an additional requirement on UTF-8, in that only the
* shortest representation of a 32bit value is to be used. A decoder
* must not decode sequences that do not satisfy this requirement.
* Thus the allowed ranges have a lower bound.
*
* 0x00000000 0x0000007F: 0xxxxxxx
* 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
* 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
* 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*
* Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
* 17 planes of 65536 values. This limits the sequences actually seen
* even more, to just the following.
*
* 0 - 0x7F: 0 - 0x7F
* 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF
* 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF
* 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
*
* Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
*
* Note that the longest sequence seen with valid usage is 4 bytes,
* the same a single UTF-32 character. This makes the UTF-8
* representation of Unicode strictly smaller than UTF-32.
*
* The shortest sequence requirement was introduced by:
* Corrigendum #1: UTF-8 Shortest Form
* It can be found here:
* http://www.unicode.org/versions/corrigendum1.html
*
*/
/*
* Return the number of bytes used by the current UTF-8 sequence.
* Assumes the input points to the first byte of a valid UTF-8
* sequence.
*/
static inline int utf8clen(const char *s)
{
unsigned char c = *s;
return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
}
/*
* Decode a 3-byte UTF-8 sequence.
*/
static unsigned int
utf8decode3(const char *str)
{
unsigned int uc;
uc = *str++ & 0x0F;
uc <<= 6;
uc |= *str++ & 0x3F;
uc <<= 6;
uc |= *str++ & 0x3F;
return uc;
}
/*
* Encode a 3-byte UTF-8 sequence.
*/
static int
utf8encode3(char *str, unsigned int val)
{
str[2] = (val & 0x3F) | 0x80;
val >>= 6;
str[1] = (val & 0x3F) | 0x80;
val >>= 6;
str[0] = val | 0xE0;
return 3;
}
/*
* utf8trie_t
*
* A compact binary tree, used to decode UTF-8 characters.
*
* Internal nodes are one byte for the node itself, and up to three
* bytes for an offset into the tree. The first byte contains the
* following information:
* NEXTBYTE - flag - advance to next byte if set
* BITNUM - 3 bit field - the bit number to tested
* OFFLEN - 2 bit field - number of bytes in the offset
* if offlen == 0 (non-branching node)
* RIGHTPATH - 1 bit field - set if the following node is for the
* right-hand path (tested bit is set)
* TRIENODE - 1 bit field - set if the following node is an internal
* node, otherwise it is a leaf node
* if offlen != 0 (branching node)
* LEFTNODE - 1 bit field - set if the left-hand node is internal
* RIGHTNODE - 1 bit field - set if the right-hand node is internal
*
* Due to the way utf8 works, there cannot be branching nodes with
* NEXTBYTE set, and moreover those nodes always have a righthand
* descendant.
*/
typedef const unsigned char utf8trie_t;
#define BITNUM 0x07
#define NEXTBYTE 0x08
#define OFFLEN 0x30
#define OFFLEN_SHIFT 4
#define RIGHTPATH 0x40
#define TRIENODE 0x80
#define RIGHTNODE 0x40
#define LEFTNODE 0x80
/*
* utf8leaf_t
*
* The leaves of the trie are embedded in the trie, and so the same
* underlying datatype: unsigned char.
*
* leaf[0]: The unicode version, stored as a generation number that is
* an index into utf8agetab[]. With this we can filter code
* points based on the unicode version in which they were
* defined. The CCC of a non-defined code point is 0.
* leaf[1]: Canonical Combining Class. During normalization, we need
* to do a stable sort into ascending order of all characters
* with a non-zero CCC that occur between two characters with
* a CCC of 0, or at the begin or end of a string.
* The unicode standard guarantees that all CCC values are
* between 0 and 254 inclusive, which leaves 255 available as
* a special value.
* Code points with CCC 0 are known as stoppers.
* leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
* start of a NUL-terminated string that is the decomposition
* of the character.
* The CCC of a decomposable character is the same as the CCC
* of the first character of its decomposition.
* Some characters decompose as the empty string: these are
* characters with the Default_Ignorable_Code_Point property.
* These do affect normalization, as they all have CCC 0.
*
* The decompositions in the trie have been fully expanded, with the
* exception of Hangul syllables, which are decomposed algorithmically.
*
* Casefolding, if applicable, is also done using decompositions.
*
* The trie is constructed in such a way that leaves exist for all
* UTF-8 sequences that match the criteria from the "UTF-8 valid
* ranges" comment above, and only for those sequences. Therefore a
* lookup in the trie can be used to validate the UTF-8 input.
*/
typedef const unsigned char utf8leaf_t;
#define LEAF_GEN(LEAF) ((LEAF)[0])
#define LEAF_CCC(LEAF) ((LEAF)[1])
#define LEAF_STR(LEAF) ((const char *)((LEAF) + 2))
#define MINCCC (0)
#define MAXCCC (254)
#define STOPPER (0)
#define DECOMPOSE (255)
/* Marker for hangul syllable decomposition. */
#define HANGUL ((char)(255))
/* Size of the synthesized leaf used for Hangul syllable decomposition. */
#define UTF8HANGULLEAF (12)
/*
* Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
*
* AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
* D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
*
* SBase = 0xAC00
* LBase = 0x1100
* VBase = 0x1161
* TBase = 0x11A7
* LCount = 19
* VCount = 21
* TCount = 28
* NCount = 588 (VCount * TCount)
* SCount = 11172 (LCount * NCount)
*
* Decomposition:
* SIndex = s - SBase
*
* LV (Canonical/Full)
* LIndex = SIndex / NCount
* VIndex = (Sindex % NCount) / TCount
* LPart = LBase + LIndex
* VPart = VBase + VIndex
*
* LVT (Canonical)
* LVIndex = (SIndex / TCount) * TCount
* TIndex = (Sindex % TCount)
* LVPart = SBase + LVIndex
* TPart = TBase + TIndex
*
* LVT (Full)
* LIndex = SIndex / NCount
* VIndex = (Sindex % NCount) / TCount
* TIndex = (Sindex % TCount)
* LPart = LBase + LIndex
* VPart = VBase + VIndex
* if (TIndex == 0) {
* d = <LPart, VPart>
* } else {
* TPart = TBase + TIndex
* d = <LPart, TPart, VPart>
* }
*/
/* Constants */
#define SB (0xAC00)
#define LB (0x1100)
#define VB (0x1161)
#define TB (0x11A7)
#define LC (19)
#define VC (21)
#define TC (28)
#define NC (VC * TC)
#define SC (LC * NC)
/* Algorithmic decomposition of hangul syllable. */
static utf8leaf_t *
utf8hangul(const char *str, unsigned char *hangul)
{
unsigned int si;
unsigned int li;
unsigned int vi;
unsigned int ti;
unsigned char *h;
/* Calculate the SI, LI, VI, and TI values. */
si = utf8decode3(str) - SB;
li = si / NC;
vi = (si % NC) / TC;
ti = si % TC;
/* Fill in base of leaf. */
h = hangul;
LEAF_GEN(h) = 2;
LEAF_CCC(h) = DECOMPOSE;
h += 2;
/* Add LPart, a 3-byte UTF-8 sequence. */
h += utf8encode3((char *)h, li + LB);
/* Add VPart, a 3-byte UTF-8 sequence. */
h += utf8encode3((char *)h, vi + VB);
/* Add TPart if required, also a 3-byte UTF-8 sequence. */
if (ti)
h += utf8encode3((char *)h, ti + TB);
/* Terminate string. */
h[0] = '\0';
return hangul;
}
/*
* Use trie to scan s, touching at most len bytes.
* Returns the leaf if one exists, NULL otherwise.
*
* A non-NULL return guarantees that the UTF-8 sequence starting at s
* is well-formed and corresponds to a known unicode code point. The
* shorthand for this will be "is valid UTF-8 unicode".
*/
static utf8leaf_t *utf8nlookup(const struct utf8data *data,
unsigned char *hangul, const char *s, size_t len)
{
utf8trie_t *trie = NULL;
int offlen;
int offset;
int mask;
int node;
if (!data)
return NULL;
if (len == 0)
return NULL;
trie = utf8data + data->offset;
node = 1;
while (node) {
offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
if (*trie & NEXTBYTE) {
if (--len == 0)
return NULL;
s++;
}
mask = 1 << (*trie & BITNUM);
if (*s & mask) {
/* Right leg */
if (offlen) {
/* Right node at offset of trie */
node = (*trie & RIGHTNODE);
offset = trie[offlen];
while (--offlen) {
offset <<= 8;
offset |= trie[offlen];
}
trie += offset;
} else if (*trie & RIGHTPATH) {
/* Right node after this node */
node = (*trie & TRIENODE);
trie++;
} else {
/* No right node. */
return NULL;
}
} else {
/* Left leg */
if (offlen) {
/* Left node after this node. */
node = (*trie & LEFTNODE);
trie += offlen + 1;
} else if (*trie & RIGHTPATH) {
/* No left node. */
return NULL;
} else {
/* Left node after this node */
node = (*trie & TRIENODE);
trie++;
}
}
}
/*
* Hangul decomposition is done algorithmically. These are the
* codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
* always 3 bytes long, so s has been advanced twice, and the
* start of the sequence is at s-2.
*/
if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
trie = utf8hangul(s - 2, hangul);
return trie;
}
/*
* Use trie to scan s.
* Returns the leaf if one exists, NULL otherwise.
*
* Forwards to utf8nlookup().
*/
static utf8leaf_t *utf8lookup(const struct utf8data *data,
unsigned char *hangul, const char *s)
{
return utf8nlookup(data, hangul, s, (size_t)-1);
}
/*
* Maximum age of any character in s.
* Return -1 if s is not valid UTF-8 unicode.
* Return 0 if only non-assigned code points are used.
*/
int utf8agemax(const struct utf8data *data, const char *s)
{
utf8leaf_t *leaf;
int age = 0;
int leaf_age;
unsigned char hangul[UTF8HANGULLEAF];
if (!data)
return -1;
while (*s) {
leaf = utf8lookup(data, hangul, s);
if (!leaf)
return -1;
leaf_age = utf8agetab[LEAF_GEN(leaf)];
if (leaf_age <= data->maxage && leaf_age > age)
age = leaf_age;
s += utf8clen(s);
}
return age;
}
EXPORT_SYMBOL(utf8agemax);
/*
* Minimum age of any character in s.
* Return -1 if s is not valid UTF-8 unicode.
* Return 0 if non-assigned code points are used.
*/
int utf8agemin(const struct utf8data *data, const char *s)
{
utf8leaf_t *leaf;
int age;
int leaf_age;
unsigned char hangul[UTF8HANGULLEAF];
if (!data)
return -1;
age = data->maxage;
while (*s) {
leaf = utf8lookup(data, hangul, s);
if (!leaf)
return -1;
leaf_age = utf8agetab[LEAF_GEN(leaf)];
if (leaf_age <= data->maxage && leaf_age < age)
age = leaf_age;
s += utf8clen(s);
}
return age;
}
EXPORT_SYMBOL(utf8agemin);
/*
* Maximum age of any character in s, touch at most len bytes.
* Return -1 if s is not valid UTF-8 unicode.
*/
int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
{
utf8leaf_t *leaf;
int age = 0;
int leaf_age;
unsigned char hangul[UTF8HANGULLEAF];
if (!data)
return -1;
while (len && *s) {
leaf = utf8nlookup(data, hangul, s, len);
if (!leaf)
return -1;
leaf_age = utf8agetab[LEAF_GEN(leaf)];
if (leaf_age <= data->maxage && leaf_age > age)
age = leaf_age;
len -= utf8clen(s);
s += utf8clen(s);
}
return age;
}
EXPORT_SYMBOL(utf8nagemax);
/*
* Maximum age of any character in s, touch at most len bytes.
* Return -1 if s is not valid UTF-8 unicode.
*/
int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
{
utf8leaf_t *leaf;
int leaf_age;
int age;
unsigned char hangul[UTF8HANGULLEAF];
if (!data)
return -1;
age = data->maxage;
while (len && *s) {
leaf = utf8nlookup(data, hangul, s, len);
if (!leaf)
return -1;
leaf_age = utf8agetab[LEAF_GEN(leaf)];
if (leaf_age <= data->maxage && leaf_age < age)
age = leaf_age;
len -= utf8clen(s);
s += utf8clen(s);
}
return age;
}
EXPORT_SYMBOL(utf8nagemin);
/*
* Length of the normalization of s.
* Return -1 if s is not valid UTF-8 unicode.
*
* A string of Default_Ignorable_Code_Point has length 0.
*/
ssize_t utf8len(const struct utf8data *data, const char *s)
{
utf8leaf_t *leaf;
size_t ret = 0;
unsigned char hangul[UTF8HANGULLEAF];
if (!data)
return -1;
while (*s) {
leaf = utf8lookup(data, hangul, s);
if (!leaf)
return -1;
if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
ret += utf8clen(s);
else if (LEAF_CCC(leaf) == DECOMPOSE)
ret += strlen(LEAF_STR(leaf));
else
ret += utf8clen(s);
s += utf8clen(s);
}
return ret;
}
EXPORT_SYMBOL(utf8len);
/*
* Length of the normalization of s, touch at most len bytes.
* Return -1 if s is not valid UTF-8 unicode.
*/
ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
{
utf8leaf_t *leaf;
size_t ret = 0;
unsigned char hangul[UTF8HANGULLEAF];
if (!data)
return -1;
while (len && *s) {
leaf = utf8nlookup(data, hangul, s, len);
if (!leaf)
return -1;
if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
ret += utf8clen(s);
else if (LEAF_CCC(leaf) == DECOMPOSE)
ret += strlen(LEAF_STR(leaf));
else
ret += utf8clen(s);
len -= utf8clen(s);
s += utf8clen(s);
}
return ret;
}
EXPORT_SYMBOL(utf8nlen);
/*
* Set up an utf8cursor for use by utf8byte().
*
* u8c : pointer to cursor.
* data : const struct utf8data to use for normalization.
* s : string.
* len : length of s.
*
* Returns -1 on error, 0 on success.
*/
int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
const char *s, size_t len)
{
if (!data)
return -1;
if (!s)
return -1;
u8c->data = data;
u8c->s = s;
u8c->p = NULL;
u8c->ss = NULL;
u8c->sp = NULL;
u8c->len = len;
u8c->slen = 0;
u8c->ccc = STOPPER;
u8c->nccc = STOPPER;
/* Check we didn't clobber the maximum length. */
if (u8c->len != len)
return -1;
/* The first byte of s may not be an utf8 continuation. */
if (len > 0 && (*s & 0xC0) == 0x80)
return -1;
return 0;
}
EXPORT_SYMBOL(utf8ncursor);
/*
* Set up an utf8cursor for use by utf8byte().
*
* u8c : pointer to cursor.
* data : const struct utf8data to use for normalization.
* s : NUL-terminated string.
*
* Returns -1 on error, 0 on success.
*/
int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
const char *s)
{
return utf8ncursor(u8c, data, s, (unsigned int)-1);
}
EXPORT_SYMBOL(utf8cursor);
/*
* Get one byte from the normalized form of the string described by u8c.
*
* Returns the byte cast to an unsigned char on succes, and -1 on failure.
*
* The cursor keeps track of the location in the string in u8c->s.
* When a character is decomposed, the current location is stored in
* u8c->p, and u8c->s is set to the start of the decomposition. Note
* that bytes from a decomposition do not count against u8c->len.
*
* Characters are emitted if they match the current CCC in u8c->ccc.
* Hitting end-of-string while u8c->ccc == STOPPER means we're done,
* and the function returns 0 in that case.
*
* Sorting by CCC is done by repeatedly scanning the string. The
* values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
* the start of the scan. The first pass finds the lowest CCC to be
* emitted and stores it in u8c->nccc, the second pass emits the
* characters with this CCC and finds the next lowest CCC. This limits
* the number of passes to 1 + the number of different CCCs in the
* sequence being scanned.
*
* Therefore:
* u8c->p != NULL -> a decomposition is being scanned.
* u8c->ss != NULL -> this is a repeating scan.
* u8c->ccc == -1 -> this is the first scan of a repeating scan.
*/
int utf8byte(struct utf8cursor *u8c)
{
utf8leaf_t *leaf;
int ccc;
for (;;) {
/* Check for the end of a decomposed character. */
if (u8c->p && *u8c->s == '\0') {
u8c->s = u8c->p;
u8c->p = NULL;
}
/* Check for end-of-string. */
if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
/* There is no next byte. */
if (u8c->ccc == STOPPER)
return 0;
/* End-of-string during a scan counts as a stopper. */
ccc = STOPPER;
goto ccc_mismatch;
} else if ((*u8c->s & 0xC0) == 0x80) {
/* This is a continuation of the current character. */
if (!u8c->p)
u8c->len--;
return (unsigned char)*u8c->s++;
}
/* Look up the data for the current character. */
if (u8c->p) {
leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
} else {
leaf = utf8nlookup(u8c->data, u8c->hangul,
u8c->s, u8c->len);
}
/* No leaf found implies that the input is a binary blob. */
if (!leaf)
return -1;
ccc = LEAF_CCC(leaf);
/* Characters that are too new have CCC 0. */
if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
ccc = STOPPER;
} else if (ccc == DECOMPOSE) {
u8c->len -= utf8clen(u8c->s);
u8c->p = u8c->s + utf8clen(u8c->s);
u8c->s = LEAF_STR(leaf);
/* Empty decomposition implies CCC 0. */
if (*u8c->s == '\0') {
if (u8c->ccc == STOPPER)
continue;
ccc = STOPPER;
goto ccc_mismatch;
}
leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
ccc = LEAF_CCC(leaf);
}
/*
* If this is not a stopper, then see if it updates
* the next canonical class to be emitted.
*/
if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
u8c->nccc = ccc;
/*
* Return the current byte if this is the current
* combining class.
*/
if (ccc == u8c->ccc) {
if (!u8c->p)
u8c->len--;
return (unsigned char)*u8c->s++;
}
/* Current combining class mismatch. */
ccc_mismatch:
if (u8c->nccc == STOPPER) {
/*
* Scan forward for the first canonical class
* to be emitted. Save the position from
* which to restart.
*/
u8c->ccc = MINCCC - 1;
u8c->nccc = ccc;
u8c->sp = u8c->p;
u8c->ss = u8c->s;
u8c->slen = u8c->len;
if (!u8c->p)
u8c->len -= utf8clen(u8c->s);
u8c->s += utf8clen(u8c->s);
} else if (ccc != STOPPER) {
/* Not a stopper, and not the ccc we're emitting. */
if (!u8c->p)
u8c->len -= utf8clen(u8c->s);
u8c->s += utf8clen(u8c->s);
} else if (u8c->nccc != MAXCCC + 1) {
/* At a stopper, restart for next ccc. */
u8c->ccc = u8c->nccc;
u8c->nccc = MAXCCC + 1;
u8c->s = u8c->ss;
u8c->p = u8c->sp;
u8c->len = u8c->slen;
} else {
/* All done, proceed from here. */
u8c->ccc = STOPPER;
u8c->nccc = STOPPER;
u8c->sp = NULL;
u8c->ss = NULL;
u8c->slen = 0;
}
}
}
EXPORT_SYMBOL(utf8byte);
const struct utf8data *utf8nfdi(unsigned int maxage)
{
int i = ARRAY_SIZE(utf8nfdidata) - 1;
while (maxage < utf8nfdidata[i].maxage)
i--;
if (maxage > utf8nfdidata[i].maxage)
return NULL;
return &utf8nfdidata[i];
}
EXPORT_SYMBOL(utf8nfdi);
const struct utf8data *utf8nfdicf(unsigned int maxage)
{
int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
while (maxage < utf8nfdicfdata[i].maxage)
i--;
if (maxage > utf8nfdicfdata[i].maxage)
return NULL;
return &utf8nfdicfdata[i];
}
EXPORT_SYMBOL(utf8nfdicf);

320
fs/unicode/utf8-selftest.c Normal file
View File

@ -0,0 +1,320 @@
/*
* Kernel module for testing utf-8 support.
*
* Copyright 2017 Collabora Ltd.
*
* This software is licensed under the terms of the GNU General Public
* License version 2, as published by the Free Software Foundation, and
* may be copied, distributed, and modified under those terms.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/printk.h>
#include <linux/unicode.h>
#include <linux/dcache.h>
#include "utf8n.h"
unsigned int failed_tests;
unsigned int total_tests;
/* Tests will be based on this version. */
#define latest_maj 12
#define latest_min 1
#define latest_rev 0
#define _test(cond, func, line, fmt, ...) do { \
total_tests++; \
if (!cond) { \
failed_tests++; \
pr_err("test %s:%d Failed: %s%s", \
func, line, #cond, (fmt?":":".")); \
if (fmt) \
pr_err(fmt, ##__VA_ARGS__); \
} \
} while (0)
#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define test(cond) _test(cond, __func__, __LINE__, "")
const static struct {
/* UTF-8 strings in this vector _must_ be NULL-terminated. */
unsigned char str[10];
unsigned char dec[10];
} nfdi_test_data[] = {
/* Trivial sequence */
{
/* "ABba" decomposes to itself */
.str = "aBba",
.dec = "aBba",
},
/* Simple equivalent sequences */
{
/* 'VULGAR FRACTION ONE QUARTER' cannot decompose to
'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on
canonical decomposition */
.str = {0xc2, 0xbc, 0x00},
.dec = {0xc2, 0xbc, 0x00},
},
{
/* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to
'LETTER A' + 'COMBINING DIAERESIS' */
.str = {0xc3, 0xa4, 0x00},
.dec = {0x61, 0xcc, 0x88, 0x00},
},
{
/* 'LATIN SMALL LETTER LJ' can't decompose to
'LETTER L' + 'LETTER J' on canonical decomposition */
.str = {0xC7, 0x89, 0x00},
.dec = {0xC7, 0x89, 0x00},
},
{
/* GREEK ANO TELEIA decomposes to MIDDLE DOT */
.str = {0xCE, 0x87, 0x00},
.dec = {0xC2, 0xB7, 0x00}
},
/* Canonical ordering */
{
/* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes
to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */
.str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0},
.dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0},
},
{
/* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK'
decomposes to
'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */
.str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00},
.dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00},
},
};
const static struct {
/* UTF-8 strings in this vector _must_ be NULL-terminated. */
unsigned char str[30];
unsigned char ncf[30];
} nfdicf_test_data[] = {
/* Trivial sequences */
{
/* "ABba" folds to lowercase */
.str = {0x41, 0x42, 0x62, 0x61, 0x00},
.ncf = {0x61, 0x62, 0x62, 0x61, 0x00},
},
{
/* All ASCII folds to lower-case */
.str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1",
.ncf = "abcdefghijklmnopqrstuvwxyz0.1",
},
{
/* LATIN SMALL LETTER SHARP S folds to
LATIN SMALL LETTER S + LATIN SMALL LETTER S */
.str = {0xc3, 0x9f, 0x00},
.ncf = {0x73, 0x73, 0x00},
},
{
/* LATIN CAPITAL LETTER A WITH RING ABOVE folds to
LATIN SMALL LETTER A + COMBINING RING ABOVE */
.str = {0xC3, 0x85, 0x00},
.ncf = {0x61, 0xcc, 0x8a, 0x00},
},
/* Introduced by UTF-8.0.0. */
/* Cherokee letters are interesting test-cases because they fold
to upper-case. Before 8.0.0, Cherokee lowercase were
undefined, thus, the folding from LC is not stable between
7.0.0 -> 8.0.0, but it is from UC. */
{
/* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */
.str = {0xea, 0xad, 0xb0, 0x00},
.ncf = {0xe1, 0x8e, 0xa0, 0x00},
},
{
/* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */
.str = {0xe1, 0x8f, 0xb8, 0x00},
.ncf = {0xe1, 0x8f, 0xb0, 0x00},
},
{
/* OLD HUNGARIAN CAPITAL LETTER AMB folds to
OLD HUNGARIAN SMALL LETTER AMB */
.str = {0xf0, 0x90, 0xb2, 0x83, 0x00},
.ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00},
},
/* Introduced by UTF-9.0.0. */
{
/* OSAGE CAPITAL LETTER CHA folds to
OSAGE SMALL LETTER CHA */
.str = {0xf0, 0x90, 0x92, 0xb5, 0x00},
.ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00},
},
{
/* LATIN CAPITAL LETTER SMALL CAPITAL I folds to
LATIN LETTER SMALL CAPITAL I */
.str = {0xea, 0x9e, 0xae, 0x00},
.ncf = {0xc9, 0xaa, 0x00},
},
/* Introduced by UTF-11.0.0. */
{
/* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI
CAPITAL LETTER AN */
.str = {0xe1, 0xb2, 0x90, 0x00},
.ncf = {0xe1, 0x83, 0x90, 0x00},
}
};
static void check_utf8_nfdi(void)
{
int i;
struct utf8cursor u8c;
const struct utf8data *data;
data = utf8nfdi(UNICODE_AGE(latest_maj, latest_min, latest_rev));
if (!data) {
pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n",
__func__, latest_maj, latest_min, latest_rev);
return;
}
for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
int len = strlen(nfdi_test_data[i].str);
int nlen = strlen(nfdi_test_data[i].dec);
int j = 0;
unsigned char c;
test((utf8len(data, nfdi_test_data[i].str) == nlen));
test((utf8nlen(data, nfdi_test_data[i].str, len) == nlen));
if (utf8cursor(&u8c, data, nfdi_test_data[i].str) < 0)
pr_err("can't create cursor\n");
while ((c = utf8byte(&u8c)) > 0) {
test_f((c == nfdi_test_data[i].dec[j]),
"Unexpected byte 0x%x should be 0x%x\n",
c, nfdi_test_data[i].dec[j]);
j++;
}
test((j == nlen));
}
}
static void check_utf8_nfdicf(void)
{
int i;
struct utf8cursor u8c;
const struct utf8data *data;
data = utf8nfdicf(UNICODE_AGE(latest_maj, latest_min, latest_rev));
if (!data) {
pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n",
__func__, latest_maj, latest_min, latest_rev);
return;
}
for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
int len = strlen(nfdicf_test_data[i].str);
int nlen = strlen(nfdicf_test_data[i].ncf);
int j = 0;
unsigned char c;
test((utf8len(data, nfdicf_test_data[i].str) == nlen));
test((utf8nlen(data, nfdicf_test_data[i].str, len) == nlen));
if (utf8cursor(&u8c, data, nfdicf_test_data[i].str) < 0)
pr_err("can't create cursor\n");
while ((c = utf8byte(&u8c)) > 0) {
test_f((c == nfdicf_test_data[i].ncf[j]),
"Unexpected byte 0x%x should be 0x%x\n",
c, nfdicf_test_data[i].ncf[j]);
j++;
}
test((j == nlen));
}
}
static void check_utf8_comparisons(void)
{
int i;
struct unicode_map *table = utf8_load("12.1.0");
if (IS_ERR(table)) {
pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n",
__func__, latest_maj, latest_min, latest_rev);
return;
}
for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
const struct qstr s1 = {.name = nfdi_test_data[i].str,
.len = sizeof(nfdi_test_data[i].str)};
const struct qstr s2 = {.name = nfdi_test_data[i].dec,
.len = sizeof(nfdi_test_data[i].dec)};
test_f(!utf8_strncmp(table, &s1, &s2),
"%s %s comparison mismatch\n", s1.name, s2.name);
}
for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
const struct qstr s1 = {.name = nfdicf_test_data[i].str,
.len = sizeof(nfdicf_test_data[i].str)};
const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
.len = sizeof(nfdicf_test_data[i].ncf)};
test_f(!utf8_strncasecmp(table, &s1, &s2),
"%s %s comparison mismatch\n", s1.name, s2.name);
}
utf8_unload(table);
}
static void check_supported_versions(void)
{
/* Unicode 7.0.0 should be supported. */
test(utf8version_is_supported(7, 0, 0));
/* Unicode 9.0.0 should be supported. */
test(utf8version_is_supported(9, 0, 0));
/* Unicode 1x.0.0 (the latest version) should be supported. */
test(utf8version_is_supported(latest_maj, latest_min, latest_rev));
/* Next versions don't exist. */
test(!utf8version_is_supported(13, 0, 0));
test(!utf8version_is_supported(0, 0, 0));
test(!utf8version_is_supported(-1, -1, -1));
}
static int __init init_test_ucd(void)
{
failed_tests = 0;
total_tests = 0;
check_supported_versions();
check_utf8_nfdi();
check_utf8_nfdicf();
check_utf8_comparisons();
if (!failed_tests)
pr_info("All %u tests passed\n", total_tests);
else
pr_err("%u out of %u tests failed\n", failed_tests,
total_tests);
return 0;
}
static void __exit exit_test_ucd(void)
{
}
module_init(init_test_ucd);
module_exit(exit_test_ucd);
MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
MODULE_LICENSE("GPL");

File diff suppressed because it is too large Load Diff

117
fs/unicode/utf8n.h Normal file
View File

@ -0,0 +1,117 @@
/*
* Copyright (c) 2014 SGI.
* All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
*/
#ifndef UTF8NORM_H
#define UTF8NORM_H
#include <linux/types.h>
#include <linux/export.h>
#include <linux/string.h>
#include <linux/module.h>
/* Encoding a unicode version number as a single unsigned int. */
#define UNICODE_MAJ_SHIFT (16)
#define UNICODE_MIN_SHIFT (8)
#define UNICODE_AGE(MAJ, MIN, REV) \
(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
((unsigned int)(REV)))
/* Highest unicode version supported by the data tables. */
extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
extern int utf8version_latest(void);
/*
* Look for the correct const struct utf8data for a unicode version.
* Returns NULL if the version requested is too new.
*
* Two normalization forms are supported: nfdi and nfdicf.
*
* nfdi:
* - Apply unicode normalization form NFD.
* - Remove any Default_Ignorable_Code_Point.
*
* nfdicf:
* - Apply unicode normalization form NFD.
* - Remove any Default_Ignorable_Code_Point.
* - Apply a full casefold (C + F).
*/
extern const struct utf8data *utf8nfdi(unsigned int maxage);
extern const struct utf8data *utf8nfdicf(unsigned int maxage);
/*
* Determine the maximum age of any unicode character in the string.
* Returns 0 if only unassigned code points are present.
* Returns -1 if the input is not valid UTF-8.
*/
extern int utf8agemax(const struct utf8data *data, const char *s);
extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len);
/*
* Determine the minimum age of any unicode character in the string.
* Returns 0 if any unassigned code points are present.
* Returns -1 if the input is not valid UTF-8.
*/
extern int utf8agemin(const struct utf8data *data, const char *s);
extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len);
/*
* Determine the length of the normalized from of the string,
* excluding any terminating NULL byte.
* Returns 0 if only ignorable code points are present.
* Returns -1 if the input is not valid UTF-8.
*/
extern ssize_t utf8len(const struct utf8data *data, const char *s);
extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len);
/* Needed in struct utf8cursor below. */
#define UTF8HANGULLEAF (12)
/*
* Cursor structure used by the normalizer.
*/
struct utf8cursor {
const struct utf8data *data;
const char *s;
const char *p;
const char *ss;
const char *sp;
unsigned int len;
unsigned int slen;
short int ccc;
short int nccc;
unsigned char hangul[UTF8HANGULLEAF];
};
/*
* Initialize a utf8cursor to normalize a string.
* Returns 0 on success.
* Returns -1 on failure.
*/
extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
const char *s);
extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
const char *s, size_t len);
/*
* Get the next byte in the normalization.
* Returns a value > 0 && < 256 on success.
* Returns 0 when the end of the normalization is reached.
* Returns -1 if the string being normalized is not valid UTF-8.
*/
extern int utf8byte(struct utf8cursor *u8c);
#endif /* UTF8NORM_H */

View File

@ -1963,6 +1963,7 @@ struct super_operations {
#define S_DAX 0 /* Make all the DAX code disappear */
#endif
#define S_ENCRYPTED 16384 /* Encrypted file (using fs/crypto/) */
#define S_CASEFOLD 32768 /* Casefolded file */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@ -2003,6 +2004,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
#define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC)
#define IS_DAX(inode) ((inode)->i_flags & S_DAX)
#define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED)
#define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD)
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)

30
include/linux/unicode.h Normal file
View File

@ -0,0 +1,30 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_UNICODE_H
#define _LINUX_UNICODE_H
#include <linux/init.h>
#include <linux/dcache.h>
struct unicode_map {
const char *charset;
int version;
};
int utf8_validate(const struct unicode_map *um, const struct qstr *str);
int utf8_strncmp(const struct unicode_map *um,
const struct qstr *s1, const struct qstr *s2);
int utf8_strncasecmp(const struct unicode_map *um,
const struct qstr *s1, const struct qstr *s2);
int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen);
int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen);
struct unicode_map *utf8_load(const char *version);
void utf8_unload(struct unicode_map *um);
#endif /* _LINUX_UNICODE_H */