From 9dde5e4f3383c3459a67ab63786ca58d645d6b5e Mon Sep 17 00:00:00 2001 From: Gang He Date: Tue, 22 Mar 2016 14:24:20 -0700 Subject: [PATCH 01/81] ocfs2: export ocfs2_kset for online file check When there are errors in the ocfs2 filesystem, they are usually accompanied by the inode number which caused the error. This inode number would be the input to fixing the file. One of these options could be considered: A file in the sys filesytem which would accept inode numbers. This could be used to communication back what has to be fixed or is fixed. You could write: $# echo "" > /sys/fs/ocfs2/devname/filecheck/check or $# echo "" > /sys/fs/ocfs2/devname/filecheck/fix Compare with second version, I re-design filecheck sysfs interfaces, there are three sysfs files (check, fix and set) under filecheck directory (see above), sysfs will accept only one argument . Second, I adjust some code in ocfs2_filecheck_repair_inode_block() function according to upstream feedback, we cannot just add VALID_FL flag back as a inode block fix, then we will not fix this field corruption currently until having a complete solution. Compare with first version, I use strncasecmp instead of double strncmp functions. Second, update the source file contribution vendor. This patch (of 4): Export ocfs2_kset object from ocfs2_stackglue kernel module, then online file check code will create the related sysfiles under ocfs2_kset object. We're exporting this because it's built in ocfs2_stackglue.ko. Signed-off-by: Gang He Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stackglue.c | 3 ++- fs/ocfs2/stackglue.h | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 5d965e83bd43..13219ed73e1d 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -629,7 +629,8 @@ static struct attribute_group ocfs2_attr_group = { .attrs = ocfs2_attrs, }; -static struct kset *ocfs2_kset; +struct kset *ocfs2_kset; +EXPORT_SYMBOL_GPL(ocfs2_kset); static void ocfs2_sysfs_exit(void) { diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index 66334a30cea8..f2dce10fae54 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,4 +298,6 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); +extern struct kset *ocfs2_kset; + #endif /* STACKGLUE_H */ From a860f6eb4c6a8bb0ca6860d9472f424bad9af9cf Mon Sep 17 00:00:00 2001 From: Gang He Date: Tue, 22 Mar 2016 14:24:24 -0700 Subject: [PATCH 02/81] ocfs2: sysfile interfaces for online file check Implement online file check sysfile interfaces, e.g. how to create the related sysfile according to device name, how to display/handle file check request from the sysfile. Signed-off-by: Gang He Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/Makefile | 3 +- fs/ocfs2/filecheck.c | 606 +++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/filecheck.h | 49 ++++ fs/ocfs2/inode.h | 3 + 4 files changed, 660 insertions(+), 1 deletion(-) create mode 100644 fs/ocfs2/filecheck.c create mode 100644 fs/ocfs2/filecheck.h diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index ce210d4951a1..e27e6527912b 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -41,7 +41,8 @@ ocfs2-objs := \ quota_local.o \ quota_global.o \ xattr.o \ - acl.o + acl.o \ + filecheck.o ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c new file mode 100644 index 000000000000..2cabbcf2f28e --- /dev/null +++ b/fs/ocfs2/filecheck.c @@ -0,0 +1,606 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * filecheck.c + * + * Code which implements online file check. + * + * Copyright (C) 2016 SuSE. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ocfs2.h" +#include "ocfs2_fs.h" +#include "stackglue.h" +#include "inode.h" + +#include "filecheck.h" + + +/* File check error strings, + * must correspond with error number in header file. + */ +static const char * const ocfs2_filecheck_errs[] = { + "SUCCESS", + "FAILED", + "INPROGRESS", + "READONLY", + "INJBD", + "INVALIDINO", + "BLOCKECC", + "BLOCKNO", + "VALIDFLAG", + "GENERATION", + "UNSUPPORTED" +}; + +static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); +static LIST_HEAD(ocfs2_filecheck_sysfs_list); + +struct ocfs2_filecheck { + struct list_head fc_head; /* File check entry list head */ + spinlock_t fc_lock; + unsigned int fc_max; /* Maximum number of entry in list */ + unsigned int fc_size; /* Current entry count in list */ + unsigned int fc_done; /* Finished entry count in list */ +}; + +struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */ + struct list_head fs_list; + atomic_t fs_count; + struct super_block *fs_sb; + struct kset *fs_devicekset; + struct kset *fs_fcheckkset; + struct ocfs2_filecheck *fs_fcheck; +}; + +#define OCFS2_FILECHECK_MAXSIZE 100 +#define OCFS2_FILECHECK_MINSIZE 10 + +/* File check operation type */ +enum { + OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */ + OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */ + OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */ +}; + +struct ocfs2_filecheck_entry { + struct list_head fe_list; + unsigned long fe_ino; + unsigned int fe_type; + unsigned int fe_done:1; + unsigned int fe_status:31; +}; + +struct ocfs2_filecheck_args { + unsigned int fa_type; + union { + unsigned long fa_ino; + unsigned int fa_len; + }; +}; + +static const char * +ocfs2_filecheck_error(int errno) +{ + if (!errno) + return ocfs2_filecheck_errs[errno]; + + BUG_ON(errno < OCFS2_FILECHECK_ERR_START || + errno > OCFS2_FILECHECK_ERR_END); + return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1]; +} + +static ssize_t ocfs2_filecheck_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf); +static ssize_t ocfs2_filecheck_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count); +static struct kobj_attribute ocfs2_attr_filecheck_chk = + __ATTR(check, S_IRUSR | S_IWUSR, + ocfs2_filecheck_show, + ocfs2_filecheck_store); +static struct kobj_attribute ocfs2_attr_filecheck_fix = + __ATTR(fix, S_IRUSR | S_IWUSR, + ocfs2_filecheck_show, + ocfs2_filecheck_store); +static struct kobj_attribute ocfs2_attr_filecheck_set = + __ATTR(set, S_IRUSR | S_IWUSR, + ocfs2_filecheck_show, + ocfs2_filecheck_store); + +static int ocfs2_filecheck_sysfs_wait(atomic_t *p) +{ + schedule(); + return 0; +} + +static void +ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) +{ + struct ocfs2_filecheck_entry *p; + + if (!atomic_dec_and_test(&entry->fs_count)) + wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait, + TASK_UNINTERRUPTIBLE); + + spin_lock(&entry->fs_fcheck->fc_lock); + while (!list_empty(&entry->fs_fcheck->fc_head)) { + p = list_first_entry(&entry->fs_fcheck->fc_head, + struct ocfs2_filecheck_entry, fe_list); + list_del(&p->fe_list); + BUG_ON(!p->fe_done); /* To free a undone file check entry */ + kfree(p); + } + spin_unlock(&entry->fs_fcheck->fc_lock); + + kset_unregister(entry->fs_fcheckkset); + kset_unregister(entry->fs_devicekset); + kfree(entry->fs_fcheck); + kfree(entry); +} + +static void +ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry) +{ + spin_lock(&ocfs2_filecheck_sysfs_lock); + list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list); + spin_unlock(&ocfs2_filecheck_sysfs_lock); +} + +static int ocfs2_filecheck_sysfs_del(const char *devname) +{ + struct ocfs2_filecheck_sysfs_entry *p; + + spin_lock(&ocfs2_filecheck_sysfs_lock); + list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { + if (!strcmp(p->fs_sb->s_id, devname)) { + list_del(&p->fs_list); + spin_unlock(&ocfs2_filecheck_sysfs_lock); + ocfs2_filecheck_sysfs_free(p); + return 0; + } + } + spin_unlock(&ocfs2_filecheck_sysfs_lock); + return 1; +} + +static void +ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry) +{ + if (atomic_dec_and_test(&entry->fs_count)) + wake_up_atomic_t(&entry->fs_count); +} + +static struct ocfs2_filecheck_sysfs_entry * +ocfs2_filecheck_sysfs_get(const char *devname) +{ + struct ocfs2_filecheck_sysfs_entry *p = NULL; + + spin_lock(&ocfs2_filecheck_sysfs_lock); + list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) { + if (!strcmp(p->fs_sb->s_id, devname)) { + atomic_inc(&p->fs_count); + spin_unlock(&ocfs2_filecheck_sysfs_lock); + return p; + } + } + spin_unlock(&ocfs2_filecheck_sysfs_lock); + return NULL; +} + +int ocfs2_filecheck_create_sysfs(struct super_block *sb) +{ + int ret = 0; + struct kset *device_kset = NULL; + struct kset *fcheck_kset = NULL; + struct ocfs2_filecheck *fcheck = NULL; + struct ocfs2_filecheck_sysfs_entry *entry = NULL; + struct attribute **attrs = NULL; + struct attribute_group attrgp; + + if (!ocfs2_kset) + return -ENOMEM; + + attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS); + if (!attrs) { + ret = -ENOMEM; + goto error; + } else { + attrs[0] = &ocfs2_attr_filecheck_chk.attr; + attrs[1] = &ocfs2_attr_filecheck_fix.attr; + attrs[2] = &ocfs2_attr_filecheck_set.attr; + attrs[3] = NULL; + memset(&attrgp, 0, sizeof(attrgp)); + attrgp.attrs = attrs; + } + + fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS); + if (!fcheck) { + ret = -ENOMEM; + goto error; + } else { + INIT_LIST_HEAD(&fcheck->fc_head); + spin_lock_init(&fcheck->fc_lock); + fcheck->fc_max = OCFS2_FILECHECK_MINSIZE; + fcheck->fc_size = 0; + fcheck->fc_done = 0; + } + + if (strlen(sb->s_id) <= 0) { + mlog(ML_ERROR, + "Cannot get device basename when create filecheck sysfs\n"); + ret = -ENODEV; + goto error; + } + + device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj); + if (!device_kset) { + ret = -ENOMEM; + goto error; + } + + fcheck_kset = kset_create_and_add("filecheck", NULL, + &device_kset->kobj); + if (!fcheck_kset) { + ret = -ENOMEM; + goto error; + } + + ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp); + if (ret) + goto error; + + entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS); + if (!entry) { + ret = -ENOMEM; + goto error; + } else { + atomic_set(&entry->fs_count, 1); + entry->fs_sb = sb; + entry->fs_devicekset = device_kset; + entry->fs_fcheckkset = fcheck_kset; + entry->fs_fcheck = fcheck; + ocfs2_filecheck_sysfs_add(entry); + } + + kfree(attrs); + return 0; + +error: + kfree(attrs); + kfree(entry); + kfree(fcheck); + kset_unregister(fcheck_kset); + kset_unregister(device_kset); + return ret; +} + +int ocfs2_filecheck_remove_sysfs(struct super_block *sb) +{ + return ocfs2_filecheck_sysfs_del(sb->s_id); +} + +static int +ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int count); +static int +ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int len) +{ + int ret; + + if ((len < OCFS2_FILECHECK_MINSIZE) || (len > OCFS2_FILECHECK_MAXSIZE)) + return -EINVAL; + + spin_lock(&ent->fs_fcheck->fc_lock); + if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) { + mlog(ML_ERROR, + "Cannot set online file check maximum entry number " + "to %u due to too many pending entries(%u)\n", + len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done); + ret = -EBUSY; + } else { + if (len < ent->fs_fcheck->fc_size) + BUG_ON(!ocfs2_filecheck_erase_entries(ent, + ent->fs_fcheck->fc_size - len)); + + ent->fs_fcheck->fc_max = len; + ret = 0; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + + return ret; +} + +#define OCFS2_FILECHECK_ARGS_LEN 24 +static int +ocfs2_filecheck_args_get_long(const char *buf, size_t count, + unsigned long *val) +{ + char buffer[OCFS2_FILECHECK_ARGS_LEN]; + + memcpy(buffer, buf, count); + buffer[count] = '\0'; + + if (kstrtoul(buffer, 0, val)) + return 1; + + return 0; +} + +static int +ocfs2_filecheck_type_parse(const char *name, unsigned int *type) +{ + if (!strncmp(name, "fix", 4)) + *type = OCFS2_FILECHECK_TYPE_FIX; + else if (!strncmp(name, "check", 6)) + *type = OCFS2_FILECHECK_TYPE_CHK; + else if (!strncmp(name, "set", 4)) + *type = OCFS2_FILECHECK_TYPE_SET; + else + return 1; + + return 0; +} + +static int +ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count, + struct ocfs2_filecheck_args *args) +{ + unsigned long val = 0; + unsigned int type; + + /* too short/long args length */ + if ((count < 1) || (count >= OCFS2_FILECHECK_ARGS_LEN)) + return 1; + + if (ocfs2_filecheck_type_parse(name, &type)) + return 1; + if (ocfs2_filecheck_args_get_long(buf, count, &val)) + return 1; + + if (val <= 0) + return 1; + + args->fa_type = type; + if (type == OCFS2_FILECHECK_TYPE_SET) + args->fa_len = (unsigned int)val; + else + args->fa_ino = val; + + return 0; +} + +static ssize_t ocfs2_filecheck_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + + ssize_t ret = 0, total = 0, remain = PAGE_SIZE; + unsigned int type; + struct ocfs2_filecheck_entry *p; + struct ocfs2_filecheck_sysfs_entry *ent; + + if (ocfs2_filecheck_type_parse(attr->attr.name, &type)) + return -EINVAL; + + ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); + if (!ent) { + mlog(ML_ERROR, + "Cannot get the corresponding entry via device basename %s\n", + kobj->name); + return -ENODEV; + } + + if (type == OCFS2_FILECHECK_TYPE_SET) { + spin_lock(&ent->fs_fcheck->fc_lock); + total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max); + spin_unlock(&ent->fs_fcheck->fc_lock); + goto exit; + } + + ret = snprintf(buf, remain, "INO\t\tDONE\tERROR\n"); + total += ret; + remain -= ret; + spin_lock(&ent->fs_fcheck->fc_lock); + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (p->fe_type != type) + continue; + + ret = snprintf(buf + total, remain, "%lu\t\t%u\t%s\n", + p->fe_ino, p->fe_done, + ocfs2_filecheck_error(p->fe_status)); + if (ret < 0) { + total = ret; + break; + } + if (ret == remain) { + /* snprintf() didn't fit */ + total = -E2BIG; + break; + } + total += ret; + remain -= ret; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + +exit: + ocfs2_filecheck_sysfs_put(ent); + return total; +} + +static int +ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent) +{ + struct ocfs2_filecheck_entry *p; + + list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) { + if (p->fe_done) { + list_del(&p->fe_list); + kfree(p); + ent->fs_fcheck->fc_size--; + ent->fs_fcheck->fc_done--; + return 1; + } + } + + return 0; +} + +static int +ocfs2_filecheck_erase_entries(struct ocfs2_filecheck_sysfs_entry *ent, + unsigned int count) +{ + unsigned int i = 0; + unsigned int ret = 0; + + while (i++ < count) { + if (ocfs2_filecheck_erase_entry(ent)) + ret++; + else + break; + } + + return (ret == count ? 1 : 0); +} + +static void +ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent, + struct ocfs2_filecheck_entry *entry) +{ + entry->fe_done = 1; + spin_lock(&ent->fs_fcheck->fc_lock); + ent->fs_fcheck->fc_done++; + spin_unlock(&ent->fs_fcheck->fc_lock); +} + +static unsigned int +ocfs2_filecheck_handle(struct super_block *sb, + unsigned long ino, unsigned int flags) +{ + unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS; + struct inode *inode = NULL; + int rc; + + inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0); + if (IS_ERR(inode)) { + rc = (int)(-(long)inode); + if (rc >= OCFS2_FILECHECK_ERR_START && + rc < OCFS2_FILECHECK_ERR_END) + ret = rc; + else + ret = OCFS2_FILECHECK_ERR_FAILED; + } else + iput(inode); + + return ret; +} + +static void +ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, + struct ocfs2_filecheck_entry *entry) +{ + if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK) + entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK); + else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX) + entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, + entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX); + else + entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED; + + ocfs2_filecheck_done_entry(ent, entry); +} + +static ssize_t ocfs2_filecheck_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct ocfs2_filecheck_args args; + struct ocfs2_filecheck_entry *entry; + struct ocfs2_filecheck_sysfs_entry *ent; + ssize_t ret = 0; + + if (count == 0) + return count; + + if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) { + mlog(ML_ERROR, "Invalid arguments for online file check\n"); + return -EINVAL; + } + + ent = ocfs2_filecheck_sysfs_get(kobj->parent->name); + if (!ent) { + mlog(ML_ERROR, + "Cannot get the corresponding entry via device basename %s\n", + kobj->parent->name); + return -ENODEV; + } + + if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) { + ret = ocfs2_filecheck_adjust_max(ent, args.fa_len); + goto exit; + } + + entry = kmalloc(sizeof(struct ocfs2_filecheck_entry), GFP_NOFS); + if (!entry) { + ret = -ENOMEM; + goto exit; + } + + spin_lock(&ent->fs_fcheck->fc_lock); + if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + (ent->fs_fcheck->fc_done == 0)) { + mlog(ML_ERROR, + "Cannot do more file check " + "since file check queue(%u) is full now\n", + ent->fs_fcheck->fc_max); + ret = -EBUSY; + kfree(entry); + } else { + if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && + (ent->fs_fcheck->fc_done > 0)) { + /* Delete the oldest entry which was done, + * make sure the entry size in list does + * not exceed maximum value + */ + BUG_ON(!ocfs2_filecheck_erase_entry(ent)); + } + + entry->fe_ino = args.fa_ino; + entry->fe_type = args.fa_type; + entry->fe_done = 0; + entry->fe_status = OCFS2_FILECHECK_ERR_INPROGRESS; + list_add_tail(&entry->fe_list, &ent->fs_fcheck->fc_head); + ent->fs_fcheck->fc_size++; + } + spin_unlock(&ent->fs_fcheck->fc_lock); + + if (!ret) + ocfs2_filecheck_handle_entry(ent, entry); + +exit: + ocfs2_filecheck_sysfs_put(ent); + return (!ret ? count : ret); +} diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h new file mode 100644 index 000000000000..e5cd002a2c09 --- /dev/null +++ b/fs/ocfs2/filecheck.h @@ -0,0 +1,49 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * filecheck.h + * + * Online file check. + * + * Copyright (C) 2016 SuSE. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + + +#ifndef FILECHECK_H +#define FILECHECK_H + +#include +#include + + +/* File check errno */ +enum { + OCFS2_FILECHECK_ERR_SUCCESS = 0, /* Success */ + OCFS2_FILECHECK_ERR_FAILED = 1000, /* Other failure */ + OCFS2_FILECHECK_ERR_INPROGRESS, /* In progress */ + OCFS2_FILECHECK_ERR_READONLY, /* Read only */ + OCFS2_FILECHECK_ERR_INJBD, /* Buffer in jbd */ + OCFS2_FILECHECK_ERR_INVALIDINO, /* Invalid ino */ + OCFS2_FILECHECK_ERR_BLOCKECC, /* Block ecc */ + OCFS2_FILECHECK_ERR_BLOCKNO, /* Block number */ + OCFS2_FILECHECK_ERR_VALIDFLAG, /* Inode valid flag */ + OCFS2_FILECHECK_ERR_GENERATION, /* Inode generation */ + OCFS2_FILECHECK_ERR_UNSUPPORTED /* Unsupported */ +}; + +#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED +#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED + +int ocfs2_filecheck_create_sysfs(struct super_block *sb); +int ocfs2_filecheck_remove_sysfs(struct super_block *sb); + +#endif /* FILECHECK_H */ diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index aac8b86f312e..01635e016b3e 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -139,6 +139,9 @@ int ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ #define OCFS2_FI_FLAG_SYSFILE 0x1 #define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x2 +#define OCFS2_FI_FLAG_FILECHECK_CHK 0x4 +#define OCFS2_FI_FLAG_FILECHECK_FIX 0x8 + struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, int sysfile_type); From a849d46816fe9e11d59aae78ea95c54f640b1904 Mon Sep 17 00:00:00 2001 From: Gang He Date: Tue, 22 Mar 2016 14:24:27 -0700 Subject: [PATCH 03/81] ocfs2: create/remove sysfile for online file check Create online file check sysfile when ocfs2 mount, remove the related sysfile when ocfs2 umount. Signed-off-by: Gang He Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 302854ee0985..ccc9386c42c5 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -74,6 +74,7 @@ #include "suballoc.h" #include "buffer_head_io.h" +#include "filecheck.h" static struct kmem_cache *ocfs2_inode_cachep; struct kmem_cache *ocfs2_dquot_cachep; @@ -1205,6 +1206,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) /* Start this when the mount is almost sure of being successful */ ocfs2_orphan_scan_start(osb); + /* Create filecheck sysfile /sys/fs/ocfs2//filecheck */ + ocfs2_filecheck_create_sysfs(sb); + return status; read_super_error: @@ -1668,6 +1672,7 @@ static void ocfs2_put_super(struct super_block *sb) ocfs2_sync_blockdev(sb); ocfs2_dismount_volume(sb, 0); + ocfs2_filecheck_remove_sysfs(sb); } static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) From d56a8f32e4c662509ce50a37e78fa66c777977d3 Mon Sep 17 00:00:00 2001 From: Gang He Date: Tue, 22 Mar 2016 14:24:30 -0700 Subject: [PATCH 04/81] ocfs2: check/fix inode block for online file check Implement online check or fix inode block during reading a inode block to memory. Signed-off-by: Gang He Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 225 +++++++++++++++++++++++++++++++++++++++-- fs/ocfs2/ocfs2_trace.h | 2 + 2 files changed, 218 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 36294446d960..ba495beff1c2 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -53,6 +53,7 @@ #include "xattr.h" #include "refcounttree.h" #include "ocfs2_trace.h" +#include "filecheck.h" #include "buffer_head_io.h" @@ -74,6 +75,14 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh); +static int ocfs2_filecheck_read_inode_block_full(struct inode *inode, + struct buffer_head **bh, + int flags, int type); +static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, + struct buffer_head *bh); +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, + struct buffer_head *bh); + void ocfs2_set_inode_flags(struct inode *inode) { unsigned int flags = OCFS2_I(inode)->ip_attr; @@ -127,6 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, int sysfile_type) { + int rc = 0; struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; @@ -161,12 +171,17 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, } trace_ocfs2_iget5_locked(inode->i_state); if (inode->i_state & I_NEW) { - ocfs2_read_locked_inode(inode, &args); + rc = ocfs2_read_locked_inode(inode, &args); unlock_new_inode(inode); } if (is_bad_inode(inode)) { iput(inode); - inode = ERR_PTR(-ESTALE); + if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) || + (flags & OCFS2_FI_FLAG_FILECHECK_FIX)) + /* Return OCFS2_FILECHECK_ERR_XXX related errno */ + inode = ERR_PTR(rc); + else + inode = ERR_PTR(-ESTALE); goto bail; } @@ -410,7 +425,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, struct ocfs2_super *osb; struct ocfs2_dinode *fe; struct buffer_head *bh = NULL; - int status, can_lock; + int status, can_lock, lock_level = 0; u32 generation = 0; status = -EINVAL; @@ -478,7 +493,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, mlog_errno(status); return status; } - status = ocfs2_inode_lock(inode, NULL, 0); + status = ocfs2_inode_lock(inode, NULL, lock_level); if (status) { make_bad_inode(inode); mlog_errno(status); @@ -495,16 +510,32 @@ static int ocfs2_read_locked_inode(struct inode *inode, } if (can_lock) { - status = ocfs2_read_inode_block_full(inode, &bh, - OCFS2_BH_IGNORE_CACHE); + if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) + status = ocfs2_filecheck_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE, 0); + else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) + status = ocfs2_filecheck_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE, 1); + else + status = ocfs2_read_inode_block_full(inode, + &bh, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); /* * If buffer is in jbd, then its checksum may not have been * computed as yet. */ - if (!status && !buffer_jbd(bh)) - status = ocfs2_validate_inode_block(osb->sb, bh); + if (!status && !buffer_jbd(bh)) { + if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) + status = ocfs2_filecheck_validate_inode_block( + osb->sb, bh); + else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) + status = ocfs2_filecheck_repair_inode_block( + osb->sb, bh); + else + status = ocfs2_validate_inode_block( + osb->sb, bh); + } } if (status < 0) { mlog_errno(status); @@ -532,11 +563,24 @@ static int ocfs2_read_locked_inode(struct inode *inode, BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); + if (buffer_dirty(bh) && !buffer_jbd(bh)) { + if (can_lock) { + ocfs2_inode_unlock(inode, lock_level); + lock_level = 1; + ocfs2_inode_lock(inode, NULL, lock_level); + } + status = ocfs2_write_block(osb, bh, INODE_CACHE(inode)); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + status = 0; bail: if (can_lock) - ocfs2_inode_unlock(inode, 0); + ocfs2_inode_unlock(inode, lock_level); if (status < 0) make_bad_inode(inode); @@ -1397,6 +1441,169 @@ bail: return rc; } +static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + trace_ocfs2_filecheck_validate_inode_block( + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * Call ocfs2_validate_meta_ecc() first since it has ecc repair + * function, but we should not return error immediately when ecc + * validation fails, because the reason is quite likely the invalid + * inode number inputed. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); + if (rc) { + mlog(ML_ERROR, + "Filecheck: checksum failed for dinode %llu\n", + (unsigned long long)bh->b_blocknr); + rc = -OCFS2_FILECHECK_ERR_BLOCKECC; + } + + if (!OCFS2_IS_VALID_DINODE(di)) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, di->i_signature); + rc = -OCFS2_FILECHECK_ERR_INVALIDINO; + goto bail; + } else if (rc) + goto bail; + + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); + rc = -OCFS2_FILECHECK_ERR_BLOCKNO; + goto bail; + } + + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL " + "not set\n", + (unsigned long long)bh->b_blocknr); + rc = -OCFS2_FILECHECK_ERR_VALIDFLAG; + goto bail; + } + + if (le32_to_cpu(di->i_fs_generation) != + OCFS2_SB(sb)->fs_generation) { + mlog(ML_ERROR, + "Filecheck: invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); + rc = -OCFS2_FILECHECK_ERR_GENERATION; + goto bail; + } + +bail: + return rc; +} + +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, + struct buffer_head *bh) +{ + int changed = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + if (!ocfs2_filecheck_validate_inode_block(sb, bh)) + return 0; + + trace_ocfs2_filecheck_repair_inode_block( + (unsigned long long)bh->b_blocknr); + + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) || + ocfs2_is_soft_readonly(OCFS2_SB(sb))) { + mlog(ML_ERROR, + "Filecheck: cannot repair dinode #%llu " + "on readonly filesystem\n", + (unsigned long long)bh->b_blocknr); + return -OCFS2_FILECHECK_ERR_READONLY; + } + + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "Filecheck: cannot repair dinode #%llu, " + "its buffer is in jbd\n", + (unsigned long long)bh->b_blocknr); + return -OCFS2_FILECHECK_ERR_INJBD; + } + + if (!OCFS2_IS_VALID_DINODE(di)) { + /* Cannot fix invalid inode block */ + return -OCFS2_FILECHECK_ERR_INVALIDINO; + } + + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + /* Cannot just add VALID_FL flag back as a fix, + * need more things to check here. + */ + return -OCFS2_FILECHECK_ERR_VALIDFLAG; + } + + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { + di->i_blkno = cpu_to_le64(bh->b_blocknr); + changed = 1; + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: i_blkno to %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); + } + + if (le32_to_cpu(di->i_fs_generation) != + OCFS2_SB(sb)->fs_generation) { + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + changed = 1; + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: fs_generation to %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); + } + + if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) { + ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check); + mark_buffer_dirty(bh); + mlog(ML_ERROR, + "Filecheck: reset dinode #%llu: compute meta ecc\n", + (unsigned long long)bh->b_blocknr); + } + + return 0; +} + +static int +ocfs2_filecheck_read_inode_block_full(struct inode *inode, + struct buffer_head **bh, + int flags, int type) +{ + int rc; + struct buffer_head *tmp = *bh; + + if (!type) /* Check inode block */ + rc = ocfs2_read_blocks(INODE_CACHE(inode), + OCFS2_I(inode)->ip_blkno, + 1, &tmp, flags, + ocfs2_filecheck_validate_inode_block); + else /* Repair inode block */ + rc = ocfs2_read_blocks(INODE_CACHE(inode), + OCFS2_I(inode)->ip_blkno, + 1, &tmp, flags, + ocfs2_filecheck_repair_inode_block); + + /* If ocfs2_read_blocks() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, int flags) { diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index a52a2dbc064e..24b7e7f591dc 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1540,6 +1540,8 @@ DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode); DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state); DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block); +DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_validate_inode_block); +DEFINE_OCFS2_ULL_EVENT(ocfs2_filecheck_repair_inode_block); TRACE_EVENT(ocfs2_inode_is_valid_to_delete, TP_PROTO(void *task, void *dc_task, unsigned long long ino, From d750c42ac265c00df3f0963a240a4440fa073603 Mon Sep 17 00:00:00 2001 From: Gang He Date: Tue, 22 Mar 2016 14:24:33 -0700 Subject: [PATCH 05/81] ocfs2: add feature document for online file check This document will describe OCFS2 online file check feature. OCFS2 is often used in high-availaibility systems. However, OCFS2 usually converts the filesystem to read-only when encounters an error. This may not be necessary, since turning the filesystem read-only would affect other running processes as well, decreasing availability. Then, a mount option (errors=continue) is introduced, which would return the -EIO errno to the calling process and terminate furhter processing so that the filesystem is not corrupted further. The filesystem is not converted to read-only, and the problematic file's inode number is reported in the kernel log. The user can try to check/fix this file via online filecheck feature. Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../filesystems/ocfs2-online-filecheck.txt | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 Documentation/filesystems/ocfs2-online-filecheck.txt diff --git a/Documentation/filesystems/ocfs2-online-filecheck.txt b/Documentation/filesystems/ocfs2-online-filecheck.txt new file mode 100644 index 000000000000..1ab07860430d --- /dev/null +++ b/Documentation/filesystems/ocfs2-online-filecheck.txt @@ -0,0 +1,94 @@ + OCFS2 online file check + ----------------------- + +This document will describe OCFS2 online file check feature. + +Introduction +============ +OCFS2 is often used in high-availaibility systems. However, OCFS2 usually +converts the filesystem to read-only when encounters an error. This may not be +necessary, since turning the filesystem read-only would affect other running +processes as well, decreasing availability. +Then, a mount option (errors=continue) is introduced, which would return the +-EIO errno to the calling process and terminate furhter processing so that the +filesystem is not corrupted further. The filesystem is not converted to +read-only, and the problematic file's inode number is reported in the kernel +log. The user can try to check/fix this file via online filecheck feature. + +Scope +===== +This effort is to check/fix small issues which may hinder day-to-day operations +of a cluster filesystem by turning the filesystem read-only. The scope of +checking/fixing is at the file level, initially for regular files and eventually +to all files (including system files) of the filesystem. + +In case of directory to file links is incorrect, the directory inode is +reported as erroneous. + +This feature is not suited for extravagant checks which involve dependency of +other components of the filesystem, such as but not limited to, checking if the +bits for file blocks in the allocation has been set. In case of such an error, +the offline fsck should/would be recommended. + +Finally, such an operation/feature should not be automated lest the filesystem +may end up with more damage than before the repair attempt. So, this has to +be performed using user interaction and consent. + +User interface +============== +When there are errors in the OCFS2 filesystem, they are usually accompanied +by the inode number which caused the error. This inode number would be the +input to check/fix the file. + +There is a sysfs directory for each OCFS2 file system mounting: + + /sys/fs/ocfs2//filecheck + +Here, indicates the name of OCFS2 volumn device which has been already +mounted. The file above would accept inode numbers. This could be used to +communicate with kernel space, tell which file(inode number) will be checked or +fixed. Currently, three operations are supported, which includes checking +inode, fixing inode and setting the size of result record history. + +1. If you want to know what error exactly happened to before fixing, do + + # echo "" > /sys/fs/ocfs2//filecheck/check + # cat /sys/fs/ocfs2//filecheck/check + +The output is like this: + INO DONE ERROR +39502 1 GENERATION + + lists the inode numbers. + indicates whether the operation has been finished. + says what kind of errors was found. For the detailed error numbers, +please refer to the file linux/fs/ocfs2/filecheck.h. + +2. If you determine to fix this inode, do + + # echo "" > /sys/fs/ocfs2//filecheck/fix + # cat /sys/fs/ocfs2//filecheck/fix + +The output is like this: + INO DONE ERROR +39502 1 SUCCESS + +This time, the column indicates whether this fix is successful or not. + +3. The record cache is used to store the history of check/fix results. It's +defalut size is 10, and can be adjust between the range of 10 ~ 100. You can +adjust the size like this: + + # echo "" > /sys/fs/ocfs2//filecheck/set + +Fixing stuff +============ +On receivng the inode, the filesystem would read the inode and the +file metadata. In case of errors, the filesystem would fix the errors +and report the problems it fixed in the kernel log. As a precautionary measure, +the inode must first be checked for errors before performing a final fix. + +The inode and the result history will be maintained temporarily in a +small linked list buffer which would contain the last (N) inodes +fixed/checked, the detailed errors which were fixed/checked are printed in the +kernel log. From 3f2b1a04f44933f2d6fe0a9bf9a9c1c452df23f7 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 22 Mar 2016 14:24:36 -0700 Subject: [PATCH 06/81] zram: revive swap_slot_free_notify Commit b430e9d1c6d4 ("remove compressed copy from zram in-memory") applied swap_slot_free_notify call in *end_swap_bio_read* to remove duplicated memory between zram and memory. However, with the introduction of rw_page in zram: 8c7f01025f7b ("zram: implement rw_page operation of zram"), it became void because rw_page doesn't need bio. Memory footprint is really important in embedded platforms which have small memory, for example, 512M) recently because it could start to kill processes if memory footprint exceeds some threshold by LMK or some similar memory management modules. This patch restores the function for rw_page, thereby eliminating this duplication. Signed-off-by: Minchan Kim Cc: Sergey Senozhatsky Cc: karam.lee Cc: Cc: Chan Jeong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 93 ++++++++++++++++++++++++++++------------------------ 1 file changed, 50 insertions(+), 43 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index ff74e512f029..18aac7819cc9 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -66,6 +66,54 @@ void end_swap_bio_write(struct bio *bio) bio_put(bio); } +static void swap_slot_free_notify(struct page *page) +{ + struct swap_info_struct *sis; + struct gendisk *disk; + + /* + * There is no guarantee that the page is in swap cache - the software + * suspend code (at least) uses end_swap_bio_read() against a non- + * swapcache page. So we must check PG_swapcache before proceeding with + * this optimization. + */ + if (unlikely(!PageSwapCache(page))) + return; + + sis = page_swap_info(page); + if (!(sis->flags & SWP_BLKDEV)) + return; + + /* + * The swap subsystem performs lazy swap slot freeing, + * expecting that the page will be swapped out again. + * So we can avoid an unnecessary write if the page + * isn't redirtied. + * This is good for real swap storage because we can + * reduce unnecessary I/O and enhance wear-leveling + * if an SSD is used as the as swap device. + * But if in-memory swap device (eg zram) is used, + * this causes a duplicated copy between uncompressed + * data in VM-owned memory and compressed data in + * zram-owned memory. So let's free zram-owned memory + * and make the VM-owned decompressed page *dirty*, + * so the page should be swapped out somewhere again if + * we again wish to reclaim it. + */ + disk = sis->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) { + swp_entry_t entry; + unsigned long offset; + + entry.val = page_private(page); + offset = swp_offset(entry); + + SetPageDirty(page); + disk->fops->swap_slot_free_notify(sis->bdev, + offset); + } +} + static void end_swap_bio_read(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; @@ -81,49 +129,7 @@ static void end_swap_bio_read(struct bio *bio) } SetPageUptodate(page); - - /* - * There is no guarantee that the page is in swap cache - the software - * suspend code (at least) uses end_swap_bio_read() against a non- - * swapcache page. So we must check PG_swapcache before proceeding with - * this optimization. - */ - if (likely(PageSwapCache(page))) { - struct swap_info_struct *sis; - - sis = page_swap_info(page); - if (sis->flags & SWP_BLKDEV) { - /* - * The swap subsystem performs lazy swap slot freeing, - * expecting that the page will be swapped out again. - * So we can avoid an unnecessary write if the page - * isn't redirtied. - * This is good for real swap storage because we can - * reduce unnecessary I/O and enhance wear-leveling - * if an SSD is used as the as swap device. - * But if in-memory swap device (eg zram) is used, - * this causes a duplicated copy between uncompressed - * data in VM-owned memory and compressed data in - * zram-owned memory. So let's free zram-owned memory - * and make the VM-owned decompressed page *dirty*, - * so the page should be swapped out somewhere again if - * we again wish to reclaim it. - */ - struct gendisk *disk = sis->bdev->bd_disk; - if (disk->fops->swap_slot_free_notify) { - swp_entry_t entry; - unsigned long offset; - - entry.val = page_private(page); - offset = swp_offset(entry); - - SetPageDirty(page); - disk->fops->swap_slot_free_notify(sis->bdev, - offset); - } - } - } - + swap_slot_free_notify(page); out: unlock_page(page); bio_put(bio); @@ -347,6 +353,7 @@ int swap_readpage(struct page *page) ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); if (!ret) { + swap_slot_free_notify(page); count_vm_event(PSWPIN); return 0; } From b4aa14a63cb3194d8eab355fcee194838ab09121 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 22 Mar 2016 14:24:39 -0700 Subject: [PATCH 07/81] kernel/hung_task.c: use timeout diff when timeout is updated When new timeout is written to /proc/sys/kernel/hung_task_timeout_secs, khungtaskd is interrupted and again sleeps for full timeout duration. This means that hang task will not be checked if new timeout is written periodically within old timeout duration and/or checking of hang task will be delayed for up to previous timeout duration. Fix this by remembering last time khungtaskd checked hang task. This change will allow other watchdog tasks (if any) to share khungtaskd by sleeping for minimal timeout diff of all watchdog tasks. Doing more watchdog tasks from khungtaskd will reduce the possibility of printk() collisions by multiple watchdog threads. Signed-off-by: Tetsuo Handa Cc: Oleg Nesterov Cc: Aaron Tomlin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index e0f90c2b57aa..d234022805dc 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -185,10 +185,12 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); } -static unsigned long timeout_jiffies(unsigned long timeout) +static long hung_timeout_jiffies(unsigned long last_checked, + unsigned long timeout) { /* timeout of 0 will disable the watchdog */ - return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT; + return timeout ? last_checked - jiffies + timeout * HZ : + MAX_SCHEDULE_TIMEOUT; } /* @@ -224,18 +226,21 @@ EXPORT_SYMBOL_GPL(reset_hung_task_detector); */ static int watchdog(void *dummy) { + unsigned long hung_last_checked = jiffies; + set_user_nice(current, 0); for ( ; ; ) { unsigned long timeout = sysctl_hung_task_timeout_secs; + long t = hung_timeout_jiffies(hung_last_checked, timeout); - while (schedule_timeout_interruptible(timeout_jiffies(timeout))) - timeout = sysctl_hung_task_timeout_secs; - - if (atomic_xchg(&reset_hung_task, 0)) + if (t <= 0) { + if (!atomic_xchg(&reset_hung_task, 0)) + check_hung_uninterruptible_tasks(timeout); + hung_last_checked = jiffies; continue; - - check_hung_uninterruptible_tasks(timeout); + } + schedule_timeout_interruptible(t); } return 0; From 5180e3e24fd3e8e7ea46fbe21e10f5ea3fb1edaa Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:24:43 -0700 Subject: [PATCH 08/81] compat: add in_compat_syscall to ask whether we're in a compat syscall A lot of code currently abuses is_compat_task to determine this. Signed-off-by: Andy Lutomirski Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Clemens Ladisch Cc: David Airlie Cc: David Herrmann Cc: David Miller Cc: Dmitry Torokhov Cc: Eric Paris Cc: Herbert Xu Cc: Ingo Molnar Acked-by: Jiri Kosina Cc: Matt Fleming Cc: Neil Horman Cc: Oded Gabbay Cc: Oleg Drokin Cc: Oleg Nesterov Cc: Paul Moore Cc: Sam Ravnborg Cc: Steffen Klassert Cc: Thomas Gleixner Cc: Vlad Yasevich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/linux/compat.h b/include/linux/compat.h index fe4ccd0c748a..f964ef79e0ad 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -5,6 +5,8 @@ * syscall compatibility layer. */ +#include + #ifdef CONFIG_COMPAT #include @@ -719,9 +721,22 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32, int, const char __user *); + +/* + * For most but not all architectures, "am I in a compat syscall?" and + * "am I a compat task?" are the same question. For architectures on which + * they aren't the same question, arch code can override in_compat_syscall. + */ + +#ifndef in_compat_syscall +static inline bool in_compat_syscall(void) { return is_compat_task(); } +#endif + #else #define is_compat_task() (0) +static inline bool in_compat_syscall(void) { return false; } #endif /* CONFIG_COMPAT */ + #endif /* _LINUX_COMPAT_H */ From 069923d87e97a26d5f4ee2d53829f2d3cd05294b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:24:46 -0700 Subject: [PATCH 09/81] sparc/compat: provide an accurate in_compat_syscall implementation On sparc64 compat-enabled kernels, any task can make 32-bit and 64-bit syscalls. is_compat_task returns true in 32-bit tasks, which does not necessarily imply that the current syscall is 32-bit. Provide an in_compat_syscall implementation that checks whether the current syscall is compat. As far as I know, sparc is the only architecture on which is_compat_task checks the compat status of the task and on which the compat status of a syscall can differ from the compat status of the task. On x86, is_compat_task checks the syscall type, not the task type. [akpm@linux-foundation.org: add comment, per Sam] [akpm@linux-foundation.org: update comment, per Andy] Signed-off-by: Andy Lutomirski Acked-by: David S. Miller Cc: Sam Ravnborg Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/compat.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index 830502fe62b4..6f251c4d613e 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -307,4 +307,11 @@ static inline int is_compat_task(void) return test_thread_flag(TIF_32BIT); } +static inline bool in_compat_syscall(void) +{ + /* Vector 0x110 is LINUX_32BIT_SYSCALL_TRAP */ + return pt_regs_trap_type(current_pt_regs()) == 0x110; +} +#define in_compat_syscall in_compat_syscall + #endif /* _ASM_SPARC64_COMPAT_H */ From 203f79078fea8525d0b0a13f2e13534b7ff3aa97 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:24:49 -0700 Subject: [PATCH 10/81] sparc/syscall: fix syscall_get_arch Sparc's syscall_get_arch was buggy: it returned the task arch, not the syscall arch. This could confuse seccomp and audit. I don't think this is as bad for seccomp as it looks: sparc's 32-bit and 64-bit syscalls are numbered the same. Signed-off-by: Andy Lutomirski Cc: David S. Miller Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/syscall.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/sparc/include/asm/syscall.h b/arch/sparc/include/asm/syscall.h index 49f71fd5b56e..1757cd6c521b 100644 --- a/arch/sparc/include/asm/syscall.h +++ b/arch/sparc/include/asm/syscall.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -128,7 +129,13 @@ static inline void syscall_set_arguments(struct task_struct *task, static inline int syscall_get_arch(void) { - return is_32bit_task() ? AUDIT_ARCH_SPARC : AUDIT_ARCH_SPARC64; +#if defined(CONFIG_SPARC64) && defined(CONFIG_COMPAT) + return in_compat_syscall() ? AUDIT_ARCH_SPARC : AUDIT_ARCH_SPARC64; +#elif defined(CONFIG_SPARC64) + return AUDIT_ARCH_SPARC64; +#else + return AUDIT_ARCH_SPARC; +#endif } #endif /* __ASM_SPARC_SYSCALL_H */ From 5c38065e021bc76f97fc08997f6d7fc7ea3fb7a7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:24:52 -0700 Subject: [PATCH 11/81] seccomp: check in_compat_syscall, not is_compat_task, in strict mode Seccomp wants to know the syscall bitness, not the caller task bitness, when it selects the syscall whitelist. As far as I know, this makes no difference on any architecture, so it's not a security problem. (It generates identical code everywhere except sparc, and, on sparc, the syscall numbering is the same for both ABIs.) Signed-off-by: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/seccomp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 15a1795bbba1..e1e5a354854e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -395,7 +395,7 @@ seccomp_prepare_user_filter(const char __user *user_filter) struct seccomp_filter *filter = ERR_PTR(-EFAULT); #ifdef CONFIG_COMPAT - if (is_compat_task()) { + if (in_compat_syscall()) { struct compat_sock_fprog fprog32; if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) goto out; @@ -529,7 +529,7 @@ static void __secure_computing_strict(int this_syscall) { int *syscall_whitelist = mode1_syscalls; #ifdef CONFIG_COMPAT - if (is_compat_task()) + if (in_compat_syscall()) syscall_whitelist = mode1_syscalls_32; #endif do { From 5c465217a930d4bbb7dd35a56bde1eea5bbd14d6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:24:55 -0700 Subject: [PATCH 12/81] ptrace: in PEEK_SIGINFO, check syscall bitness, not task bitness Users of the 32-bit ptrace() ABI expect the full 32-bit ABI. siginfo translation should check ptrace() ABI, not caller task ABI. This is an ABI change on SPARC. Let's hope that no one relied on the old buggy ABI. Signed-off-by: Andy Lutomirski Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 2341efe7fe02..c79b91d09e35 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -681,7 +681,7 @@ static int ptrace_peek_siginfo(struct task_struct *child, break; #ifdef CONFIG_COMPAT - if (unlikely(is_compat_task())) { + if (unlikely(in_compat_syscall())) { compat_siginfo_t __user *uinfo = compat_ptr(data); if (copy_siginfo_to_user32(uinfo, &info) || From efbc0fbf34927bd4d3d49b50b370990be82809c2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:24:58 -0700 Subject: [PATCH 13/81] auditsc: for seccomp events, log syscall compat state using in_compat_syscall Except on SPARC, this is what the code always did. SPARC compat seccomp was buggy, although the impact of the bug was limited because SPARC 32-bit and 64-bit syscall numbers are the same. Signed-off-by: Andy Lutomirski Cc: Paul Moore Cc: Eric Paris Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 195ffaee50b9..7d0e3cf8abe1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2412,8 +2412,8 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", - signr, syscall_get_arch(), syscall, is_compat_task(), - KSTK_EIP(current), code); + signr, syscall_get_arch(), syscall, + in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } From 6d8bedff926b7d31fa68e7f81c2f13fdc29fe157 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:25:01 -0700 Subject: [PATCH 14/81] staging/lustre: switch from is_compat_task to in_compat_syscall AFAICT, lustre is trying to determine syscall bitness. Use the new accessor. Signed-off-by: Andy Lutomirski Cc: Oleg Drokin Cc: Andreas Dilger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/staging/lustre/lustre/llite/llite_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h index 973f5cdec192..3e1572cb457b 100644 --- a/drivers/staging/lustre/lustre/llite/llite_internal.h +++ b/drivers/staging/lustre/lustre/llite/llite_internal.h @@ -657,7 +657,7 @@ static inline int ll_need_32bit_api(struct ll_sb_info *sbi) #if BITS_PER_LONG == 32 return 1; #elif defined(CONFIG_COMPAT) - return unlikely(is_compat_task() || (sbi->ll_flags & LL_SBI_32BIT_API)); + return unlikely(in_compat_syscall() || (sbi->ll_flags & LL_SBI_32BIT_API)); #else return unlikely(sbi->ll_flags & LL_SBI_32BIT_API); #endif From 121cef8f17d80e97838ea2f88715417e675e4403 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:25:04 -0700 Subject: [PATCH 15/81] ext4: in ext4_dir_llseek, check syscall bitness directly ext4 treats directory offsets differently for 32-bit and 64-bit callers. Check the caller type using in_compat_syscall, not is_compat_task. This changes behavior on SPARC slightly. Signed-off-by: Andy Lutomirski Cc: "Theodore Ts'o" Cc: Andreas Dilger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 33f5e2a50cf8..50ba27cbed03 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -285,7 +285,7 @@ errout: static inline int is_32bit_api(void) { #ifdef CONFIG_COMPAT - return is_compat_task(); + return in_compat_syscall(); #else return (BITS_PER_LONG == 32); #endif From 96c0e0a908ecfa61118d49db85c03e162b7f6e20 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:25:07 -0700 Subject: [PATCH 16/81] net/sctp: use in_compat_syscall for sctp_getsockopt_connectx3 SCTP unfortunately has a different ABI for SCTP_SOCKOPT_CONNECTX3 for 32-bit and 64-bit callers. Use in_compat_syscall to correctly distinguish them on all architectures. Signed-off-by: Andy Lutomirski Cc: Vlad Yasevich Cc: Neil Horman Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/sctp/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 96e08111106f..6f2653d86961 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1389,7 +1389,7 @@ static int sctp_getsockopt_connectx3(struct sock *sk, int len, int err = 0; #ifdef CONFIG_COMPAT - if (is_compat_task()) { + if (in_compat_syscall()) { struct compat_sctp_getaddrs_old param32; if (len < sizeof(param32)) From 2bf8c47626599c54ab072a29ef1c294d3cce6b0a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:25:10 -0700 Subject: [PATCH 17/81] net/xfrm_user: use in_compat_syscall to deny compat syscalls The code wants to prevent compat code from receiving messages. Use in_compat_syscall for this. Signed-off-by: Andy Lutomirski Cc: Steffen Klassert Cc: Herbert Xu Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 805681a7d356..2cc7af858c6f 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2449,7 +2449,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) int type, err; #ifdef CONFIG_COMPAT - if (is_compat_task()) + if (in_compat_syscall()) return -ENOTSUPP; #endif From a25045ff32dfb4d28c650560d5f2a82c1773dd1c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:25:13 -0700 Subject: [PATCH 18/81] firewire: use in_compat_syscall to check ioctl compatness Firewire was using is_compat_task to check whether it was in a compat ioctl or a non-compat ioctl. Use is_compat_syscall instead so it works properly on all architectures. Signed-off-by: Andy Lutomirski Cc: Clemens Ladisch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firewire/core-cdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 36a7c2d89a01..aee149bdf4c0 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -221,7 +221,7 @@ struct inbound_phy_packet_event { #ifdef CONFIG_COMPAT static void __user *u64_to_uptr(u64 value) { - if (is_compat_task()) + if (in_compat_syscall()) return compat_ptr(value); else return (void __user *)(unsigned long)value; @@ -229,7 +229,7 @@ static void __user *u64_to_uptr(u64 value) static u64 uptr_to_u64(void __user *ptr) { - if (is_compat_task()) + if (in_compat_syscall()) return ptr_to_compat(ptr); else return (u64)(unsigned long)ptr; From 4f01ed221e2ef5e7efdb0ac89179b4faf4cc3c86 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:25:16 -0700 Subject: [PATCH 19/81] drivers/firmware/efi/efivars.c: use in_compat_syscall() to check for compat callers This should make no difference on any architecture, as x86's historical is_compat_task behavior really did check whether the calling syscall was a compat syscall. x86's is_compat_task is going away, though. Signed-off-by: Andy Lutomirski Reviewed-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firmware/efi/efivars.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c index b23a271c6ae5..096adcbcb5a9 100644 --- a/drivers/firmware/efi/efivars.c +++ b/drivers/firmware/efi/efivars.c @@ -231,7 +231,7 @@ sanity_check(struct efi_variable *var, efi_char16_t *name, efi_guid_t vendor, static inline bool is_compat(void) { - if (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) + if (IS_ENABLED(CONFIG_COMPAT) && in_compat_syscall()) return true; return false; From 10f1685fd59d4ded226123550bdaef176b99f65f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:25:19 -0700 Subject: [PATCH 20/81] drivers/gpu/drm/amd/amdkfd: use in_compat_syscall to check open() caller type amdkfd wants to know syscall type, not task type. Check directly. Unfortunately, amdkfd is making nasty assumptions that a process' bitness is a well-defined constant thing. This isn't the case on x86. I don't know how much this matters, but this patch has no effect on generated code on x86, so amdkfd is equally broken with and without this patch. Signed-off-by: Andy Lutomirski Cc: Oded Gabbay Cc: David Airlie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index d2b49c026cf6..07ac724e3ec9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -107,7 +107,7 @@ static int kfd_open(struct inode *inode, struct file *filep) if (iminor(inode) != 0) return -ENODEV; - is_32bit_user_mode = is_compat_task(); + is_32bit_user_mode = in_compat_syscall(); if (is_32bit_user_mode == true) { dev_warn(kfd_device, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index a902ae037398..ac005796b71c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -311,7 +311,7 @@ static struct kfd_process *create_process(const struct task_struct *thread) goto err_process_pqm_init; /* init process apertures*/ - process->is_32bit_user_mode = is_compat_task(); + process->is_32bit_user_mode = in_compat_syscall(); if (kfd_init_apertures(process) != 0) goto err_init_apretures; From f4056b52845283d1c1eaf546d7d05d74923029b2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:25:21 -0700 Subject: [PATCH 21/81] input: redefine INPUT_COMPAT_TEST as in_compat_syscall() The input compat code should work like all other compat code: for 32-bit syscalls, use the 32-bit ABI and for 64-bit syscalls, use the 64-bit ABI. We have a helper for that (in_compat_syscall()): just use it. Signed-off-by: Andy Lutomirski Cc: Dmitry Torokhov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/input/input-compat.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/input/input-compat.h b/drivers/input/input-compat.h index 148f66fe3205..0f25878d5fa2 100644 --- a/drivers/input/input-compat.h +++ b/drivers/input/input-compat.h @@ -17,17 +17,7 @@ #ifdef CONFIG_COMPAT -/* Note to the author of this code: did it ever occur to - you why the ifdefs are needed? Think about it again. -AK */ -#if defined(CONFIG_X86_64) || defined(CONFIG_TILE) -# define INPUT_COMPAT_TEST is_compat_task() -#elif defined(CONFIG_S390) -# define INPUT_COMPAT_TEST test_thread_flag(TIF_31BIT) -#elif defined(CONFIG_MIPS) -# define INPUT_COMPAT_TEST test_thread_flag(TIF_32BIT_ADDR) -#else -# define INPUT_COMPAT_TEST test_thread_flag(TIF_32BIT) -#endif +#define INPUT_COMPAT_TEST in_compat_syscall() struct input_event_compat { struct compat_timeval time; From 7365abbade07f9181cbd013669642f129e9ec635 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:25:24 -0700 Subject: [PATCH 22/81] drivers/hid/uhid.c: check write() bitness using in_compat_syscall uhid changes the format expected in write() depending on bitness. It should check the syscall bitness directly. Signed-off-by: Andy Lutomirski Cc: David Herrmann Acked-by: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hid/uhid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c index e094c572b86e..16b6f11a0700 100644 --- a/drivers/hid/uhid.c +++ b/drivers/hid/uhid.c @@ -384,7 +384,7 @@ struct uhid_create_req_compat { static int uhid_event_from_user(const char __user *buffer, size_t len, struct uhid_event *event) { - if (is_compat_task()) { + if (in_compat_syscall()) { u32 type; if (get_user(type, buffer)) From f970165beeaa4803438e435d68022b3680a1a0b0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 22 Mar 2016 14:25:27 -0700 Subject: [PATCH 23/81] x86/compat: remove is_compat_task() x86's is_compat_task always checked the current syscall type, not the task type. It has no non-arch users any more, so just remove it to avoid confusion. On x86, nothing should really be checking the task ABI. There are legitimate users for the syscall ABI and for the mm ABI. Signed-off-by: Andy Lutomirski Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/compat.h | 3 ++- arch/x86/include/asm/ftrace.h | 2 +- arch/x86/kernel/process_64.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index acdee09228b3..ebb102e1bbc7 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -316,9 +316,10 @@ static inline bool is_x32_task(void) return false; } -static inline bool is_compat_task(void) +static inline bool in_compat_syscall(void) { return is_ia32_task() || is_x32_task(); } +#define in_compat_syscall in_compat_syscall /* override the generic impl */ #endif /* _ASM_X86_COMPAT_H */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 24938852db30..21b66dbf3601 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -58,7 +58,7 @@ int ftrace_int3_handler(struct pt_regs *regs); #define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1 static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) { - if (is_compat_task()) + if (in_compat_syscall()) return true; return false; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 776229e98202..dfa2781610e8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -478,7 +478,7 @@ void set_personality_ia32(bool x32) if (current->mm) current->mm->context.ia32_compat = TIF_X32; current->personality &= ~READ_IMPLIES_EXEC; - /* is_compat_task() uses the presence of the x32 + /* in_compat_syscall() uses the presence of the x32 syscall bit flag to determine compat status */ current_thread_info()->status &= ~TS_COMPAT; } else { From 38739380683795354b3f0f1a1e80614e311b8617 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 22 Mar 2016 14:25:30 -0700 Subject: [PATCH 24/81] fat: add config option to set UTF-8 mount option by default FAT has long supported its own default file name encoding config setting, separate from CONFIG_NLS_DEFAULT. However, if UTF-8 encoded file names are desired FAT character set should not be set to utf8 since this would make file names case sensitive even if case insensitive matching is requested. Instead, "utf8" mount options should be provided to enable UTF-8 file names in FAT file system. Unfortunately, there was no possibility to set the default value of this option so on UTF-8 system "utf8" mount option had to be added manually to most FAT mounts. This patch adds config option to set such default value. Signed-off-by: Maciej S. Szmigiero Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/vfat.txt | 7 ++++--- fs/fat/Kconfig | 18 +++++++++++++++++- fs/fat/inode.c | 4 +++- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index 223c32171dcc..cf51360e3a9f 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt @@ -56,9 +56,10 @@ iocharset= -- Character set to use for converting between the you should consider the following option instead. utf8= -- UTF-8 is the filesystem safe version of Unicode that - is used by the console. It can be enabled for the - filesystem with this option. If 'uni_xlate' gets set, - UTF-8 gets disabled. + is used by the console. It can be enabled or disabled + for the filesystem with this option. + If 'uni_xlate' gets set, UTF-8 gets disabled. + By default, FAT_DEFAULT_UTF8 setting is used. uni_xlate= -- Translate unhandled Unicode characters to special escaped sequences. This would let you backup and diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index 182f9ffe2b51..3ff1772f612e 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -93,8 +93,24 @@ config FAT_DEFAULT_IOCHARSET that most of your FAT filesystems use, and can be overridden with the "iocharset" mount option for FAT filesystems. Note that "utf8" is not recommended for FAT filesystems. - If unsure, you shouldn't set "utf8" here. + If unsure, you shouldn't set "utf8" here - select the next option + instead if you would like to use UTF-8 encoded file names by default. See for more information. Enable any character sets you need in File Systems/Native Language Support. + +config FAT_DEFAULT_UTF8 + bool "Enable FAT UTF-8 option by default" + depends on VFAT_FS + default n + help + Set this if you would like to have "utf8" mount option set + by default when mounting FAT filesystems. + + Even if you say Y here can always disable UTF-8 for + particular mount by adding "utf8=0" to mount options. + + Say Y if you use UTF-8 encoding for file names, N otherwise. + + See for more information. diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a5599052116c..226281068a46 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1127,7 +1127,7 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, } opts->name_check = 'n'; opts->quiet = opts->showexec = opts->sys_immutable = opts->dotsOK = 0; - opts->utf8 = opts->unicode_xlate = 0; + opts->unicode_xlate = 0; opts->numtail = 1; opts->usefree = opts->nocase = 0; opts->tz_set = 0; @@ -1135,6 +1135,8 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, opts->errors = FAT_ERRORS_RO; *debug = 0; + opts->utf8 = IS_ENABLED(CONFIG_FAT_DEFAULT_UTF8) && is_vfat; + if (!options) goto out; From 1333ab03150478df8d6f5673a91df1e50dc6ab97 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Mar 2016 14:25:33 -0700 Subject: [PATCH 25/81] ptrace: change __ptrace_unlink() to clear ->ptrace under ->siglock This test-case (simplified version of generated by syzkaller) #include #include #include void test(void) { for (;;) { if (fork()) { wait(NULL); continue; } ptrace(PTRACE_SEIZE, getppid(), 0, 0); ptrace(PTRACE_INTERRUPT, getppid(), 0, 0); _exit(0); } } int main(void) { int np; for (np = 0; np < 8; ++np) if (!fork()) test(); while (wait(NULL) > 0) ; return 0; } triggers the 2nd WARN_ON_ONCE(!signr) warning in do_jobctl_trap(). The problem is that __ptrace_unlink() clears task->jobctl under siglock but task->ptrace is cleared without this lock held; this fools the "else" branch which assumes that !PT_SEIZED means PT_PTRACED. Note also that most of other PTRACE_SEIZE checks can race with detach from the exiting tracer too. Say, the callers of ptrace_trap_notify() assume that SEIZED can't go away after it was checked. Signed-off-by: Oleg Nesterov Reported-by: Dmitry Vyukov Cc: Tejun Heo Cc: syzkaller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c79b91d09e35..d49bfa1e53e6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -73,12 +73,11 @@ void __ptrace_unlink(struct task_struct *child) { BUG_ON(!child->ptrace); - child->ptrace = 0; child->parent = child->real_parent; list_del_init(&child->ptrace_entry); spin_lock(&child->sighand->siglock); - + child->ptrace = 0; /* * Clear all pending traps and TRAPPING. TRAPPING should be * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly. From 378c6520e7d29280f400ef2ceaf155c86f05a71a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 22 Mar 2016 14:25:36 -0700 Subject: [PATCH 26/81] fs/coredump: prevent fsuid=0 dumps into user-controlled directories This commit fixes the following security hole affecting systems where all of the following conditions are fulfilled: - The fs.suid_dumpable sysctl is set to 2. - The kernel.core_pattern sysctl's value starts with "/". (Systems where kernel.core_pattern starts with "|/" are not affected.) - Unprivileged user namespace creation is permitted. (This is true on Linux >=3.8, but some distributions disallow it by default using a distro patch.) Under these conditions, if a program executes under secure exec rules, causing it to run with the SUID_DUMP_ROOT flag, then unshares its user namespace, changes its root directory and crashes, the coredump will be written using fsuid=0 and a path derived from kernel.core_pattern - but this path is interpreted relative to the root directory of the process, allowing the attacker to control where a coredump will be written with root privileges. To fix the security issue, always interpret core_pattern for dumps that are written under SUID_DUMP_ROOT relative to the root directory of init. Signed-off-by: Jann Horn Acked-by: Kees Cook Cc: Al Viro Cc: "Eric W. Biederman" Cc: Andy Lutomirski Cc: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/drivers/mconsole_kern.c | 2 +- fs/coredump.c | 30 ++++++++++++++++++++++++++---- fs/fhandle.c | 2 +- fs/open.c | 6 ++---- include/linux/fs.h | 2 +- kernel/sysctl_binary.c | 2 +- 6 files changed, 32 insertions(+), 12 deletions(-) diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index b821b13d343a..8a6b57108ac2 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -133,7 +133,7 @@ void mconsole_proc(struct mc_request *req) ptr += strlen("proc"); ptr = skip_spaces(ptr); - file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY); + file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0); if (IS_ERR(file)) { mconsole_reply(req, "Failed to open file", 1, 0); printk(KERN_ERR "open /proc/%s: %ld\n", ptr, PTR_ERR(file)); diff --git a/fs/coredump.c b/fs/coredump.c index 9ea87e9fdccf..47c32c3bfa1d 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -32,6 +32,9 @@ #include #include #include +#include +#include +#include #include #include @@ -649,6 +652,8 @@ void do_coredump(const siginfo_t *siginfo) } } else { struct inode *inode; + int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW | + O_LARGEFILE | O_EXCL; if (cprm.limit < binfmt->min_coredump) goto fail_unlock; @@ -687,10 +692,27 @@ void do_coredump(const siginfo_t *siginfo) * what matters is that at least one of the two processes * writes its coredump successfully, not which one. */ - cprm.file = filp_open(cn.corename, - O_CREAT | 2 | O_NOFOLLOW | - O_LARGEFILE | O_EXCL, - 0600); + if (need_suid_safe) { + /* + * Using user namespaces, normal user tasks can change + * their current->fs->root to point to arbitrary + * directories. Since the intention of the "only dump + * with a fully qualified path" rule is to control where + * coredumps may be placed using root privileges, + * current->fs->root must not be used. Instead, use the + * root directory of init_task. + */ + struct path root; + + task_lock(&init_task); + get_fs_root(init_task.fs, &root); + task_unlock(&init_task); + cprm.file = file_open_root(root.dentry, root.mnt, + cn.corename, open_flags, 0600); + path_put(&root); + } else { + cprm.file = filp_open(cn.corename, open_flags, 0600); + } if (IS_ERR(cprm.file)) goto fail_unlock; diff --git a/fs/fhandle.c b/fs/fhandle.c index d59712dfa3e7..ca3c3dd01789 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -228,7 +228,7 @@ long do_handle_open(int mountdirfd, path_put(&path); return fd; } - file = file_open_root(path.dentry, path.mnt, "", open_flag); + file = file_open_root(path.dentry, path.mnt, "", open_flag, 0); if (IS_ERR(file)) { put_unused_fd(fd); retval = PTR_ERR(file); diff --git a/fs/open.c b/fs/open.c index 55bdc75e2172..17cb6b1dab75 100644 --- a/fs/open.c +++ b/fs/open.c @@ -992,14 +992,12 @@ struct file *filp_open(const char *filename, int flags, umode_t mode) EXPORT_SYMBOL(filp_open); struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, - const char *filename, int flags) + const char *filename, int flags, umode_t mode) { struct open_flags op; - int err = build_open_flags(flags, 0, &op); + int err = build_open_flags(flags, mode, &op); if (err) return ERR_PTR(err); - if (flags & O_CREAT) - return ERR_PTR(-EINVAL); return do_file_open_root(dentry, mnt, filename, &op); } EXPORT_SYMBOL(file_open_root); diff --git a/include/linux/fs.h b/include/linux/fs.h index 35d99266ca9a..14a97194b34b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2263,7 +2263,7 @@ extern long do_sys_open(int dfd, const char __user *filename, int flags, extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, - const char *, int); + const char *, int, umode_t); extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 7e7746a42a62..10a1d7dc9313 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1321,7 +1321,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, } mnt = task_active_pid_ns(current)->proc_mnt; - file = file_open_root(mnt->mnt_root, mnt, pathname, flags); + file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0); result = PTR_ERR(file); if (IS_ERR(file)) goto out_putname; From 95f27356e4a8982257ecae6bc44c650772d77659 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 22 Mar 2016 14:25:39 -0700 Subject: [PATCH 27/81] cpumask: remove incorrect information from comment Since commit cdfdef75e795 ("cpumask: only allocate nr_cpumask_bits."), this comment above cpumask_size() is no longer relevant. Signed-off-by: Eric Biggers Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index fc14275ff34e..40cee6b77a93 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -607,8 +607,6 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) /** * cpumask_size - size to allocate for a 'struct cpumask' in bytes - * - * This will eventually be a runtime variable, depending on nr_cpu_ids. */ static inline size_t cpumask_size(void) { From 36915976eca58f2eefa040ba8f9939672564df61 Mon Sep 17 00:00:00 2001 From: Aurelien Jacquiot Date: Tue, 22 Mar 2016 14:25:42 -0700 Subject: [PATCH 28/81] rapidio/rionet: fix deadlock on SMP Fix deadlocking during concurrent receive and transmit operations on SMP platforms caused by the use of incorrect lock: on transmit 'tx_lock' spinlock should be used instead of 'lock' which is used for receive operation. This fix is applicable to kernel versions starting from v2.15. Signed-off-by: Aurelien Jacquiot Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Andre van Herk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/rionet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index 01f08a7751f7..e7034c55e796 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -280,7 +280,7 @@ static void rionet_outb_msg_event(struct rio_mport *mport, void *dev_id, int mbo struct net_device *ndev = dev_id; struct rionet_private *rnet = netdev_priv(ndev); - spin_lock(&rnet->lock); + spin_lock(&rnet->tx_lock); if (netif_msg_intr(rnet)) printk(KERN_INFO @@ -299,7 +299,7 @@ static void rionet_outb_msg_event(struct rio_mport *mport, void *dev_id, int mbo if (rnet->tx_cnt < RIONET_TX_RING_SIZE) netif_wake_queue(ndev); - spin_unlock(&rnet->lock); + spin_unlock(&rnet->tx_lock); } static int rionet_open(struct net_device *ndev) From 92444bb366ab6ace213c67e7dfea20fabe14adff Mon Sep 17 00:00:00 2001 From: Aurelien Jacquiot Date: Tue, 22 Mar 2016 14:25:45 -0700 Subject: [PATCH 29/81] rapidio/rionet: add capability to change MTU These patches are the result of extensive collaboration within the RapidIO.org Software Task Group between Texas Instruments, Freescale, Prodrive Technologies, Nokia Networks, BAE and IDT. Additional input was received from other members of RapidIO.org. The objective was to create a character mode driver interface which exposes the capabilities of RapidIO devices directly to applications, in a manner that allows the numerous and varied RapidIO implementations to interoperate. The Software Task Group has also developed fabric management, Remote Memory Access, and sockets applications which make use of these interfaces in user space. Intensive testing with these applications prompted the RapidIO subsystem updates provided within this set of patches. This patch (of 29): Replace default Ethernet-specific routine by the custom one to allow setting of larger MTU supported by RapidIO messaging (max RIO packet size is 4096 bytes). Signed-off-by: Aurelien Jacquiot Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/rionet.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index e7034c55e796..d3d6e35a30e0 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -48,6 +48,8 @@ MODULE_LICENSE("GPL"); #define RIONET_TX_RING_SIZE CONFIG_RIONET_TX_SIZE #define RIONET_RX_RING_SIZE CONFIG_RIONET_RX_SIZE #define RIONET_MAX_NETS 8 +#define RIONET_MSG_SIZE RIO_MAX_MSG_SIZE +#define RIONET_MAX_MTU (RIONET_MSG_SIZE - ETH_HLEN) struct rionet_private { struct rio_mport *mport; @@ -443,6 +445,17 @@ static void rionet_set_msglevel(struct net_device *ndev, u32 value) rnet->msg_enable = value; } +static int rionet_change_mtu(struct net_device *ndev, int new_mtu) +{ + if ((new_mtu < 68) || (new_mtu > RIONET_MAX_MTU)) { + printk(KERN_ERR "%s: Invalid MTU size %d\n", + ndev->name, new_mtu); + return -EINVAL; + } + ndev->mtu = new_mtu; + return 0; +} + static const struct ethtool_ops rionet_ethtool_ops = { .get_drvinfo = rionet_get_drvinfo, .get_msglevel = rionet_get_msglevel, @@ -454,7 +467,7 @@ static const struct net_device_ops rionet_netdev_ops = { .ndo_open = rionet_open, .ndo_stop = rionet_close, .ndo_start_xmit = rionet_start_xmit, - .ndo_change_mtu = eth_change_mtu, + .ndo_change_mtu = rionet_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, }; @@ -489,7 +502,7 @@ static int rionet_setup_netdev(struct rio_mport *mport, struct net_device *ndev) ndev->dev_addr[5] = device_id & 0xff; ndev->netdev_ops = &rionet_netdev_ops; - ndev->mtu = RIO_MAX_MSG_SIZE - 14; + ndev->mtu = RIONET_MAX_MTU; ndev->features = NETIF_F_LLTX; SET_NETDEV_DEV(ndev, &mport->dev); ndev->ethtool_ops = &rionet_ethtool_ops; From 174f1a71cc692ddb4bc30ec93edb6d7eb370b382 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:25:48 -0700 Subject: [PATCH 30/81] rapidio/tsi721: fix hardcoded MRRS setting Remove use of hardcoded setting for Maximum Read Request Size (MRRS) value and use one set by PCIe bus driver. Using hardcoded value can cause PCIe bus errors on platforms that have tsi721 device on PCIe path that allows only smaller read request sizes. This fix is applicable to kernel versions starting from v3.2. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/tsi721.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index eeca70ddbf61..f57ee9d69910 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -2426,11 +2426,9 @@ static int tsi721_probe(struct pci_dev *pdev, BUG_ON(!pci_is_pcie(pdev)); - /* Clear "no snoop" and "relaxed ordering" bits, use default MRRS. */ + /* Clear "no snoop" and "relaxed ordering" bits. */ pcie_capability_clear_and_set_word(pdev, PCI_EXP_DEVCTL, - PCI_EXP_DEVCTL_READRQ | PCI_EXP_DEVCTL_RELAX_EN | - PCI_EXP_DEVCTL_NOSNOOP_EN, - PCI_EXP_DEVCTL_READRQ_512B); + PCI_EXP_DEVCTL_RELAX_EN | PCI_EXP_DEVCTL_NOSNOOP_EN, 0); /* Adjust PCIe completion timeout. */ pcie_capability_clear_and_set_word(pdev, PCI_EXP_DEVCTL2, 0xf, 0x2); From ba5d141b55ff0c02127cabd344585622bbaa5d02 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:25:51 -0700 Subject: [PATCH 31/81] rapidio/tsi721: add check for overlapped IB window mappings Add check for attempts to request mapping of inbound RapidIO address space that overlaps with existing active mapping windows. Tsi721 device does not support overlapped inbound windows and SRIO address decoding behavior is not defined in such cases. This patch is applicable to kernel versions starting from v3.7. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/tsi721.c | 72 ++++++++++++++++++++++---------- drivers/rapidio/devices/tsi721.h | 11 +++++ 2 files changed, 62 insertions(+), 21 deletions(-) diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index f57ee9d69910..b96f0b97dc3a 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -885,26 +885,52 @@ static int tsi721_rio_map_inb_mem(struct rio_mport *mport, dma_addr_t lstart, u64 rstart, u32 size, u32 flags) { struct tsi721_device *priv = mport->priv; - int i; + int i, avail = -1; u32 regval; + struct tsi721_ib_win *ib_win; + int ret = -EBUSY; if (!is_power_of_2(size) || size < 0x1000 || ((u64)lstart & (size - 1)) || (rstart & (size - 1))) return -EINVAL; - /* Search for free inbound translation window */ + spin_lock(&priv->win_lock); + /* + * Scan for overlapping with active regions and mark the first available + * IB window at the same time. + */ for (i = 0; i < TSI721_IBWIN_NUM; i++) { - regval = ioread32(priv->regs + TSI721_IBWIN_LB(i)); - if (!(regval & TSI721_IBWIN_LB_WEN)) + ib_win = &priv->ib_win[i]; + if (!ib_win->active) { + if (avail == -1) { + avail = i; + ret = 0; + } + } else if (rstart < (ib_win->rstart + ib_win->size) && + (rstart + size) > ib_win->rstart) { + ret = -EFAULT; break; + } } - if (i >= TSI721_IBWIN_NUM) { - dev_err(&priv->pdev->dev, - "Unable to find free inbound window\n"); - return -EBUSY; + if (ret) + goto err_out; + i = avail; + + /* Sanity check: available IB window must be disabled at this point */ + regval = ioread32(priv->regs + TSI721_IBWIN_LB(i)); + if (WARN_ON(regval & TSI721_IBWIN_LB_WEN)) { + ret = -EIO; + goto err_out; } + ib_win = &priv->ib_win[i]; + ib_win->active = true; + ib_win->rstart = rstart; + ib_win->lstart = lstart; + ib_win->size = size; + spin_unlock(&priv->win_lock); + iowrite32(TSI721_IBWIN_SIZE(size) << 8, priv->regs + TSI721_IBWIN_SZ(i)); @@ -920,6 +946,9 @@ static int tsi721_rio_map_inb_mem(struct rio_mport *mport, dma_addr_t lstart, i, rstart, (unsigned long long)lstart); return 0; +err_out: + spin_unlock(&priv->win_lock); + return ret; } /** @@ -931,25 +960,25 @@ static void tsi721_rio_unmap_inb_mem(struct rio_mport *mport, dma_addr_t lstart) { struct tsi721_device *priv = mport->priv; + struct tsi721_ib_win *ib_win; int i; - u64 addr; - u32 regval; /* Search for matching active inbound translation window */ + spin_lock(&priv->win_lock); for (i = 0; i < TSI721_IBWIN_NUM; i++) { - regval = ioread32(priv->regs + TSI721_IBWIN_LB(i)); - if (regval & TSI721_IBWIN_LB_WEN) { - regval = ioread32(priv->regs + TSI721_IBWIN_TUA(i)); - addr = (u64)regval << 32; - regval = ioread32(priv->regs + TSI721_IBWIN_TLA(i)); - addr |= regval & TSI721_IBWIN_TLA_ADD; - - if (addr == (u64)lstart) { - iowrite32(0, priv->regs + TSI721_IBWIN_LB(i)); - break; - } + ib_win = &priv->ib_win[i]; + if (ib_win->active && ib_win->lstart == lstart) { + iowrite32(0, priv->regs + TSI721_IBWIN_LB(i)); + ib_win->active = false; + break; } } + spin_unlock(&priv->win_lock); + + if (i == TSI721_IBWIN_NUM) + dev_err(&priv->pdev->dev, + "IB window mapped to %llx not found\n", + (unsigned long long)lstart); } /** @@ -966,6 +995,7 @@ static void tsi721_init_sr2pc_mapping(struct tsi721_device *priv) /* Disable all SR2PC inbound windows */ for (i = 0; i < TSI721_IBWIN_NUM; i++) iowrite32(0, priv->regs + TSI721_IBWIN_LB(i)); + spin_lock_init(&priv->win_lock); } /** diff --git a/drivers/rapidio/devices/tsi721.h b/drivers/rapidio/devices/tsi721.h index 9d2502543ef6..355f356e5347 100644 --- a/drivers/rapidio/devices/tsi721.h +++ b/drivers/rapidio/devices/tsi721.h @@ -808,6 +808,13 @@ struct msix_irq { }; #endif /* CONFIG_PCI_MSI */ +struct tsi721_ib_win { + u64 rstart; + u32 size; + dma_addr_t lstart; + bool active; +}; + struct tsi721_device { struct pci_dev *pdev; struct rio_mport *mport; @@ -843,6 +850,10 @@ struct tsi721_device { /* Outbound Messaging */ int omsg_init[TSI721_OMSG_CHNUM]; struct tsi721_omsg_ring omsg_ring[TSI721_OMSG_CHNUM]; + + /* Inbound Mapping Windows */ + struct tsi721_ib_win ib_win[TSI721_IBWIN_NUM]; + spinlock_t win_lock; }; #ifdef CONFIG_RAPIDIO_DMA_ENGINE From 9673b883c261b055433527e9249781b43172c103 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:25:54 -0700 Subject: [PATCH 32/81] rapidio/tsi721: add option to configure direct mapping of IB window Add an option to configure mapping of Inbound Window without RIO-to-PCIe address translation. If a local memory buffer is not properly aligned to meet HW requirements for RapidIO address mapping with address translation, caller can request an inbound window with matching RapidIO address assigned to it. This implementation selects RapidIO base address and size for inbound window that are capable to accommodate the local memory buffer. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/tsi721.c | 163 +++++++++++++++++++++++++------ drivers/rapidio/devices/tsi721.h | 9 +- 2 files changed, 142 insertions(+), 30 deletions(-) diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index b96f0b97dc3a..d463d2cbd0af 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -888,71 +888,145 @@ static int tsi721_rio_map_inb_mem(struct rio_mport *mport, dma_addr_t lstart, int i, avail = -1; u32 regval; struct tsi721_ib_win *ib_win; + bool direct = (lstart == rstart); + u64 ibw_size; + dma_addr_t loc_start; + u64 ibw_start; + struct tsi721_ib_win_mapping *map = NULL; int ret = -EBUSY; - if (!is_power_of_2(size) || size < 0x1000 || - ((u64)lstart & (size - 1)) || (rstart & (size - 1))) - return -EINVAL; + if (direct) { + dev_dbg(&priv->pdev->dev, + "Direct (RIO_0x%llx -> PCIe_0x%pad), size=0x%x", + rstart, &lstart, size); + + /* Calculate minimal acceptable window size and base address */ + + ibw_size = roundup_pow_of_two(size); + ibw_start = lstart & ~(ibw_size - 1); + + while ((lstart + size) > (ibw_start + ibw_size)) { + ibw_size *= 2; + ibw_start = lstart & ~(ibw_size - 1); + if (ibw_size > 0x80000000) { /* Limit max size to 2GB */ + return -EBUSY; + } + } + + loc_start = ibw_start; + + map = kzalloc(sizeof(struct tsi721_ib_win_mapping), GFP_ATOMIC); + if (map == NULL) + return -ENOMEM; + + } else { + dev_dbg(&priv->pdev->dev, + "Translated (RIO_0x%llx -> PCIe_0x%pad), size=0x%x", + rstart, &lstart, size); + + if (!is_power_of_2(size) || size < 0x1000 || + ((u64)lstart & (size - 1)) || (rstart & (size - 1))) + return -EINVAL; + if (priv->ibwin_cnt == 0) + return -EBUSY; + ibw_start = rstart; + ibw_size = size; + loc_start = lstart; + } - spin_lock(&priv->win_lock); /* * Scan for overlapping with active regions and mark the first available * IB window at the same time. */ for (i = 0; i < TSI721_IBWIN_NUM; i++) { ib_win = &priv->ib_win[i]; + if (!ib_win->active) { if (avail == -1) { avail = i; ret = 0; } - } else if (rstart < (ib_win->rstart + ib_win->size) && - (rstart + size) > ib_win->rstart) { + } else if (ibw_start < (ib_win->rstart + ib_win->size) && + (ibw_start + ibw_size) > ib_win->rstart) { + /* Return error if address translation involved */ + if (direct && ib_win->xlat) { + ret = -EFAULT; + break; + } + + /* + * Direct mappings usually are larger than originally + * requested fragments - check if this new request fits + * into it. + */ + if (rstart >= ib_win->rstart && + (rstart + size) <= (ib_win->rstart + + ib_win->size)) { + /* We are in - no further mapping required */ + map->lstart = lstart; + list_add_tail(&map->node, &ib_win->mappings); + return 0; + } + ret = -EFAULT; break; } } if (ret) - goto err_out; + goto out; i = avail; /* Sanity check: available IB window must be disabled at this point */ regval = ioread32(priv->regs + TSI721_IBWIN_LB(i)); if (WARN_ON(regval & TSI721_IBWIN_LB_WEN)) { ret = -EIO; - goto err_out; + goto out; } ib_win = &priv->ib_win[i]; ib_win->active = true; - ib_win->rstart = rstart; - ib_win->lstart = lstart; - ib_win->size = size; - spin_unlock(&priv->win_lock); + ib_win->rstart = ibw_start; + ib_win->lstart = loc_start; + ib_win->size = ibw_size; + ib_win->xlat = (lstart != rstart); + INIT_LIST_HEAD(&ib_win->mappings); - iowrite32(TSI721_IBWIN_SIZE(size) << 8, + /* + * When using direct IBW mapping and have larger than requested IBW size + * we can have multiple local memory blocks mapped through the same IBW + * To handle this situation we maintain list of "clients" for such IBWs. + */ + if (direct) { + map->lstart = lstart; + list_add_tail(&map->node, &ib_win->mappings); + } + + iowrite32(TSI721_IBWIN_SIZE(ibw_size) << 8, priv->regs + TSI721_IBWIN_SZ(i)); - iowrite32(((u64)lstart >> 32), priv->regs + TSI721_IBWIN_TUA(i)); - iowrite32(((u64)lstart & TSI721_IBWIN_TLA_ADD), + iowrite32(((u64)loc_start >> 32), priv->regs + TSI721_IBWIN_TUA(i)); + iowrite32(((u64)loc_start & TSI721_IBWIN_TLA_ADD), priv->regs + TSI721_IBWIN_TLA(i)); - iowrite32(rstart >> 32, priv->regs + TSI721_IBWIN_UB(i)); - iowrite32((rstart & TSI721_IBWIN_LB_BA) | TSI721_IBWIN_LB_WEN, + iowrite32(ibw_start >> 32, priv->regs + TSI721_IBWIN_UB(i)); + iowrite32((ibw_start & TSI721_IBWIN_LB_BA) | TSI721_IBWIN_LB_WEN, priv->regs + TSI721_IBWIN_LB(i)); + + priv->ibwin_cnt--; + dev_dbg(&priv->pdev->dev, - "Configured IBWIN%d mapping (RIO_0x%llx -> PCIe_0x%llx)\n", - i, rstart, (unsigned long long)lstart); + "Configured IBWIN%d (RIO_0x%llx -> PCIe_0x%llx), size=0x%llx\n", + i, ibw_start, (unsigned long long)loc_start, ibw_size); return 0; -err_out: - spin_unlock(&priv->win_lock); +out: + kfree(map); return ret; } /** - * fsl_rio_unmap_inb_mem -- Unmapping inbound memory region. + * tsi721_rio_unmap_inb_mem -- Unmapping inbound memory region. * @mport: RapidIO master port * @lstart: Local memory space start address. */ @@ -963,22 +1037,53 @@ static void tsi721_rio_unmap_inb_mem(struct rio_mport *mport, struct tsi721_ib_win *ib_win; int i; + dev_dbg(&priv->pdev->dev, + "Unmap IBW mapped to PCIe_0x%pad", &lstart); + /* Search for matching active inbound translation window */ - spin_lock(&priv->win_lock); for (i = 0; i < TSI721_IBWIN_NUM; i++) { ib_win = &priv->ib_win[i]; - if (ib_win->active && ib_win->lstart == lstart) { + + /* Address translating IBWs must to be an exact march */ + if (!ib_win->active || + (ib_win->xlat && lstart != ib_win->lstart)) + continue; + + if (lstart >= ib_win->lstart && + lstart < (ib_win->lstart + ib_win->size)) { + + if (!ib_win->xlat) { + struct tsi721_ib_win_mapping *map; + int found = 0; + + list_for_each_entry(map, + &ib_win->mappings, node) { + if (map->lstart == lstart) { + list_del(&map->node); + kfree(map); + found = 1; + break; + } + } + + if (!found) + continue; + + if (!list_empty(&ib_win->mappings)) + break; + } + + dev_dbg(&priv->pdev->dev, "Disable IBWIN_%d", i); iowrite32(0, priv->regs + TSI721_IBWIN_LB(i)); ib_win->active = false; + priv->ibwin_cnt++; break; } } - spin_unlock(&priv->win_lock); if (i == TSI721_IBWIN_NUM) - dev_err(&priv->pdev->dev, - "IB window mapped to %llx not found\n", - (unsigned long long)lstart); + dev_dbg(&priv->pdev->dev, + "IB window mapped to %pad not found", &lstart); } /** @@ -995,7 +1100,7 @@ static void tsi721_init_sr2pc_mapping(struct tsi721_device *priv) /* Disable all SR2PC inbound windows */ for (i = 0; i < TSI721_IBWIN_NUM; i++) iowrite32(0, priv->regs + TSI721_IBWIN_LB(i)); - spin_lock_init(&priv->win_lock); + priv->ibwin_cnt = TSI721_IBWIN_NUM; } /** diff --git a/drivers/rapidio/devices/tsi721.h b/drivers/rapidio/devices/tsi721.h index 355f356e5347..f81d01149b39 100644 --- a/drivers/rapidio/devices/tsi721.h +++ b/drivers/rapidio/devices/tsi721.h @@ -808,11 +808,18 @@ struct msix_irq { }; #endif /* CONFIG_PCI_MSI */ +struct tsi721_ib_win_mapping { + struct list_head node; + dma_addr_t lstart; +}; + struct tsi721_ib_win { u64 rstart; u32 size; dma_addr_t lstart; bool active; + bool xlat; + struct list_head mappings; }; struct tsi721_device { @@ -853,7 +860,7 @@ struct tsi721_device { /* Inbound Mapping Windows */ struct tsi721_ib_win ib_win[TSI721_IBWIN_NUM]; - spinlock_t win_lock; + int ibwin_cnt; }; #ifdef CONFIG_RAPIDIO_DMA_ENGINE From d2a321f37ed49de86058b5daaf50a11d3ee2d61f Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:25:57 -0700 Subject: [PATCH 33/81] rapidio/tsi721_dma: fix pending transaction queue handling Fix pending DMA request queue handling to avoid broken ordering during concurrent request submissions. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/tsi721.h | 2 +- drivers/rapidio/devices/tsi721_dma.c | 62 ++++++++++++++-------------- 2 files changed, 33 insertions(+), 31 deletions(-) diff --git a/drivers/rapidio/devices/tsi721.h b/drivers/rapidio/devices/tsi721.h index f81d01149b39..d675a44ffc17 100644 --- a/drivers/rapidio/devices/tsi721.h +++ b/drivers/rapidio/devices/tsi721.h @@ -674,7 +674,7 @@ struct tsi721_bdma_chan { struct dma_chan dchan; struct tsi721_tx_desc *tx_desc; spinlock_t lock; - struct list_head active_list; + struct tsi721_tx_desc *active_tx; struct list_head queue; struct list_head free_list; struct tasklet_struct tasklet; diff --git a/drivers/rapidio/devices/tsi721_dma.c b/drivers/rapidio/devices/tsi721_dma.c index 47295940a868..500e1e044c36 100644 --- a/drivers/rapidio/devices/tsi721_dma.c +++ b/drivers/rapidio/devices/tsi721_dma.c @@ -63,14 +63,6 @@ struct tsi721_tx_desc *to_tsi721_desc(struct dma_async_tx_descriptor *txd) return container_of(txd, struct tsi721_tx_desc, txd); } -static inline -struct tsi721_tx_desc *tsi721_dma_first_active( - struct tsi721_bdma_chan *bdma_chan) -{ - return list_first_entry(&bdma_chan->active_list, - struct tsi721_tx_desc, desc_node); -} - static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num) { struct tsi721_dma_desc *bd_ptr; @@ -534,23 +526,30 @@ entry_done: return err; } -static void tsi721_advance_work(struct tsi721_bdma_chan *bdma_chan) +static void tsi721_advance_work(struct tsi721_bdma_chan *bdma_chan, + struct tsi721_tx_desc *desc) { - struct tsi721_tx_desc *desc; int err; dev_dbg(bdma_chan->dchan.device->dev, "%s: Enter\n", __func__); - /* - * If there are any new transactions in the queue add them - * into the processing list - */ - if (!list_empty(&bdma_chan->queue)) - list_splice_init(&bdma_chan->queue, &bdma_chan->active_list); + if (!tsi721_dma_is_idle(bdma_chan)) + return; - /* Start new transaction (if available) */ - if (!list_empty(&bdma_chan->active_list)) { - desc = tsi721_dma_first_active(bdma_chan); + /* + * If there is no data transfer in progress, fetch new descriptor from + * the pending queue. + */ + + if (desc == NULL && bdma_chan->active_tx == NULL && + !list_empty(&bdma_chan->queue)) { + desc = list_first_entry(&bdma_chan->queue, + struct tsi721_tx_desc, desc_node); + list_del_init((&desc->desc_node)); + bdma_chan->active_tx = desc; + } + + if (desc) { err = tsi721_submit_sg(desc); if (!err) tsi721_start_dma(bdma_chan); @@ -581,6 +580,10 @@ static void tsi721_dma_tasklet(unsigned long data) dev_err(bdma_chan->dchan.device->dev, "%s: DMA ERROR - DMAC%d_STS = 0x%x\n", __func__, bdma_chan->id, dmac_sts); + + spin_lock(&bdma_chan->lock); + bdma_chan->active_tx = NULL; + spin_unlock(&bdma_chan->lock); } if (dmac_int & TSI721_DMAC_INT_STFULL) { @@ -594,7 +597,7 @@ static void tsi721_dma_tasklet(unsigned long data) tsi721_clr_stat(bdma_chan); spin_lock(&bdma_chan->lock); - desc = tsi721_dma_first_active(bdma_chan); + desc = bdma_chan->active_tx; if (desc->sg_len == 0) { dma_async_tx_callback callback = NULL; @@ -606,14 +609,15 @@ static void tsi721_dma_tasklet(unsigned long data) callback = desc->txd.callback; param = desc->txd.callback_param; } - list_move(&desc->desc_node, &bdma_chan->free_list); + list_add(&desc->desc_node, &bdma_chan->free_list); + bdma_chan->active_tx = NULL; spin_unlock(&bdma_chan->lock); if (callback) callback(param); spin_lock(&bdma_chan->lock); } - tsi721_advance_work(bdma_chan); + tsi721_advance_work(bdma_chan, bdma_chan->active_tx); spin_unlock(&bdma_chan->lock); } @@ -720,9 +724,6 @@ static void tsi721_free_chan_resources(struct dma_chan *dchan) if (bdma_chan->bd_base == NULL) return; - BUG_ON(!list_empty(&bdma_chan->active_list)); - BUG_ON(!list_empty(&bdma_chan->queue)); - tsi721_bdma_interrupt_enable(bdma_chan, 0); bdma_chan->active = false; tsi721_sync_dma_irq(bdma_chan); @@ -745,11 +746,11 @@ static void tsi721_issue_pending(struct dma_chan *dchan) dev_dbg(dchan->device->dev, "%s: Enter\n", __func__); + spin_lock_bh(&bdma_chan->lock); if (tsi721_dma_is_idle(bdma_chan) && bdma_chan->active) { - spin_lock_bh(&bdma_chan->lock); - tsi721_advance_work(bdma_chan); - spin_unlock_bh(&bdma_chan->lock); + tsi721_advance_work(bdma_chan, NULL); } + spin_unlock_bh(&bdma_chan->lock); } static @@ -839,7 +840,8 @@ static int tsi721_terminate_all(struct dma_chan *dchan) } while ((dmac_int & TSI721_DMAC_INT_SUSP) == 0); } - list_splice_init(&bdma_chan->active_list, &list); + if (bdma_chan->active_tx) + list_add(&bdma_chan->active_tx->desc_node, &list); list_splice_init(&bdma_chan->queue, &list); list_for_each_entry_safe(desc, _d, &list, desc_node) @@ -875,7 +877,7 @@ int tsi721_register_dma(struct tsi721_device *priv) spin_lock_init(&bdma_chan->lock); - INIT_LIST_HEAD(&bdma_chan->active_list); + bdma_chan->active_tx = NULL; INIT_LIST_HEAD(&bdma_chan->queue); INIT_LIST_HEAD(&bdma_chan->free_list); From 8b189fdbc5f68f3f43e67004de25f75c1a5b4e51 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:00 -0700 Subject: [PATCH 34/81] rapidio: add query_mport operation Add mport query operation to report master port RapidIO capabilities and run time configuration to upper level drivers. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio.c | 17 ++++++++++++++ include/linux/rio.h | 49 ++++++++++++++++++++++++++++++++++++++++ include/linux/rio_regs.h | 3 +++ 3 files changed, 69 insertions(+) diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index e220edc85c68..c72f4da8065e 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -67,6 +67,23 @@ u16 rio_local_get_device_id(struct rio_mport *port) return (RIO_GET_DID(port->sys_size, result)); } +/** + * rio_query_mport - Query mport device attributes + * @port: mport device to query + * @mport_attr: mport attributes data structure + * + * Returns attributes of specified mport through the + * pointer to attributes data structure. + */ +int rio_query_mport(struct rio_mport *port, + struct rio_mport_attr *mport_attr) +{ + if (!port->ops->query_mport) + return -ENODATA; + return port->ops->query_mport(port, mport_attr); +} +EXPORT_SYMBOL(rio_query_mport); + /** * rio_add_device- Adds a RIO device to the device model * @rdev: RIO device diff --git a/include/linux/rio.h b/include/linux/rio.h index cde976e86b48..8996a629a321 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -314,6 +314,50 @@ struct rio_net { struct rio_id_table destid_table; /* destID allocation table */ }; +enum rio_link_speed { + RIO_LINK_DOWN = 0, /* SRIO Link not initialized */ + RIO_LINK_125 = 1, /* 1.25 GBaud */ + RIO_LINK_250 = 2, /* 2.5 GBaud */ + RIO_LINK_312 = 3, /* 3.125 GBaud */ + RIO_LINK_500 = 4, /* 5.0 GBaud */ + RIO_LINK_625 = 5 /* 6.25 GBaud */ +}; + +enum rio_link_width { + RIO_LINK_1X = 0, + RIO_LINK_1XR = 1, + RIO_LINK_2X = 3, + RIO_LINK_4X = 2, + RIO_LINK_8X = 4, + RIO_LINK_16X = 5 +}; + +enum rio_mport_flags { + RIO_MPORT_DMA = (1 << 0), /* supports DMA data transfers */ + RIO_MPORT_DMA_SG = (1 << 1), /* DMA supports HW SG mode */ + RIO_MPORT_IBSG = (1 << 2), /* inbound mapping supports SG */ +}; + +/** + * struct rio_mport_attr - RIO mport device attributes + * @flags: mport device capability flags + * @link_speed: SRIO link speed value (as defined by RapidIO specification) + * @link_width: SRIO link width value (as defined by RapidIO specification) + * @dma_max_sge: number of SG list entries that can be handled by DMA channel(s) + * @dma_max_size: max number of bytes in single DMA transfer (SG entry) + * @dma_align: alignment shift for DMA operations (as for other DMA operations) + */ +struct rio_mport_attr { + int flags; + int link_speed; + int link_width; + + /* DMA capability info: valid only if RIO_MPORT_DMA flag is set */ + int dma_max_sge; + int dma_max_size; + int dma_align; +}; + /* Low-level architecture-dependent routines */ /** @@ -333,6 +377,7 @@ struct rio_net { * @get_inb_message: Callback to get a message from an inbound mailbox queue. * @map_inb: Callback to map RapidIO address region into local memory space. * @unmap_inb: Callback to unmap RapidIO address region mapped with map_inb(). + * @query_mport: Callback to query mport device attributes. */ struct rio_ops { int (*lcread) (struct rio_mport *mport, int index, u32 offset, int len, @@ -358,6 +403,8 @@ struct rio_ops { int (*map_inb)(struct rio_mport *mport, dma_addr_t lstart, u64 rstart, u32 size, u32 flags); void (*unmap_inb)(struct rio_mport *mport, dma_addr_t lstart); + int (*query_mport)(struct rio_mport *mport, + struct rio_mport_attr *attr); }; #define RIO_RESOURCE_MEM 0x00000100 @@ -481,5 +528,7 @@ extern int rio_open_inb_mbox(struct rio_mport *, void *, int, int); extern void rio_close_inb_mbox(struct rio_mport *, int); extern int rio_open_outb_mbox(struct rio_mport *, void *, int, int); extern void rio_close_outb_mbox(struct rio_mport *, int); +extern int rio_query_mport(struct rio_mport *port, + struct rio_mport_attr *mport_attr); #endif /* LINUX_RIO_H */ diff --git a/include/linux/rio_regs.h b/include/linux/rio_regs.h index 218168a2b5e9..1063ae382bc2 100644 --- a/include/linux/rio_regs.h +++ b/include/linux/rio_regs.h @@ -238,6 +238,8 @@ #define RIO_PORT_N_ACK_INBOUND 0x3f000000 #define RIO_PORT_N_ACK_OUTSTAND 0x00003f00 #define RIO_PORT_N_ACK_OUTBOUND 0x0000003f +#define RIO_PORT_N_CTL2_CSR(x) (0x0054 + x*0x20) +#define RIO_PORT_N_CTL2_SEL_BAUD 0xf0000000 #define RIO_PORT_N_ERR_STS_CSR(x) (0x0058 + x*0x20) #define RIO_PORT_N_ERR_STS_PW_OUT_ES 0x00010000 /* Output Error-stopped */ #define RIO_PORT_N_ERR_STS_PW_INP_ES 0x00000100 /* Input Error-stopped */ @@ -249,6 +251,7 @@ #define RIO_PORT_N_CTL_PWIDTH 0xc0000000 #define RIO_PORT_N_CTL_PWIDTH_1 0x00000000 #define RIO_PORT_N_CTL_PWIDTH_4 0x40000000 +#define RIO_PORT_N_CTL_IPW 0x38000000 /* Initialized Port Width */ #define RIO_PORT_N_CTL_P_TYP_SER 0x00000001 #define RIO_PORT_N_CTL_LOCKOUT 0x00000002 #define RIO_PORT_N_CTL_EN_RX_SER 0x00200000 From dbe74afe6a66d1606f5cb39f2ad05c39497542e6 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:02 -0700 Subject: [PATCH 35/81] rapidio/tsi721: add query_mport callback Add device-specific implementation of query_mport callback function. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/tsi721.c | 34 ++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index d463d2cbd0af..cd40f0f9fbfe 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -2287,6 +2287,39 @@ static int tsi721_messages_init(struct tsi721_device *priv) return 0; } +/** + * tsi721_query_mport - Fetch inbound message from the Tsi721 MSG Queue + * @mport: Master port implementing the Inbound Messaging Engine + * @mbox: Inbound mailbox number + * + * Returns pointer to the message on success or NULL on failure. + */ +static int tsi721_query_mport(struct rio_mport *mport, + struct rio_mport_attr *attr) +{ + struct tsi721_device *priv = mport->priv; + u32 rval; + + rval = ioread32(priv->regs + (0x100 + RIO_PORT_N_ERR_STS_CSR(0))); + if (rval & RIO_PORT_N_ERR_STS_PORT_OK) { + rval = ioread32(priv->regs + (0x100 + RIO_PORT_N_CTL2_CSR(0))); + attr->link_speed = (rval & RIO_PORT_N_CTL2_SEL_BAUD) >> 28; + rval = ioread32(priv->regs + (0x100 + RIO_PORT_N_CTL_CSR(0))); + attr->link_width = (rval & RIO_PORT_N_CTL_IPW) >> 27; + } else + attr->link_speed = RIO_LINK_DOWN; + +#ifdef CONFIG_RAPIDIO_DMA_ENGINE + attr->flags = RIO_MPORT_DMA | RIO_MPORT_DMA_SG; + attr->dma_max_sge = 0; + attr->dma_max_size = TSI721_BDMA_MAX_BCOUNT; + attr->dma_align = 0; +#else + attr->flags = 0; +#endif + return 0; +} + /** * tsi721_disable_ints - disables all device interrupts * @priv: pointer to tsi721 private data @@ -2372,6 +2405,7 @@ static int tsi721_setup_mport(struct tsi721_device *priv) ops->get_inb_message = tsi721_get_inb_message; ops->map_inb = tsi721_rio_map_inb_mem; ops->unmap_inb = tsi721_rio_unmap_inb_mem; + ops->query_mport = tsi721_query_mport; mport = kzalloc(sizeof(struct rio_mport), GFP_KERNEL); if (!mport) { From 83dc2cbc1136108068637d5c98636647f7b82461 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:05 -0700 Subject: [PATCH 36/81] rapidio: add shutdown notification for RapidIO devices Add bus-specific callback to stop RapidIO devices during a system shutdown. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio-driver.c | 12 ++++++++++++ include/linux/rio.h | 2 ++ 2 files changed, 14 insertions(+) diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c index f301f059bb85..128350f4d17a 100644 --- a/drivers/rapidio/rio-driver.c +++ b/drivers/rapidio/rio-driver.c @@ -131,6 +131,17 @@ static int rio_device_remove(struct device *dev) return 0; } +static void rio_device_shutdown(struct device *dev) +{ + struct rio_dev *rdev = to_rio_dev(dev); + struct rio_driver *rdrv = rdev->driver; + + dev_dbg(dev, "RIO: %s\n", __func__); + + if (rdrv && rdrv->shutdown) + rdrv->shutdown(rdev); +} + /** * rio_register_driver - register a new RIO driver * @rdrv: the RIO driver structure to register @@ -229,6 +240,7 @@ struct bus_type rio_bus_type = { .bus_groups = rio_bus_groups, .probe = rio_device_probe, .remove = rio_device_remove, + .shutdown = rio_device_shutdown, .uevent = rio_uevent, }; diff --git a/include/linux/rio.h b/include/linux/rio.h index 8996a629a321..c64a0baf37c8 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -423,6 +423,7 @@ struct rio_ops { * @id_table: RIO device ids to be associated with this driver * @probe: RIO device inserted * @remove: RIO device removed + * @shutdown: shutdown notification callback * @suspend: RIO device suspended * @resume: RIO device awakened * @enable_wake: RIO device enable wake event @@ -437,6 +438,7 @@ struct rio_driver { const struct rio_device_id *id_table; int (*probe) (struct rio_dev * dev, const struct rio_device_id * id); void (*remove) (struct rio_dev * dev); + void (*shutdown)(struct rio_dev *dev); int (*suspend) (struct rio_dev * dev, u32 state); int (*resume) (struct rio_dev * dev); int (*enable_wake) (struct rio_dev * dev, u32 state, int enable); From e3dd8cd4778ebc75c654f73d8673281078c81ed9 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:08 -0700 Subject: [PATCH 37/81] rapidio/tsi721: add shutdown notification callback Add device driver specific shutdown notification callback. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/tsi721.c | 15 ++++++++++++++ drivers/rapidio/devices/tsi721.h | 3 +++ drivers/rapidio/devices/tsi721_dma.c | 30 ++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index cd40f0f9fbfe..1fc9663ae3b5 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -2638,6 +2638,8 @@ static int tsi721_probe(struct pci_dev *pdev, if (err) goto err_free_consistent; + pci_set_drvdata(pdev, priv); + return 0; err_free_consistent: @@ -2660,6 +2662,18 @@ err_exit: return err; } +static void tsi721_shutdown(struct pci_dev *pdev) +{ + struct tsi721_device *priv = pci_get_drvdata(pdev); + + dev_dbg(&pdev->dev, "RIO: %s\n", __func__); + + tsi721_disable_ints(priv); + tsi721_dma_stop_all(priv); + pci_clear_master(pdev); + pci_disable_device(pdev); +} + static const struct pci_device_id tsi721_pci_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_IDT, PCI_DEVICE_ID_TSI721) }, { 0, } /* terminate list */ @@ -2671,6 +2685,7 @@ static struct pci_driver tsi721_driver = { .name = "tsi721", .id_table = tsi721_pci_tbl, .probe = tsi721_probe, + .shutdown = tsi721_shutdown, }; static int __init tsi721_init(void) diff --git a/drivers/rapidio/devices/tsi721.h b/drivers/rapidio/devices/tsi721.h index d675a44ffc17..ce2fb1193869 100644 --- a/drivers/rapidio/devices/tsi721.h +++ b/drivers/rapidio/devices/tsi721.h @@ -866,6 +866,9 @@ struct tsi721_device { #ifdef CONFIG_RAPIDIO_DMA_ENGINE extern void tsi721_bdma_handler(struct tsi721_bdma_chan *bdma_chan); extern int tsi721_register_dma(struct tsi721_device *priv); +extern void tsi721_dma_stop_all(struct tsi721_device *priv); +#else +#define tsi721_dma_stop_all(priv) do {} while (0) #endif #endif diff --git a/drivers/rapidio/devices/tsi721_dma.c b/drivers/rapidio/devices/tsi721_dma.c index 500e1e044c36..b490ec36f018 100644 --- a/drivers/rapidio/devices/tsi721_dma.c +++ b/drivers/rapidio/devices/tsi721_dma.c @@ -852,6 +852,36 @@ static int tsi721_terminate_all(struct dma_chan *dchan) return 0; } +static void tsi721_dma_stop(struct tsi721_bdma_chan *bdma_chan) +{ + if (!bdma_chan->active) + return; + spin_lock_bh(&bdma_chan->lock); + if (!tsi721_dma_is_idle(bdma_chan)) { + int timeout = 100000; + + /* stop the transfer in progress */ + iowrite32(TSI721_DMAC_CTL_SUSP, + bdma_chan->regs + TSI721_DMAC_CTL); + + /* Wait until DMA channel stops */ + while (!tsi721_dma_is_idle(bdma_chan) && --timeout) + udelay(1); + } + + spin_unlock_bh(&bdma_chan->lock); +} + +void tsi721_dma_stop_all(struct tsi721_device *priv) +{ + int i; + + for (i = 0; i < TSI721_DMA_MAXCH; i++) { + if (i != TSI721_DMACH_MAINT) + tsi721_dma_stop(&priv->bdma[i]); + } +} + int tsi721_register_dma(struct tsi721_device *priv) { int i; From f41e2472ba115275438161fd72d40b2448963658 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:11 -0700 Subject: [PATCH 38/81] rapidio/rionet: add shutdown event handling Add shutdown notification handler which terminates active connections with remote RapidIO nodes. This prevents remote nodes from sending packets to the powered off node and eliminates hardware error events on remote nodes. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/rionet.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index d3d6e35a30e0..f994fa1fde2f 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -24,6 +24,7 @@ #include #include #include +#include #define DRV_NAME "rionet" #define DRV_VERSION "0.3" @@ -599,6 +600,30 @@ out: return rc; } +static int rionet_shutdown(struct notifier_block *nb, unsigned long code, + void *unused) +{ + struct rionet_peer *peer, *tmp; + int i; + + pr_debug("%s: %s\n", DRV_NAME, __func__); + + for (i = 0; i < RIONET_MAX_NETS; i++) { + if (!nets[i].ndev) + continue; + + list_for_each_entry_safe(peer, tmp, &nets[i].peers, node) { + if (nets[i].active[peer->rdev->destid]) { + rio_send_doorbell(peer->rdev, + RIONET_DOORBELL_LEAVE); + nets[i].active[peer->rdev->destid] = NULL; + } + } + } + + return NOTIFY_DONE; +} + #ifdef MODULE static struct rio_device_id rionet_id_table[] = { {RIO_DEVICE(RIO_ANY_ID, RIO_ANY_ID)}, @@ -615,8 +640,20 @@ static struct subsys_interface rionet_interface = { .remove_dev = rionet_remove_dev, }; +static struct notifier_block rionet_notifier = { + .notifier_call = rionet_shutdown, +}; + static int __init rionet_init(void) { + int ret; + + ret = register_reboot_notifier(&rionet_notifier); + if (ret) { + pr_err("%s: failed to register reboot notifier (err=%d)\n", + DRV_NAME, ret); + return ret; + } return subsys_interface_register(&rionet_interface); } @@ -648,6 +685,7 @@ static void __exit rionet_exit(void) } } + unregister_reboot_notifier(&rionet_notifier); subsys_interface_unregister(&rionet_interface); } From b74ec56e8ae8759d83448528bde587d00908b4a9 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:14 -0700 Subject: [PATCH 39/81] rapidio: rework common RIO device add/delete routines This patch moves per-net device list handling from rio-scan to common RapidIO core and adds a matching device deletion routine. This makes device object creation/removal available to other implementations of enumeration/discovery process. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio-scan.c | 13 ++----------- drivers/rapidio/rio.c | 33 ++++++++++++++++++++++++++++++++- drivers/rapidio/rio.h | 2 ++ 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index d6a126c17c03..c46763185849 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -449,9 +449,6 @@ static struct rio_dev *rio_setup_device(struct rio_net *net, if (do_enum) rio_route_clr_table(rdev, RIO_GLOBAL_TABLE, 0); - - list_add_tail(&rswitch->node, &net->switches); - } else { if (do_enum) /*Enable Input Output Port (transmitter reviever)*/ @@ -463,11 +460,7 @@ static struct rio_dev *rio_setup_device(struct rio_net *net, rdev->dev.parent = &port->dev; rio_attach_device(rdev); - - device_initialize(&rdev->dev); rdev->dev.release = rio_release_dev; - rio_dev_get(rdev); - rdev->dma_mask = DMA_BIT_MASK(32); rdev->dev.dma_mask = &rdev->dma_mask; rdev->dev.coherent_dma_mask = DMA_BIT_MASK(32); @@ -480,6 +473,8 @@ static struct rio_dev *rio_setup_device(struct rio_net *net, if (ret) goto cleanup; + rio_dev_get(rdev); + return rdev; cleanup: @@ -621,8 +616,6 @@ static int rio_enum_peer(struct rio_net *net, struct rio_mport *port, rdev = rio_setup_device(net, port, RIO_ANY_DESTID(port->sys_size), hopcount, 1); if (rdev) { - /* Add device to the global and bus/net specific list. */ - list_add_tail(&rdev->net_list, &net->devices); rdev->prev = prev; if (prev && rio_is_switch(prev)) prev->rswitch->nextdev[prev_port] = rdev; @@ -778,8 +771,6 @@ rio_disc_peer(struct rio_net *net, struct rio_mport *port, u16 destid, /* Setup new RIO device */ if ((rdev = rio_setup_device(net, port, destid, hopcount, 0))) { - /* Add device to the global and bus/net specific list. */ - list_add_tail(&rdev->net_list, &net->devices); rdev->prev = prev; if (prev && rio_is_switch(prev)) prev->rswitch->nextdev[prev_port] = rdev; diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index c72f4da8065e..0be86f4eea5d 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -96,12 +96,18 @@ int rio_add_device(struct rio_dev *rdev) { int err; - err = device_add(&rdev->dev); + err = device_register(&rdev->dev); if (err) return err; spin_lock(&rio_global_list_lock); list_add_tail(&rdev->global_list, &rio_devices); + if (rdev->net) { + list_add_tail(&rdev->net_list, &rdev->net->devices); + if (rdev->pef & RIO_PEF_SWITCH) + list_add_tail(&rdev->rswitch->node, + &rdev->net->switches); + } spin_unlock(&rio_global_list_lock); rio_create_sysfs_dev_files(rdev); @@ -110,6 +116,31 @@ int rio_add_device(struct rio_dev *rdev) } EXPORT_SYMBOL_GPL(rio_add_device); +/* + * rio_del_device - removes a RIO device from the device model + * @rdev: RIO device + * + * Removes the RIO device to the kernel device list and subsystem's device list. + * Clears sysfs entries for the removed device. + */ +void rio_del_device(struct rio_dev *rdev) +{ + pr_debug("RIO: %s: removing %s\n", __func__, rio_name(rdev)); + spin_lock(&rio_global_list_lock); + list_del(&rdev->global_list); + if (rdev->net) { + list_del(&rdev->net_list); + if (rdev->pef & RIO_PEF_SWITCH) { + list_del(&rdev->rswitch->node); + kfree(rdev->rswitch->route_table); + } + } + spin_unlock(&rio_global_list_lock); + rio_remove_sysfs_dev_files(rdev); + device_unregister(&rdev->dev); +} +EXPORT_SYMBOL_GPL(rio_del_device); + /** * rio_request_inb_mbox - request inbound mailbox service * @mport: RIO master port from which to allocate the mailbox resource diff --git a/drivers/rapidio/rio.h b/drivers/rapidio/rio.h index 2d0550e08ea2..da0f604a834e 100644 --- a/drivers/rapidio/rio.h +++ b/drivers/rapidio/rio.h @@ -28,6 +28,7 @@ extern u32 rio_mport_get_efb(struct rio_mport *port, int local, u16 destid, extern int rio_mport_chk_dev_access(struct rio_mport *mport, u16 destid, u8 hopcount); extern int rio_create_sysfs_dev_files(struct rio_dev *rdev); +extern void rio_remove_sysfs_dev_files(struct rio_dev *rdev); extern int rio_lock_device(struct rio_mport *port, u16 destid, u8 hopcount, int wait_ms); extern int rio_unlock_device(struct rio_mport *port, u16 destid, u8 hopcount); @@ -39,6 +40,7 @@ extern int rio_route_clr_table(struct rio_dev *rdev, u16 table, int lock); extern int rio_set_port_lockout(struct rio_dev *rdev, u32 pnum, int lock); extern struct rio_dev *rio_get_comptag(u32 comp_tag, struct rio_dev *from); extern int rio_add_device(struct rio_dev *rdev); +extern void rio_del_device(struct rio_dev *rdev); extern int rio_enable_rx_tx_port(struct rio_mport *port, int local, u16 destid, u8 hopcount, u8 port_num); extern int rio_register_scan(int mport_id, struct rio_scan *scan_ops); From e6b585ca6e81badeb3d42db3cc408174f2826034 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:17 -0700 Subject: [PATCH 40/81] rapidio: move net allocation into core code Make net allocation/release routines available to all components of RapidIO subsystem by moving code from rio-scan enumerator. Make destination ID allocation method private to existing enumerator because other enumeration methods can use their own algorithm. Setup net device object as a parent of all RapidIO devices residing in it and register net as a child of active mport device. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio-scan.c | 108 +++++++++++++++++++++++-------------- drivers/rapidio/rio.c | 53 ++++++++++++++++++ drivers/rapidio/rio.h | 3 ++ include/linux/rio.h | 18 +++---- 4 files changed, 133 insertions(+), 49 deletions(-) diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index c46763185849..764dc5fd30b2 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -39,6 +39,13 @@ static void rio_init_em(struct rio_dev *rdev); +struct rio_id_table { + u16 start; /* logical minimal id */ + u32 max; /* max number of IDs in table */ + spinlock_t lock; + unsigned long table[0]; +}; + static int next_destid = 0; static int next_comptag = 1; @@ -62,7 +69,7 @@ static int rio_mport_phys_table[] = { static u16 rio_destid_alloc(struct rio_net *net) { int destid; - struct rio_id_table *idtab = &net->destid_table; + struct rio_id_table *idtab = (struct rio_id_table *)net->enum_data; spin_lock(&idtab->lock); destid = find_first_zero_bit(idtab->table, idtab->max); @@ -88,7 +95,7 @@ static u16 rio_destid_alloc(struct rio_net *net) static int rio_destid_reserve(struct rio_net *net, u16 destid) { int oldbit; - struct rio_id_table *idtab = &net->destid_table; + struct rio_id_table *idtab = (struct rio_id_table *)net->enum_data; destid -= idtab->start; spin_lock(&idtab->lock); @@ -106,7 +113,7 @@ static int rio_destid_reserve(struct rio_net *net, u16 destid) */ static void rio_destid_free(struct rio_net *net, u16 destid) { - struct rio_id_table *idtab = &net->destid_table; + struct rio_id_table *idtab = (struct rio_id_table *)net->enum_data; destid -= idtab->start; spin_lock(&idtab->lock); @@ -121,7 +128,7 @@ static void rio_destid_free(struct rio_net *net, u16 destid) static u16 rio_destid_first(struct rio_net *net) { int destid; - struct rio_id_table *idtab = &net->destid_table; + struct rio_id_table *idtab = (struct rio_id_table *)net->enum_data; spin_lock(&idtab->lock); destid = find_first_bit(idtab->table, idtab->max); @@ -141,7 +148,7 @@ static u16 rio_destid_first(struct rio_net *net) static u16 rio_destid_next(struct rio_net *net, u16 from) { int destid; - struct rio_id_table *idtab = &net->destid_table; + struct rio_id_table *idtab = (struct rio_id_table *)net->enum_data; spin_lock(&idtab->lock); destid = find_next_bit(idtab->table, idtab->max, from); @@ -458,7 +465,7 @@ static struct rio_dev *rio_setup_device(struct rio_net *net, rdev->comp_tag & RIO_CTAG_UDEVID); } - rdev->dev.parent = &port->dev; + rdev->dev.parent = &net->dev; rio_attach_device(rdev); rdev->dev.release = rio_release_dev; rdev->dma_mask = DMA_BIT_MASK(32); @@ -855,50 +862,71 @@ static int rio_mport_is_active(struct rio_mport *port) return result & RIO_PORT_N_ERR_STS_PORT_OK; } -/** - * rio_alloc_net- Allocate and configure a new RIO network - * @port: Master port associated with the RIO network - * @do_enum: Enumeration/Discovery mode flag - * @start: logical minimal start id for new net - * - * Allocates a RIO network structure, initializes per-network - * list heads, and adds the associated master port to the - * network list of associated master ports. Returns a - * RIO network pointer on success or %NULL on failure. - */ -static struct rio_net *rio_alloc_net(struct rio_mport *port, - int do_enum, u16 start) +static void rio_scan_release_net(struct rio_net *net) +{ + pr_debug("RIO-SCAN: %s: net_%d\n", __func__, net->id); + kfree(net->enum_data); +} + +static void rio_scan_release_dev(struct device *dev) { struct rio_net *net; - net = kzalloc(sizeof(struct rio_net), GFP_KERNEL); - if (net && do_enum) { - net->destid_table.table = kcalloc( - BITS_TO_LONGS(RIO_MAX_ROUTE_ENTRIES(port->sys_size)), - sizeof(long), - GFP_KERNEL); + net = to_rio_net(dev); + pr_debug("RIO-SCAN: %s: net_%d\n", __func__, net->id); + kfree(net); +} - if (net->destid_table.table == NULL) { +/* + * rio_scan_alloc_net - Allocate and configure a new RIO network + * @mport: Master port associated with the RIO network + * @do_enum: Enumeration/Discovery mode flag + * @start: logical minimal start id for new net + * + * Allocates a new RIO network structure and initializes enumerator-specific + * part of it (if required). + * Returns a RIO network pointer on success or %NULL on failure. + */ +static struct rio_net *rio_scan_alloc_net(struct rio_mport *mport, + int do_enum, u16 start) +{ + struct rio_net *net; + + net = rio_alloc_net(mport); + + if (net && do_enum) { + struct rio_id_table *idtab; + size_t size; + + size = sizeof(struct rio_id_table) + + BITS_TO_LONGS( + RIO_MAX_ROUTE_ENTRIES(mport->sys_size) + ) * sizeof(long); + + idtab = kzalloc(size, GFP_KERNEL); + + if (idtab == NULL) { pr_err("RIO: failed to allocate destID table\n"); - kfree(net); + rio_free_net(net); net = NULL; } else { - net->destid_table.start = start; - net->destid_table.max = - RIO_MAX_ROUTE_ENTRIES(port->sys_size); - spin_lock_init(&net->destid_table.lock); + net->enum_data = idtab; + net->release = rio_scan_release_net; + idtab->start = start; + idtab->max = RIO_MAX_ROUTE_ENTRIES(mport->sys_size); + spin_lock_init(&idtab->lock); } } if (net) { - INIT_LIST_HEAD(&net->node); - INIT_LIST_HEAD(&net->devices); - INIT_LIST_HEAD(&net->switches); - INIT_LIST_HEAD(&net->mports); - list_add_tail(&port->nnode, &net->mports); - net->hport = port; - net->id = port->id; + net->id = mport->id; + net->hport = mport; + dev_set_name(&net->dev, "rnet_%d", net->id); + net->dev.parent = &mport->dev; + net->dev.release = rio_scan_release_dev; + rio_add_net(net); } + return net; } @@ -1007,7 +1035,7 @@ static int rio_enum_mport(struct rio_mport *mport, u32 flags) /* If master port has an active link, allocate net and enum peers */ if (rio_mport_is_active(mport)) { - net = rio_alloc_net(mport, 1, 0); + net = rio_scan_alloc_net(mport, 1, 0); if (!net) { printk(KERN_ERR "RIO: failed to allocate new net\n"); rc = -ENOMEM; @@ -1124,7 +1152,7 @@ static int rio_disc_mport(struct rio_mport *mport, u32 flags) enum_done: pr_debug("RIO: ... enumeration done\n"); - net = rio_alloc_net(mport, 0, 0); + net = rio_scan_alloc_net(mport, 0, 0); if (!net) { printk(KERN_ERR "RIO: Failed to allocate new net\n"); goto bail; diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index 0be86f4eea5d..5fd68cb4b610 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -42,6 +42,7 @@ MODULE_PARM_DESC(hdid, "Destination ID assignment to local RapidIO controllers"); static LIST_HEAD(rio_devices); +static LIST_HEAD(rio_nets); static DEFINE_SPINLOCK(rio_global_list_lock); static LIST_HEAD(rio_mports); @@ -84,6 +85,58 @@ int rio_query_mport(struct rio_mport *port, } EXPORT_SYMBOL(rio_query_mport); +/** + * rio_alloc_net- Allocate and initialize a new RIO network data structure + * @mport: Master port associated with the RIO network + * + * Allocates a RIO network structure, initializes per-network + * list heads, and adds the associated master port to the + * network list of associated master ports. Returns a + * RIO network pointer on success or %NULL on failure. + */ +struct rio_net *rio_alloc_net(struct rio_mport *mport) +{ + struct rio_net *net; + + net = kzalloc(sizeof(struct rio_net), GFP_KERNEL); + if (net) { + INIT_LIST_HEAD(&net->node); + INIT_LIST_HEAD(&net->devices); + INIT_LIST_HEAD(&net->switches); + INIT_LIST_HEAD(&net->mports); + mport->net = net; + } + return net; +} +EXPORT_SYMBOL_GPL(rio_alloc_net); + +int rio_add_net(struct rio_net *net) +{ + int err; + + err = device_register(&net->dev); + if (err) + return err; + spin_lock(&rio_global_list_lock); + list_add_tail(&net->node, &rio_nets); + spin_unlock(&rio_global_list_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(rio_add_net); + +void rio_free_net(struct rio_net *net) +{ + spin_lock(&rio_global_list_lock); + if (!list_empty(&net->node)) + list_del(&net->node); + spin_unlock(&rio_global_list_lock); + if (net->release) + net->release(net); + device_unregister(&net->dev); +} +EXPORT_SYMBOL_GPL(rio_free_net); + /** * rio_add_device- Adds a RIO device to the device model * @rdev: RIO device diff --git a/drivers/rapidio/rio.h b/drivers/rapidio/rio.h index da0f604a834e..68e7852dd1d8 100644 --- a/drivers/rapidio/rio.h +++ b/drivers/rapidio/rio.h @@ -39,6 +39,9 @@ extern int rio_route_get_entry(struct rio_dev *rdev, u16 table, extern int rio_route_clr_table(struct rio_dev *rdev, u16 table, int lock); extern int rio_set_port_lockout(struct rio_dev *rdev, u32 pnum, int lock); extern struct rio_dev *rio_get_comptag(u32 comp_tag, struct rio_dev *from); +extern struct rio_net *rio_alloc_net(struct rio_mport *mport); +extern int rio_add_net(struct rio_net *net); +extern void rio_free_net(struct rio_net *net); extern int rio_add_device(struct rio_dev *rdev); extern void rio_del_device(struct rio_dev *rdev); extern int rio_enable_rx_tx_port(struct rio_mport *port, int local, u16 destid, diff --git a/include/linux/rio.h b/include/linux/rio.h index c64a0baf37c8..36086bf7e6f9 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -202,6 +202,7 @@ struct rio_dev { #define to_rio_dev(n) container_of(n, struct rio_dev, dev) #define sw_to_rio_dev(n) container_of(n, struct rio_dev, rswitch[0]) #define to_rio_mport(n) container_of(n, struct rio_mport, dev) +#define to_rio_net(n) container_of(n, struct rio_net, dev) /** * struct rio_msg - RIO message event @@ -237,6 +238,7 @@ enum rio_phy_type { * @dbells: List of doorbell events * @node: Node in global list of master ports * @nnode: Node in network list of master ports + * @net: RIO net this mport is attached to * @iores: I/O mem resource that this master port interface owns * @riores: RIO resources that this master port interfaces owns * @inb_msg: RIO inbound message event descriptors @@ -258,6 +260,7 @@ struct rio_mport { struct list_head dbells; /* list of doorbell events */ struct list_head node; /* node in global list of ports */ struct list_head nnode; /* node in net list of ports */ + struct rio_net *net; /* RIO net this mport is attached to */ struct resource iores; struct resource riores[RIO_MAX_MPORT_RESOURCES]; struct rio_msg inb_msg[RIO_MAX_MBOX]; @@ -287,13 +290,6 @@ struct rio_mport { */ #define RIO_SCAN_ENUM_NO_WAIT 0x00000001 /* Do not wait for enum completed */ -struct rio_id_table { - u16 start; /* logical minimal id */ - u32 max; /* max number of IDs in table */ - spinlock_t lock; - unsigned long *table; -}; - /** * struct rio_net - RIO network info * @node: Node in global list of RIO networks @@ -302,7 +298,9 @@ struct rio_id_table { * @mports: List of master ports accessing this network * @hport: Default port for accessing this network * @id: RIO network ID - * @destid_table: destID allocation table + * @dev: Device object + * @enum_data: private data specific to a network enumerator + * @release: enumerator-specific release callback */ struct rio_net { struct list_head node; /* node in list of networks */ @@ -311,7 +309,9 @@ struct rio_net { struct list_head mports; /* list of ports accessing net */ struct rio_mport *hport; /* primary port for accessing net */ unsigned char id; /* RIO network ID */ - struct rio_id_table destid_table; /* destID allocation table */ + struct device dev; + void *enum_data; /* private data for enumerator of the network */ + void (*release)(struct rio_net *net); }; enum rio_link_speed { From b77a2030dface6ea6b0d900bd8496ef41a9f3323 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:20 -0700 Subject: [PATCH 41/81] rapidio: add core mport removal support Add common mport removal support functions into the RapidIO subsystem core. Changes to the existing mport registration process have been made to avoid race conditions with active subsystem interfaces immediately after mport device registration: part of initialization code from rio_register_mport() have been moved into separate function rio_mport_initialize() to allow to perform mport registration as the final step of setup process. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio.c | 106 +++++++++++++++++++++++++++++++++--------- drivers/rapidio/rio.h | 2 +- include/linux/rio.h | 18 +++++++ 3 files changed, 102 insertions(+), 24 deletions(-) diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index 5fd68cb4b610..03b8c5af72bb 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -149,6 +149,7 @@ int rio_add_device(struct rio_dev *rdev) { int err; + atomic_set(&rdev->state, RIO_DEVICE_RUNNING); err = device_register(&rdev->dev); if (err) return err; @@ -172,13 +173,15 @@ EXPORT_SYMBOL_GPL(rio_add_device); /* * rio_del_device - removes a RIO device from the device model * @rdev: RIO device + * @state: device state to set during removal process * * Removes the RIO device to the kernel device list and subsystem's device list. * Clears sysfs entries for the removed device. */ -void rio_del_device(struct rio_dev *rdev) +void rio_del_device(struct rio_dev *rdev, enum rio_device_state state) { pr_debug("RIO: %s: removing %s\n", __func__, rio_name(rdev)); + atomic_set(&rdev->state, state); spin_lock(&rio_global_list_lock); list_del(&rdev->global_list); if (rdev->net) { @@ -2010,32 +2013,28 @@ static int rio_get_hdid(int index) return hdid[index]; } +int rio_mport_initialize(struct rio_mport *mport) +{ + if (next_portid >= RIO_MAX_MPORTS) { + pr_err("RIO: reached specified max number of mports\n"); + return -ENODEV; + } + + atomic_set(&mport->state, RIO_DEVICE_INITIALIZING); + mport->id = next_portid++; + mport->host_deviceid = rio_get_hdid(mport->id); + mport->nscan = NULL; + + return 0; +} +EXPORT_SYMBOL_GPL(rio_mport_initialize); + int rio_register_mport(struct rio_mport *port) { struct rio_scan_node *scan = NULL; int res = 0; - if (next_portid >= RIO_MAX_MPORTS) { - pr_err("RIO: reached specified max number of mports\n"); - return 1; - } - - port->id = next_portid++; - port->host_deviceid = rio_get_hdid(port->id); - port->nscan = NULL; - - dev_set_name(&port->dev, "rapidio%d", port->id); - port->dev.class = &rio_mport_class; - - res = device_register(&port->dev); - if (res) - dev_err(&port->dev, "RIO: mport%d registration failed ERR=%d\n", - port->id, res); - else - dev_dbg(&port->dev, "RIO: mport%d registered\n", port->id); - mutex_lock(&rio_mport_list_lock); - list_add_tail(&port->node, &rio_mports); /* * Check if there are any registered enumeration/discovery operations @@ -2049,13 +2048,74 @@ int rio_register_mport(struct rio_mport *port) break; } } + + list_add_tail(&port->node, &rio_mports); mutex_unlock(&rio_mport_list_lock); - pr_debug("RIO: %s %s id=%d\n", __func__, port->name, port->id); - return 0; + dev_set_name(&port->dev, "rapidio%d", port->id); + port->dev.class = &rio_mport_class; + atomic_set(&port->state, RIO_DEVICE_RUNNING); + + res = device_register(&port->dev); + if (res) + dev_err(&port->dev, "RIO: mport%d registration failed ERR=%d\n", + port->id, res); + else + dev_dbg(&port->dev, "RIO: registered mport%d\n", port->id); + + return res; } EXPORT_SYMBOL_GPL(rio_register_mport); +static int rio_mport_cleanup_callback(struct device *dev, void *data) +{ + struct rio_dev *rdev = to_rio_dev(dev); + + if (dev->bus == &rio_bus_type) + rio_del_device(rdev, RIO_DEVICE_SHUTDOWN); + return 0; +} + +static int rio_net_remove_children(struct rio_net *net) +{ + /* + * Unregister all RapidIO devices residing on this net (this will + * invoke notification of registered subsystem interfaces as well). + */ + device_for_each_child(&net->dev, NULL, rio_mport_cleanup_callback); + return 0; +} + +int rio_unregister_mport(struct rio_mport *port) +{ + pr_debug("RIO: %s %s id=%d\n", __func__, port->name, port->id); + + /* Transition mport to the SHUTDOWN state */ + if (atomic_cmpxchg(&port->state, + RIO_DEVICE_RUNNING, + RIO_DEVICE_SHUTDOWN) != RIO_DEVICE_RUNNING) { + pr_err("RIO: %s unexpected state transition for mport %s\n", + __func__, port->name); + } + + if (port->net && port->net->hport == port) { + rio_net_remove_children(port->net); + rio_free_net(port->net); + } + + /* + * Unregister all RapidIO devices attached to this mport (this will + * invoke notification of registered subsystem interfaces as well). + */ + mutex_lock(&rio_mport_list_lock); + list_del(&port->node); + mutex_unlock(&rio_mport_list_lock); + device_unregister(&port->dev); + + return 0; +} +EXPORT_SYMBOL_GPL(rio_unregister_mport); + EXPORT_SYMBOL_GPL(rio_local_get_device_id); EXPORT_SYMBOL_GPL(rio_get_device); EXPORT_SYMBOL_GPL(rio_get_asm); diff --git a/drivers/rapidio/rio.h b/drivers/rapidio/rio.h index 68e7852dd1d8..625d09add001 100644 --- a/drivers/rapidio/rio.h +++ b/drivers/rapidio/rio.h @@ -43,7 +43,7 @@ extern struct rio_net *rio_alloc_net(struct rio_mport *mport); extern int rio_add_net(struct rio_net *net); extern void rio_free_net(struct rio_net *net); extern int rio_add_device(struct rio_dev *rdev); -extern void rio_del_device(struct rio_dev *rdev); +extern void rio_del_device(struct rio_dev *rdev, enum rio_device_state state); extern int rio_enable_rx_tx_port(struct rio_mport *port, int local, u16 destid, u8 hopcount, u8 port_num); extern int rio_register_scan(int mport_id, struct rio_scan *scan_ops); diff --git a/include/linux/rio.h b/include/linux/rio.h index 36086bf7e6f9..f833773cdc68 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -137,6 +137,13 @@ struct rio_switch_ops { int (*em_handle) (struct rio_dev *dev, u8 swport); }; +enum rio_device_state { + RIO_DEVICE_INITIALIZING, + RIO_DEVICE_RUNNING, + RIO_DEVICE_GONE, + RIO_DEVICE_SHUTDOWN, +}; + /** * struct rio_dev - RIO device info * @global_list: Node in list of all RIO devices @@ -165,6 +172,7 @@ struct rio_switch_ops { * @destid: Network destination ID (or associated destid for switch) * @hopcount: Hopcount to this device * @prev: Previous RIO device connected to the current one + * @state: device state * @rswitch: struct rio_switch (if valid for this device) */ struct rio_dev { @@ -194,6 +202,7 @@ struct rio_dev { u16 destid; u8 hopcount; struct rio_dev *prev; + atomic_t state; struct rio_switch rswitch[0]; /* RIO switch info */ }; @@ -255,6 +264,7 @@ enum rio_phy_type { * @priv: Master port private data * @dma: DMA device associated with mport * @nscan: RapidIO network enumeration/discovery operations + * @state: mport device state */ struct rio_mport { struct list_head dbells; /* list of doorbell events */ @@ -283,8 +293,14 @@ struct rio_mport { struct dma_device dma; #endif struct rio_scan *nscan; + atomic_t state; }; +static inline int rio_mport_is_running(struct rio_mport *mport) +{ + return atomic_read(&mport->state) == RIO_DEVICE_RUNNING; +} + /* * Enumeration/discovery control flags */ @@ -525,7 +541,9 @@ struct rio_scan_node { }; /* Architecture and hardware-specific functions */ +extern int rio_mport_initialize(struct rio_mport *); extern int rio_register_mport(struct rio_mport *); +extern int rio_unregister_mport(struct rio_mport *); extern int rio_open_inb_mbox(struct rio_mport *, void *, int, int); extern void rio_close_inb_mbox(struct rio_mport *, int); extern int rio_open_outb_mbox(struct rio_mport *, void *, int, int); From 748353cc2d03d0514a3bf6d0752244ce657f197c Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:23 -0700 Subject: [PATCH 42/81] rapidio/tsi721: add HW specific mport removal Add hardware-specific device removal support for Tsi721 PCIe-to-RapidIO bridge. To avoid excessive data type conversions, parameters passed to some internal functions have been revised. Dynamic memory allocations of rio_mport and rio_ops have been replaced to reduce references between data structures. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/tsi721.c | 302 ++++++++++++++++----------- drivers/rapidio/devices/tsi721.h | 4 +- drivers/rapidio/devices/tsi721_dma.c | 28 ++- 3 files changed, 212 insertions(+), 122 deletions(-) diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index 1fc9663ae3b5..db95d71ba4e9 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -236,16 +236,15 @@ static int tsi721_cwrite_dma(struct rio_mport *mport, int index, u16 destid, /** * tsi721_pw_handler - Tsi721 inbound port-write interrupt handler - * @mport: RapidIO master port structure + * @priv: tsi721 device private structure * * Handles inbound port-write interrupts. Copies PW message from an internal * buffer into PW message FIFO and schedules deferred routine to process * queued messages. */ static int -tsi721_pw_handler(struct rio_mport *mport) +tsi721_pw_handler(struct tsi721_device *priv) { - struct tsi721_device *priv = mport->priv; u32 pw_stat; u32 pw_buf[TSI721_RIO_PW_MSG_SIZE/sizeof(u32)]; @@ -363,16 +362,15 @@ static int tsi721_dsend(struct rio_mport *mport, int index, /** * tsi721_dbell_handler - Tsi721 doorbell interrupt handler - * @mport: RapidIO master port structure + * @priv: tsi721 device-specific data structure * * Handles inbound doorbell interrupts. Copies doorbell entry from an internal * buffer into DB message FIFO and schedules deferred routine to process * queued DBs. */ static int -tsi721_dbell_handler(struct rio_mport *mport) +tsi721_dbell_handler(struct tsi721_device *priv) { - struct tsi721_device *priv = mport->priv; u32 regval; /* Disable IDB interrupts */ @@ -404,7 +402,7 @@ static void tsi721_db_dpc(struct work_struct *work) /* * Process queued inbound doorbells */ - mport = priv->mport; + mport = &priv->mport; wr_ptr = ioread32(priv->regs + TSI721_IDQ_WP(IDB_QUEUE)) % IDB_QSIZE; rd_ptr = ioread32(priv->regs + TSI721_IDQ_RP(IDB_QUEUE)) % IDB_QSIZE; @@ -457,15 +455,14 @@ static void tsi721_db_dpc(struct work_struct *work) /** * tsi721_irqhandler - Tsi721 interrupt handler * @irq: Linux interrupt number - * @ptr: Pointer to interrupt-specific data (mport structure) + * @ptr: Pointer to interrupt-specific data (tsi721_device structure) * * Handles Tsi721 interrupts signaled using MSI and INTA. Checks reported * interrupt events and calls an event-specific handler(s). */ static irqreturn_t tsi721_irqhandler(int irq, void *ptr) { - struct rio_mport *mport = (struct rio_mport *)ptr; - struct tsi721_device *priv = mport->priv; + struct tsi721_device *priv = (struct tsi721_device *)ptr; u32 dev_int; u32 dev_ch_int; u32 intval; @@ -488,7 +485,7 @@ static irqreturn_t tsi721_irqhandler(int irq, void *ptr) intval = ioread32(priv->regs + TSI721_SR_CHINT(IDB_QUEUE)); if (intval & TSI721_SR_CHINT_IDBQRCV) - tsi721_dbell_handler(mport); + tsi721_dbell_handler(priv); else dev_info(&priv->pdev->dev, "Unsupported SR_CH_INT %x\n", intval); @@ -545,7 +542,7 @@ static irqreturn_t tsi721_irqhandler(int irq, void *ptr) /* Service SRIO MAC interrupts */ intval = ioread32(priv->regs + TSI721_RIO_EM_INT_STAT); if (intval & TSI721_RIO_EM_INT_STAT_PW_RX) - tsi721_pw_handler(mport); + tsi721_pw_handler(priv); } #ifdef CONFIG_RAPIDIO_DMA_ENGINE @@ -613,13 +610,13 @@ static void tsi721_interrupts_init(struct tsi721_device *priv) /** * tsi721_omsg_msix - MSI-X interrupt handler for outbound messaging * @irq: Linux interrupt number - * @ptr: Pointer to interrupt-specific data (mport structure) + * @ptr: Pointer to interrupt-specific data (tsi721_device structure) * * Handles outbound messaging interrupts signaled using MSI-X. */ static irqreturn_t tsi721_omsg_msix(int irq, void *ptr) { - struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv; + struct tsi721_device *priv = (struct tsi721_device *)ptr; int mbox; mbox = (irq - priv->msix[TSI721_VECT_OMB0_DONE].vector) % RIO_MAX_MBOX; @@ -630,13 +627,13 @@ static irqreturn_t tsi721_omsg_msix(int irq, void *ptr) /** * tsi721_imsg_msix - MSI-X interrupt handler for inbound messaging * @irq: Linux interrupt number - * @ptr: Pointer to interrupt-specific data (mport structure) + * @ptr: Pointer to interrupt-specific data (tsi721_device structure) * * Handles inbound messaging interrupts signaled using MSI-X. */ static irqreturn_t tsi721_imsg_msix(int irq, void *ptr) { - struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv; + struct tsi721_device *priv = (struct tsi721_device *)ptr; int mbox; mbox = (irq - priv->msix[TSI721_VECT_IMB0_RCV].vector) % RIO_MAX_MBOX; @@ -647,19 +644,19 @@ static irqreturn_t tsi721_imsg_msix(int irq, void *ptr) /** * tsi721_srio_msix - Tsi721 MSI-X SRIO MAC interrupt handler * @irq: Linux interrupt number - * @ptr: Pointer to interrupt-specific data (mport structure) + * @ptr: Pointer to interrupt-specific data (tsi721_device structure) * * Handles Tsi721 interrupts from SRIO MAC. */ static irqreturn_t tsi721_srio_msix(int irq, void *ptr) { - struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv; + struct tsi721_device *priv = (struct tsi721_device *)ptr; u32 srio_int; /* Service SRIO MAC interrupts */ srio_int = ioread32(priv->regs + TSI721_RIO_EM_INT_STAT); if (srio_int & TSI721_RIO_EM_INT_STAT_PW_RX) - tsi721_pw_handler((struct rio_mport *)ptr); + tsi721_pw_handler(priv); return IRQ_HANDLED; } @@ -667,7 +664,7 @@ static irqreturn_t tsi721_srio_msix(int irq, void *ptr) /** * tsi721_sr2pc_ch_msix - Tsi721 MSI-X SR2PC Channel interrupt handler * @irq: Linux interrupt number - * @ptr: Pointer to interrupt-specific data (mport structure) + * @ptr: Pointer to interrupt-specific data (tsi721_device structure) * * Handles Tsi721 interrupts from SR2PC Channel. * NOTE: At this moment services only one SR2PC channel associated with inbound @@ -675,13 +672,13 @@ static irqreturn_t tsi721_srio_msix(int irq, void *ptr) */ static irqreturn_t tsi721_sr2pc_ch_msix(int irq, void *ptr) { - struct tsi721_device *priv = ((struct rio_mport *)ptr)->priv; + struct tsi721_device *priv = (struct tsi721_device *)ptr; u32 sr_ch_int; /* Service Inbound DB interrupt from SR2PC channel */ sr_ch_int = ioread32(priv->regs + TSI721_SR_CHINT(IDB_QUEUE)); if (sr_ch_int & TSI721_SR_CHINT_IDBQRCV) - tsi721_dbell_handler((struct rio_mport *)ptr); + tsi721_dbell_handler(priv); /* Clear interrupts */ iowrite32(sr_ch_int, priv->regs + TSI721_SR_CHINT(IDB_QUEUE)); @@ -693,32 +690,31 @@ static irqreturn_t tsi721_sr2pc_ch_msix(int irq, void *ptr) /** * tsi721_request_msix - register interrupt service for MSI-X mode. - * @mport: RapidIO master port structure + * @priv: tsi721 device-specific data structure * * Registers MSI-X interrupt service routines for interrupts that are active * immediately after mport initialization. Messaging interrupt service routines * should be registered during corresponding open requests. */ -static int tsi721_request_msix(struct rio_mport *mport) +static int tsi721_request_msix(struct tsi721_device *priv) { - struct tsi721_device *priv = mport->priv; int err = 0; err = request_irq(priv->msix[TSI721_VECT_IDB].vector, tsi721_sr2pc_ch_msix, 0, - priv->msix[TSI721_VECT_IDB].irq_name, (void *)mport); + priv->msix[TSI721_VECT_IDB].irq_name, (void *)priv); if (err) - goto out; + return err; err = request_irq(priv->msix[TSI721_VECT_PWRX].vector, tsi721_srio_msix, 0, - priv->msix[TSI721_VECT_PWRX].irq_name, (void *)mport); - if (err) - free_irq( - priv->msix[TSI721_VECT_IDB].vector, - (void *)mport); -out: - return err; + priv->msix[TSI721_VECT_PWRX].irq_name, (void *)priv); + if (err) { + free_irq(priv->msix[TSI721_VECT_IDB].vector, (void *)priv); + return err; + } + + return 0; } /** @@ -831,19 +827,18 @@ static int tsi721_enable_msix(struct tsi721_device *priv) } #endif /* CONFIG_PCI_MSI */ -static int tsi721_request_irq(struct rio_mport *mport) +static int tsi721_request_irq(struct tsi721_device *priv) { - struct tsi721_device *priv = mport->priv; int err; #ifdef CONFIG_PCI_MSI if (priv->flags & TSI721_USING_MSIX) - err = tsi721_request_msix(mport); + err = tsi721_request_msix(priv); else #endif err = request_irq(priv->pdev->irq, tsi721_irqhandler, (priv->flags & TSI721_USING_MSI) ? 0 : IRQF_SHARED, - DRV_NAME, (void *)mport); + DRV_NAME, (void *)priv); if (err) dev_err(&priv->pdev->dev, @@ -852,6 +847,17 @@ static int tsi721_request_irq(struct rio_mport *mport) return err; } +static void tsi721_free_irq(struct tsi721_device *priv) +{ +#ifdef CONFIG_PCI_MSI + if (priv->flags & TSI721_USING_MSIX) { + free_irq(priv->msix[TSI721_VECT_IDB].vector, (void *)priv); + free_irq(priv->msix[TSI721_VECT_PWRX].vector, (void *)priv); + } else +#endif + free_irq(priv->pdev->irq, (void *)priv); +} + /** * tsi721_init_pc2sr_mapping - initializes outbound (PCIe->SRIO) * translation regions. @@ -1103,6 +1109,26 @@ static void tsi721_init_sr2pc_mapping(struct tsi721_device *priv) priv->ibwin_cnt = TSI721_IBWIN_NUM; } +/* + * tsi721_close_sr2pc_mapping - closes all active inbound (SRIO->PCIe) + * translation regions. + * @priv: pointer to tsi721 device private data + */ +static void tsi721_close_sr2pc_mapping(struct tsi721_device *priv) +{ + struct tsi721_ib_win *ib_win; + int i; + + /* Disable all active SR2PC inbound windows */ + for (i = 0; i < TSI721_IBWIN_NUM; i++) { + ib_win = &priv->ib_win[i]; + if (ib_win->active) { + iowrite32(0, priv->regs + TSI721_IBWIN_LB(i)); + ib_win->active = false; + } + } +} + /** * tsi721_port_write_init - Inbound port write interface init * @priv: pointer to tsi721 private data @@ -1126,6 +1152,11 @@ static int tsi721_port_write_init(struct tsi721_device *priv) return 0; } +static void tsi721_port_write_free(struct tsi721_device *priv) +{ + kfifo_free(&priv->pw_fifo); +} + static int tsi721_doorbell_init(struct tsi721_device *priv) { /* Outbound Doorbells do not require any setup. @@ -1496,6 +1527,7 @@ tsi721_add_outb_message(struct rio_mport *mport, struct rio_dev *rdev, int mbox, static void tsi721_omsg_handler(struct tsi721_device *priv, int ch) { u32 omsg_int; + struct rio_mport *mport = &priv->mport; spin_lock(&priv->omsg_ring[ch].lock); @@ -1537,7 +1569,7 @@ static void tsi721_omsg_handler(struct tsi721_device *priv, int ch) priv->omsg_ring[ch].sts_rdptr = srd_ptr; iowrite32(srd_ptr, priv->regs + TSI721_OBDMAC_DSRP(ch)); - if (!priv->mport->outb_msg[ch].mcback) + if (!mport->outb_msg[ch].mcback) goto no_sts_update; /* Inform upper layer about transfer completion */ @@ -1564,7 +1596,7 @@ static void tsi721_omsg_handler(struct tsi721_device *priv, int ch) if (tx_slot == priv->omsg_ring[ch].size) tx_slot = 0; BUG_ON(tx_slot >= priv->omsg_ring[ch].size); - priv->mport->outb_msg[ch].mcback(priv->mport, + mport->outb_msg[ch].mcback(mport, priv->omsg_ring[ch].dev_id, ch, tx_slot); } @@ -1587,8 +1619,8 @@ no_sts_update: ioread32(priv->regs + TSI721_OBDMAC_CTL(ch)); /* Inform upper level to clear all pending tx slots */ - if (priv->mport->outb_msg[ch].mcback) - priv->mport->outb_msg[ch].mcback(priv->mport, + if (mport->outb_msg[ch].mcback) + mport->outb_msg[ch].mcback(mport, priv->omsg_ring[ch].dev_id, ch, priv->omsg_ring[ch].tx_slot); /* Synch tx_slot tracking */ @@ -1710,12 +1742,11 @@ static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id, #ifdef CONFIG_PCI_MSI if (priv->flags & TSI721_USING_MSIX) { + int idx = TSI721_VECT_OMB0_DONE + mbox; + /* Request interrupt service if we are in MSI-X mode */ - rc = request_irq( - priv->msix[TSI721_VECT_OMB0_DONE + mbox].vector, - tsi721_omsg_msix, 0, - priv->msix[TSI721_VECT_OMB0_DONE + mbox].irq_name, - (void *)mport); + rc = request_irq(priv->msix[idx].vector, tsi721_omsg_msix, 0, + priv->msix[idx].irq_name, (void *)priv); if (rc) { dev_dbg(&priv->pdev->dev, @@ -1724,18 +1755,16 @@ static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id, goto out_stat; } - rc = request_irq(priv->msix[TSI721_VECT_OMB0_INT + mbox].vector, - tsi721_omsg_msix, 0, - priv->msix[TSI721_VECT_OMB0_INT + mbox].irq_name, - (void *)mport); + idx = TSI721_VECT_OMB0_INT + mbox; + rc = request_irq(priv->msix[idx].vector, tsi721_omsg_msix, 0, + priv->msix[idx].irq_name, (void *)priv); if (rc) { dev_dbg(&priv->pdev->dev, "Unable to allocate MSI-X interrupt for " "MBOX%d-INT\n", mbox); - free_irq( - priv->msix[TSI721_VECT_OMB0_DONE + mbox].vector, - (void *)mport); + idx = TSI721_VECT_OMB0_DONE + mbox; + free_irq(priv->msix[idx].vector, (void *)priv); goto out_stat; } } @@ -1819,9 +1848,9 @@ static void tsi721_close_outb_mbox(struct rio_mport *mport, int mbox) #ifdef CONFIG_PCI_MSI if (priv->flags & TSI721_USING_MSIX) { free_irq(priv->msix[TSI721_VECT_OMB0_DONE + mbox].vector, - (void *)mport); + (void *)priv); free_irq(priv->msix[TSI721_VECT_OMB0_INT + mbox].vector, - (void *)mport); + (void *)priv); } #endif /* CONFIG_PCI_MSI */ @@ -1866,6 +1895,7 @@ static void tsi721_imsg_handler(struct tsi721_device *priv, int ch) { u32 mbox = ch - 4; u32 imsg_int; + struct rio_mport *mport = &priv->mport; spin_lock(&priv->imsg_ring[mbox].lock); @@ -1888,8 +1918,8 @@ static void tsi721_imsg_handler(struct tsi721_device *priv, int ch) /* If an IB Msg is received notify the upper layer */ if (imsg_int & TSI721_IBDMAC_INT_DQ_RCV && - priv->mport->inb_msg[mbox].mcback) - priv->mport->inb_msg[mbox].mcback(priv->mport, + mport->inb_msg[mbox].mcback) + mport->inb_msg[mbox].mcback(mport, priv->imsg_ring[mbox].dev_id, mbox, -1); if (!(priv->flags & TSI721_USING_MSIX)) { @@ -1994,7 +2024,7 @@ static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id, * once when first inbound mailbox is requested. */ if (!(priv->flags & TSI721_IMSGID_SET)) { - iowrite32((u32)priv->mport->host_deviceid, + iowrite32((u32)priv->mport.host_deviceid, priv->regs + TSI721_IB_DEVID); priv->flags |= TSI721_IMSGID_SET; } @@ -2025,11 +2055,11 @@ static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id, #ifdef CONFIG_PCI_MSI if (priv->flags & TSI721_USING_MSIX) { + int idx = TSI721_VECT_IMB0_RCV + mbox; + /* Request interrupt service if we are in MSI-X mode */ - rc = request_irq(priv->msix[TSI721_VECT_IMB0_RCV + mbox].vector, - tsi721_imsg_msix, 0, - priv->msix[TSI721_VECT_IMB0_RCV + mbox].irq_name, - (void *)mport); + rc = request_irq(priv->msix[idx].vector, tsi721_imsg_msix, 0, + priv->msix[idx].irq_name, (void *)priv); if (rc) { dev_dbg(&priv->pdev->dev, @@ -2038,10 +2068,9 @@ static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id, goto out_desc; } - rc = request_irq(priv->msix[TSI721_VECT_IMB0_INT + mbox].vector, - tsi721_imsg_msix, 0, - priv->msix[TSI721_VECT_IMB0_INT + mbox].irq_name, - (void *)mport); + idx = TSI721_VECT_IMB0_INT + mbox; + rc = request_irq(priv->msix[idx].vector, tsi721_imsg_msix, 0, + priv->msix[idx].irq_name, (void *)priv); if (rc) { dev_dbg(&priv->pdev->dev, @@ -2049,7 +2078,7 @@ static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id, "IBOX%d-INT\n", mbox); free_irq( priv->msix[TSI721_VECT_IMB0_RCV + mbox].vector, - (void *)mport); + (void *)priv); goto out_desc; } } @@ -2120,9 +2149,9 @@ static void tsi721_close_inb_mbox(struct rio_mport *mport, int mbox) #ifdef CONFIG_PCI_MSI if (priv->flags & TSI721_USING_MSIX) { free_irq(priv->msix[TSI721_VECT_IMB0_RCV + mbox].vector, - (void *)mport); + (void *)priv); free_irq(priv->msix[TSI721_VECT_IMB0_INT + mbox].vector, - (void *)mport); + (void *)priv); } #endif /* CONFIG_PCI_MSI */ @@ -2371,6 +2400,32 @@ static void tsi721_disable_ints(struct tsi721_device *priv) iowrite32(0, priv->regs + TSI721_RIO_EM_DEV_INT_EN); } +static struct rio_ops tsi721_rio_ops = { + .lcread = tsi721_lcread, + .lcwrite = tsi721_lcwrite, + .cread = tsi721_cread_dma, + .cwrite = tsi721_cwrite_dma, + .dsend = tsi721_dsend, + .open_inb_mbox = tsi721_open_inb_mbox, + .close_inb_mbox = tsi721_close_inb_mbox, + .open_outb_mbox = tsi721_open_outb_mbox, + .close_outb_mbox = tsi721_close_outb_mbox, + .add_outb_message = tsi721_add_outb_message, + .add_inb_buffer = tsi721_add_inb_buffer, + .get_inb_message = tsi721_get_inb_message, + .map_inb = tsi721_rio_map_inb_mem, + .unmap_inb = tsi721_rio_unmap_inb_mem, + .pwenable = tsi721_pw_enable, + .query_mport = tsi721_query_mport, +}; + +static void tsi721_mport_release(struct device *dev) +{ + struct rio_mport *mport = to_rio_mport(dev); + + dev_dbg(dev, "RIO: %s %s id=%d\n", __func__, mport->name, mport->id); +} + /** * tsi721_setup_mport - Setup Tsi721 as RapidIO subsystem master port * @priv: pointer to tsi721 private data @@ -2381,47 +2436,20 @@ static int tsi721_setup_mport(struct tsi721_device *priv) { struct pci_dev *pdev = priv->pdev; int err = 0; - struct rio_ops *ops; + struct rio_mport *mport = &priv->mport; - struct rio_mport *mport; + err = rio_mport_initialize(mport); + if (err) + return err; - ops = kzalloc(sizeof(struct rio_ops), GFP_KERNEL); - if (!ops) { - dev_dbg(&pdev->dev, "Unable to allocate memory for rio_ops\n"); - return -ENOMEM; - } - - ops->lcread = tsi721_lcread; - ops->lcwrite = tsi721_lcwrite; - ops->cread = tsi721_cread_dma; - ops->cwrite = tsi721_cwrite_dma; - ops->dsend = tsi721_dsend; - ops->open_inb_mbox = tsi721_open_inb_mbox; - ops->close_inb_mbox = tsi721_close_inb_mbox; - ops->open_outb_mbox = tsi721_open_outb_mbox; - ops->close_outb_mbox = tsi721_close_outb_mbox; - ops->add_outb_message = tsi721_add_outb_message; - ops->add_inb_buffer = tsi721_add_inb_buffer; - ops->get_inb_message = tsi721_get_inb_message; - ops->map_inb = tsi721_rio_map_inb_mem; - ops->unmap_inb = tsi721_rio_unmap_inb_mem; - ops->query_mport = tsi721_query_mport; - - mport = kzalloc(sizeof(struct rio_mport), GFP_KERNEL); - if (!mport) { - kfree(ops); - dev_dbg(&pdev->dev, "Unable to allocate memory for mport\n"); - return -ENOMEM; - } - - mport->ops = ops; + mport->ops = &tsi721_rio_ops; mport->index = 0; mport->sys_size = 0; /* small system */ mport->phy_type = RIO_PHY_SERIAL; mport->priv = (void *)priv; mport->phys_efptr = 0x100; mport->dev.parent = &pdev->dev; - priv->mport = mport; + mport->dev.release = tsi721_mport_release; INIT_LIST_HEAD(&mport->dbells); @@ -2443,27 +2471,24 @@ static int tsi721_setup_mport(struct tsi721_device *priv) "MSI/MSI-X is not available. Using legacy INTx.\n"); #endif /* CONFIG_PCI_MSI */ - err = tsi721_request_irq(mport); + err = tsi721_request_irq(priv); - if (!err) { - tsi721_interrupts_init(priv); - ops->pwenable = tsi721_pw_enable; - } else { + if (err) { dev_err(&pdev->dev, "Unable to get assigned PCI IRQ " "vector %02X err=0x%x\n", pdev->irq, err); - goto err_exit; + return err; } #ifdef CONFIG_RAPIDIO_DMA_ENGINE - tsi721_register_dma(priv); + err = tsi721_register_dma(priv); + if (err) + goto err_exit; #endif /* Enable SRIO link */ iowrite32(ioread32(priv->regs + TSI721_DEVCTL) | TSI721_DEVCTL_SRBOOT_CMPL, priv->regs + TSI721_DEVCTL); - rio_register_mport(mport); - if (mport->host_deviceid >= 0) iowrite32(RIO_PORT_GEN_HOST | RIO_PORT_GEN_MASTER | RIO_PORT_GEN_DISCOVERED, @@ -2471,11 +2496,16 @@ static int tsi721_setup_mport(struct tsi721_device *priv) else iowrite32(0, priv->regs + (0x100 + RIO_PORT_GEN_CTL_CSR)); + err = rio_register_mport(mport); + if (err) { + tsi721_unregister_dma(priv); + goto err_exit; + } + return 0; err_exit: - kfree(mport); - kfree(ops); + tsi721_free_irq(priv); return err; } @@ -2639,10 +2669,12 @@ static int tsi721_probe(struct pci_dev *pdev, goto err_free_consistent; pci_set_drvdata(pdev, priv); + tsi721_interrupts_init(priv); return 0; err_free_consistent: + tsi721_port_write_free(priv); tsi721_doorbell_free(priv); err_free_bdma: tsi721_bdma_maint_free(priv); @@ -2662,6 +2694,40 @@ err_exit: return err; } +static void tsi721_remove(struct pci_dev *pdev) +{ + struct tsi721_device *priv = pci_get_drvdata(pdev); + + dev_dbg(&pdev->dev, "%s enter\n", __func__); + + tsi721_disable_ints(priv); + tsi721_free_irq(priv); + rio_unregister_mport(&priv->mport); + + tsi721_unregister_dma(priv); + tsi721_bdma_maint_free(priv); + tsi721_doorbell_free(priv); + tsi721_port_write_free(priv); + tsi721_close_sr2pc_mapping(priv); + + if (priv->regs) + iounmap(priv->regs); + if (priv->odb_base) + iounmap(priv->odb_base); +#ifdef CONFIG_PCI_MSI + if (priv->flags & TSI721_USING_MSIX) + pci_disable_msix(priv->pdev); + else if (priv->flags & TSI721_USING_MSI) + pci_disable_msi(priv->pdev); +#endif + pci_release_regions(pdev); + pci_clear_master(pdev); + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); + kfree(priv); + dev_dbg(&pdev->dev, "%s exit\n", __func__); +} + static void tsi721_shutdown(struct pci_dev *pdev) { struct tsi721_device *priv = pci_get_drvdata(pdev); @@ -2685,15 +2751,11 @@ static struct pci_driver tsi721_driver = { .name = "tsi721", .id_table = tsi721_pci_tbl, .probe = tsi721_probe, + .remove = tsi721_remove, .shutdown = tsi721_shutdown, }; -static int __init tsi721_init(void) -{ - return pci_register_driver(&tsi721_driver); -} - -device_initcall(tsi721_init); +module_pci_driver(tsi721_driver); MODULE_DESCRIPTION("IDT Tsi721 PCIExpress-to-SRIO bridge driver"); MODULE_AUTHOR("Integrated Device Technology, Inc."); diff --git a/drivers/rapidio/devices/tsi721.h b/drivers/rapidio/devices/tsi721.h index ce2fb1193869..58637295a34c 100644 --- a/drivers/rapidio/devices/tsi721.h +++ b/drivers/rapidio/devices/tsi721.h @@ -824,7 +824,7 @@ struct tsi721_ib_win { struct tsi721_device { struct pci_dev *pdev; - struct rio_mport *mport; + struct rio_mport mport; u32 flags; void __iomem *regs; #ifdef CONFIG_PCI_MSI @@ -866,9 +866,11 @@ struct tsi721_device { #ifdef CONFIG_RAPIDIO_DMA_ENGINE extern void tsi721_bdma_handler(struct tsi721_bdma_chan *bdma_chan); extern int tsi721_register_dma(struct tsi721_device *priv); +extern void tsi721_unregister_dma(struct tsi721_device *priv); extern void tsi721_dma_stop_all(struct tsi721_device *priv); #else #define tsi721_dma_stop_all(priv) do {} while (0) +#define tsi721_unregister_dma(priv) do {} while (0) #endif #endif diff --git a/drivers/rapidio/devices/tsi721_dma.c b/drivers/rapidio/devices/tsi721_dma.c index b490ec36f018..31bb61b43c3f 100644 --- a/drivers/rapidio/devices/tsi721_dma.c +++ b/drivers/rapidio/devices/tsi721_dma.c @@ -887,7 +887,7 @@ int tsi721_register_dma(struct tsi721_device *priv) int i; int nr_channels = 0; int err; - struct rio_mport *mport = priv->mport; + struct rio_mport *mport = &priv->mport; INIT_LIST_HEAD(&mport->dma.channels); @@ -937,3 +937,29 @@ int tsi721_register_dma(struct tsi721_device *priv) return err; } + +void tsi721_unregister_dma(struct tsi721_device *priv) +{ + struct rio_mport *mport = &priv->mport; + struct dma_chan *chan, *_c; + struct tsi721_bdma_chan *bdma_chan; + + tsi721_dma_stop_all(priv); + dma_async_device_unregister(&mport->dma); + + list_for_each_entry_safe(chan, _c, &mport->dma.channels, + device_node) { + bdma_chan = to_tsi721_chan(chan); + if (bdma_chan->active) { + tsi721_bdma_interrupt_enable(bdma_chan, 0); + bdma_chan->active = false; + tsi721_sync_dma_irq(bdma_chan); + tasklet_kill(&bdma_chan->tasklet); + INIT_LIST_HEAD(&bdma_chan->free_list); + kfree(bdma_chan->tx_desc); + tsi721_bdma_ch_free(bdma_chan); + } + + list_del(&chan->device_node); + } +} From dd64f4fe6fe5e3924b36ec4bb4d4202af944a452 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:26 -0700 Subject: [PATCH 43/81] powerpc/fsl_rio: changes to mport registration Change mport object initialization/registration sequence to match reworked version of rio_register_mport() in the core code. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Benjamin Herrenschmidt Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/sysdev/fsl_rio.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index c1cd3698f534..385371acc0d0 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -606,6 +606,12 @@ int fsl_rio_setup(struct platform_device *dev) if (!port) continue; + rc = rio_mport_initialize(port); + if (rc) { + kfree(port); + continue; + } + i = *port_index - 1; port->index = (unsigned char)i; @@ -682,12 +688,6 @@ int fsl_rio_setup(struct platform_device *dev) dev_info(&dev->dev, "RapidIO Common Transport System size: %d\n", port->sys_size ? 65536 : 256); - if (rio_register_mport(port)) { - release_resource(&port->iores); - kfree(priv); - kfree(port); - continue; - } if (port->host_deviceid >= 0) out_be32(priv->regs_win + RIO_GCCSR, RIO_PORT_GEN_HOST | RIO_PORT_GEN_MASTER | RIO_PORT_GEN_DISCOVERED); @@ -727,6 +727,12 @@ int fsl_rio_setup(struct platform_device *dev) dbell->mport[i] = port; + if (rio_register_mport(port)) { + release_resource(&port->iores); + kfree(priv); + kfree(port); + continue; + } active_ports++; } From 34ed2ebb65b0aa6a82206686870aaaddc12b2271 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:29 -0700 Subject: [PATCH 44/81] rapidio/rionet: add locking into add/remove device Add spinlock protection when handling list of connected peers and ability to handle new peer device addition after the RIONET device was open. Before his update RIONET was sending JOIN requests only when it have been opened, peer devices added later have been missing from this process. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/rionet.c | 152 +++++++++++++++++++++++++++++-------------- 1 file changed, 102 insertions(+), 50 deletions(-) diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index f994fa1fde2f..c15d9581dd56 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -63,6 +63,7 @@ struct rionet_private { spinlock_t lock; spinlock_t tx_lock; u32 msg_enable; + bool open; }; struct rionet_peer { @@ -74,6 +75,7 @@ struct rionet_peer { struct rionet_net { struct net_device *ndev; struct list_head peers; + spinlock_t lock; /* net info access lock */ struct rio_dev **active; int nact; /* number of active peers */ }; @@ -235,26 +237,32 @@ static void rionet_dbell_event(struct rio_mport *mport, void *dev_id, u16 sid, u struct net_device *ndev = dev_id; struct rionet_private *rnet = netdev_priv(ndev); struct rionet_peer *peer; + unsigned char netid = rnet->mport->id; if (netif_msg_intr(rnet)) printk(KERN_INFO "%s: doorbell sid %4.4x tid %4.4x info %4.4x", DRV_NAME, sid, tid, info); if (info == RIONET_DOORBELL_JOIN) { - if (!nets[rnet->mport->id].active[sid]) { - list_for_each_entry(peer, - &nets[rnet->mport->id].peers, node) { + if (!nets[netid].active[sid]) { + spin_lock(&nets[netid].lock); + list_for_each_entry(peer, &nets[netid].peers, node) { if (peer->rdev->destid == sid) { - nets[rnet->mport->id].active[sid] = - peer->rdev; - nets[rnet->mport->id].nact++; + nets[netid].active[sid] = peer->rdev; + nets[netid].nact++; } } + spin_unlock(&nets[netid].lock); + rio_mport_send_doorbell(mport, sid, RIONET_DOORBELL_JOIN); } } else if (info == RIONET_DOORBELL_LEAVE) { - nets[rnet->mport->id].active[sid] = NULL; - nets[rnet->mport->id].nact--; + spin_lock(&nets[netid].lock); + if (nets[netid].active[sid]) { + nets[netid].active[sid] = NULL; + nets[netid].nact--; + } + spin_unlock(&nets[netid].lock); } else { if (netif_msg_intr(rnet)) printk(KERN_WARNING "%s: unhandled doorbell\n", @@ -308,8 +316,10 @@ static void rionet_outb_msg_event(struct rio_mport *mport, void *dev_id, int mbo static int rionet_open(struct net_device *ndev) { int i, rc = 0; - struct rionet_peer *peer, *tmp; + struct rionet_peer *peer; struct rionet_private *rnet = netdev_priv(ndev); + unsigned char netid = rnet->mport->id; + unsigned long flags; if (netif_msg_ifup(rnet)) printk(KERN_INFO "%s: open\n", DRV_NAME); @@ -348,20 +358,13 @@ static int rionet_open(struct net_device *ndev) netif_carrier_on(ndev); netif_start_queue(ndev); - list_for_each_entry_safe(peer, tmp, - &nets[rnet->mport->id].peers, node) { - if (!(peer->res = rio_request_outb_dbell(peer->rdev, - RIONET_DOORBELL_JOIN, - RIONET_DOORBELL_LEAVE))) - { - printk(KERN_ERR "%s: error requesting doorbells\n", - DRV_NAME); - continue; - } - + spin_lock_irqsave(&nets[netid].lock, flags); + list_for_each_entry(peer, &nets[netid].peers, node) { /* Send a join message */ rio_send_doorbell(peer->rdev, RIONET_DOORBELL_JOIN); } + spin_unlock_irqrestore(&nets[netid].lock, flags); + rnet->open = true; out: return rc; @@ -370,7 +373,9 @@ static int rionet_open(struct net_device *ndev) static int rionet_close(struct net_device *ndev) { struct rionet_private *rnet = netdev_priv(ndev); - struct rionet_peer *peer, *tmp; + struct rionet_peer *peer; + unsigned char netid = rnet->mport->id; + unsigned long flags; int i; if (netif_msg_ifup(rnet)) @@ -378,18 +383,21 @@ static int rionet_close(struct net_device *ndev) netif_stop_queue(ndev); netif_carrier_off(ndev); + rnet->open = false; for (i = 0; i < RIONET_RX_RING_SIZE; i++) kfree_skb(rnet->rx_skb[i]); - list_for_each_entry_safe(peer, tmp, - &nets[rnet->mport->id].peers, node) { - if (nets[rnet->mport->id].active[peer->rdev->destid]) { + spin_lock_irqsave(&nets[netid].lock, flags); + list_for_each_entry(peer, &nets[netid].peers, node) { + if (nets[netid].active[peer->rdev->destid]) { rio_send_doorbell(peer->rdev, RIONET_DOORBELL_LEAVE); - nets[rnet->mport->id].active[peer->rdev->destid] = NULL; + nets[netid].active[peer->rdev->destid] = NULL; } - rio_release_outb_dbell(peer->rdev, peer->res); + if (peer->res) + rio_release_outb_dbell(peer->rdev, peer->res); } + spin_unlock_irqrestore(&nets[netid].lock, flags); rio_release_inb_dbell(rnet->mport, RIONET_DOORBELL_JOIN, RIONET_DOORBELL_LEAVE); @@ -403,22 +411,38 @@ static void rionet_remove_dev(struct device *dev, struct subsys_interface *sif) { struct rio_dev *rdev = to_rio_dev(dev); unsigned char netid = rdev->net->hport->id; - struct rionet_peer *peer, *tmp; + struct rionet_peer *peer; + int state, found = 0; + unsigned long flags; - if (dev_rionet_capable(rdev)) { - list_for_each_entry_safe(peer, tmp, &nets[netid].peers, node) { - if (peer->rdev == rdev) { - if (nets[netid].active[rdev->destid]) { - nets[netid].active[rdev->destid] = NULL; - nets[netid].nact--; + if (!dev_rionet_capable(rdev)) + return; + + spin_lock_irqsave(&nets[netid].lock, flags); + list_for_each_entry(peer, &nets[netid].peers, node) { + if (peer->rdev == rdev) { + list_del(&peer->node); + if (nets[netid].active[rdev->destid]) { + state = atomic_read(&rdev->state); + if (state != RIO_DEVICE_GONE && + state != RIO_DEVICE_INITIALIZING) { + rio_send_doorbell(rdev, + RIONET_DOORBELL_LEAVE); } - - list_del(&peer->node); - kfree(peer); - break; + nets[netid].active[rdev->destid] = NULL; + nets[netid].nact--; } + found = 1; + break; } } + spin_unlock_irqrestore(&nets[netid].lock, flags); + + if (found) { + if (peer->res) + rio_release_outb_dbell(rdev, peer->res); + kfree(peer); + } } static void rionet_get_drvinfo(struct net_device *ndev, @@ -492,6 +516,7 @@ static int rionet_setup_netdev(struct rio_mport *mport, struct net_device *ndev) /* Set up private area */ rnet = netdev_priv(ndev); rnet->mport = mport; + rnet->open = false; /* Set the default MAC address */ device_id = rio_local_get_device_id(mport); @@ -514,8 +539,11 @@ static int rionet_setup_netdev(struct rio_mport *mport, struct net_device *ndev) rnet->msg_enable = RIONET_DEFAULT_MSGLEVEL; rc = register_netdev(ndev); - if (rc != 0) + if (rc != 0) { + free_pages((unsigned long)nets[mport->id].active, + get_order(rionet_active_bytes)); goto out; + } printk(KERN_INFO "%s: %s %s Version %s, MAC %pM, %s\n", ndev->name, @@ -529,8 +557,6 @@ static int rionet_setup_netdev(struct rio_mport *mport, struct net_device *ndev) return rc; } -static unsigned long net_table[RIONET_MAX_NETS/sizeof(unsigned long) + 1]; - static int rionet_add_dev(struct device *dev, struct subsys_interface *sif) { int rc = -ENODEV; @@ -539,19 +565,16 @@ static int rionet_add_dev(struct device *dev, struct subsys_interface *sif) struct net_device *ndev = NULL; struct rio_dev *rdev = to_rio_dev(dev); unsigned char netid = rdev->net->hport->id; - int oldnet; if (netid >= RIONET_MAX_NETS) return rc; - oldnet = test_and_set_bit(netid, net_table); - /* * If first time through this net, make sure local device is rionet * capable and setup netdev (this step will be skipped in later probes * on the same net). */ - if (!oldnet) { + if (!nets[netid].ndev) { rio_local_read_config_32(rdev->net->hport, RIO_SRC_OPS_CAR, &lsrc_ops); rio_local_read_config_32(rdev->net->hport, RIO_DST_OPS_CAR, @@ -569,30 +592,56 @@ static int rionet_add_dev(struct device *dev, struct subsys_interface *sif) rc = -ENOMEM; goto out; } - nets[netid].ndev = ndev; + rc = rionet_setup_netdev(rdev->net->hport, ndev); if (rc) { printk(KERN_ERR "%s: failed to setup netdev (rc=%d)\n", DRV_NAME, rc); + free_netdev(ndev); goto out; } INIT_LIST_HEAD(&nets[netid].peers); + spin_lock_init(&nets[netid].lock); nets[netid].nact = 0; - } else if (nets[netid].ndev == NULL) - goto out; + nets[netid].ndev = ndev; + } /* * If the remote device has mailbox/doorbell capabilities, * add it to the peer list. */ if (dev_rionet_capable(rdev)) { - if (!(peer = kmalloc(sizeof(struct rionet_peer), GFP_KERNEL))) { + struct rionet_private *rnet; + unsigned long flags; + + rnet = netdev_priv(nets[netid].ndev); + + peer = kzalloc(sizeof(*peer), GFP_KERNEL); + if (!peer) { rc = -ENOMEM; goto out; } peer->rdev = rdev; + peer->res = rio_request_outb_dbell(peer->rdev, + RIONET_DOORBELL_JOIN, + RIONET_DOORBELL_LEAVE); + if (!peer->res) { + pr_err("%s: error requesting doorbells\n", DRV_NAME); + kfree(peer); + rc = -ENOMEM; + goto out; + } + + spin_lock_irqsave(&nets[netid].lock, flags); list_add_tail(&peer->node, &nets[netid].peers); + spin_unlock_irqrestore(&nets[netid].lock, flags); + pr_debug("%s: %s add peer %s\n", + DRV_NAME, __func__, rio_name(rdev)); + + /* If netdev is already opened, send join request to new peer */ + if (rnet->open) + rio_send_doorbell(peer->rdev, RIONET_DOORBELL_JOIN); } return 0; @@ -603,7 +652,8 @@ out: static int rionet_shutdown(struct notifier_block *nb, unsigned long code, void *unused) { - struct rionet_peer *peer, *tmp; + struct rionet_peer *peer; + unsigned long flags; int i; pr_debug("%s: %s\n", DRV_NAME, __func__); @@ -612,13 +662,15 @@ static int rionet_shutdown(struct notifier_block *nb, unsigned long code, if (!nets[i].ndev) continue; - list_for_each_entry_safe(peer, tmp, &nets[i].peers, node) { + spin_lock_irqsave(&nets[i].lock, flags); + list_for_each_entry(peer, &nets[i].peers, node) { if (nets[i].active[peer->rdev->destid]) { rio_send_doorbell(peer->rdev, RIONET_DOORBELL_LEAVE); nets[i].active[peer->rdev->destid] = NULL; } } + spin_unlock_irqrestore(&nets[i].lock, flags); } return NOTIFY_DONE; From b7dfca8bd446721cbc23f4a7cf3c407eb42175dc Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:32 -0700 Subject: [PATCH 45/81] rapidio/rionet: add mport removal handling Add handling of a local mport device removal. RIONET driver registers itself as class interface that supports only removal notification, 'add_device' callback is not provided because RIONET network device can be initialized only after enumeration is completed and the existing method (using remote peer addition) satisfies this condition. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/rionet.c | 70 ++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index c15d9581dd56..9cfe6aeac84e 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -676,6 +676,34 @@ static int rionet_shutdown(struct notifier_block *nb, unsigned long code, return NOTIFY_DONE; } +static void rionet_remove_mport(struct device *dev, + struct class_interface *class_intf) +{ + struct rio_mport *mport = to_rio_mport(dev); + struct net_device *ndev; + int id = mport->id; + + pr_debug("%s %s\n", __func__, mport->name); + + WARN(nets[id].nact, "%s called when connected to %d peers\n", + __func__, nets[id].nact); + WARN(!nets[id].ndev, "%s called for mport without NDEV\n", + __func__); + + if (nets[id].ndev) { + ndev = nets[id].ndev; + netif_stop_queue(ndev); + unregister_netdev(ndev); + + free_pages((unsigned long)nets[id].active, + get_order(sizeof(void *) * + RIO_MAX_ROUTE_ENTRIES(mport->sys_size))); + nets[id].active = NULL; + free_netdev(ndev); + nets[id].ndev = NULL; + } +} + #ifdef MODULE static struct rio_device_id rionet_id_table[] = { {RIO_DEVICE(RIO_ANY_ID, RIO_ANY_ID)}, @@ -696,6 +724,13 @@ static struct notifier_block rionet_notifier = { .notifier_call = rionet_shutdown, }; +/* the rio_mport_interface is used to handle local mport devices */ +static struct class_interface rio_mport_interface __refdata = { + .class = &rio_mport_class, + .add_dev = NULL, + .remove_dev = rionet_remove_mport, +}; + static int __init rionet_init(void) { int ret; @@ -706,39 +741,22 @@ static int __init rionet_init(void) DRV_NAME, ret); return ret; } + + ret = class_interface_register(&rio_mport_interface); + if (ret) { + pr_err("%s: class_interface_register error: %d\n", + DRV_NAME, ret); + return ret; + } + return subsys_interface_register(&rionet_interface); } static void __exit rionet_exit(void) { - struct rionet_private *rnet; - struct net_device *ndev; - struct rionet_peer *peer, *tmp; - int i; - - for (i = 0; i < RIONET_MAX_NETS; i++) { - if (nets[i].ndev != NULL) { - ndev = nets[i].ndev; - rnet = netdev_priv(ndev); - unregister_netdev(ndev); - - list_for_each_entry_safe(peer, - tmp, &nets[i].peers, node) { - list_del(&peer->node); - kfree(peer); - } - - free_pages((unsigned long)nets[i].active, - get_order(sizeof(void *) * - RIO_MAX_ROUTE_ENTRIES(rnet->mport->sys_size))); - nets[i].active = NULL; - - free_netdev(ndev); - } - } - unregister_reboot_notifier(&rionet_notifier); subsys_interface_unregister(&rionet_interface); + class_interface_unregister(&rio_mport_interface); } late_initcall(rionet_init); From a7b4c636d83034f0e89d58651ef2e9b96564489a Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:35 -0700 Subject: [PATCH 46/81] rapidio: add lock protection for doorbell list Add lock protection around doorbell list handling to prevent list corruption on SMP platforms. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio.c | 9 ++++++--- include/linux/rio.h | 2 ++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index 03b8c5af72bb..e42f97e9e62a 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -362,7 +362,9 @@ rio_setup_inb_dbell(struct rio_mport *mport, void *dev_id, struct resource *res, dbell->dinb = dinb; dbell->dev_id = dev_id; + mutex_lock(&mport->lock); list_add_tail(&dbell->node, &mport->dbells); + mutex_unlock(&mport->lock); out: return rc; @@ -426,12 +428,15 @@ int rio_release_inb_dbell(struct rio_mport *mport, u16 start, u16 end) int rc = 0, found = 0; struct rio_dbell *dbell; + mutex_lock(&mport->lock); list_for_each_entry(dbell, &mport->dbells, node) { if ((dbell->res->start == start) && (dbell->res->end == end)) { + list_del(&dbell->node); found = 1; break; } } + mutex_unlock(&mport->lock); /* If we can't find an exact match, fail */ if (!found) { @@ -439,9 +444,6 @@ int rio_release_inb_dbell(struct rio_mport *mport, u16 start, u16 end) goto out; } - /* Delete from list */ - list_del(&dbell->node); - /* Release the doorbell resource */ rc = release_resource(dbell->res); @@ -2024,6 +2026,7 @@ int rio_mport_initialize(struct rio_mport *mport) mport->id = next_portid++; mport->host_deviceid = rio_get_hdid(mport->id); mport->nscan = NULL; + mutex_init(&mport->lock); return 0; } diff --git a/include/linux/rio.h b/include/linux/rio.h index f833773cdc68..948f60550ae5 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -248,6 +248,7 @@ enum rio_phy_type { * @node: Node in global list of master ports * @nnode: Node in network list of master ports * @net: RIO net this mport is attached to + * @lock: lock to synchronize lists manipulations * @iores: I/O mem resource that this master port interface owns * @riores: RIO resources that this master port interfaces owns * @inb_msg: RIO inbound message event descriptors @@ -271,6 +272,7 @@ struct rio_mport { struct list_head node; /* node in global list of ports */ struct list_head nnode; /* node in net list of ports */ struct rio_net *net; /* RIO net this mport is attached to */ + struct mutex lock; struct resource iores; struct resource riores[RIO_MAX_MPORT_RESOURCES]; struct rio_msg inb_msg[RIO_MAX_MBOX]; From 5024622f583eb242ed8040d0b9d1e0d2458d1db8 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:38 -0700 Subject: [PATCH 47/81] rapidio: move rio_local_set_device_id function to the common core Make function rio_local_set_device_id() common for all components of RapidIO subsystem. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio-scan.c | 13 ------------- drivers/rapidio/rio.c | 14 ++++++++++++++ include/linux/rio_drv.h | 1 + 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index 764dc5fd30b2..f730914f6ff6 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -193,19 +193,6 @@ static void rio_set_device_id(struct rio_mport *port, u16 destid, u8 hopcount, u RIO_SET_DID(port->sys_size, did)); } -/** - * rio_local_set_device_id - Set the base/extended device id for a port - * @port: RIO master port - * @did: Device ID value to be written - * - * Writes the base/extended device id from a device. - */ -static void rio_local_set_device_id(struct rio_mport *port, u16 did) -{ - rio_local_write_config_32(port, RIO_DID_CSR, RIO_SET_DID(port->sys_size, - did)); -} - /** * rio_clear_locks- Release all host locks and signal enumeration complete * @net: RIO network to run on diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index e42f97e9e62a..095801c4d239 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -137,6 +137,20 @@ void rio_free_net(struct rio_net *net) } EXPORT_SYMBOL_GPL(rio_free_net); +/** + * rio_local_set_device_id - Set the base/extended device id for a port + * @port: RIO master port + * @did: Device ID value to be written + * + * Writes the base/extended device id from a device. + */ +void rio_local_set_device_id(struct rio_mport *port, u16 did) +{ + rio_local_write_config_32(port, RIO_DID_CSR, + RIO_SET_DID(port->sys_size, did)); +} +EXPORT_SYMBOL_GPL(rio_local_set_device_id); + /** * rio_add_device- Adds a RIO device to the device model * @rdev: RIO device diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h index 9fc2f213e74f..341b3bf78333 100644 --- a/include/linux/rio_drv.h +++ b/include/linux/rio_drv.h @@ -435,6 +435,7 @@ static inline void rio_set_drvdata(struct rio_dev *rdev, void *data) /* Misc driver helpers */ extern u16 rio_local_get_device_id(struct rio_mport *port); +extern void rio_local_set_device_id(struct rio_mport *port, u16 did); extern struct rio_dev *rio_get_device(u16 vid, u16 did, struct rio_dev *from); extern struct rio_dev *rio_get_asm(u16 vid, u16 did, u16 asm_vid, u16 asm_did, struct rio_dev *from); From b6cb95e8eb97e51a1a1b5609b59df859cc6dc2f2 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:41 -0700 Subject: [PATCH 48/81] rapidio: move rio_pw_enable into core code Make rio_pw_enable() routine available to other RapidIO drivers. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio-scan.c | 11 ----------- drivers/rapidio/rio.c | 19 +++++++++++++++++++ include/linux/rio.h | 2 ++ include/linux/rio_drv.h | 1 + 4 files changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index f730914f6ff6..a63a380809d1 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -973,17 +973,6 @@ static void rio_init_em(struct rio_dev *rdev) } } -/** - * rio_pw_enable - Enables/disables port-write handling by a master port - * @port: Master port associated with port-write handling - * @enable: 1=enable, 0=disable - */ -static void rio_pw_enable(struct rio_mport *port, int enable) -{ - if (port->ops->pwenable) - port->ops->pwenable(port, enable); -} - /** * rio_enum_mport- Start enumeration through a master port * @mport: Master port to send transactions diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index 095801c4d239..673774bf6dd6 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -559,6 +559,24 @@ int rio_release_inb_pwrite(struct rio_dev *rdev) } EXPORT_SYMBOL_GPL(rio_release_inb_pwrite); +/** + * rio_pw_enable - Enables/disables port-write handling by a master port + * @mport: Master port associated with port-write handling + * @enable: 1=enable, 0=disable + */ +void rio_pw_enable(struct rio_mport *mport, int enable) +{ + if (mport->ops->pwenable) { + mutex_lock(&mport->lock); + + if ((enable && ++mport->pwe_refcnt == 1) || + (!enable && mport->pwe_refcnt && --mport->pwe_refcnt == 0)) + mport->ops->pwenable(mport, enable); + mutex_unlock(&mport->lock); + } +} +EXPORT_SYMBOL_GPL(rio_pw_enable); + /** * rio_map_inb_region -- Map inbound memory region. * @mport: Master port. @@ -2041,6 +2059,7 @@ int rio_mport_initialize(struct rio_mport *mport) mport->host_deviceid = rio_get_hdid(mport->id); mport->nscan = NULL; mutex_init(&mport->lock); + mport->pwe_refcnt = 0; return 0; } diff --git a/include/linux/rio.h b/include/linux/rio.h index 948f60550ae5..cb3c47543bb0 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -266,6 +266,7 @@ enum rio_phy_type { * @dma: DMA device associated with mport * @nscan: RapidIO network enumeration/discovery operations * @state: mport device state + * @pwe_refcnt: port-write enable ref counter to track enable/disable requests */ struct rio_mport { struct list_head dbells; /* list of doorbell events */ @@ -296,6 +297,7 @@ struct rio_mport { #endif struct rio_scan *nscan; atomic_t state; + unsigned int pwe_refcnt; }; static inline int rio_mport_is_running(struct rio_mport *mport) diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h index 341b3bf78333..9fb2bcd0a987 100644 --- a/include/linux/rio_drv.h +++ b/include/linux/rio_drv.h @@ -375,6 +375,7 @@ extern int rio_request_inb_pwrite(struct rio_dev *, int (*)(struct rio_dev *, union rio_pw_msg*, int)); extern int rio_release_inb_pwrite(struct rio_dev *); extern int rio_inb_pwrite_handler(union rio_pw_msg *pw_msg); +extern void rio_pw_enable(struct rio_mport *mport, int enable); /* LDM support */ int rio_register_driver(struct rio_driver *); From 9a0b062742e7e039273c0c2ba4b96ad9ec7e7d8f Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:44 -0700 Subject: [PATCH 49/81] rapidio: add global inbound port write interfaces Add new Port Write handler registration interfaces that attach PW handlers to local mport device objects. This is different from old interface that attaches PW callback to individual RapidIO device. The new interfaces are intended for use for common event handling (e.g. hot-plug notifications) while the old interface is available for individual device drivers. This patch is based on patch proposed by Andre van Herk but preserves existing per-device interface and adds lock protection for list handling. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/sysdev/fsl_rio.c | 1 + arch/powerpc/sysdev/fsl_rio.h | 1 + arch/powerpc/sysdev/fsl_rmu.c | 16 ++-- drivers/rapidio/devices/tsi721.c | 24 +----- drivers/rapidio/rio.c | 142 +++++++++++++++++++++++++------ include/linux/rio.h | 2 + include/linux/rio_drv.h | 9 +- 7 files changed, 144 insertions(+), 51 deletions(-) diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 385371acc0d0..f5bf38b94595 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -726,6 +726,7 @@ int fsl_rio_setup(struct platform_device *dev) fsl_rio_inbound_mem_init(priv); dbell->mport[i] = port; + pw->mport[i] = port; if (rio_register_mport(port)) { release_resource(&port->iores); diff --git a/arch/powerpc/sysdev/fsl_rio.h b/arch/powerpc/sysdev/fsl_rio.h index d53407a34f32..12dd18fd4795 100644 --- a/arch/powerpc/sysdev/fsl_rio.h +++ b/arch/powerpc/sysdev/fsl_rio.h @@ -97,6 +97,7 @@ struct fsl_rio_dbell { }; struct fsl_rio_pw { + struct rio_mport *mport[MAX_PORT_NUM]; struct device *dev; struct rio_pw_regs __iomem *pw_regs; struct rio_port_write_msg port_write_msg; diff --git a/arch/powerpc/sysdev/fsl_rmu.c b/arch/powerpc/sysdev/fsl_rmu.c index ffe0ee832768..c1826de4e749 100644 --- a/arch/powerpc/sysdev/fsl_rmu.c +++ b/arch/powerpc/sysdev/fsl_rmu.c @@ -481,14 +481,14 @@ pw_done: static void fsl_pw_dpc(struct work_struct *work) { struct fsl_rio_pw *pw = container_of(work, struct fsl_rio_pw, pw_work); - u32 msg_buffer[RIO_PW_MSG_SIZE/sizeof(u32)]; + union rio_pw_msg msg_buffer; + int i; /* * Process port-write messages */ - while (kfifo_out_spinlocked(&pw->pw_fifo, (unsigned char *)msg_buffer, + while (kfifo_out_spinlocked(&pw->pw_fifo, (unsigned char *)&msg_buffer, RIO_PW_MSG_SIZE, &pw->pw_fifo_lock)) { - /* Process one message */ #ifdef DEBUG_PW { u32 i; @@ -496,15 +496,19 @@ static void fsl_pw_dpc(struct work_struct *work) for (i = 0; i < RIO_PW_MSG_SIZE/sizeof(u32); i++) { if ((i%4) == 0) pr_debug("\n0x%02x: 0x%08x", i*4, - msg_buffer[i]); + msg_buffer.raw[i]); else - pr_debug(" 0x%08x", msg_buffer[i]); + pr_debug(" 0x%08x", msg_buffer.raw[i]); } pr_debug("\n"); } #endif /* Pass the port-write message to RIO core for processing */ - rio_inb_pwrite_handler((union rio_pw_msg *)msg_buffer); + for (i = 0; i < MAX_PORT_NUM; i++) { + if (pw->mport[i]) + rio_inb_pwrite_handler(pw->mport[i], + &msg_buffer); + } } } diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index db95d71ba4e9..5e1d52674e17 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -36,8 +36,6 @@ #include "tsi721.h" -#define DEBUG_PW /* Inbound Port-Write debugging */ - static void tsi721_omsg_handler(struct tsi721_device *priv, int ch); static void tsi721_imsg_handler(struct tsi721_device *priv, int ch); @@ -282,30 +280,15 @@ static void tsi721_pw_dpc(struct work_struct *work) { struct tsi721_device *priv = container_of(work, struct tsi721_device, pw_work); - u32 msg_buffer[RIO_PW_MSG_SIZE/sizeof(u32)]; /* Use full size PW message - buffer for RIO layer */ + union rio_pw_msg pwmsg; /* * Process port-write messages */ - while (kfifo_out_spinlocked(&priv->pw_fifo, (unsigned char *)msg_buffer, + while (kfifo_out_spinlocked(&priv->pw_fifo, (unsigned char *)&pwmsg, TSI721_RIO_PW_MSG_SIZE, &priv->pw_fifo_lock)) { - /* Process one message */ -#ifdef DEBUG_PW - { - u32 i; - pr_debug("%s : Port-Write Message:", __func__); - for (i = 0; i < RIO_PW_MSG_SIZE/sizeof(u32); ) { - pr_debug("0x%02x: %08x %08x %08x %08x", i*4, - msg_buffer[i], msg_buffer[i + 1], - msg_buffer[i + 2], msg_buffer[i + 3]); - i += 4; - } - pr_debug("\n"); - } -#endif /* Pass the port-write message to RIO core for processing */ - rio_inb_pwrite_handler((union rio_pw_msg *)msg_buffer); + rio_inb_pwrite_handler(&priv->mport, &pwmsg); } } @@ -2702,6 +2685,7 @@ static void tsi721_remove(struct pci_dev *pdev) tsi721_disable_ints(priv); tsi721_free_irq(priv); + flush_scheduled_work(); rio_unregister_mport(&priv->mport); tsi721_unregister_dma(priv); diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index 673774bf6dd6..17973d3caa88 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -30,6 +30,20 @@ #include "rio.h" +/* + * struct rio_pwrite - RIO portwrite event + * @node: Node in list of doorbell events + * @pwcback: Doorbell event callback + * @context: Handler specific context to pass on event + */ +struct rio_pwrite { + struct list_head node; + + int (*pwcback)(struct rio_mport *mport, void *context, + union rio_pw_msg *msg, int step); + void *context; +}; + MODULE_DESCRIPTION("RapidIO Subsystem Core"); MODULE_AUTHOR("Matt Porter "); MODULE_AUTHOR("Alexandre Bounine "); @@ -514,7 +528,71 @@ int rio_release_outb_dbell(struct rio_dev *rdev, struct resource *res) } /** - * rio_request_inb_pwrite - request inbound port-write message service + * rio_add_mport_pw_handler - add port-write message handler into the list + * of mport specific pw handlers + * @mport: RIO master port to bind the portwrite callback + * @context: Handler specific context to pass on event + * @pwcback: Callback to execute when portwrite is received + * + * Returns 0 if the request has been satisfied. + */ +int rio_add_mport_pw_handler(struct rio_mport *mport, void *context, + int (*pwcback)(struct rio_mport *mport, + void *context, union rio_pw_msg *msg, int step)) +{ + int rc = 0; + struct rio_pwrite *pwrite; + + pwrite = kzalloc(sizeof(struct rio_pwrite), GFP_KERNEL); + if (!pwrite) { + rc = -ENOMEM; + goto out; + } + + pwrite->pwcback = pwcback; + pwrite->context = context; + mutex_lock(&mport->lock); + list_add_tail(&pwrite->node, &mport->pwrites); + mutex_unlock(&mport->lock); +out: + return rc; +} +EXPORT_SYMBOL_GPL(rio_add_mport_pw_handler); + +/** + * rio_del_mport_pw_handler - remove port-write message handler from the list + * of mport specific pw handlers + * @mport: RIO master port to bind the portwrite callback + * @context: Registered handler specific context to pass on event + * @pwcback: Registered callback function + * + * Returns 0 if the request has been satisfied. + */ +int rio_del_mport_pw_handler(struct rio_mport *mport, void *context, + int (*pwcback)(struct rio_mport *mport, + void *context, union rio_pw_msg *msg, int step)) +{ + int rc = -EINVAL; + struct rio_pwrite *pwrite; + + mutex_lock(&mport->lock); + list_for_each_entry(pwrite, &mport->pwrites, node) { + if (pwrite->pwcback == pwcback && pwrite->context == context) { + list_del(&pwrite->node); + kfree(pwrite); + rc = 0; + break; + } + } + mutex_unlock(&mport->lock); + + return rc; +} +EXPORT_SYMBOL_GPL(rio_del_mport_pw_handler); + +/** + * rio_request_inb_pwrite - request inbound port-write message service for + * specific RapidIO device * @rdev: RIO device to which register inbound port-write callback routine * @pwcback: Callback routine to execute when port-write is received * @@ -539,6 +617,7 @@ EXPORT_SYMBOL_GPL(rio_request_inb_pwrite); /** * rio_release_inb_pwrite - release inbound port-write message service + * associated with specific RapidIO device * @rdev: RIO device which registered for inbound port-write callback * * Removes callback from the rio_dev structure. Returns 0 if the request @@ -1002,52 +1081,66 @@ rd_err: } /** - * rio_inb_pwrite_handler - process inbound port-write message + * rio_inb_pwrite_handler - inbound port-write message handler + * @mport: mport device associated with port-write * @pw_msg: pointer to inbound port-write message * * Processes an inbound port-write message. Returns 0 if the request * has been satisfied. */ -int rio_inb_pwrite_handler(union rio_pw_msg *pw_msg) +int rio_inb_pwrite_handler(struct rio_mport *mport, union rio_pw_msg *pw_msg) { struct rio_dev *rdev; u32 err_status, em_perrdet, em_ltlerrdet; int rc, portnum; - - rdev = rio_get_comptag((pw_msg->em.comptag & RIO_CTAG_UDEVID), NULL); - if (rdev == NULL) { - /* Device removed or enumeration error */ - pr_debug("RIO: %s No matching device for CTag 0x%08x\n", - __func__, pw_msg->em.comptag); - return -EIO; - } - - pr_debug("RIO: Port-Write message from %s\n", rio_name(rdev)); + struct rio_pwrite *pwrite; #ifdef DEBUG_PW { - u32 i; - for (i = 0; i < RIO_PW_MSG_SIZE/sizeof(u32);) { + u32 i; + + pr_debug("%s: PW to mport_%d:\n", __func__, mport->id); + for (i = 0; i < RIO_PW_MSG_SIZE / sizeof(u32); i = i + 4) { pr_debug("0x%02x: %08x %08x %08x %08x\n", - i*4, pw_msg->raw[i], pw_msg->raw[i + 1], - pw_msg->raw[i + 2], pw_msg->raw[i + 3]); - i += 4; - } + i * 4, pw_msg->raw[i], pw_msg->raw[i + 1], + pw_msg->raw[i + 2], pw_msg->raw[i + 3]); + } } #endif - /* Call an external service function (if such is registered - * for this device). This may be the service for endpoints that send - * device-specific port-write messages. End-point messages expected - * to be handled completely by EP specific device driver. + rdev = rio_get_comptag((pw_msg->em.comptag & RIO_CTAG_UDEVID), NULL); + if (rdev) { + pr_debug("RIO: Port-Write message from %s\n", rio_name(rdev)); + } else { + pr_debug("RIO: %s No matching device for CTag 0x%08x\n", + __func__, pw_msg->em.comptag); + } + + /* Call a device-specific handler (if it is registered for the device). + * This may be the service for endpoints that send device-specific + * port-write messages. End-point messages expected to be handled + * completely by EP specific device driver. * For switches rc==0 signals that no standard processing required. */ - if (rdev->pwcback != NULL) { + if (rdev && rdev->pwcback) { rc = rdev->pwcback(rdev, pw_msg, 0); if (rc == 0) return 0; } + mutex_lock(&mport->lock); + list_for_each_entry(pwrite, &mport->pwrites, node) + pwrite->pwcback(mport, pwrite->context, pw_msg, 0); + mutex_unlock(&mport->lock); + + if (!rdev) + return 0; + + /* + * FIXME: The code below stays as it was before for now until we decide + * how to do default PW handling in combination with per-mport callbacks + */ + portnum = pw_msg->em.is_port & 0xFF; /* Check if device and route to it are functional: @@ -2060,6 +2153,7 @@ int rio_mport_initialize(struct rio_mport *mport) mport->nscan = NULL; mutex_init(&mport->lock); mport->pwe_refcnt = 0; + INIT_LIST_HEAD(&mport->pwrites); return 0; } diff --git a/include/linux/rio.h b/include/linux/rio.h index cb3c47543bb0..44f3da512c15 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -245,6 +245,7 @@ enum rio_phy_type { /** * struct rio_mport - RIO master port info * @dbells: List of doorbell events + * @pwrites: List of portwrite events * @node: Node in global list of master ports * @nnode: Node in network list of master ports * @net: RIO net this mport is attached to @@ -270,6 +271,7 @@ enum rio_phy_type { */ struct rio_mport { struct list_head dbells; /* list of doorbell events */ + struct list_head pwrites; /* list of portwrite events */ struct list_head node; /* node in global list of ports */ struct list_head nnode; /* node in net list of ports */ struct rio_net *net; /* RIO net this mport is attached to */ diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h index 9fb2bcd0a987..5dff9a4cb675 100644 --- a/include/linux/rio_drv.h +++ b/include/linux/rio_drv.h @@ -374,7 +374,14 @@ extern void rio_unmap_inb_region(struct rio_mport *mport, dma_addr_t lstart); extern int rio_request_inb_pwrite(struct rio_dev *, int (*)(struct rio_dev *, union rio_pw_msg*, int)); extern int rio_release_inb_pwrite(struct rio_dev *); -extern int rio_inb_pwrite_handler(union rio_pw_msg *pw_msg); +extern int rio_add_mport_pw_handler(struct rio_mport *mport, void *dev_id, + int (*pwcback)(struct rio_mport *mport, void *dev_id, + union rio_pw_msg *msg, int step)); +extern int rio_del_mport_pw_handler(struct rio_mport *mport, void *dev_id, + int (*pwcback)(struct rio_mport *mport, void *dev_id, + union rio_pw_msg *msg, int step)); +extern int rio_inb_pwrite_handler(struct rio_mport *mport, + union rio_pw_msg *pw_msg); extern void rio_pw_enable(struct rio_mport *mport, int enable); /* LDM support */ From 2ece1caf668f183d0606ea9fc85a1083d42b5b3a Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:47 -0700 Subject: [PATCH 50/81] rapidio/tsi721: fix locking in OB_MSG processing - Add spinlock protection into outbound message queuing routine. - Change outbound message interrupt handler to avoid deadlock when calling registered callback routine. - Allow infinite retries for outbound messages to avoid retry threshold error signaling in systems with nodes that have slow message receive queue processing. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/tsi721.c | 42 +++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index 5e1d52674e17..822fd4bebf75 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -1453,11 +1453,14 @@ tsi721_add_outb_message(struct rio_mport *mport, struct rio_dev *rdev, int mbox, struct tsi721_device *priv = mport->priv; struct tsi721_omsg_desc *desc; u32 tx_slot; + unsigned long flags; if (!priv->omsg_init[mbox] || len > TSI721_MSG_MAX_SIZE || len < 8) return -EINVAL; + spin_lock_irqsave(&priv->omsg_ring[mbox].lock, flags); + tx_slot = priv->omsg_ring[mbox].tx_slot; /* Copy copy message into transfer buffer */ @@ -1469,9 +1472,11 @@ tsi721_add_outb_message(struct rio_mport *mport, struct rio_dev *rdev, int mbox, /* Build descriptor associated with buffer */ desc = priv->omsg_ring[mbox].omd_base; desc[tx_slot].type_id = cpu_to_le32((DTYPE4 << 29) | rdev->destid); +#ifdef TSI721_OMSG_DESC_INT + /* Request IOF_DONE interrupt generation for each N-th frame in queue */ if (tx_slot % 4 == 0) desc[tx_slot].type_id |= cpu_to_le32(TSI721_OMD_IOF); - +#endif desc[tx_slot].msg_info = cpu_to_le32((mport->sys_size << 26) | (mbox << 22) | (0xe << 12) | (len & 0xff8)); @@ -1497,6 +1502,8 @@ tsi721_add_outb_message(struct rio_mport *mport, struct rio_dev *rdev, int mbox, priv->regs + TSI721_OBDMAC_DWRCNT(mbox)); ioread32(priv->regs + TSI721_OBDMAC_DWRCNT(mbox)); + spin_unlock_irqrestore(&priv->omsg_ring[mbox].lock, flags); + return 0; } @@ -1511,6 +1518,9 @@ static void tsi721_omsg_handler(struct tsi721_device *priv, int ch) { u32 omsg_int; struct rio_mport *mport = &priv->mport; + void *dev_id = NULL; + u32 tx_slot = 0xffffffff; + int do_callback = 0; spin_lock(&priv->omsg_ring[ch].lock); @@ -1524,7 +1534,6 @@ static void tsi721_omsg_handler(struct tsi721_device *priv, int ch) u32 srd_ptr; u64 *sts_ptr, last_ptr = 0, prev_ptr = 0; int i, j; - u32 tx_slot; /* * Find last successfully processed descriptor @@ -1574,14 +1583,19 @@ static void tsi721_omsg_handler(struct tsi721_device *priv, int ch) goto no_sts_update; } + if (tx_slot >= priv->omsg_ring[ch].size) + dev_dbg(&priv->pdev->dev, + "OB_MSG tx_slot=%x > size=%x", + tx_slot, priv->omsg_ring[ch].size); + WARN_ON(tx_slot >= priv->omsg_ring[ch].size); + /* Move slot index to the next message to be sent */ ++tx_slot; if (tx_slot == priv->omsg_ring[ch].size) tx_slot = 0; - BUG_ON(tx_slot >= priv->omsg_ring[ch].size); - mport->outb_msg[ch].mcback(mport, - priv->omsg_ring[ch].dev_id, ch, - tx_slot); + + dev_id = priv->omsg_ring[ch].dev_id; + do_callback = 1; } no_sts_update: @@ -1597,15 +1611,15 @@ no_sts_update: iowrite32(TSI721_OBDMAC_INT_ERROR, priv->regs + TSI721_OBDMAC_INT(ch)); - iowrite32(TSI721_OBDMAC_CTL_INIT, + iowrite32(TSI721_OBDMAC_CTL_RETRY_THR | TSI721_OBDMAC_CTL_INIT, priv->regs + TSI721_OBDMAC_CTL(ch)); ioread32(priv->regs + TSI721_OBDMAC_CTL(ch)); /* Inform upper level to clear all pending tx slots */ - if (mport->outb_msg[ch].mcback) - mport->outb_msg[ch].mcback(mport, - priv->omsg_ring[ch].dev_id, ch, - priv->omsg_ring[ch].tx_slot); + dev_id = priv->omsg_ring[ch].dev_id; + tx_slot = priv->omsg_ring[ch].tx_slot; + do_callback = 1; + /* Synch tx_slot tracking */ iowrite32(priv->omsg_ring[ch].tx_slot, priv->regs + TSI721_OBDMAC_DRDCNT(ch)); @@ -1627,6 +1641,9 @@ no_sts_update: } spin_unlock(&priv->omsg_ring[ch].lock); + + if (mport->outb_msg[ch].mcback && do_callback) + mport->outb_msg[ch].mcback(mport, dev_id, ch, tx_slot); } /** @@ -1768,7 +1785,8 @@ static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id, mb(); /* Initialize Outbound Message engine */ - iowrite32(TSI721_OBDMAC_CTL_INIT, priv->regs + TSI721_OBDMAC_CTL(mbox)); + iowrite32(TSI721_OBDMAC_CTL_RETRY_THR | TSI721_OBDMAC_CTL_INIT, + priv->regs + TSI721_OBDMAC_CTL(mbox)); ioread32(priv->regs + TSI721_OBDMAC_DWRCNT(mbox)); udelay(10); From 93bdaca5018c02ba838f8fe2178fab261e2c1e68 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:50 -0700 Subject: [PATCH 51/81] rapidio: add outbound window support Add RapidIO controller (mport) outbound window configuration operations. This patch is a part of the original patch submitted by Li Yang: https://lists.ozlabs.org/pipermail/linuxppc-dev/2009-April/071210.html For some reason the original part was not applied to mainline code tree. The inbound window mapping part has been applied later during tsi721 mport driver submission. Now goes the second part with corresponding HW support. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Li Yang Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio.c | 50 +++++++++++++++++++++++++++++++++++++++++ include/linux/rio.h | 5 +++++ include/linux/rio_drv.h | 4 ++++ 3 files changed, 59 insertions(+) diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index 17973d3caa88..0dcaa660cba1 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -699,6 +699,56 @@ void rio_unmap_inb_region(struct rio_mport *mport, dma_addr_t lstart) } EXPORT_SYMBOL_GPL(rio_unmap_inb_region); +/** + * rio_map_outb_region -- Map outbound memory region. + * @mport: Master port. + * @destid: destination id window points to + * @rbase: RIO base address window translates to + * @size: Size of the memory region + * @rflags: Flags for mapping. + * @local: physical address of memory region mapped + * + * Return: 0 -- Success. + * + * This function will create the mapping from RIO space to local memory. + */ +int rio_map_outb_region(struct rio_mport *mport, u16 destid, u64 rbase, + u32 size, u32 rflags, dma_addr_t *local) +{ + int rc = 0; + unsigned long flags; + + if (!mport->ops->map_outb) + return -ENODEV; + + spin_lock_irqsave(&rio_mmap_lock, flags); + rc = mport->ops->map_outb(mport, destid, rbase, size, + rflags, local); + spin_unlock_irqrestore(&rio_mmap_lock, flags); + + return rc; +} +EXPORT_SYMBOL_GPL(rio_map_outb_region); + +/** + * rio_unmap_inb_region -- Unmap the inbound memory region + * @mport: Master port + * @destid: destination id mapping points to + * @rstart: RIO base address window translates to + */ +void rio_unmap_outb_region(struct rio_mport *mport, u16 destid, u64 rstart) +{ + unsigned long flags; + + if (!mport->ops->unmap_outb) + return; + + spin_lock_irqsave(&rio_mmap_lock, flags); + mport->ops->unmap_outb(mport, destid, rstart); + spin_unlock_irqrestore(&rio_mmap_lock, flags); +} +EXPORT_SYMBOL_GPL(rio_unmap_outb_region); + /** * rio_mport_get_physefb - Helper function that returns register offset * for Physical Layer Extended Features Block. diff --git a/include/linux/rio.h b/include/linux/rio.h index 44f3da512c15..aa2323893e8d 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -400,6 +400,8 @@ struct rio_mport_attr { * @map_inb: Callback to map RapidIO address region into local memory space. * @unmap_inb: Callback to unmap RapidIO address region mapped with map_inb(). * @query_mport: Callback to query mport device attributes. + * @map_outb: Callback to map outbound address region into local memory space. + * @unmap_outb: Callback to unmap outbound RapidIO address region. */ struct rio_ops { int (*lcread) (struct rio_mport *mport, int index, u32 offset, int len, @@ -427,6 +429,9 @@ struct rio_ops { void (*unmap_inb)(struct rio_mport *mport, dma_addr_t lstart); int (*query_mport)(struct rio_mport *mport, struct rio_mport_attr *attr); + int (*map_outb)(struct rio_mport *mport, u16 destid, u64 rstart, + u32 size, u32 flags, dma_addr_t *laddr); + void (*unmap_outb)(struct rio_mport *mport, u16 destid, u64 rstart); }; #define RIO_RESOURCE_MEM 0x00000100 diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h index 5dff9a4cb675..0834264fb7f2 100644 --- a/include/linux/rio_drv.h +++ b/include/linux/rio_drv.h @@ -369,6 +369,10 @@ void rio_release_region(struct rio_dev *, int); extern int rio_map_inb_region(struct rio_mport *mport, dma_addr_t local, u64 rbase, u32 size, u32 rflags); extern void rio_unmap_inb_region(struct rio_mport *mport, dma_addr_t lstart); +extern int rio_map_outb_region(struct rio_mport *mport, u16 destid, u64 rbase, + u32 size, u32 rflags, dma_addr_t *local); +extern void rio_unmap_outb_region(struct rio_mport *mport, + u16 destid, u64 rstart); /* Port-Write management */ extern int rio_request_inb_pwrite(struct rio_dev *, From 1679e8dabf753edbf4ed15afa85bf8c4e81f249e Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:53 -0700 Subject: [PATCH 52/81] rapidio/tsi721: add outbound windows mapping support Add device-specific callback functions to support outbound windows mapping and release. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/tsi721.c | 222 ++++++++++++++++++++++++++++++- drivers/rapidio/devices/tsi721.h | 20 +++ 2 files changed, 235 insertions(+), 7 deletions(-) diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index 822fd4bebf75..74bc1fbb784e 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -841,6 +841,169 @@ static void tsi721_free_irq(struct tsi721_device *priv) free_irq(priv->pdev->irq, (void *)priv); } +static int +tsi721_obw_alloc(struct tsi721_device *priv, struct tsi721_obw_bar *pbar, + u32 size, int *win_id) +{ + u64 win_base; + u64 bar_base; + u64 bar_end; + u32 align; + struct tsi721_ob_win *win; + struct tsi721_ob_win *new_win = NULL; + int new_win_idx = -1; + int i = 0; + + bar_base = pbar->base; + bar_end = bar_base + pbar->size; + win_base = bar_base; + align = size/TSI721_PC2SR_ZONES; + + while (i < TSI721_IBWIN_NUM) { + for (i = 0; i < TSI721_IBWIN_NUM; i++) { + if (!priv->ob_win[i].active) { + if (new_win == NULL) { + new_win = &priv->ob_win[i]; + new_win_idx = i; + } + continue; + } + + /* + * If this window belongs to the current BAR check it + * for overlap + */ + win = &priv->ob_win[i]; + + if (win->base >= bar_base && win->base < bar_end) { + if (win_base < (win->base + win->size) && + (win_base + size) > win->base) { + /* Overlap detected */ + win_base = win->base + win->size; + win_base = ALIGN(win_base, align); + break; + } + } + } + } + + if (win_base + size > bar_end) + return -ENOMEM; + + if (!new_win) { + dev_err(&priv->pdev->dev, "ERR: OBW count tracking failed\n"); + return -EIO; + } + + new_win->active = true; + new_win->base = win_base; + new_win->size = size; + new_win->pbar = pbar; + priv->obwin_cnt--; + pbar->free -= size; + *win_id = new_win_idx; + return 0; +} + +static int tsi721_map_outb_win(struct rio_mport *mport, u16 destid, u64 rstart, + u32 size, u32 flags, dma_addr_t *laddr) +{ + struct tsi721_device *priv = mport->priv; + int i; + struct tsi721_obw_bar *pbar; + struct tsi721_ob_win *ob_win; + int obw = -1; + u32 rval; + u64 rio_addr; + u32 zsize; + int ret = -ENOMEM; + + if (!is_power_of_2(size) || (size < 0x8000) || (rstart & (size - 1))) + return -EINVAL; + + if (priv->obwin_cnt == 0) + return -EBUSY; + + for (i = 0; i < 2; i++) { + if (priv->p2r_bar[i].free >= size) { + pbar = &priv->p2r_bar[i]; + ret = tsi721_obw_alloc(priv, pbar, size, &obw); + if (!ret) + break; + } + } + + if (ret) + return ret; + + WARN_ON(obw == -1); + ob_win = &priv->ob_win[obw]; + ob_win->destid = destid; + ob_win->rstart = rstart; + + /* + * Configure Outbound Window + */ + + zsize = size/TSI721_PC2SR_ZONES; + rio_addr = rstart; + + /* + * Program Address Translation Zones: + * This implementation uses all 8 zones associated wit window. + */ + for (i = 0; i < TSI721_PC2SR_ZONES; i++) { + + while (ioread32(priv->regs + TSI721_ZONE_SEL) & + TSI721_ZONE_SEL_GO) { + udelay(1); + } + + rval = (u32)(rio_addr & TSI721_LUT_DATA0_ADD) | + TSI721_LUT_DATA0_NREAD | TSI721_LUT_DATA0_NWR; + iowrite32(rval, priv->regs + TSI721_LUT_DATA0); + rval = (u32)(rio_addr >> 32); + iowrite32(rval, priv->regs + TSI721_LUT_DATA1); + rval = destid; + iowrite32(rval, priv->regs + TSI721_LUT_DATA2); + + rval = TSI721_ZONE_SEL_GO | (obw << 3) | i; + iowrite32(rval, priv->regs + TSI721_ZONE_SEL); + + rio_addr += zsize; + } + + iowrite32(TSI721_OBWIN_SIZE(size) << 8, + priv->regs + TSI721_OBWINSZ(obw)); + iowrite32((u32)(ob_win->base >> 32), priv->regs + TSI721_OBWINUB(obw)); + iowrite32((u32)(ob_win->base & TSI721_OBWINLB_BA) | TSI721_OBWINLB_WEN, + priv->regs + TSI721_OBWINLB(obw)); + + *laddr = ob_win->base; + return 0; +} + +static void tsi721_unmap_outb_win(struct rio_mport *mport, + u16 destid, u64 rstart) +{ + struct tsi721_device *priv = mport->priv; + struct tsi721_ob_win *ob_win; + int i; + + for (i = 0; i < TSI721_OBWIN_NUM; i++) { + ob_win = &priv->ob_win[i]; + + if (ob_win->active && + ob_win->destid == destid && ob_win->rstart == rstart) { + ob_win->active = false; + iowrite32(0, priv->regs + TSI721_OBWINLB(i)); + ob_win->pbar->free += ob_win->size; + priv->obwin_cnt++; + break; + } + } +} + /** * tsi721_init_pc2sr_mapping - initializes outbound (PCIe->SRIO) * translation regions. @@ -850,11 +1013,41 @@ static void tsi721_free_irq(struct tsi721_device *priv) */ static void tsi721_init_pc2sr_mapping(struct tsi721_device *priv) { - int i; + int i, z; + u32 rval; /* Disable all PC2SR translation windows */ for (i = 0; i < TSI721_OBWIN_NUM; i++) iowrite32(0, priv->regs + TSI721_OBWINLB(i)); + + /* Initialize zone lookup tables to avoid ECC errors on reads */ + iowrite32(0, priv->regs + TSI721_LUT_DATA0); + iowrite32(0, priv->regs + TSI721_LUT_DATA1); + iowrite32(0, priv->regs + TSI721_LUT_DATA2); + + for (i = 0; i < TSI721_OBWIN_NUM; i++) { + for (z = 0; z < TSI721_PC2SR_ZONES; z++) { + while (ioread32(priv->regs + TSI721_ZONE_SEL) & + TSI721_ZONE_SEL_GO) { + udelay(1); + } + rval = TSI721_ZONE_SEL_GO | (i << 3) | z; + iowrite32(rval, priv->regs + TSI721_ZONE_SEL); + } + } + + if (priv->p2r_bar[0].size == 0 && priv->p2r_bar[1].size == 0) { + priv->obwin_cnt = 0; + return; + } + + priv->p2r_bar[0].free = priv->p2r_bar[0].size; + priv->p2r_bar[1].free = priv->p2r_bar[1].size; + + for (i = 0; i < TSI721_OBWIN_NUM; i++) + priv->ob_win[i].active = false; + + priv->obwin_cnt = TSI721_OBWIN_NUM; } /** @@ -2418,6 +2611,8 @@ static struct rio_ops tsi721_rio_ops = { .unmap_inb = tsi721_rio_unmap_inb_mem, .pwenable = tsi721_pw_enable, .query_mport = tsi721_query_mport, + .map_outb = tsi721_map_outb_win, + .unmap_outb = tsi721_unmap_outb_win, }; static void tsi721_mport_release(struct device *dev) @@ -2573,14 +2768,27 @@ static int tsi721_probe(struct pci_dev *pdev, * It may be a good idea to keep them disabled using HW configuration * to save PCI memory space. */ - if ((pci_resource_flags(pdev, BAR_2) & IORESOURCE_MEM) && - (pci_resource_flags(pdev, BAR_2) & IORESOURCE_MEM_64)) { - dev_info(&pdev->dev, "Outbound BAR2 is not used but enabled.\n"); + + priv->p2r_bar[0].size = priv->p2r_bar[1].size = 0; + + if (pci_resource_flags(pdev, BAR_2) & IORESOURCE_MEM_64) { + if (pci_resource_flags(pdev, BAR_2) & IORESOURCE_PREFETCH) + dev_info(&pdev->dev, + "Prefetchable OBW BAR2 will not be used\n"); + else { + priv->p2r_bar[0].base = pci_resource_start(pdev, BAR_2); + priv->p2r_bar[0].size = pci_resource_len(pdev, BAR_2); + } } - if ((pci_resource_flags(pdev, BAR_4) & IORESOURCE_MEM) && - (pci_resource_flags(pdev, BAR_4) & IORESOURCE_MEM_64)) { - dev_info(&pdev->dev, "Outbound BAR4 is not used but enabled.\n"); + if (pci_resource_flags(pdev, BAR_4) & IORESOURCE_MEM_64) { + if (pci_resource_flags(pdev, BAR_4) & IORESOURCE_PREFETCH) + dev_info(&pdev->dev, + "Prefetchable OBW BAR4 will not be used\n"); + else { + priv->p2r_bar[1].base = pci_resource_start(pdev, BAR_4); + priv->p2r_bar[1].size = pci_resource_len(pdev, BAR_4); + } } err = pci_request_regions(pdev, DRV_NAME); diff --git a/drivers/rapidio/devices/tsi721.h b/drivers/rapidio/devices/tsi721.h index 58637295a34c..57b46d025630 100644 --- a/drivers/rapidio/devices/tsi721.h +++ b/drivers/rapidio/devices/tsi721.h @@ -822,6 +822,21 @@ struct tsi721_ib_win { struct list_head mappings; }; +struct tsi721_obw_bar { + u64 base; + u64 size; + u64 free; +}; + +struct tsi721_ob_win { + u64 base; + u32 size; + u16 destid; + u64 rstart; + bool active; + struct tsi721_obw_bar *pbar; +}; + struct tsi721_device { struct pci_dev *pdev; struct rio_mport mport; @@ -861,6 +876,11 @@ struct tsi721_device { /* Inbound Mapping Windows */ struct tsi721_ib_win ib_win[TSI721_IBWIN_NUM]; int ibwin_cnt; + + /* Outbound Mapping Windows */ + struct tsi721_obw_bar p2r_bar[2]; + struct tsi721_ob_win ob_win[TSI721_OBWIN_NUM]; + int obwin_cnt; }; #ifdef CONFIG_RAPIDIO_DMA_ENGINE From 72d8a0d23083ba89fb00a7ad9b07419e34ebe47c Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:56 -0700 Subject: [PATCH 53/81] rapidio/tsi721: add filtered debug output Replace "all-or-nothing" debug output with controlled debug output using functional block masks. This allows run time control of debug messages through 'dbg_level' module parameter. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/rapidio/tsi721.txt | 9 + drivers/rapidio/devices/tsi721.c | 242 ++++++++++++++------------- drivers/rapidio/devices/tsi721.h | 40 +++++ drivers/rapidio/devices/tsi721_dma.c | 147 ++++++++-------- 4 files changed, 249 insertions(+), 189 deletions(-) diff --git a/Documentation/rapidio/tsi721.txt b/Documentation/rapidio/tsi721.txt index 626052f403bb..7c1c7bf48ec0 100644 --- a/Documentation/rapidio/tsi721.txt +++ b/Documentation/rapidio/tsi721.txt @@ -16,6 +16,15 @@ For inbound messages this driver uses destination ID matching to forward message into the corresponding message queue. Messaging callbacks are implemented to be fully compatible with RIONET driver (Ethernet over RapidIO messaging services). +1. Module parameters: +- 'dbg_level' - This parameter allows to control amount of debug information + generated by this device driver. This parameter is formed by set of + This parameter can be changed bit masks that correspond to the specific + functional block. + For mask definitions see 'drivers/rapidio/devices/tsi721.h' + This parameter can be changed dynamically. + Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level. + II. Known problems None. diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index 74bc1fbb784e..b5b455614f8a 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -36,6 +36,12 @@ #include "tsi721.h" +#ifdef DEBUG +u32 dbg_level = DBG_INIT | DBG_EXIT; +module_param(dbg_level, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(dbg_level, "Debugging output level (default 0 = none)"); +#endif + static void tsi721_omsg_handler(struct tsi721_device *priv, int ch); static void tsi721_imsg_handler(struct tsi721_device *priv, int ch); @@ -141,9 +147,9 @@ static int tsi721_maint_dma(struct tsi721_device *priv, u32 sys_size, & TSI721_DMAC_STS_RUN) { udelay(1); if (++i >= 5000000) { - dev_dbg(&priv->pdev->dev, - "%s : DMA[%d] read timeout ch_status=%x\n", - __func__, priv->mdma.ch_id, ch_stat); + tsi_debug(MAINT, &priv->pdev->dev, + "DMA[%d] read timeout ch_status=%x", + priv->mdma.ch_id, ch_stat); if (!do_wr) *data = 0xffffffff; err = -EIO; @@ -155,10 +161,12 @@ static int tsi721_maint_dma(struct tsi721_device *priv, u32 sys_size, /* If DMA operation aborted due to error, * reinitialize DMA channel */ - dev_dbg(&priv->pdev->dev, "%s : DMA ABORT ch_stat=%x\n", - __func__, ch_stat); - dev_dbg(&priv->pdev->dev, "OP=%d : destid=%x hc=%x off=%x\n", - do_wr ? MAINT_WR : MAINT_RD, destid, hopcount, offset); + tsi_debug(MAINT, &priv->pdev->dev, "DMA ABORT ch_stat=%x", + ch_stat); + tsi_debug(MAINT, &priv->pdev->dev, + "OP=%d : destid=%x hc=%x off=%x", + do_wr ? MAINT_WR : MAINT_RD, + destid, hopcount, offset); iowrite32(TSI721_DMAC_INT_ALL, regs + TSI721_DMAC_INT); iowrite32(TSI721_DMAC_CTL_INIT, regs + TSI721_DMAC_CTL); udelay(10); @@ -336,8 +344,8 @@ static int tsi721_dsend(struct rio_mport *mport, int index, offset = (((mport->sys_size) ? RIO_TT_CODE_16 : RIO_TT_CODE_8) << 18) | (destid << 2); - dev_dbg(&priv->pdev->dev, - "Send Doorbell 0x%04x to destID 0x%x\n", data, destid); + tsi_debug(DBELL, &priv->pdev->dev, + "Send Doorbell 0x%04x to destID 0x%x", data, destid); iowrite16be(data, priv->odb_base + offset); return 0; @@ -411,10 +419,10 @@ static void tsi721_db_dpc(struct work_struct *work) dbell->dinb(mport, dbell->dev_id, DBELL_SID(idb.bytes), DBELL_TID(idb.bytes), DBELL_INF(idb.bytes)); } else { - dev_dbg(&priv->pdev->dev, - "spurious inb doorbell, sid %2.2x tid %2.2x" - " info %4.4x\n", DBELL_SID(idb.bytes), - DBELL_TID(idb.bytes), DBELL_INF(idb.bytes)); + tsi_debug(DBELL, &priv->pdev->dev, + "spurious IDB sid %2.2x tid %2.2x info %4.4x", + DBELL_SID(idb.bytes), DBELL_TID(idb.bytes), + DBELL_INF(idb.bytes)); } wr_ptr = ioread32(priv->regs + @@ -470,8 +478,8 @@ static irqreturn_t tsi721_irqhandler(int irq, void *ptr) if (intval & TSI721_SR_CHINT_IDBQRCV) tsi721_dbell_handler(priv); else - dev_info(&priv->pdev->dev, - "Unsupported SR_CH_INT %x\n", intval); + tsi_info(&priv->pdev->dev, + "Unsupported SR_CH_INT %x", intval); /* Clear interrupts */ iowrite32(intval, @@ -533,8 +541,8 @@ static irqreturn_t tsi721_irqhandler(int irq, void *ptr) int ch; if (dev_ch_int & TSI721_INT_BDMA_CHAN_M) { - dev_dbg(&priv->pdev->dev, - "IRQ from DMA channel 0x%08x\n", dev_ch_int); + tsi_debug(DMA, &priv->pdev->dev, + "IRQ from DMA channel 0x%08x", dev_ch_int); for (ch = 0; ch < TSI721_DMA_MAXCH; ch++) { if (!(dev_ch_int & TSI721_INT_BDMA_CHAN(ch))) @@ -749,8 +757,8 @@ static int tsi721_enable_msix(struct tsi721_device *priv) err = pci_enable_msix_exact(priv->pdev, entries, ARRAY_SIZE(entries)); if (err) { - dev_err(&priv->pdev->dev, - "Failed to enable MSI-X (err=%d)\n", err); + tsi_err(&priv->pdev->dev, + "Failed to enable MSI-X (err=%d)", err); return err; } @@ -824,8 +832,8 @@ static int tsi721_request_irq(struct tsi721_device *priv) DRV_NAME, (void *)priv); if (err) - dev_err(&priv->pdev->dev, - "Unable to allocate interrupt, Error: %d\n", err); + tsi_err(&priv->pdev->dev, + "Unable to allocate interrupt, err=%d", err); return err; } @@ -891,7 +899,7 @@ tsi721_obw_alloc(struct tsi721_device *priv, struct tsi721_obw_bar *pbar, return -ENOMEM; if (!new_win) { - dev_err(&priv->pdev->dev, "ERR: OBW count tracking failed\n"); + tsi_err(&priv->pdev->dev, "OBW count tracking failed"); return -EIO; } @@ -918,6 +926,9 @@ static int tsi721_map_outb_win(struct rio_mport *mport, u16 destid, u64 rstart, u32 zsize; int ret = -ENOMEM; + tsi_debug(OBW, &priv->pdev->dev, + "did=%d ra=0x%llx sz=0x%x", destid, rstart, size); + if (!is_power_of_2(size) || (size < 0x8000) || (rstart & (size - 1))) return -EINVAL; @@ -940,6 +951,8 @@ static int tsi721_map_outb_win(struct rio_mport *mport, u16 destid, u64 rstart, ob_win = &priv->ob_win[obw]; ob_win->destid = destid; ob_win->rstart = rstart; + tsi_debug(OBW, &priv->pdev->dev, + "allocated OBW%d @%llx", obw, ob_win->base); /* * Configure Outbound Window @@ -990,11 +1003,15 @@ static void tsi721_unmap_outb_win(struct rio_mport *mport, struct tsi721_ob_win *ob_win; int i; + tsi_debug(OBW, &priv->pdev->dev, "did=%d ra=0x%llx", destid, rstart); + for (i = 0; i < TSI721_OBWIN_NUM; i++) { ob_win = &priv->ob_win[i]; if (ob_win->active && ob_win->destid == destid && ob_win->rstart == rstart) { + tsi_debug(OBW, &priv->pdev->dev, + "free OBW%d @%llx", i, ob_win->base); ob_win->active = false; iowrite32(0, priv->regs + TSI721_OBWINLB(i)); ob_win->pbar->free += ob_win->size; @@ -1078,15 +1095,15 @@ static int tsi721_rio_map_inb_mem(struct rio_mport *mport, dma_addr_t lstart, int ret = -EBUSY; if (direct) { - dev_dbg(&priv->pdev->dev, - "Direct (RIO_0x%llx -> PCIe_0x%pad), size=0x%x", - rstart, &lstart, size); - /* Calculate minimal acceptable window size and base address */ ibw_size = roundup_pow_of_two(size); ibw_start = lstart & ~(ibw_size - 1); + tsi_debug(IBW, &priv->pdev->dev, + "Direct (RIO_0x%llx -> PCIe_0x%pad), size=0x%x, ibw_start = 0x%llx", + rstart, &lstart, size, ibw_start); + while ((lstart + size) > (ibw_start + ibw_size)) { ibw_size *= 2; ibw_start = lstart & ~(ibw_size - 1); @@ -1102,7 +1119,7 @@ static int tsi721_rio_map_inb_mem(struct rio_mport *mport, dma_addr_t lstart, return -ENOMEM; } else { - dev_dbg(&priv->pdev->dev, + tsi_debug(IBW, &priv->pdev->dev, "Translated (RIO_0x%llx -> PCIe_0x%pad), size=0x%x", rstart, &lstart, size); @@ -1197,9 +1214,9 @@ static int tsi721_rio_map_inb_mem(struct rio_mport *mport, dma_addr_t lstart, priv->ibwin_cnt--; - dev_dbg(&priv->pdev->dev, - "Configured IBWIN%d (RIO_0x%llx -> PCIe_0x%llx), size=0x%llx\n", - i, ibw_start, (unsigned long long)loc_start, ibw_size); + tsi_debug(IBW, &priv->pdev->dev, + "Configured IBWIN%d (RIO_0x%llx -> PCIe_0x%pad), size=0x%llx", + i, ibw_start, &loc_start, ibw_size); return 0; out: @@ -1219,7 +1236,7 @@ static void tsi721_rio_unmap_inb_mem(struct rio_mport *mport, struct tsi721_ib_win *ib_win; int i; - dev_dbg(&priv->pdev->dev, + tsi_debug(IBW, &priv->pdev->dev, "Unmap IBW mapped to PCIe_0x%pad", &lstart); /* Search for matching active inbound translation window */ @@ -1255,7 +1272,7 @@ static void tsi721_rio_unmap_inb_mem(struct rio_mport *mport, break; } - dev_dbg(&priv->pdev->dev, "Disable IBWIN_%d", i); + tsi_debug(IBW, &priv->pdev->dev, "Disable IBWIN_%d", i); iowrite32(0, priv->regs + TSI721_IBWIN_LB(i)); ib_win->active = false; priv->ibwin_cnt++; @@ -1264,7 +1281,7 @@ static void tsi721_rio_unmap_inb_mem(struct rio_mport *mport, } if (i == TSI721_IBWIN_NUM) - dev_dbg(&priv->pdev->dev, + tsi_debug(IBW, &priv->pdev->dev, "IB window mapped to %pad not found", &lstart); } @@ -1319,7 +1336,7 @@ static int tsi721_port_write_init(struct tsi721_device *priv) spin_lock_init(&priv->pw_fifo_lock); if (kfifo_alloc(&priv->pw_fifo, TSI721_RIO_PW_MSG_SIZE * 32, GFP_KERNEL)) { - dev_err(&priv->pdev->dev, "PW FIFO allocation failed\n"); + tsi_err(&priv->pdev->dev, "PW FIFO allocation failed"); return -ENOMEM; } @@ -1351,8 +1368,9 @@ static int tsi721_doorbell_init(struct tsi721_device *priv) if (!priv->idb_base) return -ENOMEM; - dev_dbg(&priv->pdev->dev, "Allocated IDB buffer @ %p (phys = %llx)\n", - priv->idb_base, (unsigned long long)priv->idb_dma); + tsi_debug(DBELL, &priv->pdev->dev, + "Allocated IDB buffer @ %p (phys = %pad)", + priv->idb_base, &priv->idb_dma); iowrite32(TSI721_IDQ_SIZE_VAL(IDB_QSIZE), priv->regs + TSI721_IDQ_SIZE(IDB_QUEUE)); @@ -1398,9 +1416,8 @@ static int tsi721_bdma_maint_init(struct tsi721_device *priv) int bd_num = 2; void __iomem *regs; - dev_dbg(&priv->pdev->dev, - "Init Block DMA Engine for Maintenance requests, CH%d\n", - TSI721_DMACH_MAINT); + tsi_debug(MAINT, &priv->pdev->dev, + "Init BDMA_%d Maintenance requests", TSI721_DMACH_MAINT); /* * Initialize DMA channel for maintenance requests @@ -1420,8 +1437,8 @@ static int tsi721_bdma_maint_init(struct tsi721_device *priv) priv->mdma.bd_phys = bd_phys; priv->mdma.bd_base = bd_ptr; - dev_dbg(&priv->pdev->dev, "DMA descriptors @ %p (phys = %llx)\n", - bd_ptr, (unsigned long long)bd_phys); + tsi_debug(MAINT, &priv->pdev->dev, "DMA descriptors @ %p (phys = %pad)", + bd_ptr, &bd_phys); /* Allocate space for descriptor status FIFO */ sts_size = (bd_num >= TSI721_DMA_MINSTSSZ) ? @@ -1443,9 +1460,9 @@ static int tsi721_bdma_maint_init(struct tsi721_device *priv) priv->mdma.sts_base = sts_ptr; priv->mdma.sts_size = sts_size; - dev_dbg(&priv->pdev->dev, - "desc status FIFO @ %p (phys = %llx) size=0x%x\n", - sts_ptr, (unsigned long long)sts_phys, sts_size); + tsi_debug(MAINT, &priv->pdev->dev, + "desc status FIFO @ %p (phys = %pad) size=0x%x", + sts_ptr, &sts_phys, sts_size); /* Initialize DMA descriptors ring */ bd_ptr[bd_num - 1].type_id = cpu_to_le32(DTYPE3 << 29); @@ -1720,8 +1737,8 @@ static void tsi721_omsg_handler(struct tsi721_device *priv, int ch) omsg_int = ioread32(priv->regs + TSI721_OBDMAC_INT(ch)); if (omsg_int & TSI721_OBDMAC_INT_ST_FULL) - dev_info(&priv->pdev->dev, - "OB MBOX%d: Status FIFO is full\n", ch); + tsi_info(&priv->pdev->dev, + "OB MBOX%d: Status FIFO is full", ch); if (omsg_int & (TSI721_OBDMAC_INT_DONE | TSI721_OBDMAC_INT_IOF_DONE)) { u32 srd_ptr; @@ -1777,7 +1794,7 @@ static void tsi721_omsg_handler(struct tsi721_device *priv, int ch) } if (tx_slot >= priv->omsg_ring[ch].size) - dev_dbg(&priv->pdev->dev, + tsi_debug(OMSG, &priv->pdev->dev, "OB_MSG tx_slot=%x > size=%x", tx_slot, priv->omsg_ring[ch].size); WARN_ON(tx_slot >= priv->omsg_ring[ch].size); @@ -1799,8 +1816,8 @@ no_sts_update: * reinitialize OB MSG channel */ - dev_dbg(&priv->pdev->dev, "OB MSG ABORT ch_stat=%x\n", - ioread32(priv->regs + TSI721_OBDMAC_STS(ch))); + tsi_debug(OMSG, &priv->pdev->dev, "OB MSG ABORT ch_stat=%x", + ioread32(priv->regs + TSI721_OBDMAC_STS(ch))); iowrite32(TSI721_OBDMAC_INT_ERROR, priv->regs + TSI721_OBDMAC_INT(ch)); @@ -1874,9 +1891,8 @@ static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id, &priv->omsg_ring[mbox].omq_phys[i], GFP_KERNEL); if (priv->omsg_ring[mbox].omq_base[i] == NULL) { - dev_dbg(&priv->pdev->dev, - "Unable to allocate OB MSG data buffer for" - " MBOX%d\n", mbox); + tsi_debug(OMSG, &priv->pdev->dev, + "ENOMEM for OB_MSG_%d data buffer", mbox); rc = -ENOMEM; goto out_buf; } @@ -1888,9 +1904,8 @@ static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id, (entries + 1) * sizeof(struct tsi721_omsg_desc), &priv->omsg_ring[mbox].omd_phys, GFP_KERNEL); if (priv->omsg_ring[mbox].omd_base == NULL) { - dev_dbg(&priv->pdev->dev, - "Unable to allocate OB MSG descriptor memory " - "for MBOX%d\n", mbox); + tsi_debug(OMSG, &priv->pdev->dev, + "ENOMEM for OB_MSG_%d descriptor memory", mbox); rc = -ENOMEM; goto out_buf; } @@ -1904,9 +1919,8 @@ static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id, sizeof(struct tsi721_dma_sts), &priv->omsg_ring[mbox].sts_phys, GFP_KERNEL); if (priv->omsg_ring[mbox].sts_base == NULL) { - dev_dbg(&priv->pdev->dev, - "Unable to allocate OB MSG descriptor status FIFO " - "for MBOX%d\n", mbox); + tsi_debug(OMSG, &priv->pdev->dev, + "ENOMEM for OB_MSG_%d status FIFO", mbox); rc = -ENOMEM; goto out_desc; } @@ -1942,9 +1956,9 @@ static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id, priv->msix[idx].irq_name, (void *)priv); if (rc) { - dev_dbg(&priv->pdev->dev, - "Unable to allocate MSI-X interrupt for " - "OBOX%d-DONE\n", mbox); + tsi_debug(OMSG, &priv->pdev->dev, + "Unable to get MSI-X IRQ for OBOX%d-DONE", + mbox); goto out_stat; } @@ -1953,9 +1967,8 @@ static int tsi721_open_outb_mbox(struct rio_mport *mport, void *dev_id, priv->msix[idx].irq_name, (void *)priv); if (rc) { - dev_dbg(&priv->pdev->dev, - "Unable to allocate MSI-X interrupt for " - "MBOX%d-INT\n", mbox); + tsi_debug(OMSG, &priv->pdev->dev, + "Unable to get MSI-X IRQ for MBOX%d-INT", mbox); idx = TSI721_VECT_OMB0_DONE + mbox; free_irq(priv->msix[idx].vector, (void *)priv); goto out_stat; @@ -2096,16 +2109,13 @@ static void tsi721_imsg_handler(struct tsi721_device *priv, int ch) imsg_int = ioread32(priv->regs + TSI721_IBDMAC_INT(ch)); if (imsg_int & TSI721_IBDMAC_INT_SRTO) - dev_info(&priv->pdev->dev, "IB MBOX%d SRIO timeout\n", - mbox); + tsi_info(&priv->pdev->dev, "IB MBOX%d SRIO timeout", mbox); if (imsg_int & TSI721_IBDMAC_INT_PC_ERROR) - dev_info(&priv->pdev->dev, "IB MBOX%d PCIe error\n", - mbox); + tsi_info(&priv->pdev->dev, "IB MBOX%d PCIe error", mbox); if (imsg_int & TSI721_IBDMAC_INT_FQ_LOW) - dev_info(&priv->pdev->dev, - "IB MBOX%d IB free queue low\n", mbox); + tsi_info(&priv->pdev->dev, "IB MBOX%d IB free queue low", mbox); /* Clear IB channel interrupts */ iowrite32(imsg_int, priv->regs + TSI721_IBDMAC_INT(ch)); @@ -2169,8 +2179,8 @@ static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id, GFP_KERNEL); if (priv->imsg_ring[mbox].buf_base == NULL) { - dev_err(&priv->pdev->dev, - "Failed to allocate buffers for IB MBOX%d\n", mbox); + tsi_err(&priv->pdev->dev, + "Failed to allocate buffers for IB MBOX%d", mbox); rc = -ENOMEM; goto out; } @@ -2183,8 +2193,8 @@ static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id, GFP_KERNEL); if (priv->imsg_ring[mbox].imfq_base == NULL) { - dev_err(&priv->pdev->dev, - "Failed to allocate free queue for IB MBOX%d\n", mbox); + tsi_err(&priv->pdev->dev, + "Failed to allocate free queue for IB MBOX%d", mbox); rc = -ENOMEM; goto out_buf; } @@ -2196,8 +2206,8 @@ static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id, &priv->imsg_ring[mbox].imd_phys, GFP_KERNEL); if (priv->imsg_ring[mbox].imd_base == NULL) { - dev_err(&priv->pdev->dev, - "Failed to allocate descriptor memory for IB MBOX%d\n", + tsi_err(&priv->pdev->dev, + "Failed to allocate descriptor memory for IB MBOX%d", mbox); rc = -ENOMEM; goto out_dma; @@ -2256,9 +2266,9 @@ static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id, priv->msix[idx].irq_name, (void *)priv); if (rc) { - dev_dbg(&priv->pdev->dev, - "Unable to allocate MSI-X interrupt for " - "IBOX%d-DONE\n", mbox); + tsi_debug(IMSG, &priv->pdev->dev, + "Unable to get MSI-X IRQ for IBOX%d-DONE", + mbox); goto out_desc; } @@ -2267,9 +2277,8 @@ static int tsi721_open_inb_mbox(struct rio_mport *mport, void *dev_id, priv->msix[idx].irq_name, (void *)priv); if (rc) { - dev_dbg(&priv->pdev->dev, - "Unable to allocate MSI-X interrupt for " - "IBOX%d-INT\n", mbox); + tsi_debug(IMSG, &priv->pdev->dev, + "Unable to get MSI-X IRQ for IBOX%d-INT", mbox); free_irq( priv->msix[TSI721_VECT_IMB0_RCV + mbox].vector, (void *)priv); @@ -2392,8 +2401,8 @@ static int tsi721_add_inb_buffer(struct rio_mport *mport, int mbox, void *buf) rx_slot = priv->imsg_ring[mbox].rx_slot; if (priv->imsg_ring[mbox].imq_base[rx_slot]) { - dev_err(&priv->pdev->dev, - "Error adding inbound buffer %d, buffer exists\n", + tsi_err(&priv->pdev->dev, + "Error adding inbound buffer %d, buffer exists", rx_slot); rc = -EINVAL; goto out; @@ -2619,7 +2628,7 @@ static void tsi721_mport_release(struct device *dev) { struct rio_mport *mport = to_rio_mport(dev); - dev_dbg(dev, "RIO: %s %s id=%d\n", __func__, mport->name, mport->id); + tsi_debug(EXIT, dev, "%s id=%d", mport->name, mport->id); } /** @@ -2663,15 +2672,15 @@ static int tsi721_setup_mport(struct tsi721_device *priv) else if (!pci_enable_msi(pdev)) priv->flags |= TSI721_USING_MSI; else - dev_info(&pdev->dev, - "MSI/MSI-X is not available. Using legacy INTx.\n"); + tsi_debug(MPORT, &pdev->dev, + "MSI/MSI-X is not available. Using legacy INTx."); #endif /* CONFIG_PCI_MSI */ err = tsi721_request_irq(priv); if (err) { - dev_err(&pdev->dev, "Unable to get assigned PCI IRQ " - "vector %02X err=0x%x\n", pdev->irq, err); + tsi_err(&pdev->dev, "Unable to get PCI IRQ %02X (err=0x%x)", + pdev->irq, err); return err; } @@ -2712,15 +2721,14 @@ static int tsi721_probe(struct pci_dev *pdev, int err; priv = kzalloc(sizeof(struct tsi721_device), GFP_KERNEL); - if (priv == NULL) { - dev_err(&pdev->dev, "Failed to allocate memory for device\n"); + if (!priv) { err = -ENOMEM; goto err_exit; } err = pci_enable_device(pdev); if (err) { - dev_err(&pdev->dev, "Failed to enable PCI device\n"); + tsi_err(&pdev->dev, "Failed to enable PCI device"); goto err_clean; } @@ -2728,13 +2736,12 @@ static int tsi721_probe(struct pci_dev *pdev, #ifdef DEBUG { - int i; - for (i = 0; i <= PCI_STD_RESOURCE_END; i++) { - dev_dbg(&pdev->dev, "res[%d] @ 0x%llx (0x%lx, 0x%lx)\n", - i, (unsigned long long)pci_resource_start(pdev, i), - (unsigned long)pci_resource_len(pdev, i), - pci_resource_flags(pdev, i)); - } + int i; + + for (i = 0; i <= PCI_STD_RESOURCE_END; i++) { + tsi_debug(INIT, &pdev->dev, "res%d %pR", + i, &pdev->resource[i]); + } } #endif /* @@ -2745,8 +2752,7 @@ static int tsi721_probe(struct pci_dev *pdev, if (!(pci_resource_flags(pdev, BAR_0) & IORESOURCE_MEM) || pci_resource_flags(pdev, BAR_0) & IORESOURCE_MEM_64 || pci_resource_len(pdev, BAR_0) < TSI721_REG_SPACE_SIZE) { - dev_err(&pdev->dev, - "Missing or misconfigured CSR BAR0, aborting.\n"); + tsi_err(&pdev->dev, "Missing or misconfigured CSR BAR0"); err = -ENODEV; goto err_disable_pdev; } @@ -2755,8 +2761,7 @@ static int tsi721_probe(struct pci_dev *pdev, if (!(pci_resource_flags(pdev, BAR_1) & IORESOURCE_MEM) || pci_resource_flags(pdev, BAR_1) & IORESOURCE_MEM_64 || pci_resource_len(pdev, BAR_1) < TSI721_DB_WIN_SIZE) { - dev_err(&pdev->dev, - "Missing or misconfigured Doorbell BAR1, aborting.\n"); + tsi_err(&pdev->dev, "Missing or misconfigured Doorbell BAR1"); err = -ENODEV; goto err_disable_pdev; } @@ -2773,8 +2778,8 @@ static int tsi721_probe(struct pci_dev *pdev, if (pci_resource_flags(pdev, BAR_2) & IORESOURCE_MEM_64) { if (pci_resource_flags(pdev, BAR_2) & IORESOURCE_PREFETCH) - dev_info(&pdev->dev, - "Prefetchable OBW BAR2 will not be used\n"); + tsi_debug(INIT, &pdev->dev, + "Prefetchable OBW BAR2 will not be used"); else { priv->p2r_bar[0].base = pci_resource_start(pdev, BAR_2); priv->p2r_bar[0].size = pci_resource_len(pdev, BAR_2); @@ -2783,8 +2788,8 @@ static int tsi721_probe(struct pci_dev *pdev, if (pci_resource_flags(pdev, BAR_4) & IORESOURCE_MEM_64) { if (pci_resource_flags(pdev, BAR_4) & IORESOURCE_PREFETCH) - dev_info(&pdev->dev, - "Prefetchable OBW BAR4 will not be used\n"); + tsi_debug(INIT, &pdev->dev, + "Prefetchable OBW BAR4 will not be used"); else { priv->p2r_bar[1].base = pci_resource_start(pdev, BAR_4); priv->p2r_bar[1].size = pci_resource_len(pdev, BAR_4); @@ -2793,8 +2798,7 @@ static int tsi721_probe(struct pci_dev *pdev, err = pci_request_regions(pdev, DRV_NAME); if (err) { - dev_err(&pdev->dev, "Cannot obtain PCI resources, " - "aborting.\n"); + tsi_err(&pdev->dev, "Unable to obtain PCI resources"); goto err_disable_pdev; } @@ -2802,16 +2806,14 @@ static int tsi721_probe(struct pci_dev *pdev, priv->regs = pci_ioremap_bar(pdev, BAR_0); if (!priv->regs) { - dev_err(&pdev->dev, - "Unable to map device registers space, aborting\n"); + tsi_err(&pdev->dev, "Unable to map device registers space"); err = -ENOMEM; goto err_free_res; } priv->odb_base = pci_ioremap_bar(pdev, BAR_1); if (!priv->odb_base) { - dev_err(&pdev->dev, - "Unable to map outbound doorbells space, aborting\n"); + tsi_err(&pdev->dev, "Unable to map outbound doorbells space"); err = -ENOMEM; goto err_unmap_bars; } @@ -2820,16 +2822,16 @@ static int tsi721_probe(struct pci_dev *pdev, if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { - dev_info(&pdev->dev, "Unable to set DMA mask\n"); + tsi_err(&pdev->dev, "Unable to set DMA mask"); goto err_unmap_bars; } if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32))) - dev_info(&pdev->dev, "Unable to set consistent DMA mask\n"); + tsi_info(&pdev->dev, "Unable to set consistent DMA mask"); } else { err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); if (err) - dev_info(&pdev->dev, "Unable to set consistent DMA mask\n"); + tsi_info(&pdev->dev, "Unable to set consistent DMA mask"); } BUG_ON(!pci_is_pcie(pdev)); @@ -2858,7 +2860,7 @@ static int tsi721_probe(struct pci_dev *pdev, tsi721_init_sr2pc_mapping(priv); if (tsi721_bdma_maint_init(priv)) { - dev_err(&pdev->dev, "BDMA initialization failed, aborting\n"); + tsi_err(&pdev->dev, "BDMA initialization failed"); err = -ENOMEM; goto err_unmap_bars; } @@ -2907,7 +2909,7 @@ static void tsi721_remove(struct pci_dev *pdev) { struct tsi721_device *priv = pci_get_drvdata(pdev); - dev_dbg(&pdev->dev, "%s enter\n", __func__); + tsi_debug(EXIT, &pdev->dev, "enter"); tsi721_disable_ints(priv); tsi721_free_irq(priv); @@ -2935,14 +2937,14 @@ static void tsi721_remove(struct pci_dev *pdev) pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); kfree(priv); - dev_dbg(&pdev->dev, "%s exit\n", __func__); + tsi_debug(EXIT, &pdev->dev, "exit"); } static void tsi721_shutdown(struct pci_dev *pdev) { struct tsi721_device *priv = pci_get_drvdata(pdev); - dev_dbg(&pdev->dev, "RIO: %s\n", __func__); + tsi_debug(EXIT, &pdev->dev, "enter"); tsi721_disable_ints(priv); tsi721_dma_stop_all(priv); diff --git a/drivers/rapidio/devices/tsi721.h b/drivers/rapidio/devices/tsi721.h index 57b46d025630..5456dbddc929 100644 --- a/drivers/rapidio/devices/tsi721.h +++ b/drivers/rapidio/devices/tsi721.h @@ -21,6 +21,46 @@ #ifndef __TSI721_H #define __TSI721_H +/* Debug output filtering masks */ +enum { + DBG_NONE = 0, + DBG_INIT = BIT(0), /* driver init */ + DBG_EXIT = BIT(1), /* driver exit */ + DBG_MPORT = BIT(2), /* mport add/remove */ + DBG_MAINT = BIT(3), /* maintenance ops messages */ + DBG_DMA = BIT(4), /* DMA transfer messages */ + DBG_DMAV = BIT(5), /* verbose DMA transfer messages */ + DBG_IBW = BIT(6), /* inbound window */ + DBG_EVENT = BIT(7), /* event handling messages */ + DBG_OBW = BIT(8), /* outbound window messages */ + DBG_DBELL = BIT(9), /* doorbell messages */ + DBG_OMSG = BIT(10), /* doorbell messages */ + DBG_IMSG = BIT(11), /* doorbell messages */ + DBG_ALL = ~0, +}; + +#ifdef DEBUG +extern u32 dbg_level; + +#define tsi_debug(level, dev, fmt, arg...) \ + do { \ + if (DBG_##level & dbg_level) \ + dev_dbg(dev, "%s: " fmt "\n", __func__, ##arg); \ + } while (0) +#else +#define tsi_debug(level, dev, fmt, arg...) \ + no_printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##arg) +#endif + +#define tsi_info(dev, fmt, arg...) \ + dev_info(dev, "%s: " fmt "\n", __func__, ##arg) + +#define tsi_warn(dev, fmt, arg...) \ + dev_warn(dev, "%s: WARNING " fmt "\n", __func__, ##arg) + +#define tsi_err(dev, fmt, arg...) \ + dev_err(dev, "%s: ERROR " fmt "\n", __func__, ##arg) + #define DRV_NAME "tsi721" #define DEFAULT_HOPCOUNT 0xff diff --git a/drivers/rapidio/devices/tsi721_dma.c b/drivers/rapidio/devices/tsi721_dma.c index 31bb61b43c3f..494482e4ab0a 100644 --- a/drivers/rapidio/devices/tsi721_dma.c +++ b/drivers/rapidio/devices/tsi721_dma.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "../../dma/dmaengine.h" @@ -75,7 +76,7 @@ static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num) struct tsi721_device *priv = to_tsi721(bdma_chan->dchan.device); #endif - dev_dbg(dev, "Init Block DMA Engine, CH%d\n", bdma_chan->id); + tsi_debug(DMA, &bdma_chan->dchan.dev->device, "DMAC%d", bdma_chan->id); /* * Allocate space for DMA descriptors @@ -91,8 +92,9 @@ static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num) bdma_chan->bd_phys = bd_phys; bdma_chan->bd_base = bd_ptr; - dev_dbg(dev, "DMA descriptors @ %p (phys = %llx)\n", - bd_ptr, (unsigned long long)bd_phys); + tsi_debug(DMA, &bdma_chan->dchan.dev->device, + "DMAC%d descriptors @ %p (phys = %pad)", + bdma_chan->id, bd_ptr, &bd_phys); /* Allocate space for descriptor status FIFO */ sts_size = ((bd_num + 1) >= TSI721_DMA_MINSTSSZ) ? @@ -114,9 +116,9 @@ static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num) bdma_chan->sts_base = sts_ptr; bdma_chan->sts_size = sts_size; - dev_dbg(dev, - "desc status FIFO @ %p (phys = %llx) size=0x%x\n", - sts_ptr, (unsigned long long)sts_phys, sts_size); + tsi_debug(DMA, &bdma_chan->dchan.dev->device, + "DMAC%d desc status FIFO @ %p (phys = %pad) size=0x%x", + bdma_chan->id, sts_ptr, &sts_phys, sts_size); /* Initialize DMA descriptors ring using added link descriptor */ bd_ptr[bd_num].type_id = cpu_to_le32(DTYPE3 << 29); @@ -155,8 +157,9 @@ static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num) priv->msix[idx].irq_name, (void *)bdma_chan); if (rc) { - dev_dbg(dev, "Unable to get MSI-X for BDMA%d-DONE\n", - bdma_chan->id); + tsi_debug(DMA, &bdma_chan->dchan.dev->device, + "Unable to get MSI-X for DMAC%d-DONE", + bdma_chan->id); goto err_out; } @@ -166,8 +169,9 @@ static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num) priv->msix[idx].irq_name, (void *)bdma_chan); if (rc) { - dev_dbg(dev, "Unable to get MSI-X for BDMA%d-INT\n", - bdma_chan->id); + tsi_debug(DMA, &bdma_chan->dchan.dev->device, + "Unable to get MSI-X for DMAC%d-INT", + bdma_chan->id); free_irq( priv->msix[TSI721_VECT_DMA0_DONE + bdma_chan->id].vector, @@ -302,20 +306,22 @@ static irqreturn_t tsi721_bdma_msix(int irq, void *ptr) static void tsi721_start_dma(struct tsi721_bdma_chan *bdma_chan) { if (!tsi721_dma_is_idle(bdma_chan)) { - dev_err(bdma_chan->dchan.device->dev, - "BUG: Attempt to start non-idle channel\n"); + tsi_err(&bdma_chan->dchan.dev->device, + "DMAC%d Attempt to start non-idle channel", + bdma_chan->id); return; } if (bdma_chan->wr_count == bdma_chan->wr_count_next) { - dev_err(bdma_chan->dchan.device->dev, - "BUG: Attempt to start DMA with no BDs ready\n"); + tsi_err(&bdma_chan->dchan.dev->device, + "DMAC%d Attempt to start DMA with no BDs ready %d", + bdma_chan->id, task_pid_nr(current)); return; } - dev_dbg(bdma_chan->dchan.device->dev, - "%s: chan_%d (wrc=%d)\n", __func__, bdma_chan->id, - bdma_chan->wr_count_next); + tsi_debug(DMA, &bdma_chan->dchan.dev->device, "DMAC%d (wrc=%d) %d", + bdma_chan->id, bdma_chan->wr_count_next, + task_pid_nr(current)); iowrite32(bdma_chan->wr_count_next, bdma_chan->regs + TSI721_DMAC_DWRCNT); @@ -417,10 +423,11 @@ static int tsi721_submit_sg(struct tsi721_tx_desc *desc) struct tsi721_dma_desc *bd_ptr = NULL; u32 idx, rd_idx; u32 add_count = 0; + struct device *ch_dev = &dchan->dev->device; if (!tsi721_dma_is_idle(bdma_chan)) { - dev_err(bdma_chan->dchan.device->dev, - "BUG: Attempt to use non-idle channel\n"); + tsi_err(ch_dev, "DMAC%d ERR: Attempt to use non-idle channel", + bdma_chan->id); return -EIO; } @@ -431,7 +438,7 @@ static int tsi721_submit_sg(struct tsi721_tx_desc *desc) rio_addr = desc->rio_addr; next_addr = -1; bcount = 0; - sys_size = dma_to_mport(bdma_chan->dchan.device)->sys_size; + sys_size = dma_to_mport(dchan->device)->sys_size; rd_idx = ioread32(bdma_chan->regs + TSI721_DMAC_DRDCNT); rd_idx %= (bdma_chan->bd_num + 1); @@ -443,18 +450,18 @@ static int tsi721_submit_sg(struct tsi721_tx_desc *desc) add_count++; } - dev_dbg(dchan->device->dev, "%s: BD ring status: rdi=%d wri=%d\n", - __func__, rd_idx, idx); + tsi_debug(DMA, ch_dev, "DMAC%d BD ring status: rdi=%d wri=%d", + bdma_chan->id, rd_idx, idx); for_each_sg(desc->sg, sg, desc->sg_len, i) { - dev_dbg(dchan->device->dev, "sg%d/%d addr: 0x%llx len: %d\n", - i, desc->sg_len, + tsi_debug(DMAV, ch_dev, "DMAC%d sg%d/%d addr: 0x%llx len: %d", + bdma_chan->id, i, desc->sg_len, (unsigned long long)sg_dma_address(sg), sg_dma_len(sg)); if (sg_dma_len(sg) > TSI721_BDMA_MAX_BCOUNT) { - dev_err(dchan->device->dev, - "%s: SG entry %d is too large\n", __func__, i); + tsi_err(ch_dev, "DMAC%d SG entry %d is too large", + bdma_chan->id, i); err = -EINVAL; break; } @@ -471,17 +478,16 @@ static int tsi721_submit_sg(struct tsi721_tx_desc *desc) } else if (next_addr != -1) { /* Finalize descriptor using total byte count value */ tsi721_desc_fill_end(bd_ptr, bcount, 0); - dev_dbg(dchan->device->dev, - "%s: prev desc final len: %d\n", - __func__, bcount); + tsi_debug(DMAV, ch_dev, "DMAC%d prev desc final len: %d", + bdma_chan->id, bcount); } desc->rio_addr = rio_addr; if (i && idx == rd_idx) { - dev_dbg(dchan->device->dev, - "%s: HW descriptor ring is full @ %d\n", - __func__, i); + tsi_debug(DMAV, ch_dev, + "DMAC%d HW descriptor ring is full @ %d", + bdma_chan->id, i); desc->sg = sg; desc->sg_len -= i; break; @@ -490,13 +496,12 @@ static int tsi721_submit_sg(struct tsi721_tx_desc *desc) bd_ptr = &((struct tsi721_dma_desc *)bdma_chan->bd_base)[idx]; err = tsi721_desc_fill_init(desc, bd_ptr, sg, sys_size); if (err) { - dev_err(dchan->device->dev, - "Failed to build desc: err=%d\n", err); + tsi_err(ch_dev, "Failed to build desc: err=%d", err); break; } - dev_dbg(dchan->device->dev, "bd_ptr = %p did=%d raddr=0x%llx\n", - bd_ptr, desc->destid, desc->rio_addr); + tsi_debug(DMAV, ch_dev, "DMAC%d bd_ptr = %p did=%d raddr=0x%llx", + bdma_chan->id, bd_ptr, desc->destid, desc->rio_addr); next_addr = sg_dma_address(sg); bcount = sg_dma_len(sg); @@ -511,8 +516,9 @@ static int tsi721_submit_sg(struct tsi721_tx_desc *desc) entry_done: if (sg_is_last(sg)) { tsi721_desc_fill_end(bd_ptr, bcount, 0); - dev_dbg(dchan->device->dev, "%s: last desc final len: %d\n", - __func__, bcount); + tsi_debug(DMAV, ch_dev, + "DMAC%d last desc final len: %d", + bdma_chan->id, bcount); desc->sg_len = 0; } else { rio_addr += sg_dma_len(sg); @@ -531,7 +537,7 @@ static void tsi721_advance_work(struct tsi721_bdma_chan *bdma_chan, { int err; - dev_dbg(bdma_chan->dchan.device->dev, "%s: Enter\n", __func__); + tsi_debug(DMA, &bdma_chan->dchan.dev->device, "DMAC%d", bdma_chan->id); if (!tsi721_dma_is_idle(bdma_chan)) return; @@ -555,13 +561,14 @@ static void tsi721_advance_work(struct tsi721_bdma_chan *bdma_chan, tsi721_start_dma(bdma_chan); else { tsi721_dma_tx_err(bdma_chan, desc); - dev_dbg(bdma_chan->dchan.device->dev, - "ERR: tsi721_submit_sg failed with err=%d\n", - err); + tsi_debug(DMA, &bdma_chan->dchan.dev->device, + "DMAC%d ERR: tsi721_submit_sg failed with err=%d", + bdma_chan->id, err); } } - dev_dbg(bdma_chan->dchan.device->dev, "%s: Exit\n", __func__); + tsi_debug(DMA, &bdma_chan->dchan.dev->device, "DMAC%d Exit", + bdma_chan->id); } static void tsi721_dma_tasklet(unsigned long data) @@ -570,16 +577,16 @@ static void tsi721_dma_tasklet(unsigned long data) u32 dmac_int, dmac_sts; dmac_int = ioread32(bdma_chan->regs + TSI721_DMAC_INT); - dev_dbg(bdma_chan->dchan.device->dev, "%s: DMAC%d_INT = 0x%x\n", - __func__, bdma_chan->id, dmac_int); + tsi_debug(DMA, &bdma_chan->dchan.dev->device, "DMAC%d_INT = 0x%x", + bdma_chan->id, dmac_int); /* Clear channel interrupts */ iowrite32(dmac_int, bdma_chan->regs + TSI721_DMAC_INT); if (dmac_int & TSI721_DMAC_INT_ERR) { dmac_sts = ioread32(bdma_chan->regs + TSI721_DMAC_STS); - dev_err(bdma_chan->dchan.device->dev, - "%s: DMA ERROR - DMAC%d_STS = 0x%x\n", - __func__, bdma_chan->id, dmac_sts); + tsi_err(&bdma_chan->dchan.dev->device, + "ERR - DMAC%d_STS = 0x%x", + bdma_chan->id, dmac_sts); spin_lock(&bdma_chan->lock); bdma_chan->active_tx = NULL; @@ -587,9 +594,9 @@ static void tsi721_dma_tasklet(unsigned long data) } if (dmac_int & TSI721_DMAC_INT_STFULL) { - dev_err(bdma_chan->dchan.device->dev, - "%s: DMAC%d descriptor status FIFO is full\n", - __func__, bdma_chan->id); + tsi_err(&bdma_chan->dchan.dev->device, + "DMAC%d descriptor status FIFO is full", + bdma_chan->id); } if (dmac_int & (TSI721_DMAC_INT_DONE | TSI721_DMAC_INT_IOFDONE)) { @@ -633,8 +640,9 @@ static dma_cookie_t tsi721_tx_submit(struct dma_async_tx_descriptor *txd) /* Check if the descriptor is detached from any lists */ if (!list_empty(&desc->desc_node)) { - dev_err(bdma_chan->dchan.device->dev, - "%s: wrong state of descriptor %p\n", __func__, txd); + tsi_err(&bdma_chan->dchan.dev->device, + "DMAC%d wrong state of descriptor %p", + bdma_chan->id, txd); return -EIO; } @@ -659,16 +667,15 @@ static int tsi721_alloc_chan_resources(struct dma_chan *dchan) struct tsi721_tx_desc *desc = NULL; int i; - dev_dbg(dchan->device->dev, "%s: for channel %d\n", - __func__, bdma_chan->id); + tsi_debug(DMA, &dchan->dev->device, "DMAC%d", bdma_chan->id); if (bdma_chan->bd_base) return TSI721_DMA_TX_QUEUE_SZ; /* Initialize BDMA channel */ if (tsi721_bdma_ch_init(bdma_chan, dma_desc_per_channel)) { - dev_err(dchan->device->dev, "Unable to initialize data DMA" - " channel %d, aborting\n", bdma_chan->id); + tsi_err(&dchan->dev->device, "Unable to initialize DMAC%d", + bdma_chan->id); return -ENODEV; } @@ -676,8 +683,9 @@ static int tsi721_alloc_chan_resources(struct dma_chan *dchan) desc = kcalloc(TSI721_DMA_TX_QUEUE_SZ, sizeof(struct tsi721_tx_desc), GFP_KERNEL); if (!desc) { - dev_err(dchan->device->dev, - "Failed to allocate logical descriptors\n"); + tsi_err(&dchan->dev->device, + "DMAC%d Failed to allocate logical descriptors", + bdma_chan->id); tsi721_bdma_ch_free(bdma_chan); return -ENOMEM; } @@ -718,8 +726,7 @@ static void tsi721_free_chan_resources(struct dma_chan *dchan) { struct tsi721_bdma_chan *bdma_chan = to_tsi721_chan(dchan); - dev_dbg(dchan->device->dev, "%s: for channel %d\n", - __func__, bdma_chan->id); + tsi_debug(DMA, &dchan->dev->device, "DMAC%d", bdma_chan->id); if (bdma_chan->bd_base == NULL) return; @@ -744,7 +751,7 @@ static void tsi721_issue_pending(struct dma_chan *dchan) { struct tsi721_bdma_chan *bdma_chan = to_tsi721_chan(dchan); - dev_dbg(dchan->device->dev, "%s: Enter\n", __func__); + tsi_debug(DMA, &dchan->dev->device, "DMAC%d", bdma_chan->id); spin_lock_bh(&bdma_chan->lock); if (tsi721_dma_is_idle(bdma_chan) && bdma_chan->active) { @@ -766,12 +773,13 @@ struct dma_async_tx_descriptor *tsi721_prep_rio_sg(struct dma_chan *dchan, struct dma_async_tx_descriptor *txd = NULL; if (!sgl || !sg_len) { - dev_err(dchan->device->dev, "%s: No SG list\n", __func__); + tsi_err(&dchan->dev->device, "DMAC%d No SG list", + bdma_chan->id); return NULL; } - dev_dbg(dchan->device->dev, "%s: %s\n", __func__, - (dir == DMA_DEV_TO_MEM)?"READ":"WRITE"); + tsi_debug(DMA, &dchan->dev->device, "DMAC%d %s", bdma_chan->id, + (dir == DMA_DEV_TO_MEM)?"READ":"WRITE"); if (dir == DMA_DEV_TO_MEM) rtype = NREAD; @@ -789,8 +797,9 @@ struct dma_async_tx_descriptor *tsi721_prep_rio_sg(struct dma_chan *dchan, break; } } else { - dev_err(dchan->device->dev, - "%s: Unsupported DMA direction option\n", __func__); + tsi_err(&dchan->dev->device, + "DMAC%d Unsupported DMA direction option", + bdma_chan->id); return NULL; } @@ -823,7 +832,7 @@ static int tsi721_terminate_all(struct dma_chan *dchan) u32 dmac_int; LIST_HEAD(list); - dev_dbg(dchan->device->dev, "%s: Entry\n", __func__); + tsi_debug(DMA, &dchan->dev->device, "DMAC%d", bdma_chan->id); spin_lock_bh(&bdma_chan->lock); @@ -933,7 +942,7 @@ int tsi721_register_dma(struct tsi721_device *priv) err = dma_async_device_register(&mport->dma); if (err) - dev_err(&priv->pdev->dev, "Failed to register DMA device\n"); + tsi_err(&priv->pdev->dev, "Failed to register DMA device"); return err; } From 83472457505c454592d91807754135d0ad34b125 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:26:59 -0700 Subject: [PATCH 54/81] rapidio/tsi721_dma: update error reporting from prep_sg callback Switch to returning error-valued pointer instead of simple NULL pointer. This allows to properly identify situation when request queue is full and therefore gives to upper layer an option to retry operation later. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/tsi721_dma.c | 37 ++++++++++++++++------------ 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/drivers/rapidio/devices/tsi721_dma.c b/drivers/rapidio/devices/tsi721_dma.c index 494482e4ab0a..5bc9071ebcae 100644 --- a/drivers/rapidio/devices/tsi721_dma.c +++ b/drivers/rapidio/devices/tsi721_dma.c @@ -767,7 +767,7 @@ struct dma_async_tx_descriptor *tsi721_prep_rio_sg(struct dma_chan *dchan, void *tinfo) { struct tsi721_bdma_chan *bdma_chan = to_tsi721_chan(dchan); - struct tsi721_tx_desc *desc, *_d; + struct tsi721_tx_desc *desc; struct rio_dma_ext *rext = tinfo; enum dma_rtype rtype; struct dma_async_tx_descriptor *txd = NULL; @@ -775,7 +775,7 @@ struct dma_async_tx_descriptor *tsi721_prep_rio_sg(struct dma_chan *dchan, if (!sgl || !sg_len) { tsi_err(&dchan->dev->device, "DMAC%d No SG list", bdma_chan->id); - return NULL; + return ERR_PTR(-EINVAL); } tsi_debug(DMA, &dchan->dev->device, "DMAC%d %s", bdma_chan->id, @@ -800,28 +800,33 @@ struct dma_async_tx_descriptor *tsi721_prep_rio_sg(struct dma_chan *dchan, tsi_err(&dchan->dev->device, "DMAC%d Unsupported DMA direction option", bdma_chan->id); - return NULL; + return ERR_PTR(-EINVAL); } spin_lock_bh(&bdma_chan->lock); - list_for_each_entry_safe(desc, _d, &bdma_chan->free_list, desc_node) { - if (async_tx_test_ack(&desc->txd)) { - list_del_init(&desc->desc_node); - desc->destid = rext->destid; - desc->rio_addr = rext->rio_addr; - desc->rio_addr_u = 0; - desc->rtype = rtype; - desc->sg_len = sg_len; - desc->sg = sgl; - txd = &desc->txd; - txd->flags = flags; - break; - } + if (!list_empty(&bdma_chan->free_list)) { + desc = list_first_entry(&bdma_chan->free_list, + struct tsi721_tx_desc, desc_node); + list_del_init(&desc->desc_node); + desc->destid = rext->destid; + desc->rio_addr = rext->rio_addr; + desc->rio_addr_u = 0; + desc->rtype = rtype; + desc->sg_len = sg_len; + desc->sg = sgl; + txd = &desc->txd; + txd->flags = flags; } spin_unlock_bh(&bdma_chan->lock); + if (!txd) { + tsi_debug(DMA, &dchan->dev->device, + "DMAC%d free TXD is not available", bdma_chan->id); + return ERR_PTR(-EBUSY); + } + return txd; } From e680b672a2b47748ee548229242b543a23b902c1 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:27:02 -0700 Subject: [PATCH 55/81] rapidio/tsi721_dma: fix synchronization issues Fix synchronization issues found during testing using multiple DMA transfer requests to the same channel: - lost MSI-X interrupt notifications - non-synchronized attempts to start DMA channel HW resulting in error message from the driver - cookie tracking/update race conditions resulting in incorrect DMA transfer status report Signed-off-by: Alexandre Bounine Reported-by: Barry Wood Tested-by: Barry Wood Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Cc: Barry Wood Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/tsi721_dma.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/rapidio/devices/tsi721_dma.c b/drivers/rapidio/devices/tsi721_dma.c index 5bc9071ebcae..03637330c482 100644 --- a/drivers/rapidio/devices/tsi721_dma.c +++ b/drivers/rapidio/devices/tsi721_dma.c @@ -84,7 +84,7 @@ static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num) */ bd_ptr = dma_zalloc_coherent(dev, (bd_num + 1) * sizeof(struct tsi721_dma_desc), - &bd_phys, GFP_KERNEL); + &bd_phys, GFP_ATOMIC); if (!bd_ptr) return -ENOMEM; @@ -102,7 +102,7 @@ static int tsi721_bdma_ch_init(struct tsi721_bdma_chan *bdma_chan, int bd_num) sts_size = roundup_pow_of_two(sts_size); sts_ptr = dma_zalloc_coherent(dev, sts_size * sizeof(struct tsi721_dma_sts), - &sts_phys, GFP_KERNEL); + &sts_phys, GFP_ATOMIC); if (!sts_ptr) { /* Free space allocated for DMA descriptors */ dma_free_coherent(dev, @@ -297,7 +297,8 @@ static irqreturn_t tsi721_bdma_msix(int irq, void *ptr) { struct tsi721_bdma_chan *bdma_chan = ptr; - tsi721_bdma_handler(bdma_chan); + if (bdma_chan->active) + tasklet_schedule(&bdma_chan->tasklet); return IRQ_HANDLED; } #endif /* CONFIG_PCI_MSI */ @@ -618,14 +619,14 @@ static void tsi721_dma_tasklet(unsigned long data) } list_add(&desc->desc_node, &bdma_chan->free_list); bdma_chan->active_tx = NULL; + tsi721_advance_work(bdma_chan, NULL); spin_unlock(&bdma_chan->lock); if (callback) callback(param); - spin_lock(&bdma_chan->lock); + } else { + tsi721_advance_work(bdma_chan, bdma_chan->active_tx); + spin_unlock(&bdma_chan->lock); } - - tsi721_advance_work(bdma_chan, bdma_chan->active_tx); - spin_unlock(&bdma_chan->lock); } /* Re-Enable BDMA channel interrupts */ @@ -681,7 +682,7 @@ static int tsi721_alloc_chan_resources(struct dma_chan *dchan) /* Allocate queue of transaction descriptors */ desc = kcalloc(TSI721_DMA_TX_QUEUE_SZ, sizeof(struct tsi721_tx_desc), - GFP_KERNEL); + GFP_ATOMIC); if (!desc) { tsi_err(&dchan->dev->device, "DMAC%d Failed to allocate logical descriptors", @@ -744,7 +745,13 @@ static enum dma_status tsi721_tx_status(struct dma_chan *dchan, dma_cookie_t cookie, struct dma_tx_state *txstate) { - return dma_cookie_status(dchan, cookie, txstate); + struct tsi721_bdma_chan *bdma_chan = to_tsi721_chan(dchan); + enum dma_status status; + + spin_lock_bh(&bdma_chan->lock); + status = dma_cookie_status(dchan, cookie, txstate); + spin_unlock_bh(&bdma_chan->lock); + return status; } static void tsi721_issue_pending(struct dma_chan *dchan) From 458bdf6e399d7d6f779b97a24645a7a38d1ae07b Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:27:05 -0700 Subject: [PATCH 56/81] rapidio/tsi721_dma: fix hardware error handling Add DMA channel re-initialization after an error to avoid termination of all pending transfer requests. Signed-off-by: Alexandre Bounine Reported-by: Barry Wood Tested-by: Barry Wood Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Cc: Barry Wood Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/tsi721_dma.c | 82 +++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 9 deletions(-) diff --git a/drivers/rapidio/devices/tsi721_dma.c b/drivers/rapidio/devices/tsi721_dma.c index 03637330c482..155cae1e62de 100644 --- a/drivers/rapidio/devices/tsi721_dma.c +++ b/drivers/rapidio/devices/tsi721_dma.c @@ -282,7 +282,7 @@ void tsi721_bdma_handler(struct tsi721_bdma_chan *bdma_chan) /* Disable BDMA channel interrupts */ iowrite32(0, bdma_chan->regs + TSI721_DMAC_INTE); if (bdma_chan->active) - tasklet_schedule(&bdma_chan->tasklet); + tasklet_hi_schedule(&bdma_chan->tasklet); } #ifdef CONFIG_PCI_MSI @@ -298,7 +298,7 @@ static irqreturn_t tsi721_bdma_msix(int irq, void *ptr) struct tsi721_bdma_chan *bdma_chan = ptr; if (bdma_chan->active) - tasklet_schedule(&bdma_chan->tasklet); + tasklet_hi_schedule(&bdma_chan->tasklet); return IRQ_HANDLED; } #endif /* CONFIG_PCI_MSI */ @@ -584,13 +584,71 @@ static void tsi721_dma_tasklet(unsigned long data) iowrite32(dmac_int, bdma_chan->regs + TSI721_DMAC_INT); if (dmac_int & TSI721_DMAC_INT_ERR) { + int i = 10000; + struct tsi721_tx_desc *desc; + + desc = bdma_chan->active_tx; dmac_sts = ioread32(bdma_chan->regs + TSI721_DMAC_STS); tsi_err(&bdma_chan->dchan.dev->device, - "ERR - DMAC%d_STS = 0x%x", - bdma_chan->id, dmac_sts); + "DMAC%d_STS = 0x%x did=%d raddr=0x%llx", + bdma_chan->id, dmac_sts, desc->destid, desc->rio_addr); + + /* Re-initialize DMA channel if possible */ + + if ((dmac_sts & TSI721_DMAC_STS_ABORT) == 0) + goto err_out; + + tsi721_clr_stat(bdma_chan); spin_lock(&bdma_chan->lock); + + /* Put DMA channel into init state */ + iowrite32(TSI721_DMAC_CTL_INIT, + bdma_chan->regs + TSI721_DMAC_CTL); + do { + udelay(1); + dmac_sts = ioread32(bdma_chan->regs + TSI721_DMAC_STS); + i--; + } while ((dmac_sts & TSI721_DMAC_STS_ABORT) && i); + + if (dmac_sts & TSI721_DMAC_STS_ABORT) { + tsi_err(&bdma_chan->dchan.dev->device, + "Failed to re-initiate DMAC%d", bdma_chan->id); + spin_unlock(&bdma_chan->lock); + goto err_out; + } + + /* Setup DMA descriptor pointers */ + iowrite32(((u64)bdma_chan->bd_phys >> 32), + bdma_chan->regs + TSI721_DMAC_DPTRH); + iowrite32(((u64)bdma_chan->bd_phys & TSI721_DMAC_DPTRL_MASK), + bdma_chan->regs + TSI721_DMAC_DPTRL); + + /* Setup descriptor status FIFO */ + iowrite32(((u64)bdma_chan->sts_phys >> 32), + bdma_chan->regs + TSI721_DMAC_DSBH); + iowrite32(((u64)bdma_chan->sts_phys & TSI721_DMAC_DSBL_MASK), + bdma_chan->regs + TSI721_DMAC_DSBL); + iowrite32(TSI721_DMAC_DSSZ_SIZE(bdma_chan->sts_size), + bdma_chan->regs + TSI721_DMAC_DSSZ); + + /* Clear interrupt bits */ + iowrite32(TSI721_DMAC_INT_ALL, + bdma_chan->regs + TSI721_DMAC_INT); + + ioread32(bdma_chan->regs + TSI721_DMAC_INT); + + bdma_chan->wr_count = bdma_chan->wr_count_next = 0; + bdma_chan->sts_rdptr = 0; + udelay(10); + + desc = bdma_chan->active_tx; + desc->status = DMA_ERROR; + dma_cookie_complete(&desc->txd); + list_add(&desc->desc_node, &bdma_chan->free_list); bdma_chan->active_tx = NULL; + if (bdma_chan->active) + tsi721_advance_work(bdma_chan, NULL); spin_unlock(&bdma_chan->lock); } @@ -619,16 +677,19 @@ static void tsi721_dma_tasklet(unsigned long data) } list_add(&desc->desc_node, &bdma_chan->free_list); bdma_chan->active_tx = NULL; - tsi721_advance_work(bdma_chan, NULL); + if (bdma_chan->active) + tsi721_advance_work(bdma_chan, NULL); spin_unlock(&bdma_chan->lock); if (callback) callback(param); } else { - tsi721_advance_work(bdma_chan, bdma_chan->active_tx); + if (bdma_chan->active) + tsi721_advance_work(bdma_chan, + bdma_chan->active_tx); spin_unlock(&bdma_chan->lock); } } - +err_out: /* Re-Enable BDMA channel interrupts */ iowrite32(TSI721_DMAC_INT_ALL, bdma_chan->regs + TSI721_DMAC_INTE); } @@ -841,7 +902,6 @@ static int tsi721_terminate_all(struct dma_chan *dchan) { struct tsi721_bdma_chan *bdma_chan = to_tsi721_chan(dchan); struct tsi721_tx_desc *desc, *_d; - u32 dmac_int; LIST_HEAD(list); tsi_debug(DMA, &dchan->dev->device, "DMAC%d", bdma_chan->id); @@ -850,7 +910,10 @@ static int tsi721_terminate_all(struct dma_chan *dchan) bdma_chan->active = false; - if (!tsi721_dma_is_idle(bdma_chan)) { + while (!tsi721_dma_is_idle(bdma_chan)) { + + udelay(5); +#if (0) /* make sure to stop the transfer */ iowrite32(TSI721_DMAC_CTL_SUSP, bdma_chan->regs + TSI721_DMAC_CTL); @@ -859,6 +922,7 @@ static int tsi721_terminate_all(struct dma_chan *dchan) do { dmac_int = ioread32(bdma_chan->regs + TSI721_DMAC_INT); } while ((dmac_int & TSI721_DMAC_INT_SUSP) == 0); +#endif } if (bdma_chan->active_tx) From e8de370188d098bb49483c287b44925957c3c9b6 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Tue, 22 Mar 2016 14:27:08 -0700 Subject: [PATCH 57/81] rapidio: add mport char device driver Add mport character device driver to provide user space interface to basic RapidIO subsystem operations. See included Documentation/rapidio/mport_cdev.txt for more details. [akpm@linux-foundation.org: fix printk warning on i386] [dan.carpenter@oracle.com: mport_cdev: fix some error codes] Signed-off-by: Alexandre Bounine Signed-off-by: Dan Carpenter Tested-by: Barry Wood Cc: Matt Porter Cc: Aurelien Jacquiot Cc: Andre van Herk Cc: Barry Wood Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/rapidio/mport_cdev.txt | 104 + drivers/rapidio/Kconfig | 8 + drivers/rapidio/devices/Makefile | 1 + drivers/rapidio/devices/rio_mport_cdev.c | 2720 ++++++++++++++++++++++ include/linux/rio_mport_cdev.h | 271 +++ include/uapi/linux/Kbuild | 1 + 6 files changed, 3105 insertions(+) create mode 100644 Documentation/rapidio/mport_cdev.txt create mode 100644 drivers/rapidio/devices/rio_mport_cdev.c create mode 100644 include/linux/rio_mport_cdev.h diff --git a/Documentation/rapidio/mport_cdev.txt b/Documentation/rapidio/mport_cdev.txt new file mode 100644 index 000000000000..20c120d4b3b8 --- /dev/null +++ b/Documentation/rapidio/mport_cdev.txt @@ -0,0 +1,104 @@ +RapidIO subsystem mport character device driver (rio_mport_cdev.c) +================================================================== + +Version History: +---------------- + 1.0.0 - Initial driver release. + +================================================================== + +I. Overview + +This device driver is the result of collaboration within the RapidIO.org +Software Task Group (STG) between Texas Instruments, Freescale, +Prodrive Technologies, Nokia Networks, BAE and IDT. Additional input was +received from other members of RapidIO.org. The objective was to create a +character mode driver interface which exposes the capabilities of RapidIO +devices directly to applications, in a manner that allows the numerous and +varied RapidIO implementations to interoperate. + +This driver (MPORT_CDEV) provides access to basic RapidIO subsystem operations +for user-space applications. Most of RapidIO operations are supported through +'ioctl' system calls. + +When loaded this device driver creates filesystem nodes named rio_mportX in /dev +directory for each registered RapidIO mport device. 'X' in the node name matches +to unique port ID assigned to each local mport device. + +Using available set of ioctl commands user-space applications can perform +following RapidIO bus and subsystem operations: + +- Reads and writes from/to configuration registers of mport devices + (RIO_MPORT_MAINT_READ_LOCAL/RIO_MPORT_MAINT_WRITE_LOCAL) +- Reads and writes from/to configuration registers of remote RapidIO devices. + This operations are defined as RapidIO Maintenance reads/writes in RIO spec. + (RIO_MPORT_MAINT_READ_REMOTE/RIO_MPORT_MAINT_WRITE_REMOTE) +- Set RapidIO Destination ID for mport devices (RIO_MPORT_MAINT_HDID_SET) +- Set RapidIO Component Tag for mport devices (RIO_MPORT_MAINT_COMPTAG_SET) +- Query logical index of mport devices (RIO_MPORT_MAINT_PORT_IDX_GET) +- Query capabilities and RapidIO link configuration of mport devices + (RIO_MPORT_GET_PROPERTIES) +- Enable/Disable reporting of RapidIO doorbell events to user-space applications + (RIO_ENABLE_DOORBELL_RANGE/RIO_DISABLE_DOORBELL_RANGE) +- Enable/Disable reporting of RIO port-write events to user-space applications + (RIO_ENABLE_PORTWRITE_RANGE/RIO_DISABLE_PORTWRITE_RANGE) +- Query/Control type of events reported through this driver: doorbells, + port-writes or both (RIO_SET_EVENT_MASK/RIO_GET_EVENT_MASK) +- Configure/Map mport's outbound requests window(s) for specific size, + RapidIO destination ID, hopcount and request type + (RIO_MAP_OUTBOUND/RIO_UNMAP_OUTBOUND) +- Configure/Map mport's inbound requests window(s) for specific size, + RapidIO base address and local memory base address + (RIO_MAP_INBOUND/RIO_UNMAP_INBOUND) +- Allocate/Free contiguous DMA coherent memory buffer for DMA data transfers + to/from remote RapidIO devices (RIO_ALLOC_DMA/RIO_FREE_DMA) +- Initiate DMA data transfers to/from remote RapidIO devices (RIO_TRANSFER). + Supports blocking, asynchronous and posted (a.k.a 'fire-and-forget') data + transfer modes. +- Check/Wait for completion of asynchronous DMA data transfer + (RIO_WAIT_FOR_ASYNC) +- Manage device objects supported by RapidIO subsystem (RIO_DEV_ADD/RIO_DEV_DEL). + This allows implementation of various RapidIO fabric enumeration algorithms + as user-space applications while using remaining functionality provided by + kernel RapidIO subsystem. + +II. Hardware Compatibility + +This device driver uses standard interfaces defined by kernel RapidIO subsystem +and therefore it can be used with any mport device driver registered by RapidIO +subsystem with limitations set by available mport implementation. + +At this moment the most common limitation is availability of RapidIO-specific +DMA engine framework for specific mport device. Users should verify available +functionality of their platform when planning to use this driver: + +- IDT Tsi721 PCIe-to-RapidIO bridge device and its mport device driver are fully + compatible with this driver. +- Freescale SoCs 'fsl_rio' mport driver does not have implementation for RapidIO + specific DMA engine support and therefore DMA data transfers mport_cdev driver + are not available. + +III. Module parameters + +- 'dbg_level' - This parameter allows to control amount of debug information + generated by this device driver. This parameter is formed by set of + This parameter can be changed bit masks that correspond to the specific + functional block. + For mask definitions see 'drivers/rapidio/devices/rio_mport_cdev.c' + This parameter can be changed dynamically. + Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level. + +IV. Known problems + + None. + +V. User-space Applications and API + +API library and applications that use this device driver are available from +RapidIO.org. + +VI. TODO List + +- Add support for sending/receiving "raw" RapidIO messaging packets. +- Add memory mapped DMA data transfers as an option when RapidIO-specific DMA + is not available. diff --git a/drivers/rapidio/Kconfig b/drivers/rapidio/Kconfig index 3e3be57e9a1a..b5a10d3c92c7 100644 --- a/drivers/rapidio/Kconfig +++ b/drivers/rapidio/Kconfig @@ -67,6 +67,14 @@ config RAPIDIO_ENUM_BASIC endchoice +config RAPIDIO_MPORT_CDEV + tristate "RapidIO /dev mport device driver" + depends on RAPIDIO + help + This option includes generic RapidIO mport device driver which + allows to user space applications to perform RapidIO-specific + operations through selected RapidIO mport. + menu "RapidIO Switch drivers" depends on RAPIDIO diff --git a/drivers/rapidio/devices/Makefile b/drivers/rapidio/devices/Makefile index 9432c494cf57..927dbf89592b 100644 --- a/drivers/rapidio/devices/Makefile +++ b/drivers/rapidio/devices/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_RAPIDIO_TSI721) += tsi721_mport.o tsi721_mport-y := tsi721.o tsi721_mport-$(CONFIG_RAPIDIO_DMA_ENGINE) += tsi721_dma.o +obj-$(CONFIG_RAPIDIO_MPORT_CDEV) += rio_mport_cdev.o diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c new file mode 100644 index 000000000000..9607bc826460 --- /dev/null +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -0,0 +1,2720 @@ +/* + * RapidIO mport character device + * + * Copyright 2014-2015 Integrated Device Technology, Inc. + * Alexandre Bounine + * Copyright 2014-2015 Prodrive Technologies + * Andre van Herk + * Jerry Jacobs + * Copyright (C) 2014 Texas Instruments Incorporated + * Aurelien Jacquiot + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#ifdef CONFIG_RAPIDIO_DMA_ENGINE +#include +#endif + +#include +#include +#include +#include + +#include "../rio.h" + +#define DRV_NAME "rio_mport" +#define DRV_PREFIX DRV_NAME ": " +#define DEV_NAME "rio_mport" +#define DRV_VERSION "1.0.0" + +/* Debug output filtering masks */ +enum { + DBG_NONE = 0, + DBG_INIT = BIT(0), /* driver init */ + DBG_EXIT = BIT(1), /* driver exit */ + DBG_MPORT = BIT(2), /* mport add/remove */ + DBG_RDEV = BIT(3), /* RapidIO device add/remove */ + DBG_DMA = BIT(4), /* DMA transfer messages */ + DBG_MMAP = BIT(5), /* mapping messages */ + DBG_IBW = BIT(6), /* inbound window */ + DBG_EVENT = BIT(7), /* event handling messages */ + DBG_OBW = BIT(8), /* outbound window messages */ + DBG_DBELL = BIT(9), /* doorbell messages */ + DBG_ALL = ~0, +}; + +#ifdef DEBUG +#define rmcd_debug(level, fmt, arg...) \ + do { \ + if (DBG_##level & dbg_level) \ + pr_debug(DRV_PREFIX "%s: " fmt "\n", __func__, ##arg); \ + } while (0) +#else +#define rmcd_debug(level, fmt, arg...) \ + no_printk(KERN_DEBUG pr_fmt(DRV_PREFIX fmt "\n"), ##arg) +#endif + +#define rmcd_warn(fmt, arg...) \ + pr_warn(DRV_PREFIX "%s WARNING " fmt "\n", __func__, ##arg) + +#define rmcd_error(fmt, arg...) \ + pr_err(DRV_PREFIX "%s ERROR " fmt "\n", __func__, ##arg) + +MODULE_AUTHOR("Jerry Jacobs "); +MODULE_AUTHOR("Aurelien Jacquiot "); +MODULE_AUTHOR("Alexandre Bounine "); +MODULE_AUTHOR("Andre van Herk "); +MODULE_DESCRIPTION("RapidIO mport character device driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRV_VERSION); + +static int dma_timeout = 3000; /* DMA transfer timeout in msec */ +module_param(dma_timeout, int, S_IRUGO); +MODULE_PARM_DESC(dma_timeout, "DMA Transfer Timeout in msec (default: 3000)"); + +#ifdef DEBUG +static u32 dbg_level = DBG_NONE; +module_param(dbg_level, uint, S_IWUSR | S_IWGRP | S_IRUGO); +MODULE_PARM_DESC(dbg_level, "Debugging output level (default 0 = none)"); +#endif + +/* + * An internal DMA coherent buffer + */ +struct mport_dma_buf { + void *ib_base; + dma_addr_t ib_phys; + u32 ib_size; + u64 ib_rio_base; + bool ib_map; + struct file *filp; +}; + +/* + * Internal memory mapping structure + */ +enum rio_mport_map_dir { + MAP_INBOUND, + MAP_OUTBOUND, + MAP_DMA, +}; + +struct rio_mport_mapping { + struct list_head node; + struct mport_dev *md; + enum rio_mport_map_dir dir; + u32 rioid; + u64 rio_addr; + dma_addr_t phys_addr; /* for mmap */ + void *virt_addr; /* kernel address, for dma_free_coherent */ + u64 size; + struct kref ref; /* refcount of vmas sharing the mapping */ + struct file *filp; +}; + +struct rio_mport_dma_map { + int valid; + uint64_t length; + void *vaddr; + dma_addr_t paddr; +}; + +#define MPORT_MAX_DMA_BUFS 16 +#define MPORT_EVENT_DEPTH 10 + +/* + * mport_dev driver-specific structure that represents mport device + * @active mport device status flag + * @node list node to maintain list of registered mports + * @cdev character device + * @dev associated device object + * @mport associated subsystem's master port device object + * @buf_mutex lock for buffer handling + * @file_mutex - lock for open files list + * @file_list - list of open files on given mport + * @properties properties of this mport + * @portwrites queue of inbound portwrites + * @pw_lock lock for port write queue + * @mappings queue for memory mappings + * @dma_chan DMA channels associated with this device + * @dma_ref: + * @comp: + */ +struct mport_dev { + atomic_t active; + struct list_head node; + struct cdev cdev; + struct device dev; + struct rio_mport *mport; + struct mutex buf_mutex; + struct mutex file_mutex; + struct list_head file_list; + struct rio_mport_properties properties; + struct list_head doorbells; + spinlock_t db_lock; + struct list_head portwrites; + spinlock_t pw_lock; + struct list_head mappings; +#ifdef CONFIG_RAPIDIO_DMA_ENGINE + struct dma_chan *dma_chan; + struct kref dma_ref; + struct completion comp; +#endif +}; + +/* + * mport_cdev_priv - data structure specific to individual file object + * associated with an open device + * @md master port character device object + * @async_queue - asynchronous notification queue + * @list - file objects tracking list + * @db_filters inbound doorbell filters for this descriptor + * @pw_filters portwrite filters for this descriptor + * @event_fifo event fifo for this descriptor + * @event_rx_wait wait queue for this descriptor + * @fifo_lock lock for event_fifo + * @event_mask event mask for this descriptor + * @dmach DMA engine channel allocated for specific file object + */ +struct mport_cdev_priv { + struct mport_dev *md; + struct fasync_struct *async_queue; + struct list_head list; + struct list_head db_filters; + struct list_head pw_filters; + struct kfifo event_fifo; + wait_queue_head_t event_rx_wait; + spinlock_t fifo_lock; + unsigned int event_mask; /* RIO_DOORBELL, RIO_PORTWRITE */ +#ifdef CONFIG_RAPIDIO_DMA_ENGINE + struct dma_chan *dmach; + struct list_head async_list; + struct list_head pend_list; + spinlock_t req_lock; + struct mutex dma_lock; + struct kref dma_ref; + struct completion comp; +#endif +}; + +/* + * rio_mport_pw_filter - structure to describe a portwrite filter + * md_node node in mport device's list + * priv_node node in private file object's list + * priv reference to private data + * filter actual portwrite filter + */ +struct rio_mport_pw_filter { + struct list_head md_node; + struct list_head priv_node; + struct mport_cdev_priv *priv; + struct rio_pw_filter filter; +}; + +/* + * rio_mport_db_filter - structure to describe a doorbell filter + * @data_node reference to device node + * @priv_node node in private data + * @priv reference to private data + * @filter actual doorbell filter + */ +struct rio_mport_db_filter { + struct list_head data_node; + struct list_head priv_node; + struct mport_cdev_priv *priv; + struct rio_doorbell_filter filter; +}; + +static LIST_HEAD(mport_devs); +static DEFINE_MUTEX(mport_devs_lock); + +#if (0) /* used by commented out portion of poll function : FIXME */ +static DECLARE_WAIT_QUEUE_HEAD(mport_cdev_wait); +#endif + +static struct class *dev_class; +static dev_t dev_number; + +static struct workqueue_struct *dma_wq; + +static void mport_release_mapping(struct kref *ref); + +static int rio_mport_maint_rd(struct mport_cdev_priv *priv, void __user *arg, + int local) +{ + struct rio_mport *mport = priv->md->mport; + struct rio_mport_maint_io maint_io; + u32 *buffer; + u32 offset; + size_t length; + int ret, i; + + if (unlikely(copy_from_user(&maint_io, arg, sizeof(maint_io)))) + return -EFAULT; + + if ((maint_io.offset % 4) || + (maint_io.length == 0) || (maint_io.length % 4)) + return -EINVAL; + + buffer = vmalloc(maint_io.length); + if (buffer == NULL) + return -ENOMEM; + length = maint_io.length/sizeof(u32); + offset = maint_io.offset; + + for (i = 0; i < length; i++) { + if (local) + ret = __rio_local_read_config_32(mport, + offset, &buffer[i]); + else + ret = rio_mport_read_config_32(mport, maint_io.rioid, + maint_io.hopcount, offset, &buffer[i]); + if (ret) + goto out; + + offset += 4; + } + + if (unlikely(copy_to_user(maint_io.buffer, buffer, maint_io.length))) + ret = -EFAULT; +out: + vfree(buffer); + return ret; +} + +static int rio_mport_maint_wr(struct mport_cdev_priv *priv, void __user *arg, + int local) +{ + struct rio_mport *mport = priv->md->mport; + struct rio_mport_maint_io maint_io; + u32 *buffer; + u32 offset; + size_t length; + int ret = -EINVAL, i; + + if (unlikely(copy_from_user(&maint_io, arg, sizeof(maint_io)))) + return -EFAULT; + + if ((maint_io.offset % 4) || + (maint_io.length == 0) || (maint_io.length % 4)) + return -EINVAL; + + buffer = vmalloc(maint_io.length); + if (buffer == NULL) + return -ENOMEM; + length = maint_io.length; + + if (unlikely(copy_from_user(buffer, maint_io.buffer, length))) { + ret = -EFAULT; + goto out; + } + + offset = maint_io.offset; + length /= sizeof(u32); + + for (i = 0; i < length; i++) { + if (local) + ret = __rio_local_write_config_32(mport, + offset, buffer[i]); + else + ret = rio_mport_write_config_32(mport, maint_io.rioid, + maint_io.hopcount, + offset, buffer[i]); + if (ret) + goto out; + + offset += 4; + } + +out: + vfree(buffer); + return ret; +} + + +/* + * Inbound/outbound memory mapping functions + */ +static int +rio_mport_create_outbound_mapping(struct mport_dev *md, struct file *filp, + u32 rioid, u64 raddr, u32 size, + dma_addr_t *paddr) +{ + struct rio_mport *mport = md->mport; + struct rio_mport_mapping *map; + int ret; + + rmcd_debug(OBW, "did=%d ra=0x%llx sz=0x%x", rioid, raddr, size); + + map = kzalloc(sizeof(struct rio_mport_mapping), GFP_KERNEL); + if (map == NULL) + return -ENOMEM; + + ret = rio_map_outb_region(mport, rioid, raddr, size, 0, paddr); + if (ret < 0) + goto err_map_outb; + + map->dir = MAP_OUTBOUND; + map->rioid = rioid; + map->rio_addr = raddr; + map->size = size; + map->phys_addr = *paddr; + map->filp = filp; + map->md = md; + kref_init(&map->ref); + list_add_tail(&map->node, &md->mappings); + return 0; +err_map_outb: + kfree(map); + return ret; +} + +static int +rio_mport_get_outbound_mapping(struct mport_dev *md, struct file *filp, + u32 rioid, u64 raddr, u32 size, + dma_addr_t *paddr) +{ + struct rio_mport_mapping *map; + int err = -ENOMEM; + + mutex_lock(&md->buf_mutex); + list_for_each_entry(map, &md->mappings, node) { + if (map->dir != MAP_OUTBOUND) + continue; + if (rioid == map->rioid && + raddr == map->rio_addr && size == map->size) { + *paddr = map->phys_addr; + err = 0; + break; + } else if (rioid == map->rioid && + raddr < (map->rio_addr + map->size - 1) && + (raddr + size) > map->rio_addr) { + err = -EBUSY; + break; + } + } + + /* If not found, create new */ + if (err == -ENOMEM) + err = rio_mport_create_outbound_mapping(md, filp, rioid, raddr, + size, paddr); + mutex_unlock(&md->buf_mutex); + return err; +} + +static int rio_mport_obw_map(struct file *filp, void __user *arg) +{ + struct mport_cdev_priv *priv = filp->private_data; + struct mport_dev *data = priv->md; + struct rio_mmap map; + dma_addr_t paddr; + int ret; + + if (unlikely(copy_from_user(&map, arg, sizeof(struct rio_mmap)))) + return -EFAULT; + + rmcd_debug(OBW, "did=%d ra=0x%llx sz=0x%llx", + map.rioid, map.rio_addr, map.length); + + ret = rio_mport_get_outbound_mapping(data, filp, map.rioid, + map.rio_addr, map.length, &paddr); + if (ret < 0) { + rmcd_error("Failed to set OBW err= %d", ret); + return ret; + } + + map.handle = paddr; + + if (unlikely(copy_to_user(arg, &map, sizeof(struct rio_mmap)))) + return -EFAULT; + return 0; +} + +/* + * rio_mport_obw_free() - unmap an OutBound Window from RapidIO address space + * + * @priv: driver private data + * @arg: buffer handle returned by allocation routine + */ +static int rio_mport_obw_free(struct file *filp, void __user *arg) +{ + struct mport_cdev_priv *priv = filp->private_data; + struct mport_dev *md = priv->md; + u64 handle; + struct rio_mport_mapping *map, *_map; + + if (!md->mport->ops->unmap_outb) + return -EPROTONOSUPPORT; + + if (copy_from_user(&handle, arg, sizeof(u64))) + return -EFAULT; + + rmcd_debug(OBW, "h=0x%llx", handle); + + mutex_lock(&md->buf_mutex); + list_for_each_entry_safe(map, _map, &md->mappings, node) { + if (map->dir == MAP_OUTBOUND && map->phys_addr == handle) { + if (map->filp == filp) { + rmcd_debug(OBW, "kref_put h=0x%llx", handle); + map->filp = NULL; + kref_put(&map->ref, mport_release_mapping); + } + break; + } + } + mutex_unlock(&md->buf_mutex); + + return 0; +} + +/* + * maint_hdid_set() - Set the host Device ID + * @priv: driver private data + * @arg: Device Id + */ +static int maint_hdid_set(struct mport_cdev_priv *priv, void __user *arg) +{ + struct mport_dev *md = priv->md; + uint16_t hdid; + + if (copy_from_user(&hdid, arg, sizeof(uint16_t))) + return -EFAULT; + + md->mport->host_deviceid = hdid; + md->properties.hdid = hdid; + rio_local_set_device_id(md->mport, hdid); + + rmcd_debug(MPORT, "Set host device Id to %d", hdid); + + return 0; +} + +/* + * maint_comptag_set() - Set the host Component Tag + * @priv: driver private data + * @arg: Component Tag + */ +static int maint_comptag_set(struct mport_cdev_priv *priv, void __user *arg) +{ + struct mport_dev *md = priv->md; + uint32_t comptag; + + if (copy_from_user(&comptag, arg, sizeof(uint32_t))) + return -EFAULT; + + rio_local_write_config_32(md->mport, RIO_COMPONENT_TAG_CSR, comptag); + + rmcd_debug(MPORT, "Set host Component Tag to %d", comptag); + + return 0; +} + +#ifdef CONFIG_RAPIDIO_DMA_ENGINE + +struct mport_dma_req { + struct list_head node; + struct file *filp; + struct mport_cdev_priv *priv; + enum rio_transfer_sync sync; + struct sg_table sgt; + struct page **page_list; + unsigned int nr_pages; + struct rio_mport_mapping *map; + struct dma_chan *dmach; + enum dma_data_direction dir; + dma_cookie_t cookie; + enum dma_status status; + struct completion req_comp; +}; + +struct mport_faf_work { + struct work_struct work; + struct mport_dma_req *req; +}; + +static void mport_release_def_dma(struct kref *dma_ref) +{ + struct mport_dev *md = + container_of(dma_ref, struct mport_dev, dma_ref); + + rmcd_debug(EXIT, "DMA_%d", md->dma_chan->chan_id); + rio_release_dma(md->dma_chan); + md->dma_chan = NULL; +} + +static void mport_release_dma(struct kref *dma_ref) +{ + struct mport_cdev_priv *priv = + container_of(dma_ref, struct mport_cdev_priv, dma_ref); + + rmcd_debug(EXIT, "DMA_%d", priv->dmach->chan_id); + complete(&priv->comp); +} + +static void dma_req_free(struct mport_dma_req *req) +{ + struct mport_cdev_priv *priv = req->priv; + unsigned int i; + + dma_unmap_sg(req->dmach->device->dev, + req->sgt.sgl, req->sgt.nents, req->dir); + sg_free_table(&req->sgt); + if (req->page_list) { + for (i = 0; i < req->nr_pages; i++) + put_page(req->page_list[i]); + kfree(req->page_list); + } + + if (req->map) { + mutex_lock(&req->map->md->buf_mutex); + kref_put(&req->map->ref, mport_release_mapping); + mutex_unlock(&req->map->md->buf_mutex); + } + + kref_put(&priv->dma_ref, mport_release_dma); + + kfree(req); +} + +static void dma_xfer_callback(void *param) +{ + struct mport_dma_req *req = (struct mport_dma_req *)param; + struct mport_cdev_priv *priv = req->priv; + + req->status = dma_async_is_tx_complete(priv->dmach, req->cookie, + NULL, NULL); + complete(&req->req_comp); +} + +static void dma_faf_cleanup(struct work_struct *_work) +{ + struct mport_faf_work *work = container_of(_work, + struct mport_faf_work, work); + struct mport_dma_req *req = work->req; + + dma_req_free(req); + kfree(work); +} + +static void dma_faf_callback(void *param) +{ + struct mport_dma_req *req = (struct mport_dma_req *)param; + struct mport_faf_work *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return; + + INIT_WORK(&work->work, dma_faf_cleanup); + work->req = req; + queue_work(dma_wq, &work->work); +} + +/* + * prep_dma_xfer() - Configure and send request to DMAengine to prepare DMA + * transfer object. + * Returns pointer to DMA transaction descriptor allocated by DMA driver on + * success or ERR_PTR (and/or NULL) if failed. Caller must check returned + * non-NULL pointer using IS_ERR macro. + */ +static struct dma_async_tx_descriptor +*prep_dma_xfer(struct dma_chan *chan, struct rio_transfer_io *transfer, + struct sg_table *sgt, int nents, enum dma_transfer_direction dir, + enum dma_ctrl_flags flags) +{ + struct rio_dma_data tx_data; + + tx_data.sg = sgt->sgl; + tx_data.sg_len = nents; + tx_data.rio_addr_u = 0; + tx_data.rio_addr = transfer->rio_addr; + if (dir == DMA_MEM_TO_DEV) { + switch (transfer->method) { + case RIO_EXCHANGE_NWRITE: + tx_data.wr_type = RDW_ALL_NWRITE; + break; + case RIO_EXCHANGE_NWRITE_R_ALL: + tx_data.wr_type = RDW_ALL_NWRITE_R; + break; + case RIO_EXCHANGE_NWRITE_R: + tx_data.wr_type = RDW_LAST_NWRITE_R; + break; + case RIO_EXCHANGE_DEFAULT: + tx_data.wr_type = RDW_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + } + + return rio_dma_prep_xfer(chan, transfer->rioid, &tx_data, dir, flags); +} + +/* Request DMA channel associated with this mport device. + * Try to request DMA channel for every new process that opened given + * mport. If a new DMA channel is not available use default channel + * which is the first DMA channel opened on mport device. + */ +static int get_dma_channel(struct mport_cdev_priv *priv) +{ + mutex_lock(&priv->dma_lock); + if (!priv->dmach) { + priv->dmach = rio_request_mport_dma(priv->md->mport); + if (!priv->dmach) { + /* Use default DMA channel if available */ + if (priv->md->dma_chan) { + priv->dmach = priv->md->dma_chan; + kref_get(&priv->md->dma_ref); + } else { + rmcd_error("Failed to get DMA channel"); + mutex_unlock(&priv->dma_lock); + return -ENODEV; + } + } else if (!priv->md->dma_chan) { + /* Register default DMA channel if we do not have one */ + priv->md->dma_chan = priv->dmach; + kref_init(&priv->md->dma_ref); + rmcd_debug(DMA, "Register DMA_chan %d as default", + priv->dmach->chan_id); + } + + kref_init(&priv->dma_ref); + init_completion(&priv->comp); + } + + kref_get(&priv->dma_ref); + mutex_unlock(&priv->dma_lock); + return 0; +} + +static void put_dma_channel(struct mport_cdev_priv *priv) +{ + kref_put(&priv->dma_ref, mport_release_dma); +} + +/* + * DMA transfer functions + */ +static int do_dma_request(struct mport_dma_req *req, + struct rio_transfer_io *xfer, + enum rio_transfer_sync sync, int nents) +{ + struct mport_cdev_priv *priv; + struct sg_table *sgt; + struct dma_chan *chan; + struct dma_async_tx_descriptor *tx; + dma_cookie_t cookie; + unsigned long tmo = msecs_to_jiffies(dma_timeout); + enum dma_transfer_direction dir; + long wret; + int ret = 0; + + priv = req->priv; + sgt = &req->sgt; + + chan = priv->dmach; + dir = (req->dir == DMA_FROM_DEVICE) ? DMA_DEV_TO_MEM : DMA_MEM_TO_DEV; + + rmcd_debug(DMA, "%s(%d) uses %s for DMA_%s", + current->comm, task_pid_nr(current), + dev_name(&chan->dev->device), + (dir == DMA_DEV_TO_MEM)?"READ":"WRITE"); + + /* Initialize DMA transaction request */ + tx = prep_dma_xfer(chan, xfer, sgt, nents, dir, + DMA_CTRL_ACK | DMA_PREP_INTERRUPT); + + if (!tx) { + rmcd_debug(DMA, "prep error for %s A:0x%llx L:0x%llx", + (dir == DMA_DEV_TO_MEM)?"READ":"WRITE", + xfer->rio_addr, xfer->length); + ret = -EIO; + goto err_out; + } else if (IS_ERR(tx)) { + ret = PTR_ERR(tx); + rmcd_debug(DMA, "prep error %d for %s A:0x%llx L:0x%llx", ret, + (dir == DMA_DEV_TO_MEM)?"READ":"WRITE", + xfer->rio_addr, xfer->length); + goto err_out; + } + + if (sync == RIO_TRANSFER_FAF) + tx->callback = dma_faf_callback; + else + tx->callback = dma_xfer_callback; + tx->callback_param = req; + + req->dmach = chan; + req->sync = sync; + req->status = DMA_IN_PROGRESS; + init_completion(&req->req_comp); + + cookie = dmaengine_submit(tx); + req->cookie = cookie; + + rmcd_debug(DMA, "pid=%d DMA_%s tx_cookie = %d", task_pid_nr(current), + (dir == DMA_DEV_TO_MEM)?"READ":"WRITE", cookie); + + if (dma_submit_error(cookie)) { + rmcd_error("submit err=%d (addr:0x%llx len:0x%llx)", + cookie, xfer->rio_addr, xfer->length); + ret = -EIO; + goto err_out; + } + + dma_async_issue_pending(chan); + + if (sync == RIO_TRANSFER_ASYNC) { + spin_lock(&priv->req_lock); + list_add_tail(&req->node, &priv->async_list); + spin_unlock(&priv->req_lock); + return cookie; + } else if (sync == RIO_TRANSFER_FAF) + return 0; + + wret = wait_for_completion_interruptible_timeout(&req->req_comp, tmo); + + if (wret == 0) { + /* Timeout on wait occurred */ + rmcd_error("%s(%d) timed out waiting for DMA_%s %d", + current->comm, task_pid_nr(current), + (dir == DMA_DEV_TO_MEM)?"READ":"WRITE", cookie); + return -ETIMEDOUT; + } else if (wret == -ERESTARTSYS) { + /* Wait_for_completion was interrupted by a signal but DMA may + * be in progress + */ + rmcd_error("%s(%d) wait for DMA_%s %d was interrupted", + current->comm, task_pid_nr(current), + (dir == DMA_DEV_TO_MEM)?"READ":"WRITE", cookie); + return -EINTR; + } + + if (req->status != DMA_COMPLETE) { + /* DMA transaction completion was signaled with error */ + rmcd_error("%s(%d) DMA_%s %d completed with status %d (ret=%d)", + current->comm, task_pid_nr(current), + (dir == DMA_DEV_TO_MEM)?"READ":"WRITE", + cookie, req->status, ret); + ret = -EIO; + } + +err_out: + return ret; +} + +/* + * rio_dma_transfer() - Perform RapidIO DMA data transfer to/from + * the remote RapidIO device + * @filp: file pointer associated with the call + * @transfer_mode: DMA transfer mode + * @sync: synchronization mode + * @dir: DMA transfer direction (DMA_MEM_TO_DEV = write OR + * DMA_DEV_TO_MEM = read) + * @xfer: data transfer descriptor structure + */ +static int +rio_dma_transfer(struct file *filp, uint32_t transfer_mode, + enum rio_transfer_sync sync, enum dma_data_direction dir, + struct rio_transfer_io *xfer) +{ + struct mport_cdev_priv *priv = filp->private_data; + unsigned long nr_pages = 0; + struct page **page_list = NULL; + struct mport_dma_req *req; + struct mport_dev *md = priv->md; + struct dma_chan *chan; + int i, ret; + int nents; + + if (xfer->length == 0) + return -EINVAL; + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + ret = get_dma_channel(priv); + if (ret) { + kfree(req); + return ret; + } + + /* + * If parameter loc_addr != NULL, we are transferring data from/to + * data buffer allocated in user-space: lock in memory user-space + * buffer pages and build an SG table for DMA transfer request + * + * Otherwise (loc_addr == NULL) contiguous kernel-space buffer is + * used for DMA data transfers: build single entry SG table using + * offset within the internal buffer specified by handle parameter. + */ + if (xfer->loc_addr) { + unsigned long offset; + long pinned; + + offset = (unsigned long)xfer->loc_addr & ~PAGE_MASK; + nr_pages = PAGE_ALIGN(xfer->length + offset) >> PAGE_SHIFT; + + page_list = kmalloc_array(nr_pages, + sizeof(*page_list), GFP_KERNEL); + if (page_list == NULL) { + ret = -ENOMEM; + goto err_req; + } + + down_read(¤t->mm->mmap_sem); + pinned = get_user_pages(current, current->mm, + (unsigned long)xfer->loc_addr & PAGE_MASK, + nr_pages, dir == DMA_FROM_DEVICE, 0, + page_list, NULL); + up_read(¤t->mm->mmap_sem); + + if (pinned != nr_pages) { + if (pinned < 0) { + rmcd_error("get_user_pages err=%ld", pinned); + nr_pages = 0; + } else + rmcd_error("pinned %ld out of %ld pages", + pinned, nr_pages); + ret = -EFAULT; + goto err_pg; + } + + ret = sg_alloc_table_from_pages(&req->sgt, page_list, nr_pages, + offset, xfer->length, GFP_KERNEL); + if (ret) { + rmcd_error("sg_alloc_table failed with err=%d", ret); + goto err_pg; + } + + req->page_list = page_list; + req->nr_pages = nr_pages; + } else { + dma_addr_t baddr; + struct rio_mport_mapping *map; + + baddr = (dma_addr_t)xfer->handle; + + mutex_lock(&md->buf_mutex); + list_for_each_entry(map, &md->mappings, node) { + if (baddr >= map->phys_addr && + baddr < (map->phys_addr + map->size)) { + kref_get(&map->ref); + req->map = map; + break; + } + } + mutex_unlock(&md->buf_mutex); + + if (req->map == NULL) { + ret = -ENOMEM; + goto err_req; + } + + if (xfer->length + xfer->offset > map->size) { + ret = -EINVAL; + goto err_req; + } + + ret = sg_alloc_table(&req->sgt, 1, GFP_KERNEL); + if (unlikely(ret)) { + rmcd_error("sg_alloc_table failed for internal buf"); + goto err_req; + } + + sg_set_buf(req->sgt.sgl, + map->virt_addr + (baddr - map->phys_addr) + + xfer->offset, xfer->length); + } + + req->dir = dir; + req->filp = filp; + req->priv = priv; + chan = priv->dmach; + + nents = dma_map_sg(chan->device->dev, + req->sgt.sgl, req->sgt.nents, dir); + if (nents == -EFAULT) { + rmcd_error("Failed to map SG list"); + return -EFAULT; + } + + ret = do_dma_request(req, xfer, sync, nents); + + if (ret >= 0) { + if (sync == RIO_TRANSFER_SYNC) + goto sync_out; + return ret; /* return ASYNC cookie */ + } + + if (ret == -ETIMEDOUT || ret == -EINTR) { + /* + * This can happen only in case of SYNC transfer. + * Do not free unfinished request structure immediately. + * Place it into pending list and deal with it later + */ + spin_lock(&priv->req_lock); + list_add_tail(&req->node, &priv->pend_list); + spin_unlock(&priv->req_lock); + return ret; + } + + + rmcd_debug(DMA, "do_dma_request failed with err=%d", ret); +sync_out: + dma_unmap_sg(chan->device->dev, req->sgt.sgl, req->sgt.nents, dir); + sg_free_table(&req->sgt); +err_pg: + if (page_list) { + for (i = 0; i < nr_pages; i++) + put_page(page_list[i]); + kfree(page_list); + } +err_req: + if (req->map) { + mutex_lock(&md->buf_mutex); + kref_put(&req->map->ref, mport_release_mapping); + mutex_unlock(&md->buf_mutex); + } + put_dma_channel(priv); + kfree(req); + return ret; +} + +static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg) +{ + struct mport_cdev_priv *priv = filp->private_data; + struct rio_transaction transaction; + struct rio_transfer_io *transfer; + enum dma_data_direction dir; + int i, ret = 0; + + if (unlikely(copy_from_user(&transaction, arg, sizeof(transaction)))) + return -EFAULT; + + if (transaction.count != 1) + return -EINVAL; + + if ((transaction.transfer_mode & + priv->md->properties.transfer_mode) == 0) + return -ENODEV; + + transfer = vmalloc(transaction.count * sizeof(struct rio_transfer_io)); + if (!transfer) + return -ENOMEM; + + if (unlikely(copy_from_user(transfer, transaction.block, + transaction.count * sizeof(struct rio_transfer_io)))) { + ret = -EFAULT; + goto out_free; + } + + dir = (transaction.dir == RIO_TRANSFER_DIR_READ) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE; + for (i = 0; i < transaction.count && ret == 0; i++) + ret = rio_dma_transfer(filp, transaction.transfer_mode, + transaction.sync, dir, &transfer[i]); + + if (unlikely(copy_to_user(transaction.block, transfer, + transaction.count * sizeof(struct rio_transfer_io)))) + ret = -EFAULT; + +out_free: + vfree(transfer); + + return ret; +} + +static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg) +{ + struct mport_cdev_priv *priv; + struct mport_dev *md; + struct rio_async_tx_wait w_param; + struct mport_dma_req *req; + dma_cookie_t cookie; + unsigned long tmo; + long wret; + int found = 0; + int ret; + + priv = (struct mport_cdev_priv *)filp->private_data; + md = priv->md; + + if (unlikely(copy_from_user(&w_param, arg, sizeof(w_param)))) + return -EFAULT; + + cookie = w_param.token; + if (w_param.timeout) + tmo = msecs_to_jiffies(w_param.timeout); + else /* Use default DMA timeout */ + tmo = msecs_to_jiffies(dma_timeout); + + spin_lock(&priv->req_lock); + list_for_each_entry(req, &priv->async_list, node) { + if (req->cookie == cookie) { + list_del(&req->node); + found = 1; + break; + } + } + spin_unlock(&priv->req_lock); + + if (!found) + return -EAGAIN; + + wret = wait_for_completion_interruptible_timeout(&req->req_comp, tmo); + + if (wret == 0) { + /* Timeout on wait occurred */ + rmcd_error("%s(%d) timed out waiting for ASYNC DMA_%s", + current->comm, task_pid_nr(current), + (req->dir == DMA_FROM_DEVICE)?"READ":"WRITE"); + ret = -ETIMEDOUT; + goto err_tmo; + } else if (wret == -ERESTARTSYS) { + /* Wait_for_completion was interrupted by a signal but DMA may + * be still in progress + */ + rmcd_error("%s(%d) wait for ASYNC DMA_%s was interrupted", + current->comm, task_pid_nr(current), + (req->dir == DMA_FROM_DEVICE)?"READ":"WRITE"); + ret = -EINTR; + goto err_tmo; + } + + if (req->status != DMA_COMPLETE) { + /* DMA transaction completion signaled with transfer error */ + rmcd_error("%s(%d) ASYNC DMA_%s completion with status %d", + current->comm, task_pid_nr(current), + (req->dir == DMA_FROM_DEVICE)?"READ":"WRITE", + req->status); + ret = -EIO; + } else + ret = 0; + + if (req->status != DMA_IN_PROGRESS && req->status != DMA_PAUSED) + dma_req_free(req); + + return ret; + +err_tmo: + /* Return request back into async queue */ + spin_lock(&priv->req_lock); + list_add_tail(&req->node, &priv->async_list); + spin_unlock(&priv->req_lock); + return ret; +} + +static int rio_mport_create_dma_mapping(struct mport_dev *md, struct file *filp, + uint64_t size, struct rio_mport_mapping **mapping) +{ + struct rio_mport_mapping *map; + + map = kzalloc(sizeof(struct rio_mport_mapping), GFP_KERNEL); + if (map == NULL) + return -ENOMEM; + + map->virt_addr = dma_alloc_coherent(md->mport->dev.parent, size, + &map->phys_addr, GFP_KERNEL); + if (map->virt_addr == NULL) { + kfree(map); + return -ENOMEM; + } + + map->dir = MAP_DMA; + map->size = size; + map->filp = filp; + map->md = md; + kref_init(&map->ref); + mutex_lock(&md->buf_mutex); + list_add_tail(&map->node, &md->mappings); + mutex_unlock(&md->buf_mutex); + *mapping = map; + + return 0; +} + +static int rio_mport_alloc_dma(struct file *filp, void __user *arg) +{ + struct mport_cdev_priv *priv = filp->private_data; + struct mport_dev *md = priv->md; + struct rio_dma_mem map; + struct rio_mport_mapping *mapping = NULL; + int ret; + + if (unlikely(copy_from_user(&map, arg, sizeof(struct rio_dma_mem)))) + return -EFAULT; + + ret = rio_mport_create_dma_mapping(md, filp, map.length, &mapping); + if (ret) + return ret; + + map.dma_handle = mapping->phys_addr; + + if (unlikely(copy_to_user(arg, &map, sizeof(struct rio_dma_mem)))) { + mutex_lock(&md->buf_mutex); + kref_put(&mapping->ref, mport_release_mapping); + mutex_unlock(&md->buf_mutex); + return -EFAULT; + } + + return 0; +} + +static int rio_mport_free_dma(struct file *filp, void __user *arg) +{ + struct mport_cdev_priv *priv = filp->private_data; + struct mport_dev *md = priv->md; + u64 handle; + int ret = -EFAULT; + struct rio_mport_mapping *map, *_map; + + if (copy_from_user(&handle, arg, sizeof(u64))) + return -EFAULT; + rmcd_debug(EXIT, "filp=%p", filp); + + mutex_lock(&md->buf_mutex); + list_for_each_entry_safe(map, _map, &md->mappings, node) { + if (map->dir == MAP_DMA && map->phys_addr == handle && + map->filp == filp) { + kref_put(&map->ref, mport_release_mapping); + ret = 0; + break; + } + } + mutex_unlock(&md->buf_mutex); + + if (ret == -EFAULT) { + rmcd_debug(DMA, "ERR no matching mapping"); + return ret; + } + + return 0; +} +#else +static int rio_mport_transfer_ioctl(struct file *filp, void *arg) +{ + return -ENODEV; +} + +static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg) +{ + return -ENODEV; +} + +static int rio_mport_alloc_dma(struct file *filp, void __user *arg) +{ + return -ENODEV; +} + +static int rio_mport_free_dma(struct file *filp, void __user *arg) +{ + return -ENODEV; +} +#endif /* CONFIG_RAPIDIO_DMA_ENGINE */ + +/* + * Inbound/outbound memory mapping functions + */ + +static int +rio_mport_create_inbound_mapping(struct mport_dev *md, struct file *filp, + u64 raddr, u32 size, + struct rio_mport_mapping **mapping) +{ + struct rio_mport *mport = md->mport; + struct rio_mport_mapping *map; + int ret; + + map = kzalloc(sizeof(struct rio_mport_mapping), GFP_KERNEL); + if (map == NULL) + return -ENOMEM; + + map->virt_addr = dma_alloc_coherent(mport->dev.parent, size, + &map->phys_addr, GFP_KERNEL); + if (map->virt_addr == NULL) { + ret = -ENOMEM; + goto err_dma_alloc; + } + + if (raddr == RIO_MAP_ANY_ADDR) + raddr = map->phys_addr; + ret = rio_map_inb_region(mport, map->phys_addr, raddr, size, 0); + if (ret < 0) + goto err_map_inb; + + map->dir = MAP_INBOUND; + map->rio_addr = raddr; + map->size = size; + map->filp = filp; + map->md = md; + kref_init(&map->ref); + mutex_lock(&md->buf_mutex); + list_add_tail(&map->node, &md->mappings); + mutex_unlock(&md->buf_mutex); + *mapping = map; + return 0; + +err_map_inb: + dma_free_coherent(mport->dev.parent, size, + map->virt_addr, map->phys_addr); +err_dma_alloc: + kfree(map); + return ret; +} + +static int +rio_mport_get_inbound_mapping(struct mport_dev *md, struct file *filp, + u64 raddr, u32 size, + struct rio_mport_mapping **mapping) +{ + struct rio_mport_mapping *map; + int err = -ENOMEM; + + if (raddr == RIO_MAP_ANY_ADDR) + goto get_new; + + mutex_lock(&md->buf_mutex); + list_for_each_entry(map, &md->mappings, node) { + if (map->dir != MAP_INBOUND) + continue; + if (raddr == map->rio_addr && size == map->size) { + /* allow exact match only */ + *mapping = map; + err = 0; + break; + } else if (raddr < (map->rio_addr + map->size - 1) && + (raddr + size) > map->rio_addr) { + err = -EBUSY; + break; + } + } + mutex_unlock(&md->buf_mutex); + + if (err != -ENOMEM) + return err; +get_new: + /* not found, create new */ + return rio_mport_create_inbound_mapping(md, filp, raddr, size, mapping); +} + +static int rio_mport_map_inbound(struct file *filp, void __user *arg) +{ + struct mport_cdev_priv *priv = filp->private_data; + struct mport_dev *md = priv->md; + struct rio_mmap map; + struct rio_mport_mapping *mapping = NULL; + int ret; + + if (!md->mport->ops->map_inb) + return -EPROTONOSUPPORT; + if (unlikely(copy_from_user(&map, arg, sizeof(struct rio_mmap)))) + return -EFAULT; + + rmcd_debug(IBW, "%s filp=%p", dev_name(&priv->md->dev), filp); + + ret = rio_mport_get_inbound_mapping(md, filp, map.rio_addr, + map.length, &mapping); + if (ret) + return ret; + + map.handle = mapping->phys_addr; + map.rio_addr = mapping->rio_addr; + + if (unlikely(copy_to_user(arg, &map, sizeof(struct rio_mmap)))) { + /* Delete mapping if it was created by this request */ + if (ret == 0 && mapping->filp == filp) { + mutex_lock(&md->buf_mutex); + kref_put(&mapping->ref, mport_release_mapping); + mutex_unlock(&md->buf_mutex); + } + return -EFAULT; + } + + return 0; +} + +/* + * rio_mport_inbound_free() - unmap from RapidIO address space and free + * previously allocated inbound DMA coherent buffer + * @priv: driver private data + * @arg: buffer handle returned by allocation routine + */ +static int rio_mport_inbound_free(struct file *filp, void __user *arg) +{ + struct mport_cdev_priv *priv = filp->private_data; + struct mport_dev *md = priv->md; + u64 handle; + struct rio_mport_mapping *map, *_map; + + rmcd_debug(IBW, "%s filp=%p", dev_name(&priv->md->dev), filp); + + if (!md->mport->ops->unmap_inb) + return -EPROTONOSUPPORT; + + if (copy_from_user(&handle, arg, sizeof(u64))) + return -EFAULT; + + mutex_lock(&md->buf_mutex); + list_for_each_entry_safe(map, _map, &md->mappings, node) { + if (map->dir == MAP_INBOUND && map->phys_addr == handle) { + if (map->filp == filp) { + map->filp = NULL; + kref_put(&map->ref, mport_release_mapping); + } + break; + } + } + mutex_unlock(&md->buf_mutex); + + return 0; +} + +/* + * maint_port_idx_get() - Get the port index of the mport instance + * @priv: driver private data + * @arg: port index + */ +static int maint_port_idx_get(struct mport_cdev_priv *priv, void __user *arg) +{ + struct mport_dev *md = priv->md; + uint32_t port_idx = md->mport->index; + + rmcd_debug(MPORT, "port_index=%d", port_idx); + + if (copy_to_user(arg, &port_idx, sizeof(port_idx))) + return -EFAULT; + + return 0; +} + +static int rio_mport_add_event(struct mport_cdev_priv *priv, + struct rio_event *event) +{ + int overflow; + + if (!(priv->event_mask & event->header)) + return -EACCES; + + spin_lock(&priv->fifo_lock); + overflow = kfifo_avail(&priv->event_fifo) < sizeof(*event) + || kfifo_in(&priv->event_fifo, (unsigned char *)event, + sizeof(*event)) != sizeof(*event); + spin_unlock(&priv->fifo_lock); + + wake_up_interruptible(&priv->event_rx_wait); + + if (overflow) { + dev_warn(&priv->md->dev, DRV_NAME ": event fifo overflow\n"); + return -EBUSY; + } + + return 0; +} + +static void rio_mport_doorbell_handler(struct rio_mport *mport, void *dev_id, + u16 src, u16 dst, u16 info) +{ + struct mport_dev *data = dev_id; + struct mport_cdev_priv *priv; + struct rio_mport_db_filter *db_filter; + struct rio_event event; + int handled; + + event.header = RIO_DOORBELL; + event.u.doorbell.rioid = src; + event.u.doorbell.payload = info; + + handled = 0; + spin_lock(&data->db_lock); + list_for_each_entry(db_filter, &data->doorbells, data_node) { + if (((db_filter->filter.rioid == 0xffffffff || + db_filter->filter.rioid == src)) && + info >= db_filter->filter.low && + info <= db_filter->filter.high) { + priv = db_filter->priv; + rio_mport_add_event(priv, &event); + handled = 1; + } + } + spin_unlock(&data->db_lock); + + if (!handled) + dev_warn(&data->dev, + "%s: spurious DB received from 0x%x, info=0x%04x\n", + __func__, src, info); +} + +static int rio_mport_add_db_filter(struct mport_cdev_priv *priv, + void __user *arg) +{ + struct mport_dev *md = priv->md; + struct rio_mport_db_filter *db_filter; + struct rio_doorbell_filter filter; + unsigned long flags; + int ret; + + if (copy_from_user(&filter, arg, sizeof(filter))) + return -EFAULT; + + if (filter.low > filter.high) + return -EINVAL; + + ret = rio_request_inb_dbell(md->mport, md, filter.low, filter.high, + rio_mport_doorbell_handler); + if (ret) { + rmcd_error("%s failed to register IBDB, err=%d", + dev_name(&md->dev), ret); + return ret; + } + + db_filter = kzalloc(sizeof(*db_filter), GFP_KERNEL); + if (db_filter == NULL) { + rio_release_inb_dbell(md->mport, filter.low, filter.high); + return -ENOMEM; + } + + db_filter->filter = filter; + db_filter->priv = priv; + spin_lock_irqsave(&md->db_lock, flags); + list_add_tail(&db_filter->priv_node, &priv->db_filters); + list_add_tail(&db_filter->data_node, &md->doorbells); + spin_unlock_irqrestore(&md->db_lock, flags); + + return 0; +} + +static void rio_mport_delete_db_filter(struct rio_mport_db_filter *db_filter) +{ + list_del(&db_filter->data_node); + list_del(&db_filter->priv_node); + kfree(db_filter); +} + +static int rio_mport_remove_db_filter(struct mport_cdev_priv *priv, + void __user *arg) +{ + struct rio_mport_db_filter *db_filter; + struct rio_doorbell_filter filter; + unsigned long flags; + int ret = -EINVAL; + + if (copy_from_user(&filter, arg, sizeof(filter))) + return -EFAULT; + + spin_lock_irqsave(&priv->md->db_lock, flags); + list_for_each_entry(db_filter, &priv->db_filters, priv_node) { + if (db_filter->filter.rioid == filter.rioid && + db_filter->filter.low == filter.low && + db_filter->filter.high == filter.high) { + rio_mport_delete_db_filter(db_filter); + ret = 0; + break; + } + } + spin_unlock_irqrestore(&priv->md->db_lock, flags); + + if (!ret) + rio_release_inb_dbell(priv->md->mport, filter.low, filter.high); + + return ret; +} + +static int rio_mport_match_pw(union rio_pw_msg *msg, + struct rio_pw_filter *filter) +{ + if ((msg->em.comptag & filter->mask) < filter->low || + (msg->em.comptag & filter->mask) > filter->high) + return 0; + return 1; +} + +static int rio_mport_pw_handler(struct rio_mport *mport, void *context, + union rio_pw_msg *msg, int step) +{ + struct mport_dev *md = context; + struct mport_cdev_priv *priv; + struct rio_mport_pw_filter *pw_filter; + struct rio_event event; + int handled; + + event.header = RIO_PORTWRITE; + memcpy(event.u.portwrite.payload, msg->raw, RIO_PW_MSG_SIZE); + + handled = 0; + spin_lock(&md->pw_lock); + list_for_each_entry(pw_filter, &md->portwrites, md_node) { + if (rio_mport_match_pw(msg, &pw_filter->filter)) { + priv = pw_filter->priv; + rio_mport_add_event(priv, &event); + handled = 1; + } + } + spin_unlock(&md->pw_lock); + + if (!handled) { + printk_ratelimited(KERN_WARNING DRV_NAME + ": mport%d received spurious PW from 0x%08x\n", + mport->id, msg->em.comptag); + } + + return 0; +} + +static int rio_mport_add_pw_filter(struct mport_cdev_priv *priv, + void __user *arg) +{ + struct mport_dev *md = priv->md; + struct rio_mport_pw_filter *pw_filter; + struct rio_pw_filter filter; + unsigned long flags; + int hadd = 0; + + if (copy_from_user(&filter, arg, sizeof(filter))) + return -EFAULT; + + pw_filter = kzalloc(sizeof(*pw_filter), GFP_KERNEL); + if (pw_filter == NULL) + return -ENOMEM; + + pw_filter->filter = filter; + pw_filter->priv = priv; + spin_lock_irqsave(&md->pw_lock, flags); + if (list_empty(&md->portwrites)) + hadd = 1; + list_add_tail(&pw_filter->priv_node, &priv->pw_filters); + list_add_tail(&pw_filter->md_node, &md->portwrites); + spin_unlock_irqrestore(&md->pw_lock, flags); + + if (hadd) { + int ret; + + ret = rio_add_mport_pw_handler(md->mport, md, + rio_mport_pw_handler); + if (ret) { + dev_err(&md->dev, + "%s: failed to add IB_PW handler, err=%d\n", + __func__, ret); + return ret; + } + rio_pw_enable(md->mport, 1); + } + + return 0; +} + +static void rio_mport_delete_pw_filter(struct rio_mport_pw_filter *pw_filter) +{ + list_del(&pw_filter->md_node); + list_del(&pw_filter->priv_node); + kfree(pw_filter); +} + +static int rio_mport_match_pw_filter(struct rio_pw_filter *a, + struct rio_pw_filter *b) +{ + if ((a->mask == b->mask) && (a->low == b->low) && (a->high == b->high)) + return 1; + return 0; +} + +static int rio_mport_remove_pw_filter(struct mport_cdev_priv *priv, + void __user *arg) +{ + struct mport_dev *md = priv->md; + struct rio_mport_pw_filter *pw_filter; + struct rio_pw_filter filter; + unsigned long flags; + int ret = -EINVAL; + int hdel = 0; + + if (copy_from_user(&filter, arg, sizeof(filter))) + return -EFAULT; + + spin_lock_irqsave(&md->pw_lock, flags); + list_for_each_entry(pw_filter, &priv->pw_filters, priv_node) { + if (rio_mport_match_pw_filter(&pw_filter->filter, &filter)) { + rio_mport_delete_pw_filter(pw_filter); + ret = 0; + break; + } + } + + if (list_empty(&md->portwrites)) + hdel = 1; + spin_unlock_irqrestore(&md->pw_lock, flags); + + if (hdel) { + rio_del_mport_pw_handler(md->mport, priv->md, + rio_mport_pw_handler); + rio_pw_enable(md->mport, 0); + } + + return ret; +} + +/* + * rio_release_dev - release routine for kernel RIO device object + * @dev: kernel device object associated with a RIO device structure + * + * Frees a RIO device struct associated a RIO device struct. + * The RIO device struct is freed. + */ +static void rio_release_dev(struct device *dev) +{ + struct rio_dev *rdev; + + rdev = to_rio_dev(dev); + pr_info(DRV_PREFIX "%s: %s\n", __func__, rio_name(rdev)); + kfree(rdev); +} + + +static void rio_release_net(struct device *dev) +{ + struct rio_net *net; + + net = to_rio_net(dev); + rmcd_debug(RDEV, "net_%d", net->id); + kfree(net); +} + + +/* + * rio_mport_add_riodev - creates a kernel RIO device object + * + * Allocates a RIO device data structure and initializes required fields based + * on device's configuration space contents. + * If the device has switch capabilities, then a switch specific portion is + * allocated and configured. + */ +static int rio_mport_add_riodev(struct mport_cdev_priv *priv, + void __user *arg) +{ + struct mport_dev *md = priv->md; + struct rio_rdev_info dev_info; + struct rio_dev *rdev; + struct rio_switch *rswitch = NULL; + struct rio_mport *mport; + size_t size; + u32 rval; + u32 swpinfo = 0; + u16 destid; + u8 hopcount; + int err; + + if (copy_from_user(&dev_info, arg, sizeof(dev_info))) + return -EFAULT; + + rmcd_debug(RDEV, "name:%s ct:0x%x did:0x%x hc:0x%x", dev_info.name, + dev_info.comptag, dev_info.destid, dev_info.hopcount); + + if (bus_find_device_by_name(&rio_bus_type, NULL, dev_info.name)) { + rmcd_debug(RDEV, "device %s already exists", dev_info.name); + return -EEXIST; + } + + size = sizeof(struct rio_dev); + mport = md->mport; + destid = (u16)dev_info.destid; + hopcount = (u8)dev_info.hopcount; + + if (rio_mport_read_config_32(mport, destid, hopcount, + RIO_PEF_CAR, &rval)) + return -EIO; + + if (rval & RIO_PEF_SWITCH) { + rio_mport_read_config_32(mport, destid, hopcount, + RIO_SWP_INFO_CAR, &swpinfo); + size += (RIO_GET_TOTAL_PORTS(swpinfo) * + sizeof(rswitch->nextdev[0])) + sizeof(*rswitch); + } + + rdev = kzalloc(size, GFP_KERNEL); + if (rdev == NULL) + return -ENOMEM; + + if (mport->net == NULL) { + struct rio_net *net; + + net = rio_alloc_net(mport); + if (!net) { + err = -ENOMEM; + rmcd_debug(RDEV, "failed to allocate net object"); + goto cleanup; + } + + net->id = mport->id; + net->hport = mport; + dev_set_name(&net->dev, "rnet_%d", net->id); + net->dev.parent = &mport->dev; + net->dev.release = rio_release_net; + err = rio_add_net(net); + if (err) { + rmcd_debug(RDEV, "failed to register net, err=%d", err); + kfree(net); + goto cleanup; + } + } + + rdev->net = mport->net; + rdev->pef = rval; + rdev->swpinfo = swpinfo; + rio_mport_read_config_32(mport, destid, hopcount, + RIO_DEV_ID_CAR, &rval); + rdev->did = rval >> 16; + rdev->vid = rval & 0xffff; + rio_mport_read_config_32(mport, destid, hopcount, RIO_DEV_INFO_CAR, + &rdev->device_rev); + rio_mport_read_config_32(mport, destid, hopcount, RIO_ASM_ID_CAR, + &rval); + rdev->asm_did = rval >> 16; + rdev->asm_vid = rval & 0xffff; + rio_mport_read_config_32(mport, destid, hopcount, RIO_ASM_INFO_CAR, + &rval); + rdev->asm_rev = rval >> 16; + + if (rdev->pef & RIO_PEF_EXT_FEATURES) { + rdev->efptr = rval & 0xffff; + rdev->phys_efptr = rio_mport_get_physefb(mport, 0, destid, + hopcount); + + rdev->em_efptr = rio_mport_get_feature(mport, 0, destid, + hopcount, RIO_EFB_ERR_MGMNT); + } + + rio_mport_read_config_32(mport, destid, hopcount, RIO_SRC_OPS_CAR, + &rdev->src_ops); + rio_mport_read_config_32(mport, destid, hopcount, RIO_DST_OPS_CAR, + &rdev->dst_ops); + + rdev->comp_tag = dev_info.comptag; + rdev->destid = destid; + /* hopcount is stored as specified by a caller, regardles of EP or SW */ + rdev->hopcount = hopcount; + + if (rdev->pef & RIO_PEF_SWITCH) { + rswitch = rdev->rswitch; + rswitch->route_table = NULL; + } + + if (strlen(dev_info.name)) + dev_set_name(&rdev->dev, "%s", dev_info.name); + else if (rdev->pef & RIO_PEF_SWITCH) + dev_set_name(&rdev->dev, "%02x:s:%04x", mport->id, + rdev->comp_tag & RIO_CTAG_UDEVID); + else + dev_set_name(&rdev->dev, "%02x:e:%04x", mport->id, + rdev->comp_tag & RIO_CTAG_UDEVID); + + INIT_LIST_HEAD(&rdev->net_list); + rdev->dev.parent = &mport->net->dev; + rio_attach_device(rdev); + rdev->dev.release = rio_release_dev; + + if (rdev->dst_ops & RIO_DST_OPS_DOORBELL) + rio_init_dbell_res(&rdev->riores[RIO_DOORBELL_RESOURCE], + 0, 0xffff); + err = rio_add_device(rdev); + if (err) + goto cleanup; + rio_dev_get(rdev); + + return 0; +cleanup: + kfree(rdev); + return err; +} + +static int rio_mport_del_riodev(struct mport_cdev_priv *priv, void __user *arg) +{ + struct rio_rdev_info dev_info; + struct rio_dev *rdev = NULL; + struct device *dev; + struct rio_mport *mport; + struct rio_net *net; + + if (copy_from_user(&dev_info, arg, sizeof(dev_info))) + return -EFAULT; + + mport = priv->md->mport; + + /* If device name is specified, removal by name has priority */ + if (strlen(dev_info.name)) { + dev = bus_find_device_by_name(&rio_bus_type, NULL, + dev_info.name); + if (dev) + rdev = to_rio_dev(dev); + } else { + do { + rdev = rio_get_comptag(dev_info.comptag, rdev); + if (rdev && rdev->dev.parent == &mport->net->dev && + rdev->destid == (u16)dev_info.destid && + rdev->hopcount == (u8)dev_info.hopcount) + break; + } while (rdev); + } + + if (!rdev) { + rmcd_debug(RDEV, + "device name:%s ct:0x%x did:0x%x hc:0x%x not found", + dev_info.name, dev_info.comptag, dev_info.destid, + dev_info.hopcount); + return -ENODEV; + } + + net = rdev->net; + rio_dev_put(rdev); + rio_del_device(rdev, RIO_DEVICE_SHUTDOWN); + + if (list_empty(&net->devices)) { + rio_free_net(net); + mport->net = NULL; + } + + return 0; +} + +/* + * Mport cdev management + */ + +/* + * mport_cdev_open() - Open character device (mport) + */ +static int mport_cdev_open(struct inode *inode, struct file *filp) +{ + int ret; + int minor = iminor(inode); + struct mport_dev *chdev; + struct mport_cdev_priv *priv; + + /* Test for valid device */ + if (minor >= RIO_MAX_MPORTS) { + rmcd_error("Invalid minor device number"); + return -EINVAL; + } + + chdev = container_of(inode->i_cdev, struct mport_dev, cdev); + + rmcd_debug(INIT, "%s filp=%p", dev_name(&chdev->dev), filp); + + if (atomic_read(&chdev->active) == 0) + return -ENODEV; + + get_device(&chdev->dev); + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + put_device(&chdev->dev); + return -ENOMEM; + } + + priv->md = chdev; + + mutex_lock(&chdev->file_mutex); + list_add_tail(&priv->list, &chdev->file_list); + mutex_unlock(&chdev->file_mutex); + + INIT_LIST_HEAD(&priv->db_filters); + INIT_LIST_HEAD(&priv->pw_filters); + spin_lock_init(&priv->fifo_lock); + init_waitqueue_head(&priv->event_rx_wait); + ret = kfifo_alloc(&priv->event_fifo, + sizeof(struct rio_event) * MPORT_EVENT_DEPTH, + GFP_KERNEL); + if (ret < 0) { + dev_err(&chdev->dev, DRV_NAME ": kfifo_alloc failed\n"); + ret = -ENOMEM; + goto err_fifo; + } + +#ifdef CONFIG_RAPIDIO_DMA_ENGINE + INIT_LIST_HEAD(&priv->async_list); + INIT_LIST_HEAD(&priv->pend_list); + spin_lock_init(&priv->req_lock); + mutex_init(&priv->dma_lock); +#endif + + filp->private_data = priv; + goto out; +err_fifo: + kfree(priv); +out: + return ret; +} + +static int mport_cdev_fasync(int fd, struct file *filp, int mode) +{ + struct mport_cdev_priv *priv = filp->private_data; + + return fasync_helper(fd, filp, mode, &priv->async_queue); +} + +#ifdef CONFIG_RAPIDIO_DMA_ENGINE +static void mport_cdev_release_dma(struct file *filp) +{ + struct mport_cdev_priv *priv = filp->private_data; + struct mport_dev *md; + struct mport_dma_req *req, *req_next; + unsigned long tmo = msecs_to_jiffies(dma_timeout); + long wret; + LIST_HEAD(list); + + rmcd_debug(EXIT, "from filp=%p %s(%d)", + filp, current->comm, task_pid_nr(current)); + + if (!priv->dmach) { + rmcd_debug(EXIT, "No DMA channel for filp=%p", filp); + return; + } + + md = priv->md; + + flush_workqueue(dma_wq); + + spin_lock(&priv->req_lock); + if (!list_empty(&priv->async_list)) { + rmcd_debug(EXIT, "async list not empty filp=%p %s(%d)", + filp, current->comm, task_pid_nr(current)); + list_splice_init(&priv->async_list, &list); + } + spin_unlock(&priv->req_lock); + + if (!list_empty(&list)) { + rmcd_debug(EXIT, "temp list not empty"); + list_for_each_entry_safe(req, req_next, &list, node) { + rmcd_debug(EXIT, "free req->filp=%p cookie=%d compl=%s", + req->filp, req->cookie, + completion_done(&req->req_comp)?"yes":"no"); + list_del(&req->node); + dma_req_free(req); + } + } + + if (!list_empty(&priv->pend_list)) { + rmcd_debug(EXIT, "Free pending DMA requests for filp=%p %s(%d)", + filp, current->comm, task_pid_nr(current)); + list_for_each_entry_safe(req, + req_next, &priv->pend_list, node) { + rmcd_debug(EXIT, "free req->filp=%p cookie=%d compl=%s", + req->filp, req->cookie, + completion_done(&req->req_comp)?"yes":"no"); + list_del(&req->node); + dma_req_free(req); + } + } + + put_dma_channel(priv); + wret = wait_for_completion_interruptible_timeout(&priv->comp, tmo); + + if (wret <= 0) { + rmcd_error("%s(%d) failed waiting for DMA release err=%ld", + current->comm, task_pid_nr(current), wret); + } + + spin_lock(&priv->req_lock); + + if (!list_empty(&priv->pend_list)) { + rmcd_debug(EXIT, "ATTN: pending DMA requests, filp=%p %s(%d)", + filp, current->comm, task_pid_nr(current)); + } + + spin_unlock(&priv->req_lock); + + if (priv->dmach != priv->md->dma_chan) { + rmcd_debug(EXIT, "Release DMA channel for filp=%p %s(%d)", + filp, current->comm, task_pid_nr(current)); + rio_release_dma(priv->dmach); + } else { + rmcd_debug(EXIT, "Adjust default DMA channel refcount"); + kref_put(&md->dma_ref, mport_release_def_dma); + } + + priv->dmach = NULL; +} +#else +#define mport_cdev_release_dma(priv) do {} while (0) +#endif + +/* + * mport_cdev_release() - Release character device + */ +static int mport_cdev_release(struct inode *inode, struct file *filp) +{ + struct mport_cdev_priv *priv = filp->private_data; + struct mport_dev *chdev; + struct rio_mport_pw_filter *pw_filter, *pw_filter_next; + struct rio_mport_db_filter *db_filter, *db_filter_next; + struct rio_mport_mapping *map, *_map; + unsigned long flags; + + rmcd_debug(EXIT, "%s filp=%p", dev_name(&priv->md->dev), filp); + + chdev = priv->md; + mport_cdev_release_dma(filp); + + priv->event_mask = 0; + + spin_lock_irqsave(&chdev->pw_lock, flags); + if (!list_empty(&priv->pw_filters)) { + list_for_each_entry_safe(pw_filter, pw_filter_next, + &priv->pw_filters, priv_node) + rio_mport_delete_pw_filter(pw_filter); + } + spin_unlock_irqrestore(&chdev->pw_lock, flags); + + spin_lock_irqsave(&chdev->db_lock, flags); + list_for_each_entry_safe(db_filter, db_filter_next, + &priv->db_filters, priv_node) { + rio_mport_delete_db_filter(db_filter); + } + spin_unlock_irqrestore(&chdev->db_lock, flags); + + kfifo_free(&priv->event_fifo); + + mutex_lock(&chdev->buf_mutex); + list_for_each_entry_safe(map, _map, &chdev->mappings, node) { + if (map->filp == filp) { + rmcd_debug(EXIT, "release mapping %p filp=%p", + map->virt_addr, filp); + kref_put(&map->ref, mport_release_mapping); + } + } + mutex_unlock(&chdev->buf_mutex); + + mport_cdev_fasync(-1, filp, 0); + filp->private_data = NULL; + mutex_lock(&chdev->file_mutex); + list_del(&priv->list); + mutex_unlock(&chdev->file_mutex); + put_device(&chdev->dev); + kfree(priv); + return 0; +} + +/* + * mport_cdev_ioctl() - IOCTLs for character device + */ +static long mport_cdev_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + int err = -EINVAL; + struct mport_cdev_priv *data = filp->private_data; + struct mport_dev *md = data->md; + + if (atomic_read(&md->active) == 0) + return -ENODEV; + + switch (cmd) { + case RIO_MPORT_MAINT_READ_LOCAL: + return rio_mport_maint_rd(data, (void __user *)arg, 1); + case RIO_MPORT_MAINT_WRITE_LOCAL: + return rio_mport_maint_wr(data, (void __user *)arg, 1); + case RIO_MPORT_MAINT_READ_REMOTE: + return rio_mport_maint_rd(data, (void __user *)arg, 0); + case RIO_MPORT_MAINT_WRITE_REMOTE: + return rio_mport_maint_wr(data, (void __user *)arg, 0); + case RIO_MPORT_MAINT_HDID_SET: + return maint_hdid_set(data, (void __user *)arg); + case RIO_MPORT_MAINT_COMPTAG_SET: + return maint_comptag_set(data, (void __user *)arg); + case RIO_MPORT_MAINT_PORT_IDX_GET: + return maint_port_idx_get(data, (void __user *)arg); + case RIO_MPORT_GET_PROPERTIES: + md->properties.hdid = md->mport->host_deviceid; + if (copy_to_user((void __user *)arg, &(data->md->properties), + sizeof(data->md->properties))) + return -EFAULT; + return 0; + case RIO_ENABLE_DOORBELL_RANGE: + return rio_mport_add_db_filter(data, (void __user *)arg); + case RIO_DISABLE_DOORBELL_RANGE: + return rio_mport_remove_db_filter(data, (void __user *)arg); + case RIO_ENABLE_PORTWRITE_RANGE: + return rio_mport_add_pw_filter(data, (void __user *)arg); + case RIO_DISABLE_PORTWRITE_RANGE: + return rio_mport_remove_pw_filter(data, (void __user *)arg); + case RIO_SET_EVENT_MASK: + data->event_mask = arg; + return 0; + case RIO_GET_EVENT_MASK: + if (copy_to_user((void __user *)arg, &data->event_mask, + sizeof(data->event_mask))) + return -EFAULT; + return 0; + case RIO_MAP_OUTBOUND: + return rio_mport_obw_map(filp, (void __user *)arg); + case RIO_MAP_INBOUND: + return rio_mport_map_inbound(filp, (void __user *)arg); + case RIO_UNMAP_OUTBOUND: + return rio_mport_obw_free(filp, (void __user *)arg); + case RIO_UNMAP_INBOUND: + return rio_mport_inbound_free(filp, (void __user *)arg); + case RIO_ALLOC_DMA: + return rio_mport_alloc_dma(filp, (void __user *)arg); + case RIO_FREE_DMA: + return rio_mport_free_dma(filp, (void __user *)arg); + case RIO_WAIT_FOR_ASYNC: + return rio_mport_wait_for_async_dma(filp, (void __user *)arg); + case RIO_TRANSFER: + return rio_mport_transfer_ioctl(filp, (void __user *)arg); + case RIO_DEV_ADD: + return rio_mport_add_riodev(data, (void __user *)arg); + case RIO_DEV_DEL: + return rio_mport_del_riodev(data, (void __user *)arg); + default: + break; + } + + return err; +} + +/* + * mport_release_mapping - free mapping resources and info structure + * @ref: a pointer to the kref within struct rio_mport_mapping + * + * NOTE: Shall be called while holding buf_mutex. + */ +static void mport_release_mapping(struct kref *ref) +{ + struct rio_mport_mapping *map = + container_of(ref, struct rio_mport_mapping, ref); + struct rio_mport *mport = map->md->mport; + + rmcd_debug(MMAP, "type %d mapping @ %p (phys = %pad) for %s", + map->dir, map->virt_addr, + &map->phys_addr, mport->name); + + list_del(&map->node); + + switch (map->dir) { + case MAP_INBOUND: + rio_unmap_inb_region(mport, map->phys_addr); + case MAP_DMA: + dma_free_coherent(mport->dev.parent, map->size, + map->virt_addr, map->phys_addr); + break; + case MAP_OUTBOUND: + rio_unmap_outb_region(mport, map->rioid, map->rio_addr); + break; + } + kfree(map); +} + +static void mport_mm_open(struct vm_area_struct *vma) +{ + struct rio_mport_mapping *map = vma->vm_private_data; + +rmcd_debug(MMAP, "0x%pad", &map->phys_addr); + kref_get(&map->ref); +} + +static void mport_mm_close(struct vm_area_struct *vma) +{ + struct rio_mport_mapping *map = vma->vm_private_data; + +rmcd_debug(MMAP, "0x%pad", &map->phys_addr); + mutex_lock(&map->md->buf_mutex); + kref_put(&map->ref, mport_release_mapping); + mutex_unlock(&map->md->buf_mutex); +} + +static const struct vm_operations_struct vm_ops = { + .open = mport_mm_open, + .close = mport_mm_close, +}; + +static int mport_cdev_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct mport_cdev_priv *priv = filp->private_data; + struct mport_dev *md; + size_t size = vma->vm_end - vma->vm_start; + dma_addr_t baddr; + unsigned long offset; + int found = 0, ret; + struct rio_mport_mapping *map; + + rmcd_debug(MMAP, "0x%x bytes at offset 0x%lx", + (unsigned int)size, vma->vm_pgoff); + + md = priv->md; + baddr = ((dma_addr_t)vma->vm_pgoff << PAGE_SHIFT); + + mutex_lock(&md->buf_mutex); + list_for_each_entry(map, &md->mappings, node) { + if (baddr >= map->phys_addr && + baddr < (map->phys_addr + map->size)) { + found = 1; + break; + } + } + mutex_unlock(&md->buf_mutex); + + if (!found) + return -ENOMEM; + + offset = baddr - map->phys_addr; + + if (size + offset > map->size) + return -EINVAL; + + vma->vm_pgoff = offset >> PAGE_SHIFT; + rmcd_debug(MMAP, "MMAP adjusted offset = 0x%lx", vma->vm_pgoff); + + if (map->dir == MAP_INBOUND || map->dir == MAP_DMA) + ret = dma_mmap_coherent(md->mport->dev.parent, vma, + map->virt_addr, map->phys_addr, map->size); + else if (map->dir == MAP_OUTBOUND) { + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + ret = vm_iomap_memory(vma, map->phys_addr, map->size); + } else { + rmcd_error("Attempt to mmap unsupported mapping type"); + ret = -EIO; + } + + if (!ret) { + vma->vm_private_data = map; + vma->vm_ops = &vm_ops; + mport_mm_open(vma); + } else { + rmcd_error("MMAP exit with err=%d", ret); + } + + return ret; +} + +static unsigned int mport_cdev_poll(struct file *filp, poll_table *wait) +{ + struct mport_cdev_priv *priv = filp->private_data; + + poll_wait(filp, &priv->event_rx_wait, wait); + if (kfifo_len(&priv->event_fifo)) + return POLLIN | POLLRDNORM; + + return 0; +} + +static ssize_t mport_read(struct file *filp, char __user *buf, size_t count, + loff_t *ppos) +{ + struct mport_cdev_priv *priv = filp->private_data; + int copied; + ssize_t ret; + + if (!count) + return 0; + + if (kfifo_is_empty(&priv->event_fifo) && + (filp->f_flags & O_NONBLOCK)) + return -EAGAIN; + + if (count % sizeof(struct rio_event)) + return -EINVAL; + + ret = wait_event_interruptible(priv->event_rx_wait, + kfifo_len(&priv->event_fifo) != 0); + if (ret) + return ret; + + while (ret < count) { + if (kfifo_to_user(&priv->event_fifo, buf, + sizeof(struct rio_event), &copied)) + return -EFAULT; + ret += copied; + buf += copied; + } + + return ret; +} + +static ssize_t mport_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct mport_cdev_priv *priv = filp->private_data; + struct rio_mport *mport = priv->md->mport; + struct rio_event event; + int len, ret; + + if (!count) + return 0; + + if (count % sizeof(event)) + return -EINVAL; + + len = 0; + while ((count - len) >= (int)sizeof(event)) { + if (copy_from_user(&event, buf, sizeof(event))) + return -EFAULT; + + if (event.header != RIO_DOORBELL) + return -EINVAL; + + ret = rio_mport_send_doorbell(mport, + (u16)event.u.doorbell.rioid, + event.u.doorbell.payload); + if (ret < 0) + return ret; + + len += sizeof(event); + buf += sizeof(event); + } + + return len; +} + +static const struct file_operations mport_fops = { + .owner = THIS_MODULE, + .open = mport_cdev_open, + .release = mport_cdev_release, + .poll = mport_cdev_poll, + .read = mport_read, + .write = mport_write, + .mmap = mport_cdev_mmap, + .fasync = mport_cdev_fasync, + .unlocked_ioctl = mport_cdev_ioctl +}; + +/* + * Character device management + */ + +static void mport_device_release(struct device *dev) +{ + struct mport_dev *md; + + rmcd_debug(EXIT, "%s", dev_name(dev)); + md = container_of(dev, struct mport_dev, dev); + kfree(md); +} + +/* + * mport_cdev_add() - Create mport_dev from rio_mport + * @mport: RapidIO master port + */ +static struct mport_dev *mport_cdev_add(struct rio_mport *mport) +{ + int ret = 0; + struct mport_dev *md; + struct rio_mport_attr attr; + + md = kzalloc(sizeof(struct mport_dev), GFP_KERNEL); + if (!md) { + rmcd_error("Unable allocate a device object"); + return NULL; + } + + md->mport = mport; + mutex_init(&md->buf_mutex); + mutex_init(&md->file_mutex); + INIT_LIST_HEAD(&md->file_list); + cdev_init(&md->cdev, &mport_fops); + md->cdev.owner = THIS_MODULE; + ret = cdev_add(&md->cdev, MKDEV(MAJOR(dev_number), mport->id), 1); + if (ret < 0) { + kfree(md); + rmcd_error("Unable to register a device, err=%d", ret); + return NULL; + } + + md->dev.devt = md->cdev.dev; + md->dev.class = dev_class; + md->dev.parent = &mport->dev; + md->dev.release = mport_device_release; + dev_set_name(&md->dev, DEV_NAME "%d", mport->id); + atomic_set(&md->active, 1); + + ret = device_register(&md->dev); + if (ret) { + rmcd_error("Failed to register mport %d (err=%d)", + mport->id, ret); + goto err_cdev; + } + + get_device(&md->dev); + + INIT_LIST_HEAD(&md->doorbells); + spin_lock_init(&md->db_lock); + INIT_LIST_HEAD(&md->portwrites); + spin_lock_init(&md->pw_lock); + INIT_LIST_HEAD(&md->mappings); + + md->properties.id = mport->id; + md->properties.sys_size = mport->sys_size; + md->properties.hdid = mport->host_deviceid; + md->properties.index = mport->index; + + /* The transfer_mode property will be returned through mport query + * interface + */ +#ifdef CONFIG_PPC /* for now: only on Freescale's SoCs */ + md->properties.transfer_mode |= RIO_TRANSFER_MODE_MAPPED; +#else + md->properties.transfer_mode |= RIO_TRANSFER_MODE_TRANSFER; +#endif + ret = rio_query_mport(mport, &attr); + if (!ret) { + md->properties.flags = attr.flags; + md->properties.link_speed = attr.link_speed; + md->properties.link_width = attr.link_width; + md->properties.dma_max_sge = attr.dma_max_sge; + md->properties.dma_max_size = attr.dma_max_size; + md->properties.dma_align = attr.dma_align; + md->properties.cap_sys_size = 0; + md->properties.cap_transfer_mode = 0; + md->properties.cap_addr_size = 0; + } else + pr_info(DRV_PREFIX "Failed to obtain info for %s cdev(%d:%d)\n", + mport->name, MAJOR(dev_number), mport->id); + + mutex_lock(&mport_devs_lock); + list_add_tail(&md->node, &mport_devs); + mutex_unlock(&mport_devs_lock); + + pr_info(DRV_PREFIX "Added %s cdev(%d:%d)\n", + mport->name, MAJOR(dev_number), mport->id); + + return md; + +err_cdev: + cdev_del(&md->cdev); + kfree(md); + return NULL; +} + +/* + * mport_cdev_terminate_dma() - Stop all active DMA data transfers and release + * associated DMA channels. + */ +static void mport_cdev_terminate_dma(struct mport_dev *md) +{ +#ifdef CONFIG_RAPIDIO_DMA_ENGINE + struct mport_cdev_priv *client; + + rmcd_debug(DMA, "%s", dev_name(&md->dev)); + + mutex_lock(&md->file_mutex); + list_for_each_entry(client, &md->file_list, list) { + if (client->dmach) { + dmaengine_terminate_all(client->dmach); + rio_release_dma(client->dmach); + } + } + mutex_unlock(&md->file_mutex); + + if (md->dma_chan) { + dmaengine_terminate_all(md->dma_chan); + rio_release_dma(md->dma_chan); + md->dma_chan = NULL; + } +#endif +} + + +/* + * mport_cdev_kill_fasync() - Send SIGIO signal to all processes with open + * mport_cdev files. + */ +static int mport_cdev_kill_fasync(struct mport_dev *md) +{ + unsigned int files = 0; + struct mport_cdev_priv *client; + + mutex_lock(&md->file_mutex); + list_for_each_entry(client, &md->file_list, list) { + if (client->async_queue) + kill_fasync(&client->async_queue, SIGIO, POLL_HUP); + files++; + } + mutex_unlock(&md->file_mutex); + return files; +} + +/* + * mport_cdev_remove() - Remove mport character device + * @dev: Mport device to remove + */ +static void mport_cdev_remove(struct mport_dev *md) +{ + struct rio_mport_mapping *map, *_map; + + rmcd_debug(EXIT, "Remove %s cdev", md->mport->name); + atomic_set(&md->active, 0); + mport_cdev_terminate_dma(md); + rio_del_mport_pw_handler(md->mport, md, rio_mport_pw_handler); + cdev_del(&(md->cdev)); + mport_cdev_kill_fasync(md); + + flush_workqueue(dma_wq); + + /* TODO: do we need to give clients some time to close file + * descriptors? Simple wait for XX, or kref? + */ + + /* + * Release DMA buffers allocated for the mport device. + * Disable associated inbound Rapidio requests mapping if applicable. + */ + mutex_lock(&md->buf_mutex); + list_for_each_entry_safe(map, _map, &md->mappings, node) { + kref_put(&map->ref, mport_release_mapping); + } + mutex_unlock(&md->buf_mutex); + + if (!list_empty(&md->mappings)) + rmcd_warn("WARNING: %s pending mappings on removal", + md->mport->name); + + rio_release_inb_dbell(md->mport, 0, 0x0fff); + + device_unregister(&md->dev); + put_device(&md->dev); +} + +/* + * RIO rio_mport_interface driver + */ + +/* + * mport_add_mport() - Add rio_mport from LDM device struct + * @dev: Linux device model struct + * @class_intf: Linux class_interface + */ +static int mport_add_mport(struct device *dev, + struct class_interface *class_intf) +{ + struct rio_mport *mport = NULL; + struct mport_dev *chdev = NULL; + + mport = to_rio_mport(dev); + if (!mport) + return -ENODEV; + + chdev = mport_cdev_add(mport); + if (!chdev) + return -ENODEV; + + return 0; +} + +/* + * mport_remove_mport() - Remove rio_mport from global list + * TODO remove device from global mport_dev list + */ +static void mport_remove_mport(struct device *dev, + struct class_interface *class_intf) +{ + struct rio_mport *mport = NULL; + struct mport_dev *chdev; + int found = 0; + + mport = to_rio_mport(dev); + rmcd_debug(EXIT, "Remove %s", mport->name); + + mutex_lock(&mport_devs_lock); + list_for_each_entry(chdev, &mport_devs, node) { + if (chdev->mport->id == mport->id) { + atomic_set(&chdev->active, 0); + list_del(&chdev->node); + found = 1; + break; + } + } + mutex_unlock(&mport_devs_lock); + + if (found) + mport_cdev_remove(chdev); +} + +/* the rio_mport_interface is used to handle local mport devices */ +static struct class_interface rio_mport_interface __refdata = { + .class = &rio_mport_class, + .add_dev = mport_add_mport, + .remove_dev = mport_remove_mport, +}; + +/* + * Linux kernel module + */ + +/* + * mport_init - Driver module loading + */ +static int __init mport_init(void) +{ + int ret; + + /* Create device class needed by udev */ + dev_class = class_create(THIS_MODULE, DRV_NAME); + if (!dev_class) { + rmcd_error("Unable to create " DRV_NAME " class"); + return -EINVAL; + } + + ret = alloc_chrdev_region(&dev_number, 0, RIO_MAX_MPORTS, DRV_NAME); + if (ret < 0) + goto err_chr; + + rmcd_debug(INIT, "Registered class with major=%d", MAJOR(dev_number)); + + /* Register to rio_mport_interface */ + ret = class_interface_register(&rio_mport_interface); + if (ret) { + rmcd_error("class_interface_register() failed, err=%d", ret); + goto err_cli; + } + + dma_wq = create_singlethread_workqueue("dma_wq"); + if (!dma_wq) { + rmcd_error("failed to create DMA work queue"); + ret = -ENOMEM; + goto err_wq; + } + + return 0; + +err_wq: + class_interface_unregister(&rio_mport_interface); +err_cli: + unregister_chrdev_region(dev_number, RIO_MAX_MPORTS); +err_chr: + class_destroy(dev_class); + return ret; +} + +/** + * mport_exit - Driver module unloading + */ +static void __exit mport_exit(void) +{ + class_interface_unregister(&rio_mport_interface); + class_destroy(dev_class); + unregister_chrdev_region(dev_number, RIO_MAX_MPORTS); + destroy_workqueue(dma_wq); +} + +module_init(mport_init); +module_exit(mport_exit); diff --git a/include/linux/rio_mport_cdev.h b/include/linux/rio_mport_cdev.h new file mode 100644 index 000000000000..b65d19df76d2 --- /dev/null +++ b/include/linux/rio_mport_cdev.h @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2015-2016, Integrated Device Technology Inc. + * Copyright (c) 2015, Prodrive Technologies + * Copyright (c) 2015, Texas Instruments Incorporated + * Copyright (c) 2015, RapidIO Trade Association + * All rights reserved. + * + * This software is available to you under a choice of one of two licenses. + * You may choose to be licensed under the terms of the GNU General Public + * License(GPL) Version 2, or the BSD-3 Clause license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RIO_MPORT_CDEV_H_ +#define _RIO_MPORT_CDEV_H_ + +#ifndef __user +#define __user +#endif + +struct rio_mport_maint_io { + uint32_t rioid; /* destID of remote device */ + uint32_t hopcount; /* hopcount to remote device */ + uint32_t offset; /* offset in register space */ + size_t length; /* length in bytes */ + void __user *buffer; /* data buffer */ +}; + +/* + * Definitions for RapidIO data transfers: + * - memory mapped (MAPPED) + * - packet generation from memory (TRANSFER) + */ +#define RIO_TRANSFER_MODE_MAPPED (1 << 0) +#define RIO_TRANSFER_MODE_TRANSFER (1 << 1) +#define RIO_CAP_DBL_SEND (1 << 2) +#define RIO_CAP_DBL_RECV (1 << 3) +#define RIO_CAP_PW_SEND (1 << 4) +#define RIO_CAP_PW_RECV (1 << 5) +#define RIO_CAP_MAP_OUTB (1 << 6) +#define RIO_CAP_MAP_INB (1 << 7) + +struct rio_mport_properties { + uint16_t hdid; + uint8_t id; /* Physical port ID */ + uint8_t index; + uint32_t flags; + uint32_t sys_size; /* Default addressing size */ + uint8_t port_ok; + uint8_t link_speed; + uint8_t link_width; + uint32_t dma_max_sge; + uint32_t dma_max_size; + uint32_t dma_align; + uint32_t transfer_mode; /* Default transfer mode */ + uint32_t cap_sys_size; /* Capable system sizes */ + uint32_t cap_addr_size; /* Capable addressing sizes */ + uint32_t cap_transfer_mode; /* Capable transfer modes */ + uint32_t cap_mport; /* Mport capabilities */ +}; + +/* + * Definitions for RapidIO events; + * - incoming port-writes + * - incoming doorbells + */ +#define RIO_DOORBELL (1 << 0) +#define RIO_PORTWRITE (1 << 1) + +struct rio_doorbell { + uint32_t rioid; + uint16_t payload; +}; + +struct rio_doorbell_filter { + uint32_t rioid; /* 0xffffffff to match all ids */ + uint16_t low; + uint16_t high; +}; + + +struct rio_portwrite { + uint32_t payload[16]; +}; + +struct rio_pw_filter { + uint32_t mask; + uint32_t low; + uint32_t high; +}; + +/* RapidIO base address for inbound requests set to value defined below + * indicates that no specific RIO-to-local address translation is requested + * and driver should use direct (one-to-one) address mapping. +*/ +#define RIO_MAP_ANY_ADDR (uint64_t)(~((uint64_t) 0)) + +struct rio_mmap { + uint32_t rioid; + uint64_t rio_addr; + uint64_t length; + uint64_t handle; + void *address; +}; + +struct rio_dma_mem { + uint64_t length; /* length of DMA memory */ + uint64_t dma_handle; /* handle associated with this memory */ + void *buffer; /* pointer to this memory */ +}; + + +struct rio_event { + unsigned int header; /* event type RIO_DOORBELL or RIO_PORTWRITE */ + union { + struct rio_doorbell doorbell; /* header for RIO_DOORBELL */ + struct rio_portwrite portwrite; /* header for RIO_PORTWRITE */ + } u; +}; + +enum rio_transfer_sync { + RIO_TRANSFER_SYNC, /* synchronous transfer */ + RIO_TRANSFER_ASYNC, /* asynchronous transfer */ + RIO_TRANSFER_FAF, /* fire-and-forget transfer */ +}; + +enum rio_transfer_dir { + RIO_TRANSFER_DIR_READ, /* Read operation */ + RIO_TRANSFER_DIR_WRITE, /* Write operation */ +}; + +/* + * RapidIO data exchange transactions are lists of individual transfers. Each + * transfer exchanges data between two RapidIO devices by remote direct memory + * access and has its own completion code. + * + * The RapidIO specification defines four types of data exchange requests: + * NREAD, NWRITE, SWRITE and NWRITE_R. The RapidIO DMA channel interface allows + * to specify the required type of write operation or combination of them when + * only the last data packet requires response. + * + * NREAD: read up to 256 bytes from remote device memory into local memory + * NWRITE: write up to 256 bytes from local memory to remote device memory + * without confirmation + * SWRITE: as NWRITE, but all addresses and payloads must be 64-bit aligned + * NWRITE_R: as NWRITE, but expect acknowledgment from remote device. + * + * The default exchange is chosen from NREAD and any of the WRITE modes as the + * driver sees fit. For write requests the user can explicitly choose between + * any of the write modes for each transaction. + */ +enum rio_exchange { + RIO_EXCHANGE_DEFAULT, /* Default method */ + RIO_EXCHANGE_NWRITE, /* All packets using NWRITE */ + RIO_EXCHANGE_SWRITE, /* All packets using SWRITE */ + RIO_EXCHANGE_NWRITE_R, /* Last packet NWRITE_R, others NWRITE */ + RIO_EXCHANGE_SWRITE_R, /* Last packet NWRITE_R, others SWRITE */ + RIO_EXCHANGE_NWRITE_R_ALL, /* All packets using NWRITE_R */ +}; + +struct rio_transfer_io { + uint32_t rioid; /* Target destID */ + uint64_t rio_addr; /* Address in target's RIO mem space */ + enum rio_exchange method; /* Data exchange method */ + void __user *loc_addr; + uint64_t handle; + uint64_t offset; /* Offset in buffer */ + uint64_t length; /* Length in bytes */ + uint32_t completion_code; /* Completion code for this transfer */ +}; + +struct rio_transaction { + uint32_t transfer_mode; /* Data transfer mode */ + enum rio_transfer_sync sync; /* Synchronization method */ + enum rio_transfer_dir dir; /* Transfer direction */ + size_t count; /* Number of transfers */ + struct rio_transfer_io __user *block; /* Array of transfers */ +}; + +struct rio_async_tx_wait { + uint32_t token; /* DMA transaction ID token */ + uint32_t timeout; /* Wait timeout in msec, if 0 use default TO */ +}; + +#define RIO_MAX_DEVNAME_SZ 20 + +struct rio_rdev_info { + uint32_t destid; + uint8_t hopcount; + uint32_t comptag; + char name[RIO_MAX_DEVNAME_SZ + 1]; +}; + +/* Driver IOCTL codes */ +#define RIO_MPORT_DRV_MAGIC 'm' + +#define RIO_MPORT_MAINT_HDID_SET \ + _IOW(RIO_MPORT_DRV_MAGIC, 1, uint16_t) +#define RIO_MPORT_MAINT_COMPTAG_SET \ + _IOW(RIO_MPORT_DRV_MAGIC, 2, uint32_t) +#define RIO_MPORT_MAINT_PORT_IDX_GET \ + _IOR(RIO_MPORT_DRV_MAGIC, 3, uint32_t) +#define RIO_MPORT_GET_PROPERTIES \ + _IOR(RIO_MPORT_DRV_MAGIC, 4, struct rio_mport_properties) +#define RIO_MPORT_MAINT_READ_LOCAL \ + _IOR(RIO_MPORT_DRV_MAGIC, 5, struct rio_mport_maint_io) +#define RIO_MPORT_MAINT_WRITE_LOCAL \ + _IOW(RIO_MPORT_DRV_MAGIC, 6, struct rio_mport_maint_io) +#define RIO_MPORT_MAINT_READ_REMOTE \ + _IOR(RIO_MPORT_DRV_MAGIC, 7, struct rio_mport_maint_io) +#define RIO_MPORT_MAINT_WRITE_REMOTE \ + _IOW(RIO_MPORT_DRV_MAGIC, 8, struct rio_mport_maint_io) +#define RIO_ENABLE_DOORBELL_RANGE \ + _IOW(RIO_MPORT_DRV_MAGIC, 9, struct rio_doorbell_filter) +#define RIO_DISABLE_DOORBELL_RANGE \ + _IOW(RIO_MPORT_DRV_MAGIC, 10, struct rio_doorbell_filter) +#define RIO_ENABLE_PORTWRITE_RANGE \ + _IOW(RIO_MPORT_DRV_MAGIC, 11, struct rio_pw_filter) +#define RIO_DISABLE_PORTWRITE_RANGE \ + _IOW(RIO_MPORT_DRV_MAGIC, 12, struct rio_pw_filter) +#define RIO_SET_EVENT_MASK \ + _IOW(RIO_MPORT_DRV_MAGIC, 13, unsigned int) +#define RIO_GET_EVENT_MASK \ + _IOR(RIO_MPORT_DRV_MAGIC, 14, unsigned int) +#define RIO_MAP_OUTBOUND \ + _IOWR(RIO_MPORT_DRV_MAGIC, 15, struct rio_mmap) +#define RIO_UNMAP_OUTBOUND \ + _IOW(RIO_MPORT_DRV_MAGIC, 16, struct rio_mmap) +#define RIO_MAP_INBOUND \ + _IOWR(RIO_MPORT_DRV_MAGIC, 17, struct rio_mmap) +#define RIO_UNMAP_INBOUND \ + _IOW(RIO_MPORT_DRV_MAGIC, 18, uint64_t) +#define RIO_ALLOC_DMA \ + _IOWR(RIO_MPORT_DRV_MAGIC, 19, struct rio_dma_mem) +#define RIO_FREE_DMA \ + _IOW(RIO_MPORT_DRV_MAGIC, 20, uint64_t) +#define RIO_TRANSFER \ + _IOWR(RIO_MPORT_DRV_MAGIC, 21, struct rio_transaction) +#define RIO_WAIT_FOR_ASYNC \ + _IOW(RIO_MPORT_DRV_MAGIC, 22, struct rio_async_tx_wait) +#define RIO_DEV_ADD \ + _IOW(RIO_MPORT_DRV_MAGIC, 23, struct rio_rdev_info) +#define RIO_DEV_DEL \ + _IOW(RIO_MPORT_DRV_MAGIC, 24, struct rio_rdev_info) + +#endif /* _RIO_MPORT_CDEV_H_ */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 0495884defc1..b71fd0b5cbad 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -354,6 +354,7 @@ header-y += reiserfs_fs.h header-y += reiserfs_xattr.h header-y += resource.h header-y += rfkill.h +header-y += rio_mport_cdev.h header-y += romfs_fs.h header-y += rose.h header-y += route.h From 0335695dfa4df01edff5bb102b9a82a0668ee51e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 22 Mar 2016 14:27:11 -0700 Subject: [PATCH 58/81] cred/userns: define current_user_ns() as a function The current_user_ns() macro currently returns &init_user_ns when user namespaces are disabled, and that causes several warnings when building with gcc-6.0 in code that compares the result of the macro to &init_user_ns itself: fs/xfs/xfs_ioctl.c: In function 'xfs_ioctl_setattr_check_projid': fs/xfs/xfs_ioctl.c:1249:22: error: self-comparison always evaluates to true [-Werror=tautological-compare] if (current_user_ns() == &init_user_ns) This is a legitimate warning in principle, but here it isn't really helpful, so I'm reprasing the definition in a way that shuts up the warning. Apparently gcc only warns when comparing identical literals, but it can figure out that the result of an inline function can be identical to a constant expression in order to optimize a condition yet not warn about the fact that the condition is known at compile time. This is exactly what we want here, and it looks reasonable because we generally prefer inline functions over macros anyway. Signed-off-by: Arnd Bergmann Acked-by: Serge Hallyn Cc: David Howells Cc: Yaowei Bai Cc: James Morris Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/capability.h | 2 -- include/linux/cred.h | 5 ++++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/capability.h b/include/linux/capability.h index f314275d4e3f..00690ff92edf 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -40,8 +40,6 @@ struct inode; struct dentry; struct user_namespace; -struct user_namespace *current_user_ns(void); - extern const kernel_cap_t __cap_empty_set; extern const kernel_cap_t __cap_init_eff_set; diff --git a/include/linux/cred.h b/include/linux/cred.h index 8d70e1361ecd..257db64562e5 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -377,7 +377,10 @@ extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS #define current_user_ns() (current_cred_xxx(user_ns)) #else -#define current_user_ns() (&init_user_ns) +static inline struct user_namespace *current_user_ns(void) +{ + return &init_user_ns; +} #endif From a484c3dd9426e1f450c7ed8ad5a2002ea1b309ea Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 22 Mar 2016 14:27:14 -0700 Subject: [PATCH 59/81] eventfd: document lockless access in eventfd_poll Since commit e22553e2a25e ("eventfd: don't take the spinlock in eventfd_poll", 2015-02-17), eventfd is reading ctx->count outside ctx->wqh.lock. However, things aren't as simple as the read barrier in eventfd_poll would suggest. In fact, the read barrier, besides lacking a comment, is not paired in any obvious manner with another read barrier, and it is pointless because it is sitting between a write (deep in poll_wait) and the read of ctx->count. The read barrier is acting just as a compiler barrier, for which we can use READ_ONCE instead. This is what the code change in this patch does. The documentation change is just as important, however. The question, posed by Andrea Arcangeli, is then why the thing is safe on architectures where spin_unlock does not imply a store-load memory barrier. The answer is that it's safe because writes of ctx->count use the same lock as poll_wait, and hence an acquire barrier implicit in poll_wait provides the necessary synchronization between eventfd_poll and callers of wake_up_locked_poll. This is sort of mentioned in the commit message with respect to eventfd_ctx_read ("eventfd_read is similar, it will do a single decrement with the lock held") but it applies to all other callers too. It's tricky enough that it should be documented in the code. Signed-off-by: Paolo Bonzini Reviewed-by: Andrea Arcangeli Cc: Chris Mason Cc: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventfd.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/fs/eventfd.c b/fs/eventfd.c index ed70cf9fdc7b..1231cd1999d8 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -121,8 +121,46 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait) u64 count; poll_wait(file, &ctx->wqh, wait); - smp_rmb(); - count = ctx->count; + + /* + * All writes to ctx->count occur within ctx->wqh.lock. This read + * can be done outside ctx->wqh.lock because we know that poll_wait + * takes that lock (through add_wait_queue) if our caller will sleep. + * + * The read _can_ therefore seep into add_wait_queue's critical + * section, but cannot move above it! add_wait_queue's spin_lock acts + * as an acquire barrier and ensures that the read be ordered properly + * against the writes. The following CAN happen and is safe: + * + * poll write + * ----------------- ------------ + * lock ctx->wqh.lock (in poll_wait) + * count = ctx->count + * __add_wait_queue + * unlock ctx->wqh.lock + * lock ctx->qwh.lock + * ctx->count += n + * if (waitqueue_active) + * wake_up_locked_poll + * unlock ctx->qwh.lock + * eventfd_poll returns 0 + * + * but the following, which would miss a wakeup, cannot happen: + * + * poll write + * ----------------- ------------ + * count = ctx->count (INVALID!) + * lock ctx->qwh.lock + * ctx->count += n + * **waitqueue_active is false** + * **no wake_up_locked_poll!** + * unlock ctx->qwh.lock + * lock ctx->wqh.lock (in poll_wait) + * __add_wait_queue + * unlock ctx->wqh.lock + * eventfd_poll returns 0 + */ + count = READ_ONCE(ctx->count); if (count > 0) events |= POLLIN; From ebc41f20d77f6ad91f1f2d2af5147dc9bb6b5eea Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Tue, 22 Mar 2016 14:27:17 -0700 Subject: [PATCH 60/81] panic: change nmi_panic from macro to function Commit 1717f2096b54 ("panic, x86: Fix re-entrance problem due to panic on NMI") and commit 58c5661f2144 ("panic, x86: Allow CPUs to save registers even if looping in NMI context") introduced nmi_panic() which prevents concurrent/recursive execution of panic(). It also saves registers for the crash dump on x86. However, there are some cases where NMI handlers still use panic(). This patch set partially replaces them with nmi_panic() in those cases. Even this patchset is applied, some NMI or similar handlers (e.g. MCE handler) continue to use panic(). This is because I can't test them well and actual problems won't happen. For example, the possibility that normal panic and panic on MCE happen simultaneously is very low. This patch (of 3): Convert nmi_panic() to a proper function and export it instead of exporting internal implementation details to modules, for obvious reasons. Signed-off-by: Hidehiro Kawai Acked-by: Borislav Petkov Acked-by: Michal Nazarewicz Cc: Michal Hocko Cc: Rasmus Villemoes Cc: Nicolas Iooss Cc: Javi Merino Cc: Gobinda Charan Maji Cc: "Steven Rostedt (Red Hat)" Cc: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: HATAYAMA Daisuke Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 21 +-------------------- kernel/panic.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index b82646ee70eb..a13c52ccd8ac 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -255,7 +255,7 @@ extern long (*panic_blink)(int state); __printf(1, 2) void panic(const char *fmt, ...) __noreturn __cold; -void nmi_panic_self_stop(struct pt_regs *); +void nmi_panic(struct pt_regs *regs, const char *msg); extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); @@ -456,25 +456,6 @@ extern bool crash_kexec_post_notifiers; extern atomic_t panic_cpu; #define PANIC_CPU_INVALID -1 -/* - * A variant of panic() called from NMI context. We return if we've already - * panicked on this CPU. If another CPU already panicked, loop in - * nmi_panic_self_stop() which can provide architecture dependent code such - * as saving register state for crash dump. - */ -#define nmi_panic(regs, fmt, ...) \ -do { \ - int old_cpu, cpu; \ - \ - cpu = raw_smp_processor_id(); \ - old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu); \ - \ - if (old_cpu == PANIC_CPU_INVALID) \ - panic(fmt, ##__VA_ARGS__); \ - else if (old_cpu != cpu) \ - nmi_panic_self_stop(regs); \ -} while (0) - /* * Only to be used by arch init code. If the user over-wrote the default * CONFIG_PANIC_TIMEOUT, honor it. diff --git a/kernel/panic.c b/kernel/panic.c index fa400852bf6c..535c96510a44 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -73,6 +73,26 @@ void __weak nmi_panic_self_stop(struct pt_regs *regs) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +/* + * A variant of panic() called from NMI context. We return if we've already + * panicked on this CPU. If another CPU already panicked, loop in + * nmi_panic_self_stop() which can provide architecture dependent code such + * as saving register state for crash dump. + */ +void nmi_panic(struct pt_regs *regs, const char *msg) +{ + int old_cpu, cpu; + + cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu); + + if (old_cpu == PANIC_CPU_INVALID) + panic("%s", msg); + else if (old_cpu != cpu) + nmi_panic_self_stop(regs); +} +EXPORT_SYMBOL(nmi_panic); + /** * panic - halt the system * @fmt: The text string to print From 73cbf4a1ddfab247f9018550637c28a7d8dd9108 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Tue, 22 Mar 2016 14:27:21 -0700 Subject: [PATCH 61/81] ipmi/watchdog: use nmi_panic() when kernel panics in NMI handler Commit 1717f2096b54 ("panic, x86: Fix re-entrance problem due to panic on NMI") introduced nmi_panic() which prevents concurrent and recursive execution of panic(). It also saves registers for the crash dump on x86 by later commit 58c5661f2144 ("panic, x86: Allow CPUs to save registers even if looping in NMI context"). ipmi_watchdog driver can call panic() from NMI handler, so replace it with nmi_panic(). Signed-off-by: Hidehiro Kawai Acked-by: Corey Minyard Acked-by: Guenter Roeck Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/ipmi/ipmi_watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c index 096f0cef4da1..4facc7517a6a 100644 --- a/drivers/char/ipmi/ipmi_watchdog.c +++ b/drivers/char/ipmi/ipmi_watchdog.c @@ -1140,7 +1140,7 @@ ipmi_nmi(unsigned int val, struct pt_regs *regs) the timer. So do so. */ pretimeout_since_last_heartbeat = 1; if (atomic_inc_and_test(&preop_panic_excl)) - panic(PFX "pre-timeout"); + nmi_panic(regs, PFX "pre-timeout"); } return NMI_HANDLED; From abc514c58059ca6f02df41798e828ffd864f1d21 Mon Sep 17 00:00:00 2001 From: Hidehiro Kawai Date: Tue, 22 Mar 2016 14:27:24 -0700 Subject: [PATCH 62/81] hpwdt: use nmi_panic() when kernel panics in NMI handler Commit 1717f2096b54 ("panic, x86: Fix re-entrance problem due to panic on NMI") introduced nmi_panic() which prevents concurrent and recursive execution of panic(). It also saves registers for the crash dump on x86 by later commit 58c5661f2144 ("panic, x86: Allow CPUs to save registers even if looping in NMI context"). hpwdt driver can call panic() from NMI handler, so replace it with nmi_panic(). Also, do some cleanups. Signed-off-by: Hidehiro Kawai Acked-by: Guenter Roeck Cc: Thomas Mingarelli Cc: Wim Van Sebroeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/watchdog/hpwdt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index 8fc284cdce4e..8f89bd8a826a 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -484,7 +484,7 @@ static int hpwdt_pretimeout(unsigned int ulReason, struct pt_regs *regs) static int die_nmi_called; if (!hpwdt_nmi_decoding) - goto out; + return NMI_DONE; spin_lock_irqsave(&rom_lock, rom_pl); if (!die_nmi_called && !is_icru && !is_uefi) @@ -497,11 +497,11 @@ static int hpwdt_pretimeout(unsigned int ulReason, struct pt_regs *regs) if (!is_icru && !is_uefi) { if (cmn_regs.u1.ral == 0) { - panic("An NMI occurred, " - "but unable to determine source.\n"); + nmi_panic(regs, "An NMI occurred, but unable to determine source.\n"); + return NMI_HANDLED; } } - panic("An NMI occurred. Depending on your system the reason " + nmi_panic(regs, "An NMI occurred. Depending on your system the reason " "for the NMI is logged in any one of the following " "resources:\n" "1. Integrated Management Log (IML)\n" @@ -509,8 +509,7 @@ static int hpwdt_pretimeout(unsigned int ulReason, struct pt_regs *regs) "3. OA Forward Progress Log\n" "4. iLO Event Log"); -out: - return NMI_DONE; + return NMI_HANDLED; } #endif /* CONFIG_HPWDT_NMI_DECODING */ From ade356b99a4187578609f2a91c4d2ed88e4e70dc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 22 Mar 2016 14:27:26 -0700 Subject: [PATCH 63/81] profile: hide unused functions when !CONFIG_PROC_FS A couple of functions and variables in the profile implementation are used only on SMP systems by the procfs code, but are unused if either procfs is disabled or in uniprocessor kernels. gcc prints a harmless warning about the unused symbols: kernel/profile.c:243:13: error: 'profile_flip_buffers' defined but not used [-Werror=unused-function] static void profile_flip_buffers(void) ^ kernel/profile.c:266:13: error: 'profile_discard_flip_buffers' defined but not used [-Werror=unused-function] static void profile_discard_flip_buffers(void) ^ kernel/profile.c:330:12: error: 'profile_cpu_callback' defined but not used [-Werror=unused-function] static int profile_cpu_callback(struct notifier_block *info, ^ This adds further #ifdef to the file, to annotate exactly in which cases they are used. I have done several thousand ARM randconfig kernels with this patch applied and no longer get any warnings in this file. Signed-off-by: Arnd Bergmann Cc: Vlastimil Babka Cc: Robin Holt Cc: Johannes Weiner Cc: Christoph Lameter Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/profile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/profile.c b/kernel/profile.c index 51369697466e..c2199e9901c9 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -44,7 +44,7 @@ int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); static cpumask_var_t prof_cpu_mask; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); static DEFINE_MUTEX(profile_flip_mutex); @@ -202,7 +202,7 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n) } EXPORT_SYMBOL_GPL(profile_event_unregister); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) /* * Each cpu has a pair of open-addressed hashtables for pending * profile hits. read_profile() IPI's all cpus to request them From 5c9a8750a6409c63a0f01d51a9024861022f6593 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 22 Mar 2016 14:27:30 -0700 Subject: [PATCH 64/81] kernel: add kcov code coverage kcov provides code coverage collection for coverage-guided fuzzing (randomized testing). Coverage-guided fuzzing is a testing technique that uses coverage feedback to determine new interesting inputs to a system. A notable user-space example is AFL (http://lcamtuf.coredump.cx/afl/). However, this technique is not widely used for kernel testing due to missing compiler and kernel support. kcov does not aim to collect as much coverage as possible. It aims to collect more or less stable coverage that is function of syscall inputs. To achieve this goal it does not collect coverage in soft/hard interrupts and instrumentation of some inherently non-deterministic or non-interesting parts of kernel is disbled (e.g. scheduler, locking). Currently there is a single coverage collection mode (tracing), but the API anticipates additional collection modes. Initially I also implemented a second mode which exposes coverage in a fixed-size hash table of counters (what Quentin used in his original patch). I've dropped the second mode for simplicity. This patch adds the necessary support on kernel side. The complimentary compiler support was added in gcc revision 231296. We've used this support to build syzkaller system call fuzzer, which has found 90 kernel bugs in just 2 months: https://github.com/google/syzkaller/wiki/Found-Bugs We've also found 30+ bugs in our internal systems with syzkaller. Another (yet unexplored) direction where kcov coverage would greatly help is more traditional "blob mutation". For example, mounting a random blob as a filesystem, or receiving a random blob over wire. Why not gcov. Typical fuzzing loop looks as follows: (1) reset coverage, (2) execute a bit of code, (3) collect coverage, repeat. A typical coverage can be just a dozen of basic blocks (e.g. an invalid input). In such context gcov becomes prohibitively expensive as reset/collect coverage steps depend on total number of basic blocks/edges in program (in case of kernel it is about 2M). Cost of kcov depends only on number of executed basic blocks/edges. On top of that, kernel requires per-thread coverage because there are always background threads and unrelated processes that also produce coverage. With inlined gcov instrumentation per-thread coverage is not possible. kcov exposes kernel PCs and control flow to user-space which is insecure. But debugfs should not be mapped as user accessible. Based on a patch by Quentin Casasnovas. [akpm@linux-foundation.org: make task_struct.kcov_mode have type `enum kcov_mode'] [akpm@linux-foundation.org: unbreak allmodconfig] [akpm@linux-foundation.org: follow x86 Makefile layout standards] Signed-off-by: Dmitry Vyukov Reviewed-by: Kees Cook Cc: syzkaller Cc: Vegard Nossum Cc: Catalin Marinas Cc: Tavis Ormandy Cc: Will Deacon Cc: Quentin Casasnovas Cc: Kostya Serebryany Cc: Eric Dumazet Cc: Alexander Potapenko Cc: Kees Cook Cc: Bjorn Helgaas Cc: Sasha Levin Cc: David Drysdale Cc: Ard Biesheuvel Cc: Andrey Ryabinin Cc: Kirill A. Shutemov Cc: Jiri Slaby Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kcov.txt | 111 +++++++++++ Makefile | 11 +- arch/x86/Kconfig | 1 + arch/x86/boot/Makefile | 7 + arch/x86/boot/compressed/Makefile | 3 + arch/x86/entry/vdso/Makefile | 3 + arch/x86/kernel/Makefile | 6 + arch/x86/kernel/apic/Makefile | 4 + arch/x86/kernel/cpu/Makefile | 4 + arch/x86/lib/Makefile | 3 + arch/x86/mm/Makefile | 3 + arch/x86/realmode/rm/Makefile | 3 + drivers/firmware/efi/libstub/Makefile | 3 + include/linux/kcov.h | 29 +++ include/linux/sched.h | 11 ++ include/uapi/linux/kcov.h | 10 + kernel/Makefile | 12 ++ kernel/exit.c | 2 + kernel/fork.c | 3 + kernel/kcov.c | 273 ++++++++++++++++++++++++++ kernel/locking/Makefile | 3 + kernel/rcu/Makefile | 4 + kernel/sched/Makefile | 4 + lib/Kconfig.debug | 21 ++ lib/Makefile | 12 ++ mm/Makefile | 15 ++ mm/kasan/Makefile | 1 + scripts/Makefile.lib | 6 + 28 files changed, 567 insertions(+), 1 deletion(-) create mode 100644 Documentation/kcov.txt create mode 100644 include/linux/kcov.h create mode 100644 include/uapi/linux/kcov.h create mode 100644 kernel/kcov.c diff --git a/Documentation/kcov.txt b/Documentation/kcov.txt new file mode 100644 index 000000000000..779ff4ab1c1d --- /dev/null +++ b/Documentation/kcov.txt @@ -0,0 +1,111 @@ +kcov: code coverage for fuzzing +=============================== + +kcov exposes kernel code coverage information in a form suitable for coverage- +guided fuzzing (randomized testing). Coverage data of a running kernel is +exported via the "kcov" debugfs file. Coverage collection is enabled on a task +basis, and thus it can capture precise coverage of a single system call. + +Note that kcov does not aim to collect as much coverage as possible. It aims +to collect more or less stable coverage that is function of syscall inputs. +To achieve this goal it does not collect coverage in soft/hard interrupts +and instrumentation of some inherently non-deterministic parts of kernel is +disbled (e.g. scheduler, locking). + +Usage: +====== + +Configure kernel with: + + CONFIG_KCOV=y + +CONFIG_KCOV requires gcc built on revision 231296 or later. +Profiling data will only become accessible once debugfs has been mounted: + + mount -t debugfs none /sys/kernel/debug + +The following program demonstrates kcov usage from within a test program: + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) +#define KCOV_ENABLE _IO('c', 100) +#define KCOV_DISABLE _IO('c', 101) +#define COVER_SIZE (64<<10) + +int main(int argc, char **argv) +{ + int fd; + unsigned long *cover, n, i; + + /* A single fd descriptor allows coverage collection on a single + * thread. + */ + fd = open("/sys/kernel/debug/kcov", O_RDWR); + if (fd == -1) + perror("open"), exit(1); + /* Setup trace mode and trace size. */ + if (ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE)) + perror("ioctl"), exit(1); + /* Mmap buffer shared between kernel- and user-space. */ + cover = (unsigned long*)mmap(NULL, COVER_SIZE * sizeof(unsigned long), + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if ((void*)cover == MAP_FAILED) + perror("mmap"), exit(1); + /* Enable coverage collection on the current thread. */ + if (ioctl(fd, KCOV_ENABLE, 0)) + perror("ioctl"), exit(1); + /* Reset coverage from the tail of the ioctl() call. */ + __atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED); + /* That's the target syscal call. */ + read(-1, NULL, 0); + /* Read number of PCs collected. */ + n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); + for (i = 0; i < n; i++) + printf("0x%lx\n", cover[i + 1]); + /* Disable coverage collection for the current thread. After this call + * coverage can be enabled for a different thread. + */ + if (ioctl(fd, KCOV_DISABLE, 0)) + perror("ioctl"), exit(1); + /* Free resources. */ + if (munmap(cover, COVER_SIZE * sizeof(unsigned long))) + perror("munmap"), exit(1); + if (close(fd)) + perror("close"), exit(1); + return 0; +} + +After piping through addr2line output of the program looks as follows: + +SyS_read +fs/read_write.c:562 +__fdget_pos +fs/file.c:774 +__fget_light +fs/file.c:746 +__fget_light +fs/file.c:750 +__fget_light +fs/file.c:760 +__fdget_pos +fs/file.c:784 +SyS_read +fs/read_write.c:562 + +If a program needs to collect coverage from several threads (independently), +it needs to open /sys/kernel/debug/kcov in each thread separately. + +The interface is fine-grained to allow efficient forking of test processes. +That is, a parent process opens /sys/kernel/debug/kcov, enables trace mode, +mmaps coverage buffer and then forks child processes in a loop. Child processes +only need to enable coverage (disable happens automatically on thread end). diff --git a/Makefile b/Makefile index e055b969c325..b98a4f70d1b5 100644 --- a/Makefile +++ b/Makefile @@ -365,6 +365,7 @@ LDFLAGS_MODULE = CFLAGS_KERNEL = AFLAGS_KERNEL = CFLAGS_GCOV = -fprofile-arcs -ftest-coverage +CFLAGS_KCOV = -fsanitize-coverage=trace-pc # Use USERINCLUDE when you must reference the UAPI directories only. @@ -411,7 +412,7 @@ export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS -export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN CFLAGS_UBSAN +export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KCOV CFLAGS_KASAN CFLAGS_UBSAN export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL @@ -673,6 +674,14 @@ endif endif KBUILD_CFLAGS += $(stackp-flag) +ifdef CONFIG_KCOV + ifeq ($(call cc-option, $(CFLAGS_KCOV)),) + $(warning Cannot use CONFIG_KCOV: \ + -fsanitize-coverage=trace-pc is not supported by compiler) + CFLAGS_KCOV = + endif +endif + ifeq ($(cc-name),clang) KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8b680a5cb25b..54478b7635de 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,6 +28,7 @@ config X86 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_SG_CHAIN diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 0bf6749522d9..b1ef9e489084 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -12,6 +12,13 @@ KASAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y +# Kernel does not boot with kcov instrumentation here. +# One of the problems observed was insertion of __sanitizer_cov_trace_pc() +# callback into middle of per-cpu data enabling code. Thus the callback observed +# inconsistent state and crashed. We are interested mostly in syscall coverage, +# so boot code is not interesting anyway. +KCOV_INSTRUMENT := n + # If you want to preset the SVGA mode, uncomment the next line and # set SVGA_MODE to whatever number you want. # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 5e1d26e09407..6915ff2bd996 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -19,6 +19,9 @@ KASAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index f9fb859c98b9..6874da5f67fc 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -7,6 +7,9 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + VDSO64-$(CONFIG_X86_64) := y VDSOX32-$(CONFIG_X86_X32_ABI) := y VDSO32-$(CONFIG_X86_32) := y diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d5fb0871aba3..adaae2c781c1 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,6 +25,12 @@ OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y +# If instrumentation of this dir is enabled, boot hangs during first second. +# Probably could be more selective here, but note that files related to irqs, +# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to +# non-deterministic coverage. +KCOV_INSTRUMENT := n + CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 8bb12ddc5db8..8e63ebdcbd0b 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,6 +2,10 @@ # Makefile for local APIC drivers and for the IO-APIC code # +# Leads to non-deterministic coverage that is not a function of syscall inputs. +# In particualr, smp_apic_timer_interrupt() is called in random places. +KCOV_INSTRUMENT := n + obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o vector.o obj-y += hw_nmi.o diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 0d373d7affc8..4a8697f7d4ef 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg CFLAGS_REMOVE_perf_event.o = -pg endif +# If these files are instrumented, boot hangs during the first second. +KCOV_INSTRUMENT_common.o := n +KCOV_INSTRUMENT_perf_event.o := n + # Make sure load_percpu_segment has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index a501fa25da41..72a576752a7e 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -2,6 +2,9 @@ # Makefile for x86 specific library files. # +# Produces uninteresting flaky coverage. +KCOV_INSTRUMENT_delay.o := n + inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt quiet_cmd_inat_tables = GEN $@ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 67cf2e1e557b..f98913258c63 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,3 +1,6 @@ +# Kernel does not boot with instrumentation of tlb.c. +KCOV_INSTRUMENT_tlb.o := n + obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o physaddr.o gup.o setup_nx.o diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 053abe7b0ef7..b95964610ea7 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -9,6 +9,9 @@ KASAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + always := realmode.bin realmode.relocs wakeup-objs := wakeup_asm.o wakemain.o video-mode.o diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index a15841eced4e..da99bbb74aeb 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -25,6 +25,9 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y +# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. +KCOV_INSTRUMENT := n + lib-y := efi-stub-helper.o # include the stub's generic dependencies from lib/ when building for ARM/arm64 diff --git a/include/linux/kcov.h b/include/linux/kcov.h new file mode 100644 index 000000000000..2883ac98c280 --- /dev/null +++ b/include/linux/kcov.h @@ -0,0 +1,29 @@ +#ifndef _LINUX_KCOV_H +#define _LINUX_KCOV_H + +#include + +struct task_struct; + +#ifdef CONFIG_KCOV + +void kcov_task_init(struct task_struct *t); +void kcov_task_exit(struct task_struct *t); + +enum kcov_mode { + /* Coverage collection is not enabled yet. */ + KCOV_MODE_DISABLED = 0, + /* + * Tracing coverage collection mode. + * Covered PCs are collected in a per-task buffer. + */ + KCOV_MODE_TRACE = 1, +}; + +#else + +static inline void kcov_task_init(struct task_struct *t) {} +static inline void kcov_task_exit(struct task_struct *t) {} + +#endif /* CONFIG_KCOV */ +#endif /* _LINUX_KCOV_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 084ed9fba620..34495d2d2d7b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -51,6 +51,7 @@ struct sched_param { #include #include #include +#include #include #include #include @@ -1818,6 +1819,16 @@ struct task_struct { /* bitmask and counter of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ +#ifdef CONFIG_KCOV + /* Coverage collection mode enabled for this task (0 if disabled). */ + enum kcov_mode kcov_mode; + /* Size of the kcov_area. */ + unsigned kcov_size; + /* Buffer for coverage collection. */ + void *kcov_area; + /* kcov desciptor wired with this task or NULL. */ + struct kcov *kcov; +#endif #ifdef CONFIG_MEMCG struct mem_cgroup *memcg_in_oom; gfp_t memcg_oom_gfp_mask; diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h new file mode 100644 index 000000000000..574e22ec640d --- /dev/null +++ b/include/uapi/linux/kcov.h @@ -0,0 +1,10 @@ +#ifndef _LINUX_KCOV_IOCTLS_H +#define _LINUX_KCOV_IOCTLS_H + +#include + +#define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) +#define KCOV_ENABLE _IO('c', 100) +#define KCOV_DISABLE _IO('c', 101) + +#endif /* _LINUX_KCOV_IOCTLS_H */ diff --git a/kernel/Makefile b/kernel/Makefile index baa55e50a315..f0c40bf49d9f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -18,6 +18,17 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif +# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() +# in coverage traces. +KCOV_INSTRUMENT_softirq.o := n +# These are called from save_stack_trace() on slub debug path, +# and produce insane amounts of uninteresting coverage. +KCOV_INSTRUMENT_module.o := n +KCOV_INSTRUMENT_extable.o := n +# Don't self-instrument. +KCOV_INSTRUMENT_kcov.o := n +KASAN_SANITIZE_kcov.o := n + # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) @@ -68,6 +79,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ +obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o diff --git a/kernel/exit.c b/kernel/exit.c index 10e088237fed..953d1a1c0387 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -655,6 +656,7 @@ void do_exit(long code) TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); + kcov_task_exit(tsk); WARN_ON(blk_needs_flush_plug(tsk)); diff --git a/kernel/fork.c b/kernel/fork.c index 5b8d1e7ceeea..d277e83ed3e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -392,6 +393,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) account_kernel_stack(ti, 1); + kcov_task_init(tsk); + return tsk; free_ti: diff --git a/kernel/kcov.c b/kernel/kcov.c new file mode 100644 index 000000000000..3efbee0834a8 --- /dev/null +++ b/kernel/kcov.c @@ -0,0 +1,273 @@ +#define pr_fmt(fmt) "kcov: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * kcov descriptor (one per opened debugfs file). + * State transitions of the descriptor: + * - initial state after open() + * - then there must be a single ioctl(KCOV_INIT_TRACE) call + * - then, mmap() call (several calls are allowed but not useful) + * - then, repeated enable/disable for a task (only one task a time allowed) + */ +struct kcov { + /* + * Reference counter. We keep one for: + * - opened file descriptor + * - task with enabled coverage (we can't unwire it from another task) + */ + atomic_t refcount; + /* The lock protects mode, size, area and t. */ + spinlock_t lock; + enum kcov_mode mode; + /* Size of arena (in long's for KCOV_MODE_TRACE). */ + unsigned size; + /* Coverage buffer shared with user space. */ + void *area; + /* Task for which we collect coverage, or NULL. */ + struct task_struct *t; +}; + +/* + * Entry point from instrumented code. + * This is called once per basic-block/edge. + */ +void __sanitizer_cov_trace_pc(void) +{ + struct task_struct *t; + enum kcov_mode mode; + + t = current; + /* + * We are interested in code coverage as a function of a syscall inputs, + * so we ignore code executed in interrupts. + */ + if (!t || in_interrupt()) + return; + mode = READ_ONCE(t->kcov_mode); + if (mode == KCOV_MODE_TRACE) { + unsigned long *area; + unsigned long pos; + + /* + * There is some code that runs in interrupts but for which + * in_interrupt() returns false (e.g. preempt_schedule_irq()). + * READ_ONCE()/barrier() effectively provides load-acquire wrt + * interrupts, there are paired barrier()/WRITE_ONCE() in + * kcov_ioctl_locked(). + */ + barrier(); + area = t->kcov_area; + /* The first word is number of subsequent PCs. */ + pos = READ_ONCE(area[0]) + 1; + if (likely(pos < t->kcov_size)) { + area[pos] = _RET_IP_; + WRITE_ONCE(area[0], pos); + } + } +} +EXPORT_SYMBOL(__sanitizer_cov_trace_pc); + +static void kcov_get(struct kcov *kcov) +{ + atomic_inc(&kcov->refcount); +} + +static void kcov_put(struct kcov *kcov) +{ + if (atomic_dec_and_test(&kcov->refcount)) { + vfree(kcov->area); + kfree(kcov); + } +} + +void kcov_task_init(struct task_struct *t) +{ + t->kcov_mode = KCOV_MODE_DISABLED; + t->kcov_size = 0; + t->kcov_area = NULL; + t->kcov = NULL; +} + +void kcov_task_exit(struct task_struct *t) +{ + struct kcov *kcov; + + kcov = t->kcov; + if (kcov == NULL) + return; + spin_lock(&kcov->lock); + if (WARN_ON(kcov->t != t)) { + spin_unlock(&kcov->lock); + return; + } + /* Just to not leave dangling references behind. */ + kcov_task_init(t); + kcov->t = NULL; + spin_unlock(&kcov->lock); + kcov_put(kcov); +} + +static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) +{ + int res = 0; + void *area; + struct kcov *kcov = vma->vm_file->private_data; + unsigned long size, off; + struct page *page; + + area = vmalloc_user(vma->vm_end - vma->vm_start); + if (!area) + return -ENOMEM; + + spin_lock(&kcov->lock); + size = kcov->size * sizeof(unsigned long); + if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 || + vma->vm_end - vma->vm_start != size) { + res = -EINVAL; + goto exit; + } + if (!kcov->area) { + kcov->area = area; + vma->vm_flags |= VM_DONTEXPAND; + spin_unlock(&kcov->lock); + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + if (vm_insert_page(vma, vma->vm_start + off, page)) + WARN_ONCE(1, "vm_insert_page() failed"); + } + return 0; + } +exit: + spin_unlock(&kcov->lock); + vfree(area); + return res; +} + +static int kcov_open(struct inode *inode, struct file *filep) +{ + struct kcov *kcov; + + kcov = kzalloc(sizeof(*kcov), GFP_KERNEL); + if (!kcov) + return -ENOMEM; + atomic_set(&kcov->refcount, 1); + spin_lock_init(&kcov->lock); + filep->private_data = kcov; + return nonseekable_open(inode, filep); +} + +static int kcov_close(struct inode *inode, struct file *filep) +{ + kcov_put(filep->private_data); + return 0; +} + +static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, + unsigned long arg) +{ + struct task_struct *t; + unsigned long size, unused; + + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + */ + if (kcov->mode != KCOV_MODE_DISABLED) + return -EBUSY; + /* + * Size must be at least 2 to hold current position and one PC. + * Later we allocate size * sizeof(unsigned long) memory, + * that must not overflow. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + kcov->size = size; + kcov->mode = KCOV_MODE_TRACE; + return 0; + case KCOV_ENABLE: + /* + * Enable coverage for the current task. + * At this point user must have been enabled trace mode, + * and mmapped the file. Coverage collection is disabled only + * at task exit or voluntary by KCOV_DISABLE. After that it can + * be enabled for another task. + */ + unused = arg; + if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED || + kcov->area == NULL) + return -EINVAL; + if (kcov->t != NULL) + return -EBUSY; + t = current; + /* Cache in task struct for performance. */ + t->kcov_size = kcov->size; + t->kcov_area = kcov->area; + /* See comment in __sanitizer_cov_trace_pc(). */ + barrier(); + WRITE_ONCE(t->kcov_mode, kcov->mode); + t->kcov = kcov; + kcov->t = t; + /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */ + kcov_get(kcov); + return 0; + case KCOV_DISABLE: + /* Disable coverage for the current task. */ + unused = arg; + if (unused != 0 || current->kcov != kcov) + return -EINVAL; + t = current; + if (WARN_ON(kcov->t != t)) + return -EINVAL; + kcov_task_init(t); + kcov->t = NULL; + kcov_put(kcov); + return 0; + default: + return -ENOTTY; + } +} + +static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct kcov *kcov; + int res; + + kcov = filep->private_data; + spin_lock(&kcov->lock); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock(&kcov->lock); + return res; +} + +static const struct file_operations kcov_fops = { + .open = kcov_open, + .unlocked_ioctl = kcov_ioctl, + .mmap = kcov_mmap, + .release = kcov_close, +}; + +static int __init kcov_init(void) +{ + if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { + pr_err("failed to create kcov in debugfs\n"); + return -ENOMEM; + } + return 0; +} + +device_initcall(kcov_init); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8e96f6cc2a4a..31322a4275cd 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,3 +1,6 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 61a16569ffbf..032b2c015beb 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,3 +1,7 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n + obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 302d6ebd64f7..414d9c16da42 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) endif +# These files are disabled because they produce non-interesting flaky coverage +# that is not a function of syscall inputs. E.g. involuntary context switches. +KCOV_INSTRUMENT := n + ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5a60f45cd9bb..532d4d52d1df 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -696,6 +696,27 @@ source "lib/Kconfig.kasan" endmenu # "Memory Debugging" +config ARCH_HAS_KCOV + bool + help + KCOV does not have any arch-specific code, but currently it is enabled + only for x86_64. KCOV requires testing on other archs, and most likely + disabling of instrumentation for some early boot code. + +config KCOV + bool "Code coverage for fuzzing" + depends on ARCH_HAS_KCOV + select DEBUG_FS + help + KCOV exposes kernel code coverage information in a form suitable + for coverage-guided fuzzing (randomized testing). + + If RANDOMIZE_BASE is enabled, PC values will not be stable across + different machines and across reboots. If you need stable PC values, + disable RANDOMIZE_BASE. + + For more details, see Documentation/kcov.txt. + config DEBUG_SHIRQ bool "Debug shared IRQ handlers" depends on DEBUG_KERNEL diff --git a/lib/Makefile b/lib/Makefile index 4962d14c450f..a1de5b61ff40 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -7,6 +7,18 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) endif +# These files are disabled because they produce lots of non-interesting and/or +# flaky coverage that is not a function of syscall inputs. For example, +# rbtree can be global and individual rotations don't correlate with inputs. +KCOV_INSTRUMENT_string.o := n +KCOV_INSTRUMENT_rbtree.o := n +KCOV_INSTRUMENT_list_debug.o := n +KCOV_INSTRUMENT_debugobjects.o := n +KCOV_INSTRUMENT_dynamic_debug.o := n +# Kernel does not boot if we instrument this file as it uses custom calling +# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS). +KCOV_INSTRUMENT_hweight.o := n + lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ diff --git a/mm/Makefile b/mm/Makefile index 6da300a1414b..f5e797cbd128 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,6 +5,21 @@ KASAN_SANITIZE_slab_common.o := n KASAN_SANITIZE_slub.o := n +# These files are disabled because they produce non-interesting and/or +# flaky coverage that is not a function of syscall inputs. E.g. slab is out of +# free pages, or a task is migrated between nodes. +KCOV_INSTRUMENT_slab_common.o := n +KCOV_INSTRUMENT_slob.o := n +KCOV_INSTRUMENT_slab.o := n +KCOV_INSTRUMENT_slub.o := n +KCOV_INSTRUMENT_page_alloc.o := n +KCOV_INSTRUMENT_debug-pagealloc.o := n +KCOV_INSTRUMENT_kmemleak.o := n +KCOV_INSTRUMENT_kmemcheck.o := n +KCOV_INSTRUMENT_memcontrol.o := n +KCOV_INSTRUMENT_mmzone.o := n +KCOV_INSTRUMENT_vmstat.o := n + mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index a61460d9f5b0..131daadf40e4 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,5 +1,6 @@ KASAN_SANITIZE := n UBSAN_SANITIZE_kasan.o := n +KCOV_INSTRUMENT := n CFLAGS_REMOVE_kasan.o = -pg # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index ad50d5859ac4..ddf83d0181e7 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -136,6 +136,12 @@ _c_flags += $(if $(patsubst n%,, \ $(CFLAGS_UBSAN)) endif +ifeq ($(CONFIG_KCOV),y) +_c_flags += $(if $(patsubst n%,, \ + $(KCOV_INSTRUMENT_$(basetarget).o)$(KCOV_INSTRUMENT)y), \ + $(CFLAGS_KCOV)) +endif + # If building the kernel in a separate objtree expand all occurrences # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/'). From 2d061d999424efd99e30fa4115c7dced07533223 Mon Sep 17 00:00:00 2001 From: Kieran Bingham Date: Tue, 22 Mar 2016 14:27:33 -0700 Subject: [PATCH 65/81] scripts/gdb: add version command lx-version Report the Linux Version of the current kernel. Add a command to identify the version specified by the banner in the debugged kernel. This lets the user identify the kernel of the running kernel, and will let later scripts compare the banner of the attached kernel against the banner in the vmlinux symbols files to verify that the files are correct. [jan.kiszka@siemens.com: remove blank line from help output and fix pep8 warning] Signed-off-by: Kieran Bingham Signed-off-by: Jan Kiszka Cc: Jason Wessel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/gdb/linux/proc.py | 28 ++++++++++++++++++++++++++++ scripts/gdb/vmlinux-gdb.py | 1 + 2 files changed, 29 insertions(+) create mode 100644 scripts/gdb/linux/proc.py diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py new file mode 100644 index 000000000000..8a733dd99580 --- /dev/null +++ b/scripts/gdb/linux/proc.py @@ -0,0 +1,28 @@ +# +# gdb helper commands and functions for Linux kernel debugging +# +# Kernel proc information reader +# +# Copyright (c) 2016 Linaro Ltd +# +# Authors: +# Kieran Bingham +# +# This work is licensed under the terms of the GNU GPL version 2. +# + +import gdb + + +class LxVersion(gdb.Command): + """ Report the Linux Version of the current kernel. + Equivalent to cat /proc/version on a running target""" + + def __init__(self): + super(LxVersion, self).__init__("lx-version", gdb.COMMAND_DATA) + + def invoke(self, arg, from_tty): + # linux_banner should contain a newline + gdb.write(gdb.parse_and_eval("linux_banner").string()) + +LxVersion() diff --git a/scripts/gdb/vmlinux-gdb.py b/scripts/gdb/vmlinux-gdb.py index ce82bf5c3943..d5943eca19cd 100644 --- a/scripts/gdb/vmlinux-gdb.py +++ b/scripts/gdb/vmlinux-gdb.py @@ -29,3 +29,4 @@ else: import linux.tasks import linux.cpus import linux.lists + import linux.proc From 72bf92ec29ac052e950dcb2cc7f8820a2e1f0a02 Mon Sep 17 00:00:00 2001 From: Kieran Bingham Date: Tue, 22 Mar 2016 14:27:36 -0700 Subject: [PATCH 66/81] scripts/gdb: add cmdline reader command lx-cmdline Report the Linux Commandline used in the current kernel [jan.kiszka@siemens.com: remove blank line from help output and fix pep8 warning] Signed-off-by: Kieran Bingham Signed-off-by: Jan Kiszka Cc: Jason Wessel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/gdb/linux/proc.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py index 8a733dd99580..6e6709c1830c 100644 --- a/scripts/gdb/linux/proc.py +++ b/scripts/gdb/linux/proc.py @@ -14,6 +14,19 @@ import gdb +class LxCmdLine(gdb.Command): + """ Report the Linux Commandline used in the current kernel. + Equivalent to cat /proc/cmdline on a running target""" + + def __init__(self): + super(LxCmdLine, self).__init__("lx-cmdline", gdb.COMMAND_DATA) + + def invoke(self, arg, from_tty): + gdb.write(gdb.parse_and_eval("saved_command_line").string() + "\n") + +LxCmdLine() + + class LxVersion(gdb.Command): """ Report the Linux Version of the current kernel. Equivalent to cat /proc/version on a running target""" From ad4db3b24a93e52a92ad8f9b0273a9416f202c23 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 22 Mar 2016 14:27:39 -0700 Subject: [PATCH 67/81] scripts/gdb: account for changes in module data structure Commit 7523e4dc5057 ("module: use a structure to encapsulate layout.") factored out the module_layout structure. Adjust the symbol loader and the lsmod command to this. Signed-off-by: Jan Kiszka Reviewed-by: Kieran Bingham Tested-by: Kieran Bingham (qemu-{ARM,x86}) Cc: Rusty Russell Cc: Jiri Kosina Cc: Jason Wessel Cc: [4.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/gdb/linux/modules.py | 5 +++-- scripts/gdb/linux/symbols.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/gdb/linux/modules.py b/scripts/gdb/linux/modules.py index 25db8cff44a2..0a35d6dbfb80 100644 --- a/scripts/gdb/linux/modules.py +++ b/scripts/gdb/linux/modules.py @@ -73,10 +73,11 @@ class LxLsmod(gdb.Command): " " if utils.get_long_type().sizeof == 8 else "")) for module in module_list(): + layout = module['core_layout'] gdb.write("{address} {name:<19} {size:>8} {ref}".format( - address=str(module['module_core']).split()[0], + address=str(layout['base']).split()[0], name=module['name'].string(), - size=str(module['core_size']), + size=str(layout['size']), ref=str(module['refcnt']['counter']))) source_list = module['source_list'] diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index 627750cb420d..9a0f8923f67c 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -108,7 +108,7 @@ lx-symbols command.""" def load_module_symbols(self, module): module_name = module['name'].string() - module_addr = str(module['module_core']).split()[0] + module_addr = str(module['core_layout']['base']).split()[0] module_file = self._get_module_file(module_name) if not module_file and not self.module_files_updated: From 21b2f44315d735345f02356239a819debdd89462 Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Tue, 22 Mar 2016 14:27:42 -0700 Subject: [PATCH 68/81] kfifo: fix sparse complaints This patch fix complaints by the sparse tool when using kfifo_put() with non scalar types like structures (i.e. drivers/iio/industrialio-event.c). Casting a pointer to the value and read this pointer instead of directly casting the value will fix this. The generated code is equal. Signed-off-by: Stefani Seibold Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfifo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 473b43678ad1..41eb6fdf87a8 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -401,7 +401,7 @@ __kfifo_int_must_check_helper( \ ((typeof(__tmp->type))__kfifo->data) : \ (__tmp->buf) \ )[__kfifo->in & __tmp->kfifo.mask] = \ - (typeof(*__tmp->type))__val; \ + *(typeof(__tmp->type))&__val; \ smp_wmb(); \ __kfifo->in++; \ } \ From dde5cf39d4d2cce71f2997c37210dd624d0e4bf6 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 22 Mar 2016 14:27:45 -0700 Subject: [PATCH 69/81] ubsan: fix tree-wide -Wmaybe-uninitialized false positives -fsanitize=* options makes GCC less smart than usual and increase number of 'maybe-uninitialized' false-positives. So this patch does two things: * Add -Wno-maybe-uninitialized to CFLAGS_UBSAN which will disable all such warnings for instrumented files. * Remove CONFIG_UBSAN_SANITIZE_ALL from all[yes|mod]config builds. So the all[yes|mod]config build goes without -fsanitize=* and still with -Wmaybe-uninitialized. Signed-off-by: Andrey Ryabinin Reported-by: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.ubsan | 5 +++++ scripts/Makefile.ubsan | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index e07c1ba9ba13..39494af9a84a 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -13,6 +13,11 @@ config UBSAN_SANITIZE_ALL bool "Enable instrumentation for the entire kernel" depends on UBSAN depends on ARCH_HAS_UBSAN_SANITIZE_ALL + + # We build with -Wno-maybe-uninitilzed, but we still want to + # use -Wmaybe-uninitilized in allmodconfig builds. + # So dependsy bellow used to disable this option in allmodconfig + depends on !COMPILE_TEST default y help This option activates instrumentation for the entire kernel. diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan index 8ab68679cfb5..77ce538268b5 100644 --- a/scripts/Makefile.ubsan +++ b/scripts/Makefile.ubsan @@ -14,4 +14,8 @@ ifdef CONFIG_UBSAN ifdef CONFIG_UBSAN_ALIGNMENT CFLAGS_UBSAN += $(call cc-option, -fsanitize=alignment) endif + + # -fsanitize=* options makes GCC less smart than usual and + # increase number of 'maybe-uninitialized false-positives + CFLAGS_UBSAN += $(call cc-option, -Wno-maybe-uninitialized) endif From a5f4db877177d2a3d7ae62a7bac3a5a27e083d7f Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 22 Mar 2016 14:27:48 -0700 Subject: [PATCH 70/81] ipc/sem: make semctl setting sempid consistent As indicated by bug#112271, Linux sets the sempid value upon semctl, and not only for semop calls. However, within semctl we only do this for SETVAL, leaving SETALL without updating the field, and therefore rather inconsistent behavior when compared to other Unices. There is really no documentation regarding this and therefore users should not make assumptions. With this patch, along with updating semctl.2 manpages, this scenario should become less ambiguous As such, set sempid on SETALL cmd. Also update some in-code documentation, specifying where the sempid is set. Passes ltp and custom testcase where a child (fork) does SETALL to the set. Signed-off-by: Davidlohr Bueso Reported-by: Philip Semanchuk Cc: Michael Kerrisk Cc: PrasannaKumar Muralidharan Cc: Manfred Spraul Cc: Herton R. Krzesinski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index cddd5b5fde51..b3757ea0694b 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -92,7 +92,14 @@ /* One semaphore structure for each semaphore in the system. */ struct sem { int semval; /* current value */ - int sempid; /* pid of last operation */ + /* + * PID of the process that last modified the semaphore. For + * Linux, specifically these are: + * - semop + * - semctl, via SETVAL and SETALL. + * - at task exit when performing undo adjustments (see exit_sem). + */ + int sempid; spinlock_t lock; /* spinlock for fine-grained semtimedop */ struct list_head pending_alter; /* pending single-sop operations */ /* that alter the semaphore */ @@ -1444,8 +1451,10 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, goto out_unlock; } - for (i = 0; i < nsems; i++) + for (i = 0; i < nsems; i++) { sma->sem_base[i].semval = sem_io[i]; + sma->sem_base[i].sempid = task_tgid_vnr(current); + } ipc_assert_locked_object(&sma->sem_perm); list_for_each_entry(un, &sma->list_id, list_id) { From f138556daf62665f19178732ec4daf86c4ca13f4 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Tue, 22 Mar 2016 14:27:51 -0700 Subject: [PATCH 71/81] mm/mprotect.c: don't imply PROT_EXEC on non-exec fs The mprotect(PROT_READ) fails when called by the READ_IMPLIES_EXEC binary on a memory mapped file located on non-exec fs. The mprotect does not check whether fs is _executable_ or not. The PROT_EXEC flag is set automatically even if a memory mapped file is located on non-exec fs. Fix it by checking whether a memory mapped file is located on a non-exec fs. If so the PROT_EXEC is not implied by the PROT_READ. The implementation uses the VM_MAYEXEC flag set properly in mmap. Now it is consistent with mmap. I did the isolated tests (PT_GNU_STACK X/NX, multiple VMAs, X/NX fs). I also patched the official 3.19.0-47-generic Ubuntu 14.04 kernel and it seems to work. Signed-off-by: Piotr Kwapulinski Cc: Mel Gorman Cc: Kirill A. Shutemov Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Konstantin Khlebnikov Cc: Dan Williams Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index fa37c4cd973a..b650c5412f58 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -359,6 +359,9 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, struct vm_area_struct *vma, *prev; int error = -EINVAL; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); + const bool rier = (current->personality & READ_IMPLIES_EXEC) && + (prot & PROT_READ); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ return -EINVAL; @@ -375,11 +378,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, return -EINVAL; reqprot = prot; - /* - * Does the application expect PROT_READ to imply PROT_EXEC: - */ - if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) - prot |= PROT_EXEC; down_write(¤t->mm->mmap_sem); @@ -414,6 +412,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + /* Does the application expect PROT_READ to imply PROT_EXEC */ + if (rier && (vma->vm_flags & VM_MAYEXEC)) + prot |= PROT_EXEC; + newflags = calc_vm_prot_bits(prot, pkey); newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); @@ -445,6 +447,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = -ENOMEM; goto out; } + prot = reqprot; } out: up_write(¤t->mm->mmap_sem); From 41b27154874b3a40d6673052d08c8e9fd0c6404f Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 22 Mar 2016 14:27:54 -0700 Subject: [PATCH 72/81] kernel/signal.c: add compile-time check for __ARCH_SI_PREAMBLE_SIZE The value of __ARCH_SI_PREAMBLE_SIZE defines the size (including padding) of the part of the struct siginfo that is before the union, and it is then used to calculate the needed padding (SI_PAD_SIZE) to make the size of struct siginfo equal to 128 (SI_MAX_SIZE) bytes. Depending on the target architecture and word width it equals to either 3 or 4 times sizeof int. Since the very beginning we had __ARCH_SI_PREAMBLE_SIZE wrong on the parisc architecture for the 64bit kernel build. It's even more frustrating, because it can easily be checked at compile time if the value was defined correctly. This patch adds such a check for the correctness of __ARCH_SI_PREAMBLE_SIZE in the hope that it will prevent existing and future architectures from running into the same problem. I refrained from replacing __ARCH_SI_PREAMBLE_SIZE by offsetof() in copy_siginfo() in include/asm-generic/siginfo.h, because a) it doesn't make any difference and b) it's used in the Documentation/kmemcheck.txt example. I ran this patch through the 0-DAY kernel test infrastructure and only the parisc architecture triggered as expected. That means that this patch should be OK for all major architectures. Signed-off-by: Helge Deller Cc: Stephen Rothwell Cc: Michael Ellerman Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/signal.c b/kernel/signal.c index fe8ed298373c..aa9bf00749c1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3585,6 +3585,10 @@ __weak const char *arch_vma_name(struct vm_area_struct *vma) void __init signals_init(void) { + /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */ + BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE + != offsetof(struct siginfo, _sifields._pad)); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); } From cf61e2a1487d833e4748dead4096584de70bf742 Mon Sep 17 00:00:00 2001 From: Brian Starkey Date: Tue, 22 Mar 2016 14:27:57 -0700 Subject: [PATCH 73/81] memremap: don't modify flags These patches implement a MEMREMAP_WC flag for memremap(), which can be used to obtain writecombine mappings. This is then used for setting up dma_coherent_mem regions which use the DMA_MEMORY_MAP flag. The motivation is to fix an alignment fault on arm64, and the suggestion to implement MEMREMAP_WC for this case was made at [1]. That particular issue is handled in patch 4, which makes sure that the appropriate memset function is used when zeroing allocations mapped as IO memory. This patch (of 4): Don't modify the flags input argument to memremap(). MEMREMAP_WB is already a special case so we can check for it directly instead of clearing flag bits in each mapper. Signed-off-by: Brian Starkey Cc: Catalin Marinas Cc: Dan Williams Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/memremap.c b/kernel/memremap.c index 584febd13e2e..e5e685e6ff2a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -64,6 +64,9 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); void *addr = NULL; + if (!flags) + return NULL; + if (is_ram == REGION_MIXED) { WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", &offset, (unsigned long) size); @@ -72,7 +75,6 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) /* Try all mapping types requested until one returns non-NULL */ if (flags & MEMREMAP_WB) { - flags &= ~MEMREMAP_WB; /* * MEMREMAP_WB is special in that it can be satisifed * from the direct map. Some archs depend on the @@ -86,21 +88,19 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) } /* - * If we don't have a mapping yet and more request flags are - * pending then we will be attempting to establish a new virtual + * If we don't have a mapping yet and other request flags are + * present then we will be attempting to establish a new virtual * address mapping. Enforce that this mapping is not aliasing * System RAM. */ - if (!addr && is_ram == REGION_INTERSECTS && flags) { + if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", &offset, (unsigned long) size); return NULL; } - if (!addr && (flags & MEMREMAP_WT)) { - flags &= ~MEMREMAP_WT; + if (!addr && (flags & MEMREMAP_WT)) addr = ioremap_wt(offset, size); - } return addr; } From c907e0eb43a522de60fb651c011c553f87273222 Mon Sep 17 00:00:00 2001 From: Brian Starkey Date: Tue, 22 Mar 2016 14:28:00 -0700 Subject: [PATCH 74/81] memremap: add MEMREMAP_WC flag Add a flag to memremap() for writecombine mappings. Mappings satisfied by this flag will not be cached, however writes may be delayed or combined into more efficient bursts. This is most suitable for buffers written sequentially by the CPU for use by other DMA devices. Signed-off-by: Brian Starkey Reviewed-by: Catalin Marinas Cc: Dan Williams Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/io.h | 1 + kernel/memremap.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/io.h b/include/linux/io.h index 32403b5716e5..e2c8419278c1 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -135,6 +135,7 @@ enum { /* See memremap() kernel-doc for usage description... */ MEMREMAP_WB = 1 << 0, MEMREMAP_WT = 1 << 1, + MEMREMAP_WC = 1 << 2, }; void *memremap(resource_size_t offset, size_t size, unsigned long flags); diff --git a/kernel/memremap.c b/kernel/memremap.c index e5e685e6ff2a..a6d382312e6f 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -41,11 +41,13 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * memremap() - remap an iomem_resource as cacheable memory * @offset: iomem resource start address * @size: size of remap - * @flags: either MEMREMAP_WB or MEMREMAP_WT + * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC * * memremap() is "ioremap" for cases where it is known that the resource * being mapped does not have i/o side effects and the __iomem - * annotation is not applicable. + * annotation is not applicable. In the case of multiple flags, the different + * mapping types will be attempted in the order listed below until one of + * them succeeds. * * MEMREMAP_WB - matches the default mapping for System RAM on * the architecture. This is usually a read-allocate write-back cache. @@ -57,6 +59,10 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * cache or are written through to memory and never exist in a * cache-dirty state with respect to program visibility. Attempts to * map System RAM with this mapping type will fail. + * + * MEMREMAP_WC - establish a writecombine mapping, whereby writes may + * be coalesced together (e.g. in the CPU's write buffers), but is otherwise + * uncached. Attempts to map System RAM with this mapping type will fail. */ void *memremap(resource_size_t offset, size_t size, unsigned long flags) { @@ -102,6 +108,9 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) if (!addr && (flags & MEMREMAP_WT)) addr = ioremap_wt(offset, size); + if (!addr && (flags & MEMREMAP_WC)) + addr = ioremap_wc(offset, size); + return addr; } EXPORT_SYMBOL(memremap); From 6b03ae0d42bfecf2536f7967d37e471d98c0b3d2 Mon Sep 17 00:00:00 2001 From: Brian Starkey Date: Tue, 22 Mar 2016 14:28:03 -0700 Subject: [PATCH 75/81] drivers: dma-coherent: use MEMREMAP_WC for DMA_MEMORY_MAP When the DMA_MEMORY_MAP flag is used, memory which can be accessed directly should be returned, so use memremap(..., MEMREMAP_WC) to provide a writecombine mapping. Signed-off-by: Brian Starkey Reviewed-by: Catalin Marinas Cc: Dan Williams Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/dma-coherent.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/base/dma-coherent.c b/drivers/base/dma-coherent.c index 87b808374888..25bb398ec7a1 100644 --- a/drivers/base/dma-coherent.c +++ b/drivers/base/dma-coherent.c @@ -2,6 +2,7 @@ * Coherent per-device memory handling. * Borrowed from i386 */ +#include #include #include #include @@ -31,7 +32,10 @@ static bool dma_init_coherent_memory( if (!size) goto out; - mem_base = ioremap(phys_addr, size); + if (flags & DMA_MEMORY_MAP) + mem_base = memremap(phys_addr, size, MEMREMAP_WC); + else + mem_base = ioremap(phys_addr, size); if (!mem_base) goto out; @@ -54,8 +58,12 @@ static bool dma_init_coherent_memory( out: kfree(dma_mem); - if (mem_base) - iounmap(mem_base); + if (mem_base) { + if (flags & DMA_MEMORY_MAP) + memunmap(mem_base); + else + iounmap(mem_base); + } return false; } @@ -63,7 +71,11 @@ static void dma_release_coherent_memory(struct dma_coherent_mem *mem) { if (!mem) return; - iounmap(mem->virt_base); + + if (mem->flags & DMA_MEMORY_MAP) + memunmap(mem->virt_base); + else + iounmap(mem->virt_base); kfree(mem->bitmap); kfree(mem); } From 20d7a35bc8be06dd1350de0454b0e24cca1d4354 Mon Sep 17 00:00:00 2001 From: Brian Starkey Date: Tue, 22 Mar 2016 14:28:06 -0700 Subject: [PATCH 76/81] drivers: dma-coherent: use memset_io for DMA_MEMORY_IO mappings Use memset_io() for DMA_MEMORY_IO mappings which are mapped as I/O memory, and regular memset() for DMA_MEMORY_MAP mappings. This fixes the below alignment fault on arm64 for DMA_MEMORY_IO mappings, where memset() uses the DC ZVA instruction which is invalid on device memory. Unhandled fault: alignment fault (0x96000061) at 0xffffff8000380000 Internal error: : 96000061 [#1] PREEMPT SMP Modules linked in: hdlcd(+) clk_scpi CPU: 4 PID: 1355 Comm: systemd-udevd Not tainted 4.4.0-rc1+ #5 Hardware name: ARM Juno development board (r0) (DT) task: ffffffc9763eee00 ti: ffffffc9758c4000 task.ti: ffffffc9758c4000 PC is at __efistub_memset+0x1ac/0x200 LR is at dma_alloc_from_coherent+0xb0/0x120 pc : [] lr : [] pstate: 400001c5 sp : ffffffc9758c79a0 x29: ffffffc9758c79a0 x28: ffffffc000635cd0 x27: 0000000000000124 x26: ffffffc000119ef4 x25: 0000000000010000 x24: 0000000000000140 x23: ffffffc07e9ac3a8 x22: ffffffc9758c7a58 x21: ffffffc9758c7a68 x20: 0000000000000004 x19: ffffffc07e9ac380 x18: 0000000000000001 x17: 0000007fae1bbba8 x16: ffffffc0001b2d1c x15: ffffffffffffffff x14: 0ffffffffffffffe x13: 0000000000000010 x12: ffffff800837ffff x11: ffffff800837ffff x10: 0000000040000000 x9 : 0000000000000000 x8 : ffffff8000380000 x7 : 0000000000000000 x6 : 000000000000003f x5 : 0000000000000040 x4 : 0000000000000000 x3 : 0000000000000004 x2 : 000000000000ffc0 x1 : 0000000000000000 x0 : ffffff8000380000 Signed-off-by: Brian Starkey Reviewed-by: Catalin Marinas Cc: Dan Williams Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/dma-coherent.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/base/dma-coherent.c b/drivers/base/dma-coherent.c index 25bb398ec7a1..bdf28f7dd5e8 100644 --- a/drivers/base/dma-coherent.c +++ b/drivers/base/dma-coherent.c @@ -187,7 +187,10 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, */ *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); *ret = mem->virt_base + (pageno << PAGE_SHIFT); - memset(*ret, 0, size); + if (mem->flags & DMA_MEMORY_MAP) + memset(*ret, 0, size); + else + memset_io(*ret, 0, size); spin_unlock_irqrestore(&mem->spinlock, flags); return 1; From a395d6a7e3d6e3d1d316376db0c4c8b5d2995930 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 22 Mar 2016 14:28:09 -0700 Subject: [PATCH 77/81] kernel/...: convert pr_warning to pr_warn Use the more common logging method with the eventual goal of removing pr_warning altogether. Miscellanea: - Realign arguments - Coalesce formats - Add missing space between a few coalesced formats Signed-off-by: Joe Perches Acked-by: Rafael J. Wysocki [kernel/power/suspend.c] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 4 +-- kernel/power/suspend.c | 3 +-- kernel/time/tick-sched.c | 8 +++--- kernel/trace/blktrace.c | 4 +-- kernel/trace/ftrace.c | 7 +++-- kernel/trace/trace.c | 40 ++++++++++++++-------------- kernel/trace/trace_functions_graph.c | 6 ++--- kernel/trace/trace_kprobe.c | 27 ++++++++----------- kernel/trace/trace_mmiotrace.c | 2 +- kernel/trace/trace_probe.c | 4 +-- kernel/trace/trace_stat.c | 3 +-- kernel/trace/trace_uprobe.c | 2 +- kernel/tracepoint.c | 2 +- 13 files changed, 52 insertions(+), 60 deletions(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 64731e84c982..cc1cc641d653 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1322,8 +1322,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (nmsk != omsk) /* hope the handler works with current trigger mode */ - pr_warning("irq %d uses trigger mode %u; requested %u\n", - irq, nmsk, omsk); + pr_warn("irq %d uses trigger mode %u; requested %u\n", + irq, nmsk, omsk); } *old_ptr = new; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 230a77225e2e..5b70d64b871e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -473,8 +473,7 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { - pr_warning("PM: Unsupported test mode for suspend to idle," - "please choose none/freezer/devices/platform.\n"); + pr_warn("PM: Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n"); return -EAGAIN; } #endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 195fe7d2caad..084b79f5917e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -378,7 +378,7 @@ static int __init tick_nohz_full_setup(char *str) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { - pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + pr_warn("NO_HZ: Incorrect nohz_full cpumask\n"); free_bootmem_cpumask_var(tick_nohz_full_mask); return 1; } @@ -446,8 +446,7 @@ void __init tick_nohz_init(void) * interrupts to avoid circular dependency on the tick */ if (!arch_irq_work_has_interrupt()) { - pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " - "support irq work self-IPIs\n"); + pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); cpumask_copy(housekeeping_mask, cpu_possible_mask); tick_nohz_full_running = false; @@ -457,7 +456,8 @@ void __init tick_nohz_init(void) cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { - pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", + cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2aeb6ffc0a1e..f94e7a21f52d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1437,12 +1437,12 @@ static struct trace_event trace_blk_event = { static int __init init_blk_tracer(void) { if (!register_trace_event(&trace_blk_event)) { - pr_warning("Warning: could not register block events\n"); + pr_warn("Warning: could not register block events\n"); return 1; } if (register_tracer(&blk_tracer) != 0) { - pr_warning("Warning: could not register the block tracer\n"); + pr_warn("Warning: could not register the block tracer\n"); unregister_trace_event(&trace_blk_event); return 1; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 57a6eea84694..2ece9f1a3e5a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1058,8 +1058,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) entry = tracefs_create_file("function_profile_enabled", 0644, d_tracer, NULL, &ftrace_profile_fops); if (!entry) - pr_warning("Could not create tracefs " - "'function_profile_enabled' entry\n"); + pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); } #else /* CONFIG_FUNCTION_PROFILER */ @@ -2314,8 +2313,8 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_curr(rec); if (FTRACE_WARN_ON(!ops)) { - pr_warning("Bad trampoline accounting at: %p (%pS)\n", - (void *)rec->ip, (void *)rec->ip); + pr_warn("Bad trampoline accounting at: %p (%pS)\n", + (void *)rec->ip, (void *)rec->ip); /* Ftrace is shutting down, return anything */ return (unsigned long)FTRACE_ADDR; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d9293402ee68..032b388bea66 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2071,20 +2071,20 @@ void trace_printk_init_buffers(void) /* trace_printk() is for debug use only. Don't use it in production. */ - pr_warning("\n"); - pr_warning("**********************************************************\n"); - pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warning("** **\n"); - pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); - pr_warning("** **\n"); - pr_warning("** This means that this is a DEBUG kernel and it is **\n"); - pr_warning("** unsafe for production use. **\n"); - pr_warning("** **\n"); - pr_warning("** If you see this message and you are not debugging **\n"); - pr_warning("** the kernel, report this immediately to your vendor! **\n"); - pr_warning("** **\n"); - pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warning("**********************************************************\n"); + pr_warn("\n"); + pr_warn("**********************************************************\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("** **\n"); + pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warn("** **\n"); + pr_warn("** This means that this is a DEBUG kernel and it is **\n"); + pr_warn("** unsafe for production use. **\n"); + pr_warn("** **\n"); + pr_warn("** If you see this message and you are not debugging **\n"); + pr_warn("** the kernel, report this immediately to your vendor! **\n"); + pr_warn("** **\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(); @@ -4101,7 +4101,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, */ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); if (!map_array) { - pr_warning("Unable to allocate trace enum mapping\n"); + pr_warn("Unable to allocate trace enum mapping\n"); return; } @@ -6131,7 +6131,7 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = tracefs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { - pr_warning("Could not create tracefs '%s' entry\n", cpu_dir); + pr_warn("Could not create tracefs '%s' entry\n", cpu_dir); return; } @@ -6318,7 +6318,7 @@ struct dentry *trace_create_file(const char *name, ret = tracefs_create_file(name, mode, parent, data, fops); if (!ret) - pr_warning("Could not create tracefs '%s' entry\n", name); + pr_warn("Could not create tracefs '%s' entry\n", name); return ret; } @@ -6337,7 +6337,7 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) tr->options = tracefs_create_dir("options", d_tracer); if (!tr->options) { - pr_warning("Could not create tracefs directory 'options'\n"); + pr_warn("Could not create tracefs directory 'options'\n"); return NULL; } @@ -7248,8 +7248,8 @@ __init static int tracer_alloc_buffers(void) if (trace_boot_clock) { ret = tracing_set_clock(&global_trace, trace_boot_clock); if (ret < 0) - pr_warning("Trace clock %s not defined, going back to default\n", - trace_boot_clock); + pr_warn("Trace clock %s not defined, going back to default\n", + trace_boot_clock); } /* diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a663cbb84107..91d6a63a2ea7 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1350,7 +1350,7 @@ void graph_trace_open(struct trace_iterator *iter) out_err_free: kfree(data); out_err: - pr_warning("function graph tracer: not enough memory\n"); + pr_warn("function graph tracer: not enough memory\n"); } void graph_trace_close(struct trace_iterator *iter) @@ -1468,12 +1468,12 @@ static __init int init_graph_trace(void) max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); if (!register_trace_event(&graph_trace_entry_event)) { - pr_warning("Warning: could not register graph trace events\n"); + pr_warn("Warning: could not register graph trace events\n"); return 1; } if (!register_trace_event(&graph_trace_ret_event)) { - pr_warning("Warning: could not register graph trace events\n"); + pr_warn("Warning: could not register graph trace events\n"); return 1; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 21b81a41dae5..919e0ddd8fcc 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -459,16 +459,14 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (ret == 0) tk->tp.flags |= TP_FLAG_REGISTERED; else { - pr_warning("Could not insert probe at %s+%lu: %d\n", - trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); + pr_warn("Could not insert probe at %s+%lu: %d\n", + trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { - pr_warning("This probe might be able to register after" - "target module is loaded. Continue.\n"); + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); ret = 0; } else if (ret == -EILSEQ) { - pr_warning("Probing address(0x%p) is not an " - "instruction boundary.\n", - tk->rp.kp.addr); + pr_warn("Probing address(0x%p) is not an instruction boundary.\n", + tk->rp.kp.addr); ret = -EINVAL; } } @@ -529,7 +527,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register new event */ ret = register_kprobe_event(tk); if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } @@ -564,10 +562,9 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, __unregister_trace_kprobe(tk); ret = __register_trace_kprobe(tk); if (ret) - pr_warning("Failed to re-register probe %s on" - "%s: %d\n", - trace_event_name(&tk->tp.call), - mod->name, ret); + pr_warn("Failed to re-register probe %s on %s: %d\n", + trace_event_name(&tk->tp.call), + mod->name, ret); } } mutex_unlock(&probe_lock); @@ -1336,16 +1333,14 @@ static __init int init_kprobe_trace(void) /* Event list interface */ if (!entry) - pr_warning("Could not create tracefs " - "'kprobe_events' entry\n"); + pr_warn("Could not create tracefs 'kprobe_events' entry\n"); /* Profile interface */ entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, NULL, &kprobe_profile_ops); if (!entry) - pr_warning("Could not create tracefs " - "'kprobe_profile' entry\n"); + pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); return 0; } fs_initcall(init_kprobe_trace); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 2be8c4f2403d..68f376ca6d3f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -146,7 +146,7 @@ static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, /* XXX: This is later than where events were lost. */ trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); if (!overrun_detected) - pr_warning("mmiotrace has lost events.\n"); + pr_warn("mmiotrace has lost events\n"); overrun_detected = true; goto print_out; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1769a81da8a7..1d372fa6fefb 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -636,8 +636,8 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, *tmp = '\0'; size = tmp - kbuf + 1; } else if (done + size < count) { - pr_warning("Line length is too long: " - "Should be less than %d.", WRITE_BUFSIZE); + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE); ret = -EINVAL; goto out; } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 6cf935316769..413ff108fbd0 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -281,8 +281,7 @@ static int tracing_stat_init(void) stat_dir = tracefs_create_dir("trace_stat", d_tracing); if (!stat_dir) - pr_warning("Could not create tracefs " - "'trace_stat' entry\n"); + pr_warn("Could not create tracefs 'trace_stat' entry\n"); return 0; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d2f6d0be3503..7915142c89e4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -334,7 +334,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) ret = register_uprobe_event(tu); if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index ecd536de603a..d0639d917899 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -491,7 +491,7 @@ static __init int init_tracepoints(void) ret = register_module_notifier(&tracepoint_module_nb); if (ret) - pr_warning("Failed to register tracepoint module enter notifier\n"); + pr_warn("Failed to register tracepoint module enter notifier\n"); return ret; } From e77986b560741f40ac30755afc57074898da31e8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Mar 2016 14:28:12 -0700 Subject: [PATCH 78/81] alpha/extable: use generic search and sort routines Replace the arch specific versions of search_extable() and sort_extable() with calls to the generic ones, which now support relative exception tables as well. Signed-off-by: Ard Biesheuvel Acked-by: Richard Henderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/uaccess.h | 10 +++- arch/alpha/mm/Makefile | 2 +- arch/alpha/mm/extable.c | 92 -------------------------------- 3 files changed, 9 insertions(+), 95 deletions(-) delete mode 100644 arch/alpha/mm/extable.c diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h index 9b0d40093c9a..c419b43c461d 100644 --- a/arch/alpha/include/asm/uaccess.h +++ b/arch/alpha/include/asm/uaccess.h @@ -483,7 +483,13 @@ struct exception_table_entry (pc) + (_fixup)->fixup.bits.nextinsn; \ }) -#define ARCH_HAS_SORT_EXTABLE -#define ARCH_HAS_SEARCH_EXTABLE +#define ARCH_HAS_RELATIVE_EXTABLE + +#define swap_ex_entry_fixup(a, b, tmp, delta) \ + do { \ + (a)->fixup.unit = (b)->fixup.unit; \ + (b)->fixup.unit = (tmp).fixup.unit; \ + } while (0) + #endif /* __ALPHA_UACCESS_H */ diff --git a/arch/alpha/mm/Makefile b/arch/alpha/mm/Makefile index c993d3f93cf6..5a9807936411 100644 --- a/arch/alpha/mm/Makefile +++ b/arch/alpha/mm/Makefile @@ -4,6 +4,6 @@ ccflags-y := -Werror -obj-y := init.o fault.o extable.o +obj-y := init.o fault.o obj-$(CONFIG_DISCONTIGMEM) += numa.o diff --git a/arch/alpha/mm/extable.c b/arch/alpha/mm/extable.c deleted file mode 100644 index 813c9b63c0e1..000000000000 --- a/arch/alpha/mm/extable.c +++ /dev/null @@ -1,92 +0,0 @@ -/* - * linux/arch/alpha/mm/extable.c - */ - -#include -#include -#include - -static inline unsigned long ex_to_addr(const struct exception_table_entry *x) -{ - return (unsigned long)&x->insn + x->insn; -} - -static void swap_ex(void *a, void *b, int size) -{ - struct exception_table_entry *ex_a = a, *ex_b = b; - unsigned long addr_a = ex_to_addr(ex_a), addr_b = ex_to_addr(ex_b); - unsigned int t = ex_a->fixup.unit; - - ex_a->fixup.unit = ex_b->fixup.unit; - ex_b->fixup.unit = t; - ex_a->insn = (int)(addr_b - (unsigned long)&ex_a->insn); - ex_b->insn = (int)(addr_a - (unsigned long)&ex_b->insn); -} - -/* - * The exception table needs to be sorted so that the binary - * search that we use to find entries in it works properly. - * This is used both for the kernel exception table and for - * the exception tables of modules that get loaded. - */ -static int cmp_ex(const void *a, const void *b) -{ - const struct exception_table_entry *x = a, *y = b; - - /* avoid overflow */ - if (ex_to_addr(x) > ex_to_addr(y)) - return 1; - if (ex_to_addr(x) < ex_to_addr(y)) - return -1; - return 0; -} - -void sort_extable(struct exception_table_entry *start, - struct exception_table_entry *finish) -{ - sort(start, finish - start, sizeof(struct exception_table_entry), - cmp_ex, swap_ex); -} - -#ifdef CONFIG_MODULES -/* - * Any entry referring to the module init will be at the beginning or - * the end. - */ -void trim_init_extable(struct module *m) -{ - /*trim the beginning*/ - while (m->num_exentries && - within_module_init(ex_to_addr(&m->extable[0]), m)) { - m->extable++; - m->num_exentries--; - } - /*trim the end*/ - while (m->num_exentries && - within_module_init(ex_to_addr(&m->extable[m->num_exentries-1]), - m)) - m->num_exentries--; -} -#endif /* CONFIG_MODULES */ - -const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) -{ - while (first <= last) { - const struct exception_table_entry *mid; - unsigned long mid_value; - - mid = (last - first) / 2 + first; - mid_value = ex_to_addr(mid); - if (mid_value == value) - return mid; - else if (mid_value < value) - first = mid+1; - else - last = mid-1; - } - - return NULL; -} From c352e8b6de9808ec5edc792d54f8785de6626d02 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Mar 2016 14:28:14 -0700 Subject: [PATCH 79/81] s390/extable: use generic search and sort routines Replace the arch specific versions of search_extable() and sort_extable() with calls to the generic ones, which now support relative exception tables as well. Signed-off-by: Ard Biesheuvel Acked-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/uaccess.h | 8 +--- arch/s390/mm/Makefile | 2 +- arch/s390/mm/extable.c | 85 --------------------------------- 3 files changed, 2 insertions(+), 93 deletions(-) delete mode 100644 arch/s390/mm/extable.c diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 9dd4cc47ddc7..e0900ddf91dd 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -79,18 +79,12 @@ struct exception_table_entry int insn, fixup; }; -static inline unsigned long extable_insn(const struct exception_table_entry *x) -{ - return (unsigned long)&x->insn + x->insn; -} - static inline unsigned long extable_fixup(const struct exception_table_entry *x) { return (unsigned long)&x->fixup + x->fixup; } -#define ARCH_HAS_SORT_EXTABLE -#define ARCH_HAS_SEARCH_EXTABLE +#define ARCH_HAS_RELATIVE_EXTABLE /** * __copy_from_user: - Copy a block of data from user space, with less checking. diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index 2ae54cad2b6a..0aa0ad165d8b 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -3,7 +3,7 @@ # obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o -obj-y += page-states.o gup.o extable.o pageattr.o mem_detect.o +obj-y += page-states.o gup.o pageattr.o mem_detect.o obj-y += pgtable.o pgalloc.o obj-$(CONFIG_CMM) += cmm.o diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c deleted file mode 100644 index 18c8b819b0aa..000000000000 --- a/arch/s390/mm/extable.c +++ /dev/null @@ -1,85 +0,0 @@ -#include -#include -#include - -/* - * Search one exception table for an entry corresponding to the - * given instruction address, and return the address of the entry, - * or NULL if none is found. - * We use a binary search, and thus we assume that the table is - * already sorted. - */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) -{ - const struct exception_table_entry *mid; - unsigned long addr; - - while (first <= last) { - mid = ((last - first) >> 1) + first; - addr = extable_insn(mid); - if (addr < value) - first = mid + 1; - else if (addr > value) - last = mid - 1; - else - return mid; - } - return NULL; -} - -/* - * The exception table needs to be sorted so that the binary - * search that we use to find entries in it works properly. - * This is used both for the kernel exception table and for - * the exception tables of modules that get loaded. - * - */ -static int cmp_ex(const void *a, const void *b) -{ - const struct exception_table_entry *x = a, *y = b; - - /* This compare is only valid after normalization. */ - return x->insn - y->insn; -} - -void sort_extable(struct exception_table_entry *start, - struct exception_table_entry *finish) -{ - struct exception_table_entry *p; - int i; - - /* Normalize entries to being relative to the start of the section */ - for (p = start, i = 0; p < finish; p++, i += 8) { - p->insn += i; - p->fixup += i + 4; - } - sort(start, finish - start, sizeof(*start), cmp_ex, NULL); - /* Denormalize all entries */ - for (p = start, i = 0; p < finish; p++, i += 8) { - p->insn -= i; - p->fixup -= i + 4; - } -} - -#ifdef CONFIG_MODULES -/* - * If the exception table is sorted, any referring to the module init - * will be at the beginning or the end. - */ -void trim_init_extable(struct module *m) -{ - /* Trim the beginning */ - while (m->num_exentries && - within_module_init(extable_insn(&m->extable[0]), m)) { - m->extable++; - m->num_exentries--; - } - /* Trim the end */ - while (m->num_exentries && - within_module_init(extable_insn(&m->extable[m->num_exentries-1]), m)) - m->num_exentries--; -} -#endif /* CONFIG_MODULES */ From 29934b0fb8ff5afa61832ea56aec65befab30511 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Mar 2016 14:28:17 -0700 Subject: [PATCH 80/81] x86/extable: use generic search and sort routines Replace the arch specific versions of search_extable() and sort_extable() with calls to the generic ones, which now support relative exception tables as well. Signed-off-by: Ard Biesheuvel Acked-by: H. Peter Anvin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess.h | 5 +- arch/x86/mm/extable.c | 108 --------------------------------- 2 files changed, 2 insertions(+), 111 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 88bff6dd23ad..a969ae607be8 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -105,9 +105,8 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un struct exception_table_entry { int insn, fixup, handler; }; -/* This is not the generic standard exception_table_entry format */ -#define ARCH_HAS_SORT_EXTABLE -#define ARCH_HAS_SEARCH_EXTABLE + +#define ARCH_HAS_RELATIVE_EXTABLE extern int fixup_exception(struct pt_regs *regs, int trapnr); extern bool ex_has_fault_handler(unsigned long ip); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 9dd7e4b7fcde..82447b3fba38 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -1,16 +1,9 @@ #include -#include -#include #include typedef bool (*ex_handler_t)(const struct exception_table_entry *, struct pt_regs *, int); -static inline unsigned long -ex_insn_addr(const struct exception_table_entry *x) -{ - return (unsigned long)&x->insn + x->insn; -} static inline unsigned long ex_fixup_addr(const struct exception_table_entry *x) { @@ -110,104 +103,3 @@ int __init early_fixup_exception(unsigned long *ip) *ip = new_ip; return 1; } - -/* - * Search one exception table for an entry corresponding to the - * given instruction address, and return the address of the entry, - * or NULL if none is found. - * We use a binary search, and thus we assume that the table is - * already sorted. - */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) -{ - while (first <= last) { - const struct exception_table_entry *mid; - unsigned long addr; - - mid = ((last - first) >> 1) + first; - addr = ex_insn_addr(mid); - if (addr < value) - first = mid + 1; - else if (addr > value) - last = mid - 1; - else - return mid; - } - return NULL; -} - -/* - * The exception table needs to be sorted so that the binary - * search that we use to find entries in it works properly. - * This is used both for the kernel exception table and for - * the exception tables of modules that get loaded. - * - */ -static int cmp_ex(const void *a, const void *b) -{ - const struct exception_table_entry *x = a, *y = b; - - /* - * This value will always end up fittin in an int, because on - * both i386 and x86-64 the kernel symbol-reachable address - * space is < 2 GiB. - * - * This compare is only valid after normalization. - */ - return x->insn - y->insn; -} - -void sort_extable(struct exception_table_entry *start, - struct exception_table_entry *finish) -{ - struct exception_table_entry *p; - int i; - - /* Convert all entries to being relative to the start of the section */ - i = 0; - for (p = start; p < finish; p++) { - p->insn += i; - i += 4; - p->fixup += i; - i += 4; - p->handler += i; - i += 4; - } - - sort(start, finish - start, sizeof(struct exception_table_entry), - cmp_ex, NULL); - - /* Denormalize all entries */ - i = 0; - for (p = start; p < finish; p++) { - p->insn -= i; - i += 4; - p->fixup -= i; - i += 4; - p->handler -= i; - i += 4; - } -} - -#ifdef CONFIG_MODULES -/* - * If the exception table is sorted, any referring to the module init - * will be at the beginning or the end. - */ -void trim_init_extable(struct module *m) -{ - /*trim the beginning*/ - while (m->num_exentries && - within_module_init(ex_insn_addr(&m->extable[0]), m)) { - m->extable++; - m->num_exentries--; - } - /*trim the end*/ - while (m->num_exentries && - within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m)) - m->num_exentries--; -} -#endif /* CONFIG_MODULES */ From 8fe9752ef10343a8edb603cb93abc2bfae34e748 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 22 Mar 2016 14:28:20 -0700 Subject: [PATCH 81/81] ia64/extable: use generic search and sort routines Replace the arch specific versions of search_extable() and sort_extable() with calls to the generic ones, which now support relative exception tables as well. Signed-off-by: Ard Biesheuvel Acked-by: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/uaccess.h | 8 +-- arch/ia64/mm/extable.c | 97 +-------------------------------- 2 files changed, 4 insertions(+), 101 deletions(-) diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h index 4f3fb6ccbf21..2189d5ddc1ee 100644 --- a/arch/ia64/include/asm/uaccess.h +++ b/arch/ia64/include/asm/uaccess.h @@ -341,13 +341,11 @@ extern unsigned long __strnlen_user (const char __user *, long); __su_ret; \ }) -/* Generic code can't deal with the location-relative format that we use for compactness. */ -#define ARCH_HAS_SORT_EXTABLE -#define ARCH_HAS_SEARCH_EXTABLE +#define ARCH_HAS_RELATIVE_EXTABLE struct exception_table_entry { - int addr; /* location-relative address of insn this fixup is for */ - int cont; /* location-relative continuation addr.; if bit 2 is set, r9 is set to 0 */ + int insn; /* location-relative address of insn this fixup is for */ + int fixup; /* location-relative continuation addr.; if bit 2 is set, r9 is set to 0 */ }; extern void ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e); diff --git a/arch/ia64/mm/extable.c b/arch/ia64/mm/extable.c index c99a41e29fe8..8f70bb2d0c37 100644 --- a/arch/ia64/mm/extable.c +++ b/arch/ia64/mm/extable.c @@ -5,107 +5,12 @@ * David Mosberger-Tang */ -#include - #include -#include - -static int cmp_ex(const void *a, const void *b) -{ - const struct exception_table_entry *l = a, *r = b; - u64 lip = (u64) &l->addr + l->addr; - u64 rip = (u64) &r->addr + r->addr; - - /* avoid overflow */ - if (lip > rip) - return 1; - if (lip < rip) - return -1; - return 0; -} - -static void swap_ex(void *a, void *b, int size) -{ - struct exception_table_entry *l = a, *r = b, tmp; - u64 delta = (u64) r - (u64) l; - - tmp = *l; - l->addr = r->addr + delta; - l->cont = r->cont + delta; - r->addr = tmp.addr - delta; - r->cont = tmp.cont - delta; -} - -/* - * Sort the exception table. It's usually already sorted, but there - * may be unordered entries due to multiple text sections (such as the - * .init text section). Note that the exception-table-entries contain - * location-relative addresses, which requires a bit of care during - * sorting to avoid overflows in the offset members (e.g., it would - * not be safe to make a temporary copy of an exception-table entry on - * the stack, because the stack may be more than 2GB away from the - * exception-table). - */ -void sort_extable (struct exception_table_entry *start, - struct exception_table_entry *finish) -{ - sort(start, finish - start, sizeof(struct exception_table_entry), - cmp_ex, swap_ex); -} - -static inline unsigned long ex_to_addr(const struct exception_table_entry *x) -{ - return (unsigned long)&x->addr + x->addr; -} - -#ifdef CONFIG_MODULES -/* - * Any entry referring to the module init will be at the beginning or - * the end. - */ -void trim_init_extable(struct module *m) -{ - /*trim the beginning*/ - while (m->num_exentries && - within_module_init(ex_to_addr(&m->extable[0]), m)) { - m->extable++; - m->num_exentries--; - } - /*trim the end*/ - while (m->num_exentries && - within_module_init(ex_to_addr(&m->extable[m->num_exentries-1]), - m)) - m->num_exentries--; -} -#endif /* CONFIG_MODULES */ - -const struct exception_table_entry * -search_extable (const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long ip) -{ - const struct exception_table_entry *mid; - unsigned long mid_ip; - long diff; - - while (first <= last) { - mid = &first[(last - first)/2]; - mid_ip = (u64) &mid->addr + mid->addr; - diff = mid_ip - ip; - if (diff == 0) - return mid; - else if (diff < 0) - first = mid + 1; - else - last = mid - 1; - } - return NULL; -} void ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e) { - long fix = (u64) &e->cont + e->cont; + long fix = (u64) &e->fixup + e->fixup; regs->r8 = -EFAULT; if (fix & 4)