aboutsummaryrefslogtreecommitdiff
path: root/arch/riscv/lib
diff options
context:
space:
mode:
authorLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
committerLibravatar Linus Torvalds <torvalds@linux-foundation.org>2023-02-21 18:24:12 -0800
commit5b7c4cabbb65f5c469464da6c5f614cbd7f730f2 (patch)
treecc5c2d0a898769fd59549594fedb3ee6f84e59a0 /arch/riscv/lib
downloadlinux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.tar.gz
linux-5b7c4cabbb65f5c469464da6c5f614cbd7f730f2.zip
Merge tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-nextgrafted
Pull networking updates from Jakub Kicinski: "Core: - Add dedicated kmem_cache for typical/small skb->head, avoid having to access struct page at kfree time, and improve memory use. - Introduce sysctl to set default RPS configuration for new netdevs. - Define Netlink protocol specification format which can be used to describe messages used by each family and auto-generate parsers. Add tools for generating kernel data structures and uAPI headers. - Expose all net/core sysctls inside netns. - Remove 4s sleep in netpoll if carrier is instantly detected on boot. - Add configurable limit of MDB entries per port, and port-vlan. - Continue populating drop reasons throughout the stack. - Retire a handful of legacy Qdiscs and classifiers. Protocols: - Support IPv4 big TCP (TSO frames larger than 64kB). - Add IP_LOCAL_PORT_RANGE socket option, to control local port range on socket by socket basis. - Track and report in procfs number of MPTCP sockets used. - Support mixing IPv4 and IPv6 flows in the in-kernel MPTCP path manager. - IPv6: don't check net.ipv6.route.max_size and rely on garbage collection to free memory (similarly to IPv4). - Support Penultimate Segment Pop (PSP) flavor in SRv6 (RFC8986). - ICMP: add per-rate limit counters. - Add support for user scanning requests in ieee802154. - Remove static WEP support. - Support minimal Wi-Fi 7 Extremely High Throughput (EHT) rate reporting. - WiFi 7 EHT channel puncturing support (client & AP). BPF: - Add a rbtree data structure following the "next-gen data structure" precedent set by recently added linked list, that is, by using kfunc + kptr instead of adding a new BPF map type. - Expose XDP hints via kfuncs with initial support for RX hash and timestamp metadata. - Add BPF_F_NO_TUNNEL_KEY extension to bpf_skb_set_tunnel_key to better support decap on GRE tunnel devices not operating in collect metadata. - Improve x86 JIT's codegen for PROBE_MEM runtime error checks. - Remove the need for trace_printk_lock for bpf_trace_printk and bpf_trace_vprintk helpers. - Extend libbpf's bpf_tracing.h support for tracing arguments of kprobes/uprobes and syscall as a special case. - Significantly reduce the search time for module symbols by livepatch and BPF. - Enable cpumasks to be used as kptrs, which is useful for tracing programs tracking which tasks end up running on which CPUs in different time intervals. - Add support for BPF trampoline on s390x and riscv64. - Add capability to export the XDP features supported by the NIC. - Add __bpf_kfunc tag for marking kernel functions as kfuncs. - Add cgroup.memory=nobpf kernel parameter option to disable BPF memory accounting for container environments. Netfilter: - Remove the CLUSTERIP target. It has been marked as obsolete for years, and we still have WARN splats wrt races of the out-of-band /proc interface installed by this target. - Add 'destroy' commands to nf_tables. They are identical to the existing 'delete' commands, but do not return an error if the referenced object (set, chain, rule...) did not exist. Driver API: - Improve cpumask_local_spread() locality to help NICs set the right IRQ affinity on AMD platforms. - Separate C22 and C45 MDIO bus transactions more clearly. - Introduce new DCB table to control DSCP rewrite on egress. - Support configuration of Physical Layer Collision Avoidance (PLCA) Reconciliation Sublayer (RS) (802.3cg-2019). Modern version of shared medium Ethernet. - Support for MAC Merge layer (IEEE 802.3-2018 clause 99). Allowing preemption of low priority frames by high priority frames. - Add support for controlling MACSec offload using netlink SET. - Rework devlink instance refcounts to allow registration and de-registration under the instance lock. Split the code into multiple files, drop some of the unnecessarily granular locks and factor out common parts of netlink operation handling. - Add TX frame aggregation parameters (for USB drivers). - Add a new attr TCA_EXT_WARN_MSG to report TC (offload) warning messages with notifications for debug. - Allow offloading of UDP NEW connections via act_ct. - Add support for per action HW stats in TC. - Support hardware miss to TC action (continue processing in SW from a specific point in the action chain). - Warn if old Wireless Extension user space interface is used with modern cfg80211/mac80211 drivers. Do not support Wireless Extensions for Wi-Fi 7 devices at all. Everyone should switch to using nl80211 interface instead. - Improve the CAN bit timing configuration. Use extack to return error messages directly to user space, update the SJW handling, including the definition of a new default value that will benefit CAN-FD controllers, by increasing their oscillator tolerance. New hardware / drivers: - Ethernet: - nVidia BlueField-3 support (control traffic driver) - Ethernet support for imx93 SoCs - Motorcomm yt8531 gigabit Ethernet PHY - onsemi NCN26000 10BASE-T1S PHY (with support for PLCA) - Microchip LAN8841 PHY (incl. cable diagnostics and PTP) - Amlogic gxl MDIO mux - WiFi: - RealTek RTL8188EU (rtl8xxxu) - Qualcomm Wi-Fi 7 devices (ath12k) - CAN: - Renesas R-Car V4H Drivers: - Bluetooth: - Set Per Platform Antenna Gain (PPAG) for Intel controllers. - Ethernet NICs: - Intel (1G, igc): - support TSN / Qbv / packet scheduling features of i226 model - Intel (100G, ice): - use GNSS subsystem instead of TTY - multi-buffer XDP support - extend support for GPIO pins to E823 devices - nVidia/Mellanox: - update the shared buffer configuration on PFC commands - implement PTP adjphase function for HW offset control - TC support for Geneve and GRE with VF tunnel offload - more efficient crypto key management method - multi-port eswitch support - Netronome/Corigine: - add DCB IEEE support - support IPsec offloading for NFP3800 - Freescale/NXP (enetc): - support XDP_REDIRECT for XDP non-linear buffers - improve reconfig, avoid link flap and waiting for idle - support MAC Merge layer - Other NICs: - sfc/ef100: add basic devlink support for ef100 - ionic: rx_push mode operation (writing descriptors via MMIO) - bnxt: use the auxiliary bus abstraction for RDMA - r8169: disable ASPM and reset bus in case of tx timeout - cpsw: support QSGMII mode for J721e CPSW9G - cpts: support pulse-per-second output - ngbe: add an mdio bus driver - usbnet: optimize usbnet_bh() by avoiding unnecessary queuing - r8152: handle devices with FW with NCM support - amd-xgbe: support 10Mbps, 2.5GbE speeds and rx-adaptation - virtio-net: support multi buffer XDP - virtio/vsock: replace virtio_vsock_pkt with sk_buff - tsnep: XDP support - Ethernet high-speed switches: - nVidia/Mellanox (mlxsw): - add support for latency TLV (in FW control messages) - Microchip (sparx5): - separate explicit and implicit traffic forwarding rules, make the implicit rules always active - add support for egress DSCP rewrite - IS0 VCAP support (Ingress Classification) - IS2 VCAP filters (protos, L3 addrs, L4 ports, flags, ToS etc.) - ES2 VCAP support (Egress Access Control) - support for Per-Stream Filtering and Policing (802.1Q, 8.6.5.1) - Ethernet embedded switches: - Marvell (mv88e6xxx): - add MAB (port auth) offload support - enable PTP receive for mv88e6390 - NXP (ocelot): - support MAC Merge layer - support for the the vsc7512 internal copper phys - Microchip: - lan9303: convert to PHYLINK - lan966x: support TC flower filter statistics - lan937x: PTP support for KSZ9563/KSZ8563 and LAN937x - lan937x: support Credit Based Shaper configuration - ksz9477: support Energy Efficient Ethernet - other: - qca8k: convert to regmap read/write API, use bulk operations - rswitch: Improve TX timestamp accuracy - Intel WiFi (iwlwifi): - EHT (Wi-Fi 7) rate reporting - STEP equalizer support: transfer some STEP (connection to radio on platforms with integrated wifi) related parameters from the BIOS to the firmware. - Qualcomm 802.11ax WiFi (ath11k): - IPQ5018 support - Fine Timing Measurement (FTM) responder role support - channel 177 support - MediaTek WiFi (mt76): - per-PHY LED support - mt7996: EHT (Wi-Fi 7) support - Wireless Ethernet Dispatch (WED) reset support - switch to using page pool allocator - RealTek WiFi (rtw89): - support new version of Bluetooth co-existance - Mobile: - rmnet: support TX aggregation" * tag 'net-next-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1872 commits) page_pool: add a comment explaining the fragment counter usage net: ethtool: fix __ethtool_dev_mm_supported() implementation ethtool: pse-pd: Fix double word in comments xsk: add linux/vmalloc.h to xsk.c sefltests: netdevsim: wait for devlink instance after netns removal selftest: fib_tests: Always cleanup before exit net/mlx5e: Align IPsec ASO result memory to be as required by hardware net/mlx5e: TC, Set CT miss to the specific ct action instance net/mlx5e: Rename CHAIN_TO_REG to MAPPED_OBJ_TO_REG net/mlx5: Refactor tc miss handling to a single function net/mlx5: Kconfig: Make tc offload depend on tc skb extension net/sched: flower: Support hardware miss to tc action net/sched: flower: Move filter handle initialization earlier net/sched: cls_api: Support hardware miss to tc action net/sched: Rename user cookie and act cookie sfc: fix builds without CONFIG_RTC_LIB sfc: clean up some inconsistent indentings net/mlx4_en: Introduce flexible array to silence overflow warning net: lan966x: Fix possible deadlock inside PTP net/ulp: Remove redundant ->clone() test in inet_clone_ulp(). ...
Diffstat (limited to 'arch/riscv/lib')
-rw-r--r--arch/riscv/lib/Makefile9
-rw-r--r--arch/riscv/lib/delay.c111
-rw-r--r--arch/riscv/lib/error-inject.c10
-rw-r--r--arch/riscv/lib/memcpy.S108
-rw-r--r--arch/riscv/lib/memmove.S316
-rw-r--r--arch/riscv/lib/memset.S113
-rw-r--r--arch/riscv/lib/tishift.S76
-rw-r--r--arch/riscv/lib/uaccess.S234
8 files changed, 977 insertions, 0 deletions
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
new file mode 100644
index 000000000..25d5c9664
--- /dev/null
+++ b/arch/riscv/lib/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+lib-y += delay.o
+lib-y += memcpy.o
+lib-y += memset.o
+lib-y += memmove.o
+lib-$(CONFIG_MMU) += uaccess.o
+lib-$(CONFIG_64BIT) += tishift.o
+
+obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
diff --git a/arch/riscv/lib/delay.c b/arch/riscv/lib/delay.c
new file mode 100644
index 000000000..49d510ba7
--- /dev/null
+++ b/arch/riscv/lib/delay.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 Regents of the University of California
+ */
+
+#include <linux/delay.h>
+#include <linux/math.h>
+#include <linux/param.h>
+#include <linux/timex.h>
+#include <linux/types.h>
+#include <linux/export.h>
+
+#include <asm/processor.h>
+
+/*
+ * This is copies from arch/arm/include/asm/delay.h
+ *
+ * Loop (or tick) based delay:
+ *
+ * loops = loops_per_jiffy * jiffies_per_sec * delay_us / us_per_sec
+ *
+ * where:
+ *
+ * jiffies_per_sec = HZ
+ * us_per_sec = 1000000
+ *
+ * Therefore the constant part is HZ / 1000000 which is a small
+ * fractional number. To make this usable with integer math, we
+ * scale up this constant by 2^31, perform the actual multiplication,
+ * and scale the result back down by 2^31 with a simple shift:
+ *
+ * loops = (loops_per_jiffy * delay_us * UDELAY_MULT) >> 31
+ *
+ * where:
+ *
+ * UDELAY_MULT = 2^31 * HZ / 1000000
+ * = (2^31 / 1000000) * HZ
+ * = 2147.483648 * HZ
+ * = 2147 * HZ + 483648 * HZ / 1000000
+ *
+ * 31 is the biggest scale shift value that won't overflow 32 bits for
+ * delay_us * UDELAY_MULT assuming HZ <= 1000 and delay_us <= 2000.
+ */
+#define MAX_UDELAY_US 2000
+#define MAX_UDELAY_HZ 1000
+#define UDELAY_MULT (2147UL * HZ + 483648UL * HZ / 1000000UL)
+#define UDELAY_SHIFT 31
+
+#if HZ > MAX_UDELAY_HZ
+#error "HZ > MAX_UDELAY_HZ"
+#endif
+
+/*
+ * RISC-V supports both UDELAY and NDELAY. This is largely the same as above,
+ * but with different constants. I added 10 bits to the shift to get this, but
+ * the result is that I need a 64-bit multiply, which is slow on 32-bit
+ * platforms.
+ *
+ * NDELAY_MULT = 2^41 * HZ / 1000000000
+ * = (2^41 / 1000000000) * HZ
+ * = 2199.02325555 * HZ
+ * = 2199 * HZ + 23255550 * HZ / 1000000000
+ *
+ * The maximum here is to avoid 64-bit overflow, but it isn't checked as it
+ * won't happen.
+ */
+#define MAX_NDELAY_NS (1ULL << 42)
+#define MAX_NDELAY_HZ MAX_UDELAY_HZ
+#define NDELAY_MULT ((unsigned long long)(2199ULL * HZ + 23255550ULL * HZ / 1000000000ULL))
+#define NDELAY_SHIFT 41
+
+#if HZ > MAX_NDELAY_HZ
+#error "HZ > MAX_NDELAY_HZ"
+#endif
+
+void __delay(unsigned long cycles)
+{
+ u64 t0 = get_cycles();
+
+ while ((unsigned long)(get_cycles() - t0) < cycles)
+ cpu_relax();
+}
+EXPORT_SYMBOL(__delay);
+
+void udelay(unsigned long usecs)
+{
+ u64 ucycles = (u64)usecs * lpj_fine * UDELAY_MULT;
+ u64 n;
+
+ if (unlikely(usecs > MAX_UDELAY_US)) {
+ n = (u64)usecs * riscv_timebase;
+ do_div(n, 1000000);
+
+ __delay(n);
+ return;
+ }
+
+ __delay(ucycles >> UDELAY_SHIFT);
+}
+EXPORT_SYMBOL(udelay);
+
+void ndelay(unsigned long nsecs)
+{
+ /*
+ * This doesn't bother checking for overflow, as it won't happen (it's
+ * an hour) of delay.
+ */
+ unsigned long long ncycles = nsecs * lpj_fine * NDELAY_MULT;
+ __delay(ncycles >> NDELAY_SHIFT);
+}
+EXPORT_SYMBOL(ndelay);
diff --git a/arch/riscv/lib/error-inject.c b/arch/riscv/lib/error-inject.c
new file mode 100644
index 000000000..d667ade2b
--- /dev/null
+++ b/arch/riscv/lib/error-inject.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/error-injection.h>
+#include <linux/kprobes.h>
+
+void override_function_with_return(struct pt_regs *regs)
+{
+ instruction_pointer_set(regs, regs->ra);
+}
+NOKPROBE_SYMBOL(override_function_with_return);
diff --git a/arch/riscv/lib/memcpy.S b/arch/riscv/lib/memcpy.S
new file mode 100644
index 000000000..51ab71625
--- /dev/null
+++ b/arch/riscv/lib/memcpy.S
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Regents of the University of California
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+/* void *memcpy(void *, const void *, size_t) */
+ENTRY(__memcpy)
+WEAK(memcpy)
+ move t6, a0 /* Preserve return value */
+
+ /* Defer to byte-oriented copy for small sizes */
+ sltiu a3, a2, 128
+ bnez a3, 4f
+ /* Use word-oriented copy only if low-order bits match */
+ andi a3, t6, SZREG-1
+ andi a4, a1, SZREG-1
+ bne a3, a4, 4f
+
+ beqz a3, 2f /* Skip if already aligned */
+ /*
+ * Round to nearest double word-aligned address
+ * greater than or equal to start address
+ */
+ andi a3, a1, ~(SZREG-1)
+ addi a3, a3, SZREG
+ /* Handle initial misalignment */
+ sub a4, a3, a1
+1:
+ lb a5, 0(a1)
+ addi a1, a1, 1
+ sb a5, 0(t6)
+ addi t6, t6, 1
+ bltu a1, a3, 1b
+ sub a2, a2, a4 /* Update count */
+
+2:
+ andi a4, a2, ~((16*SZREG)-1)
+ beqz a4, 4f
+ add a3, a1, a4
+3:
+ REG_L a4, 0(a1)
+ REG_L a5, SZREG(a1)
+ REG_L a6, 2*SZREG(a1)
+ REG_L a7, 3*SZREG(a1)
+ REG_L t0, 4*SZREG(a1)
+ REG_L t1, 5*SZREG(a1)
+ REG_L t2, 6*SZREG(a1)
+ REG_L t3, 7*SZREG(a1)
+ REG_L t4, 8*SZREG(a1)
+ REG_L t5, 9*SZREG(a1)
+ REG_S a4, 0(t6)
+ REG_S a5, SZREG(t6)
+ REG_S a6, 2*SZREG(t6)
+ REG_S a7, 3*SZREG(t6)
+ REG_S t0, 4*SZREG(t6)
+ REG_S t1, 5*SZREG(t6)
+ REG_S t2, 6*SZREG(t6)
+ REG_S t3, 7*SZREG(t6)
+ REG_S t4, 8*SZREG(t6)
+ REG_S t5, 9*SZREG(t6)
+ REG_L a4, 10*SZREG(a1)
+ REG_L a5, 11*SZREG(a1)
+ REG_L a6, 12*SZREG(a1)
+ REG_L a7, 13*SZREG(a1)
+ REG_L t0, 14*SZREG(a1)
+ REG_L t1, 15*SZREG(a1)
+ addi a1, a1, 16*SZREG
+ REG_S a4, 10*SZREG(t6)
+ REG_S a5, 11*SZREG(t6)
+ REG_S a6, 12*SZREG(t6)
+ REG_S a7, 13*SZREG(t6)
+ REG_S t0, 14*SZREG(t6)
+ REG_S t1, 15*SZREG(t6)
+ addi t6, t6, 16*SZREG
+ bltu a1, a3, 3b
+ andi a2, a2, (16*SZREG)-1 /* Update count */
+
+4:
+ /* Handle trailing misalignment */
+ beqz a2, 6f
+ add a3, a1, a2
+
+ /* Use word-oriented copy if co-aligned to word boundary */
+ or a5, a1, t6
+ or a5, a5, a3
+ andi a5, a5, 3
+ bnez a5, 5f
+7:
+ lw a4, 0(a1)
+ addi a1, a1, 4
+ sw a4, 0(t6)
+ addi t6, t6, 4
+ bltu a1, a3, 7b
+
+ ret
+
+5:
+ lb a4, 0(a1)
+ addi a1, a1, 1
+ sb a4, 0(t6)
+ addi t6, t6, 1
+ bltu a1, a3, 5b
+6:
+ ret
+END(__memcpy)
diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S
new file mode 100644
index 000000000..e0609e1f0
--- /dev/null
+++ b/arch/riscv/lib/memmove.S
@@ -0,0 +1,316 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+SYM_FUNC_START(__memmove)
+SYM_FUNC_START_WEAK(memmove)
+ /*
+ * Returns
+ * a0 - dest
+ *
+ * Parameters
+ * a0 - Inclusive first byte of dest
+ * a1 - Inclusive first byte of src
+ * a2 - Length of copy n
+ *
+ * Because the return matches the parameter register a0,
+ * we will not clobber or modify that register.
+ *
+ * Note: This currently only works on little-endian.
+ * To port to big-endian, reverse the direction of shifts
+ * in the 2 misaligned fixup copy loops.
+ */
+
+ /* Return if nothing to do */
+ beq a0, a1, return_from_memmove
+ beqz a2, return_from_memmove
+
+ /*
+ * Register Uses
+ * Forward Copy: a1 - Index counter of src
+ * Reverse Copy: a4 - Index counter of src
+ * Forward Copy: t3 - Index counter of dest
+ * Reverse Copy: t4 - Index counter of dest
+ * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest
+ * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest
+ * Both Copy Modes: t0 - Link / Temporary for load-store
+ * Both Copy Modes: t1 - Temporary for load-store
+ * Both Copy Modes: t2 - Temporary for load-store
+ * Both Copy Modes: a5 - dest to src alignment offset
+ * Both Copy Modes: a6 - Shift ammount
+ * Both Copy Modes: a7 - Inverse Shift ammount
+ * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops
+ */
+
+ /*
+ * Solve for some register values now.
+ * Byte copy does not need t5 or t6.
+ */
+ mv t3, a0
+ add t4, a0, a2
+ add a4, a1, a2
+
+ /*
+ * Byte copy if copying less than (2 * SZREG) bytes. This can
+ * cause problems with the bulk copy implementation and is
+ * small enough not to bother.
+ */
+ andi t0, a2, -(2 * SZREG)
+ beqz t0, byte_copy
+
+ /*
+ * Now solve for t5 and t6.
+ */
+ andi t5, t3, -SZREG
+ andi t6, t4, -SZREG
+ /*
+ * If dest(Register t3) rounded down to the nearest naturally
+ * aligned SZREG address, does not equal dest, then add SZREG
+ * to find the low-bound of SZREG alignment in the dest memory
+ * region. Note that this could overshoot the dest memory
+ * region if n is less than SZREG. This is one reason why
+ * we always byte copy if n is less than SZREG.
+ * Otherwise, dest is already naturally aligned to SZREG.
+ */
+ beq t5, t3, 1f
+ addi t5, t5, SZREG
+ 1:
+
+ /*
+ * If the dest and src are co-aligned to SZREG, then there is
+ * no need for the full rigmarole of a full misaligned fixup copy.
+ * Instead, do a simpler co-aligned copy.
+ */
+ xor t0, a0, a1
+ andi t1, t0, (SZREG - 1)
+ beqz t1, coaligned_copy
+ /* Fall through to misaligned fixup copy */
+
+misaligned_fixup_copy:
+ bltu a1, a0, misaligned_fixup_copy_reverse
+
+misaligned_fixup_copy_forward:
+ jal t0, byte_copy_until_aligned_forward
+
+ andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */
+ slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
+ sub a5, a1, t3 /* Find the difference between src and dest */
+ andi a1, a1, -SZREG /* Align the src pointer */
+ addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/
+
+ /*
+ * Compute The Inverse Shift
+ * a7 = XLEN - a6 = XLEN + -a6
+ * 2s complement negation to find the negative: -a6 = ~a6 + 1
+ * Add that to XLEN. XLEN = SZREG * 8.
+ */
+ not a7, a6
+ addi a7, a7, (SZREG * 8 + 1)
+
+ /*
+ * Fix Misalignment Copy Loop - Forward
+ * load_val0 = load_ptr[0];
+ * do {
+ * load_val1 = load_ptr[1];
+ * store_ptr += 2;
+ * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7});
+ *
+ * if (store_ptr == {a2})
+ * break;
+ *
+ * load_val0 = load_ptr[2];
+ * load_ptr += 2;
+ * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7});
+ *
+ * } while (store_ptr != store_ptr_end);
+ * store_ptr = store_ptr_end;
+ */
+
+ REG_L t0, (0 * SZREG)(a1)
+ 1:
+ REG_L t1, (1 * SZREG)(a1)
+ addi t3, t3, (2 * SZREG)
+ srl t0, t0, a6
+ sll t2, t1, a7
+ or t2, t0, t2
+ REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3)
+
+ beq t3, a2, 2f
+
+ REG_L t0, (2 * SZREG)(a1)
+ addi a1, a1, (2 * SZREG)
+ srl t1, t1, a6
+ sll t2, t0, a7
+ or t2, t1, t2
+ REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3)
+
+ bne t3, t6, 1b
+ 2:
+ mv t3, t6 /* Fix the dest pointer in case the loop was broken */
+
+ add a1, t3, a5 /* Restore the src pointer */
+ j byte_copy_forward /* Copy any remaining bytes */
+
+misaligned_fixup_copy_reverse:
+ jal t0, byte_copy_until_aligned_reverse
+
+ andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */
+ slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */
+ sub a5, a4, t4 /* Find the difference between src and dest */
+ andi a4, a4, -SZREG /* Align the src pointer */
+ addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/
+
+ /*
+ * Compute The Inverse Shift
+ * a7 = XLEN - a6 = XLEN + -a6
+ * 2s complement negation to find the negative: -a6 = ~a6 + 1
+ * Add that to XLEN. XLEN = SZREG * 8.
+ */
+ not a7, a6
+ addi a7, a7, (SZREG * 8 + 1)
+
+ /*
+ * Fix Misalignment Copy Loop - Reverse
+ * load_val1 = load_ptr[0];
+ * do {
+ * load_val0 = load_ptr[-1];
+ * store_ptr -= 2;
+ * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7});
+ *
+ * if (store_ptr == {a2})
+ * break;
+ *
+ * load_val1 = load_ptr[-2];
+ * load_ptr -= 2;
+ * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7});
+ *
+ * } while (store_ptr != store_ptr_end);
+ * store_ptr = store_ptr_end;
+ */
+
+ REG_L t1, ( 0 * SZREG)(a4)
+ 1:
+ REG_L t0, (-1 * SZREG)(a4)
+ addi t4, t4, (-2 * SZREG)
+ sll t1, t1, a7
+ srl t2, t0, a6
+ or t2, t1, t2
+ REG_S t2, ( 1 * SZREG)(t4)
+
+ beq t4, a2, 2f
+
+ REG_L t1, (-2 * SZREG)(a4)
+ addi a4, a4, (-2 * SZREG)
+ sll t0, t0, a7
+ srl t2, t1, a6
+ or t2, t0, t2
+ REG_S t2, ( 0 * SZREG)(t4)
+
+ bne t4, t5, 1b
+ 2:
+ mv t4, t5 /* Fix the dest pointer in case the loop was broken */
+
+ add a4, t4, a5 /* Restore the src pointer */
+ j byte_copy_reverse /* Copy any remaining bytes */
+
+/*
+ * Simple copy loops for SZREG co-aligned memory locations.
+ * These also make calls to do byte copies for any unaligned
+ * data at their terminations.
+ */
+coaligned_copy:
+ bltu a1, a0, coaligned_copy_reverse
+
+coaligned_copy_forward:
+ jal t0, byte_copy_until_aligned_forward
+
+ 1:
+ REG_L t1, ( 0 * SZREG)(a1)
+ addi a1, a1, SZREG
+ addi t3, t3, SZREG
+ REG_S t1, (-1 * SZREG)(t3)
+ bne t3, t6, 1b
+
+ j byte_copy_forward /* Copy any remaining bytes */
+
+coaligned_copy_reverse:
+ jal t0, byte_copy_until_aligned_reverse
+
+ 1:
+ REG_L t1, (-1 * SZREG)(a4)
+ addi a4, a4, -SZREG
+ addi t4, t4, -SZREG
+ REG_S t1, ( 0 * SZREG)(t4)
+ bne t4, t5, 1b
+
+ j byte_copy_reverse /* Copy any remaining bytes */
+
+/*
+ * These are basically sub-functions within the function. They
+ * are used to byte copy until the dest pointer is in alignment.
+ * At which point, a bulk copy method can be used by the
+ * calling code. These work on the same registers as the bulk
+ * copy loops. Therefore, the register values can be picked
+ * up from where they were left and we avoid code duplication
+ * without any overhead except the call in and return jumps.
+ */
+byte_copy_until_aligned_forward:
+ beq t3, t5, 2f
+ 1:
+ lb t1, 0(a1)
+ addi a1, a1, 1
+ addi t3, t3, 1
+ sb t1, -1(t3)
+ bne t3, t5, 1b
+ 2:
+ jalr zero, 0x0(t0) /* Return to multibyte copy loop */
+
+byte_copy_until_aligned_reverse:
+ beq t4, t6, 2f
+ 1:
+ lb t1, -1(a4)
+ addi a4, a4, -1
+ addi t4, t4, -1
+ sb t1, 0(t4)
+ bne t4, t6, 1b
+ 2:
+ jalr zero, 0x0(t0) /* Return to multibyte copy loop */
+
+/*
+ * Simple byte copy loops.
+ * These will byte copy until they reach the end of data to copy.
+ * At that point, they will call to return from memmove.
+ */
+byte_copy:
+ bltu a1, a0, byte_copy_reverse
+
+byte_copy_forward:
+ beq t3, t4, 2f
+ 1:
+ lb t1, 0(a1)
+ addi a1, a1, 1
+ addi t3, t3, 1
+ sb t1, -1(t3)
+ bne t3, t4, 1b
+ 2:
+ ret
+
+byte_copy_reverse:
+ beq t4, t3, 2f
+ 1:
+ lb t1, -1(a4)
+ addi a4, a4, -1
+ addi t4, t4, -1
+ sb t1, 0(t4)
+ bne t4, t3, 1b
+ 2:
+
+return_from_memmove:
+ ret
+
+SYM_FUNC_END(memmove)
+SYM_FUNC_END(__memmove)
diff --git a/arch/riscv/lib/memset.S b/arch/riscv/lib/memset.S
new file mode 100644
index 000000000..34c5360c6
--- /dev/null
+++ b/arch/riscv/lib/memset.S
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Regents of the University of California
+ */
+
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+/* void *memset(void *, int, size_t) */
+ENTRY(__memset)
+WEAK(memset)
+ move t0, a0 /* Preserve return value */
+
+ /* Defer to byte-oriented fill for small sizes */
+ sltiu a3, a2, 16
+ bnez a3, 4f
+
+ /*
+ * Round to nearest XLEN-aligned address
+ * greater than or equal to start address
+ */
+ addi a3, t0, SZREG-1
+ andi a3, a3, ~(SZREG-1)
+ beq a3, t0, 2f /* Skip if already aligned */
+ /* Handle initial misalignment */
+ sub a4, a3, t0
+1:
+ sb a1, 0(t0)
+ addi t0, t0, 1
+ bltu t0, a3, 1b
+ sub a2, a2, a4 /* Update count */
+
+2: /* Duff's device with 32 XLEN stores per iteration */
+ /* Broadcast value into all bytes */
+ andi a1, a1, 0xff
+ slli a3, a1, 8
+ or a1, a3, a1
+ slli a3, a1, 16
+ or a1, a3, a1
+#ifdef CONFIG_64BIT
+ slli a3, a1, 32
+ or a1, a3, a1
+#endif
+
+ /* Calculate end address */
+ andi a4, a2, ~(SZREG-1)
+ add a3, t0, a4
+
+ andi a4, a4, 31*SZREG /* Calculate remainder */
+ beqz a4, 3f /* Shortcut if no remainder */
+ neg a4, a4
+ addi a4, a4, 32*SZREG /* Calculate initial offset */
+
+ /* Adjust start address with offset */
+ sub t0, t0, a4
+
+ /* Jump into loop body */
+ /* Assumes 32-bit instruction lengths */
+ la a5, 3f
+#ifdef CONFIG_64BIT
+ srli a4, a4, 1
+#endif
+ add a5, a5, a4
+ jr a5
+3:
+ REG_S a1, 0(t0)
+ REG_S a1, SZREG(t0)
+ REG_S a1, 2*SZREG(t0)
+ REG_S a1, 3*SZREG(t0)
+ REG_S a1, 4*SZREG(t0)
+ REG_S a1, 5*SZREG(t0)
+ REG_S a1, 6*SZREG(t0)
+ REG_S a1, 7*SZREG(t0)
+ REG_S a1, 8*SZREG(t0)
+ REG_S a1, 9*SZREG(t0)
+ REG_S a1, 10*SZREG(t0)
+ REG_S a1, 11*SZREG(t0)
+ REG_S a1, 12*SZREG(t0)
+ REG_S a1, 13*SZREG(t0)
+ REG_S a1, 14*SZREG(t0)
+ REG_S a1, 15*SZREG(t0)
+ REG_S a1, 16*SZREG(t0)
+ REG_S a1, 17*SZREG(t0)
+ REG_S a1, 18*SZREG(t0)
+ REG_S a1, 19*SZREG(t0)
+ REG_S a1, 20*SZREG(t0)
+ REG_S a1, 21*SZREG(t0)
+ REG_S a1, 22*SZREG(t0)
+ REG_S a1, 23*SZREG(t0)
+ REG_S a1, 24*SZREG(t0)
+ REG_S a1, 25*SZREG(t0)
+ REG_S a1, 26*SZREG(t0)
+ REG_S a1, 27*SZREG(t0)
+ REG_S a1, 28*SZREG(t0)
+ REG_S a1, 29*SZREG(t0)
+ REG_S a1, 30*SZREG(t0)
+ REG_S a1, 31*SZREG(t0)
+ addi t0, t0, 32*SZREG
+ bltu t0, a3, 3b
+ andi a2, a2, SZREG-1 /* Update count */
+
+4:
+ /* Handle trailing misalignment */
+ beqz a2, 6f
+ add a3, t0, a2
+5:
+ sb a1, 0(t0)
+ addi t0, t0, 1
+ bltu t0, a3, 5b
+6:
+ ret
+END(__memset)
diff --git a/arch/riscv/lib/tishift.S b/arch/riscv/lib/tishift.S
new file mode 100644
index 000000000..ef90075c4
--- /dev/null
+++ b/arch/riscv/lib/tishift.S
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2018 Free Software Foundation, Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm-generic/export.h>
+
+SYM_FUNC_START(__lshrti3)
+ beqz a2, .L1
+ li a5,64
+ sub a5,a5,a2
+ sext.w a4,a5
+ blez a5, .L2
+ sext.w a2,a2
+ srl a0,a0,a2
+ sll a4,a1,a4
+ srl a2,a1,a2
+ or a0,a0,a4
+ mv a1,a2
+.L1:
+ ret
+.L2:
+ negw a0,a4
+ li a2,0
+ srl a0,a1,a0
+ mv a1,a2
+ ret
+SYM_FUNC_END(__lshrti3)
+EXPORT_SYMBOL(__lshrti3)
+
+SYM_FUNC_START(__ashrti3)
+ beqz a2, .L3
+ li a5,64
+ sub a5,a5,a2
+ sext.w a4,a5
+ blez a5, .L4
+ sext.w a2,a2
+ srl a0,a0,a2
+ sll a4,a1,a4
+ sra a2,a1,a2
+ or a0,a0,a4
+ mv a1,a2
+.L3:
+ ret
+.L4:
+ negw a0,a4
+ srai a2,a1,0x3f
+ sra a0,a1,a0
+ mv a1,a2
+ ret
+SYM_FUNC_END(__ashrti3)
+EXPORT_SYMBOL(__ashrti3)
+
+SYM_FUNC_START(__ashlti3)
+ beqz a2, .L5
+ li a5,64
+ sub a5,a5,a2
+ sext.w a4,a5
+ blez a5, .L6
+ sext.w a2,a2
+ sll a1,a1,a2
+ srl a4,a0,a4
+ sll a2,a0,a2
+ or a1,a1,a4
+ mv a0,a2
+.L5:
+ ret
+.L6:
+ negw a1,a4
+ li a2,0
+ sll a1,a0,a1
+ mv a0,a2
+ ret
+SYM_FUNC_END(__ashlti3)
+EXPORT_SYMBOL(__ashlti3)
diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S
new file mode 100644
index 000000000..ec486e536
--- /dev/null
+++ b/arch/riscv/lib/uaccess.S
@@ -0,0 +1,234 @@
+#include <linux/linkage.h>
+#include <asm-generic/export.h>
+#include <asm/asm.h>
+#include <asm/asm-extable.h>
+#include <asm/csr.h>
+
+ .macro fixup op reg addr lbl
+100:
+ \op \reg, \addr
+ _asm_extable 100b, \lbl
+ .endm
+
+ENTRY(__asm_copy_to_user)
+ENTRY(__asm_copy_from_user)
+
+ /* Enable access to user memory */
+ li t6, SR_SUM
+ csrs CSR_STATUS, t6
+
+ /* Save for return value */
+ mv t5, a2
+
+ /*
+ * Register allocation for code below:
+ * a0 - start of uncopied dst
+ * a1 - start of uncopied src
+ * a2 - size
+ * t0 - end of uncopied dst
+ */
+ add t0, a0, a2
+
+ /*
+ * Use byte copy only if too small.
+ * SZREG holds 4 for RV32 and 8 for RV64
+ */
+ li a3, 9*SZREG /* size must be larger than size in word_copy */
+ bltu a2, a3, .Lbyte_copy_tail
+
+ /*
+ * Copy first bytes until dst is aligned to word boundary.
+ * a0 - start of dst
+ * t1 - start of aligned dst
+ */
+ addi t1, a0, SZREG-1
+ andi t1, t1, ~(SZREG-1)
+ /* dst is already aligned, skip */
+ beq a0, t1, .Lskip_align_dst
+1:
+ /* a5 - one byte for copying data */
+ fixup lb a5, 0(a1), 10f
+ addi a1, a1, 1 /* src */
+ fixup sb a5, 0(a0), 10f
+ addi a0, a0, 1 /* dst */
+ bltu a0, t1, 1b /* t1 - start of aligned dst */
+
+.Lskip_align_dst:
+ /*
+ * Now dst is aligned.
+ * Use shift-copy if src is misaligned.
+ * Use word-copy if both src and dst are aligned because
+ * can not use shift-copy which do not require shifting
+ */
+ /* a1 - start of src */
+ andi a3, a1, SZREG-1
+ bnez a3, .Lshift_copy
+
+.Lword_copy:
+ /*
+ * Both src and dst are aligned, unrolled word copy
+ *
+ * a0 - start of aligned dst
+ * a1 - start of aligned src
+ * t0 - end of aligned dst
+ */
+ addi t0, t0, -(8*SZREG) /* not to over run */
+2:
+ fixup REG_L a4, 0(a1), 10f
+ fixup REG_L a5, SZREG(a1), 10f
+ fixup REG_L a6, 2*SZREG(a1), 10f
+ fixup REG_L a7, 3*SZREG(a1), 10f
+ fixup REG_L t1, 4*SZREG(a1), 10f
+ fixup REG_L t2, 5*SZREG(a1), 10f
+ fixup REG_L t3, 6*SZREG(a1), 10f
+ fixup REG_L t4, 7*SZREG(a1), 10f
+ fixup REG_S a4, 0(a0), 10f
+ fixup REG_S a5, SZREG(a0), 10f
+ fixup REG_S a6, 2*SZREG(a0), 10f
+ fixup REG_S a7, 3*SZREG(a0), 10f
+ fixup REG_S t1, 4*SZREG(a0), 10f
+ fixup REG_S t2, 5*SZREG(a0), 10f
+ fixup REG_S t3, 6*SZREG(a0), 10f
+ fixup REG_S t4, 7*SZREG(a0), 10f
+ addi a0, a0, 8*SZREG
+ addi a1, a1, 8*SZREG
+ bltu a0, t0, 2b
+
+ addi t0, t0, 8*SZREG /* revert to original value */
+ j .Lbyte_copy_tail
+
+.Lshift_copy:
+
+ /*
+ * Word copy with shifting.
+ * For misaligned copy we still perform aligned word copy, but
+ * we need to use the value fetched from the previous iteration and
+ * do some shifts.
+ * This is safe because reading is less than a word size.
+ *
+ * a0 - start of aligned dst
+ * a1 - start of src
+ * a3 - a1 & mask:(SZREG-1)
+ * t0 - end of uncopied dst
+ * t1 - end of aligned dst
+ */
+ /* calculating aligned word boundary for dst */
+ andi t1, t0, ~(SZREG-1)
+ /* Converting unaligned src to aligned src */
+ andi a1, a1, ~(SZREG-1)
+
+ /*
+ * Calculate shifts
+ * t3 - prev shift
+ * t4 - current shift
+ */
+ slli t3, a3, 3 /* converting bytes in a3 to bits */
+ li a5, SZREG*8
+ sub t4, a5, t3
+
+ /* Load the first word to combine with second word */
+ fixup REG_L a5, 0(a1), 10f
+
+3:
+ /* Main shifting copy
+ *
+ * a0 - start of aligned dst
+ * a1 - start of aligned src
+ * t1 - end of aligned dst
+ */
+
+ /* At least one iteration will be executed */
+ srl a4, a5, t3
+ fixup REG_L a5, SZREG(a1), 10f
+ addi a1, a1, SZREG
+ sll a2, a5, t4
+ or a2, a2, a4
+ fixup REG_S a2, 0(a0), 10f
+ addi a0, a0, SZREG
+ bltu a0, t1, 3b
+
+ /* Revert src to original unaligned value */
+ add a1, a1, a3
+
+.Lbyte_copy_tail:
+ /*
+ * Byte copy anything left.
+ *
+ * a0 - start of remaining dst
+ * a1 - start of remaining src
+ * t0 - end of remaining dst
+ */
+ bgeu a0, t0, .Lout_copy_user /* check if end of copy */
+4:
+ fixup lb a5, 0(a1), 10f
+ addi a1, a1, 1 /* src */
+ fixup sb a5, 0(a0), 10f
+ addi a0, a0, 1 /* dst */
+ bltu a0, t0, 4b /* t0 - end of dst */
+
+.Lout_copy_user:
+ /* Disable access to user memory */
+ csrc CSR_STATUS, t6
+ li a0, 0
+ ret
+
+ /* Exception fixup code */
+10:
+ /* Disable access to user memory */
+ csrc CSR_STATUS, t6
+ mv a0, t5
+ ret
+ENDPROC(__asm_copy_to_user)
+ENDPROC(__asm_copy_from_user)
+EXPORT_SYMBOL(__asm_copy_to_user)
+EXPORT_SYMBOL(__asm_copy_from_user)
+
+
+ENTRY(__clear_user)
+
+ /* Enable access to user memory */
+ li t6, SR_SUM
+ csrs CSR_STATUS, t6
+
+ add a3, a0, a1
+ addi t0, a0, SZREG-1
+ andi t1, a3, ~(SZREG-1)
+ andi t0, t0, ~(SZREG-1)
+ /*
+ * a3: terminal address of target region
+ * t0: lowest doubleword-aligned address in target region
+ * t1: highest doubleword-aligned address in target region
+ */
+ bgeu t0, t1, 2f
+ bltu a0, t0, 4f
+1:
+ fixup REG_S, zero, (a0), 11f
+ addi a0, a0, SZREG
+ bltu a0, t1, 1b
+2:
+ bltu a0, a3, 5f
+
+3:
+ /* Disable access to user memory */
+ csrc CSR_STATUS, t6
+ li a0, 0
+ ret
+4: /* Edge case: unalignment */
+ fixup sb, zero, (a0), 11f
+ addi a0, a0, 1
+ bltu a0, t0, 4b
+ j 1b
+5: /* Edge case: remainder */
+ fixup sb, zero, (a0), 11f
+ addi a0, a0, 1
+ bltu a0, a3, 5b
+ j 3b
+
+ /* Exception fixup code */
+11:
+ /* Disable access to user memory */
+ csrc CSR_STATUS, t6
+ mv a0, a1
+ ret
+ENDPROC(__clear_user)
+EXPORT_SYMBOL(__clear_user)