[PATCH] Improve endian conversion in umac.c

rapier rapier at psc.edu
Fri Mar 11 07:22:01 AEDT 2022


Please forgive the overly formalized way I wrote this up. This is what I 
use for the notes I make as I need to report on a lot of this back to 
the NSF. So I just copied and pasted it here.

Test subject: OpenSSH 8.8p1 commit bf944e37

Test environment: 2 AMD EPYC 7502P 32-Core servers connected via 10Gb 
Intel X710 NICs in LAN, RTT < 0.5ms. ESNet TCP tuning applied.

Test method: 50 iterations of 'dd if=/dev/zero bs=1M count=20000 | ssh 
-caes256-ctr remote.host "cat > /dev/null"'. 5 second pause between each 
iteration.

Versions tested:
Stock openssh 8.8p1 (stock)
Migrating all endian functions from misc.c (full)
Migrating only [get|put]_u32_le from misc.c (little)

Test Results:

Flavor	mean,	median,	min,	max,	adev
stock   669.68,	670,	653,	688,	4.9728
full 	800.32,	804.5,	762,	823,	11.9216
little  803.1,	806,	772,	831,	10.484

Interpretation:
Inlining the endian functions, via compiler optimization, in umac.c 
leads to an average 16.6% improvement in throughput, with a maximized 
gain of 21.4%, in this test environment. The more pronounced deviation 
is likely a result of the higher throughput rather than intrinsic 
instabilities in the code. This performance gain comes entirely from 
inlining the little endian conversion function [get|put]_u32_le. These 
functions are defined in misc.c but only used in umac.c. As such, 
migrating them to umac.c would impose minimal disruption on the rest of 
the code base.

Patch:
diff --git a/misc.c b/misc.c
index 0134d694..20a2a186 100644
--- a/misc.c
+++ b/misc.c
@@ -1533,20 +1533,6 @@ get_u32(const void *vp)
  	return (v);
  }

-u_int32_t
-get_u32_le(const void *vp)
-{
-	const u_char *p = (const u_char *)vp;
-	u_int32_t v;
-
-	v  = (u_int32_t)p[0];
-	v |= (u_int32_t)p[1] << 8;
-	v |= (u_int32_t)p[2] << 16;
-	v |= (u_int32_t)p[3] << 24;
-
-	return (v);
-}
-
  u_int16_t
  get_u16(const void *vp)
  {
@@ -1585,17 +1571,6 @@ put_u32(void *vp, u_int32_t v)
  	p[3] = (u_char)v & 0xff;
  }

-void
-put_u32_le(void *vp, u_int32_t v)
-{
-	u_char *p = (u_char *)vp;
-
-	p[0] = (u_char)v & 0xff;
-	p[1] = (u_char)(v >> 8) & 0xff;
-	p[2] = (u_char)(v >> 16) & 0xff;
-	p[3] = (u_char)(v >> 24) & 0xff;
-}
-
  void
  put_u16(void *vp, u_int16_t v)
  {
diff --git a/misc.h b/misc.h
index 2e2dca54..b0c64270 100644
--- a/misc.h
+++ b/misc.h
@@ -154,12 +154,6 @@ void		put_u32(void *, u_int32_t)
  void		put_u16(void *, u_int16_t)
      __attribute__((__bounded__( __minbytes__, 1, 2)));

-/* Little-endian store/load, used by umac.c */
-u_int32_t	get_u32_le(const void *)
-    __attribute__((__bounded__(__minbytes__, 1, 4)));
-void		put_u32_le(void *, u_int32_t)
-    __attribute__((__bounded__(__minbytes__, 1, 4)));
-
  struct bwlimit {
  	size_t buflen;
  	u_int64_t rate;		/* desired rate in kbit/s */
diff --git a/umac.c b/umac.c
index e5ec19f0..64502365 100644
--- a/umac.c
+++ b/umac.c
@@ -134,15 +134,56 @@ typedef unsigned int	UWORD;  /* Register */
  /* --- Endian Conversion --- Forcing assembly on some platforms 
     */
  /* 
---------------------------------------------------------------------- */

+
+/* Using local statically defined versions of the get/put functions
+ * found in misc.c allows them to be inlined. This improves throughput
+ * performance by 10% to 15% on well connected (10Gb/s+) systems.
+ * Forward declaration of the functions in order to maintain
+ * the attributes.
+ * Chris Rapier <rapier at psc.edu> 2022-03-09 */
+
+static u_int32_t umac_get_u32_le(const void *)
+    __attribute__((__bounded__(__minbytes__, 1, 4)));
+
+static u_int32_t
+umac_get_u32_le(const void *vp)
+{
+        const u_char *p = (const u_char *)vp;
+        u_int32_t v;
+
+        v  = (u_int32_t)p[0];
+        v |= (u_int32_t)p[1] << 8;
+        v |= (u_int32_t)p[2] << 16;
+        v |= (u_int32_t)p[3] << 24;
+
+        return (v);
+}
+
+#if (! __LITTLE_ENDIAN__) /* compile time warning thown otherwise */
+static void umac_put_u32_le(void *, u_int32_t)
+    __attribute__((__bounded__(__minbytes__, 1, 4)));
+
+static void
+umac_put_u32_le(void *vp, u_int32_t v)
+{
+        u_char *p = (u_char *)vp;
+
+        p[0] = (u_char)v & 0xff;
+        p[1] = (u_char)(v >> 8) & 0xff;
+        p[2] = (u_char)(v >> 16) & 0xff;
+        p[3] = (u_char)(v >> 24) & 0xff;
+}
+#endif
+
  #if (__LITTLE_ENDIAN__)
  #define LOAD_UINT32_REVERSED(p)		get_u32(p)
  #define STORE_UINT32_REVERSED(p,v)	put_u32(p,v)
  #else
-#define LOAD_UINT32_REVERSED(p)		get_u32_le(p)
-#define STORE_UINT32_REVERSED(p,v)	put_u32_le(p,v)
+#define LOAD_UINT32_REVERSED(p)		umac_get_u32_le(p)
+#define STORE_UINT32_REVERSED(p,v)	umac_put_u32_le(p,v)
  #endif

-#define LOAD_UINT32_LITTLE(p)		(get_u32_le(p))
+#define LOAD_UINT32_LITTLE(p)		(umac_get_u32_le(p))
  #define STORE_UINT32_BIG(p,v)		put_u32(p, v)

  /* 
---------------------------------------------------------------------- */



More information about the openssh-unix-dev mailing list